From dfd402a4c4baae42398ce9180ff424d589b8bffc Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:54 +0100 Subject: kcsan: Add Kernel Concurrency Sanitizer infrastructure Kernel Concurrency Sanitizer (KCSAN) is a dynamic data-race detector for kernel space. KCSAN is a sampling watchpoint-based data-race detector. See the included Documentation/dev-tools/kcsan.rst for more details. This patch adds basic infrastructure, but does not yet enable KCSAN for any architecture. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/compiler-clang.h | 9 ++++ include/linux/compiler-gcc.h | 7 +++ include/linux/compiler.h | 37 ++++++++++--- include/linux/kcsan-checks.h | 97 ++++++++++++++++++++++++++++++++++ include/linux/kcsan.h | 115 +++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 4 ++ 6 files changed, 261 insertions(+), 8 deletions(-) create mode 100644 include/linux/kcsan-checks.h create mode 100644 include/linux/kcsan.h (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 333a6695a918..a213eb55e725 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -24,6 +24,15 @@ #define __no_sanitize_address #endif +#if __has_feature(thread_sanitizer) +/* emulate gcc's __SANITIZE_THREAD__ flag */ +#define __SANITIZE_THREAD__ +#define __no_sanitize_thread \ + __attribute__((no_sanitize("thread"))) +#else +#define __no_sanitize_thread +#endif + /* * Not all versions of clang implement the the type-generic versions * of the builtin overflow checkers. Fortunately, clang implements diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d7ee4c6bad48..0eb2a1cc411d 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -145,6 +145,13 @@ #define __no_sanitize_address #endif +#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) +#define __no_sanitize_thread \ + __attribute__((__noinline__)) __attribute__((no_sanitize_thread)) +#else +#define __no_sanitize_thread +#endif + #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5e88e7e33abe..c42fa83f23fb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -178,6 +178,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif #include +#include #define __READ_ONCE_SIZE \ ({ \ @@ -193,12 +194,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, } \ }) -static __always_inline -void __read_once_size(const volatile void *p, void *res, int size) -{ - __READ_ONCE_SIZE; -} - #ifdef CONFIG_KASAN /* * We can't declare function 'inline' because __no_sanitize_address confilcts @@ -207,18 +202,44 @@ void __read_once_size(const volatile void *p, void *res, int size) * '__maybe_unused' allows us to avoid defined-but-not-used warnings. */ # define __no_kasan_or_inline __no_sanitize_address notrace __maybe_unused +# define __no_sanitize_or_inline __no_kasan_or_inline #else # define __no_kasan_or_inline __always_inline #endif -static __no_kasan_or_inline +#ifdef __SANITIZE_THREAD__ +/* + * Rely on __SANITIZE_THREAD__ instead of CONFIG_KCSAN, to avoid not inlining in + * compilation units where instrumentation is disabled. + */ +# define __no_kcsan_or_inline __no_sanitize_thread notrace __maybe_unused +# define __no_sanitize_or_inline __no_kcsan_or_inline +#else +# define __no_kcsan_or_inline __always_inline +#endif + +#ifndef __no_sanitize_or_inline +#define __no_sanitize_or_inline __always_inline +#endif + +static __no_kcsan_or_inline +void __read_once_size(const volatile void *p, void *res, int size) +{ + kcsan_check_atomic_read(p, size); + __READ_ONCE_SIZE; +} + +static __no_sanitize_or_inline void __read_once_size_nocheck(const volatile void *p, void *res, int size) { __READ_ONCE_SIZE; } -static __always_inline void __write_once_size(volatile void *p, void *res, int size) +static __no_kcsan_or_inline +void __write_once_size(volatile void *p, void *res, int size) { + kcsan_check_atomic_write(p, size); + switch (size) { case 1: *(volatile __u8 *)p = *(__u8 *)res; break; case 2: *(volatile __u16 *)p = *(__u16 *)res; break; diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h new file mode 100644 index 000000000000..e78220661086 --- /dev/null +++ b/include/linux/kcsan-checks.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KCSAN_CHECKS_H +#define _LINUX_KCSAN_CHECKS_H + +#include + +/* + * Access type modifiers. + */ +#define KCSAN_ACCESS_WRITE 0x1 +#define KCSAN_ACCESS_ATOMIC 0x2 + +/* + * __kcsan_*: Always calls into runtime when KCSAN is enabled. This may be used + * even in compilation units that selectively disable KCSAN, but must use KCSAN + * to validate access to an address. Never use these in header files! + */ +#ifdef CONFIG_KCSAN +/** + * __kcsan_check_access - check generic access for data race + * + * @ptr address of access + * @size size of access + * @type access type modifier + */ +void __kcsan_check_access(const volatile void *ptr, size_t size, int type); + +#else +static inline void __kcsan_check_access(const volatile void *ptr, size_t size, + int type) { } +#endif + +/* + * kcsan_*: Only calls into runtime when the particular compilation unit has + * KCSAN instrumentation enabled. May be used in header files. + */ +#ifdef __SANITIZE_THREAD__ +#define kcsan_check_access __kcsan_check_access +#else +static inline void kcsan_check_access(const volatile void *ptr, size_t size, + int type) { } +#endif + +/** + * __kcsan_check_read - check regular read access for data races + * + * @ptr address of access + * @size size of access + */ +#define __kcsan_check_read(ptr, size) __kcsan_check_access(ptr, size, 0) + +/** + * __kcsan_check_write - check regular write access for data races + * + * @ptr address of access + * @size size of access + */ +#define __kcsan_check_write(ptr, size) \ + __kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) + +/** + * kcsan_check_read - check regular read access for data races + * + * @ptr address of access + * @size size of access + */ +#define kcsan_check_read(ptr, size) kcsan_check_access(ptr, size, 0) + +/** + * kcsan_check_write - check regular write access for data races + * + * @ptr address of access + * @size size of access + */ +#define kcsan_check_write(ptr, size) \ + kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) + +/* + * Check for atomic accesses: if atomic access are not ignored, this simply + * aliases to kcsan_check_access, otherwise becomes a no-op. + */ +#ifdef CONFIG_KCSAN_IGNORE_ATOMICS +#define kcsan_check_atomic_read(...) \ + do { \ + } while (0) +#define kcsan_check_atomic_write(...) \ + do { \ + } while (0) +#else +#define kcsan_check_atomic_read(ptr, size) \ + kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC) +#define kcsan_check_atomic_write(ptr, size) \ + kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE) +#endif + +#endif /* _LINUX_KCSAN_CHECKS_H */ diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h new file mode 100644 index 000000000000..9047048fee84 --- /dev/null +++ b/include/linux/kcsan.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KCSAN_H +#define _LINUX_KCSAN_H + +#include +#include + +#ifdef CONFIG_KCSAN + +/* + * Context for each thread of execution: for tasks, this is stored in + * task_struct, and interrupts access internal per-CPU storage. + */ +struct kcsan_ctx { + int disable_count; /* disable counter */ + int atomic_next; /* number of following atomic ops */ + + /* + * We distinguish between: (a) nestable atomic regions that may contain + * other nestable regions; and (b) flat atomic regions that do not keep + * track of nesting. Both (a) and (b) are entirely independent of each + * other, and a flat region may be started in a nestable region or + * vice-versa. + * + * This is required because, for example, in the annotations for + * seqlocks, we declare seqlock writer critical sections as (a) nestable + * atomic regions, but reader critical sections as (b) flat atomic + * regions, but have encountered cases where seqlock reader critical + * sections are contained within writer critical sections (the opposite + * may be possible, too). + * + * To support these cases, we independently track the depth of nesting + * for (a), and whether the leaf level is flat for (b). + */ + int atomic_nest_count; + bool in_flat_atomic; +}; + +/** + * kcsan_init - initialize KCSAN runtime + */ +void kcsan_init(void); + +/** + * kcsan_disable_current - disable KCSAN for the current context + * + * Supports nesting. + */ +void kcsan_disable_current(void); + +/** + * kcsan_enable_current - re-enable KCSAN for the current context + * + * Supports nesting. + */ +void kcsan_enable_current(void); + +/** + * kcsan_nestable_atomic_begin - begin nestable atomic region + * + * Accesses within the atomic region may appear to race with other accesses but + * should be considered atomic. + */ +void kcsan_nestable_atomic_begin(void); + +/** + * kcsan_nestable_atomic_end - end nestable atomic region + */ +void kcsan_nestable_atomic_end(void); + +/** + * kcsan_flat_atomic_begin - begin flat atomic region + * + * Accesses within the atomic region may appear to race with other accesses but + * should be considered atomic. + */ +void kcsan_flat_atomic_begin(void); + +/** + * kcsan_flat_atomic_end - end flat atomic region + */ +void kcsan_flat_atomic_end(void); + +/** + * kcsan_atomic_next - consider following accesses as atomic + * + * Force treating the next n memory accesses for the current context as atomic + * operations. + * + * @n number of following memory accesses to treat as atomic. + */ +void kcsan_atomic_next(int n); + +#else /* CONFIG_KCSAN */ + +static inline void kcsan_init(void) { } + +static inline void kcsan_disable_current(void) { } + +static inline void kcsan_enable_current(void) { } + +static inline void kcsan_nestable_atomic_begin(void) { } + +static inline void kcsan_nestable_atomic_end(void) { } + +static inline void kcsan_flat_atomic_begin(void) { } + +static inline void kcsan_flat_atomic_end(void) { } + +static inline void kcsan_atomic_next(int n) { } + +#endif /* CONFIG_KCSAN */ + +#endif /* _LINUX_KCSAN_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 67a1d86981a9..ae4f341c1db4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -31,6 +31,7 @@ #include #include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -1172,6 +1173,9 @@ struct task_struct { #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif +#ifdef CONFIG_KCSAN + struct kcsan_ctx kcsan_ctx; +#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ -- cgit v1.2.3 From c48981eeb0d56e107691df590007d6699441a689 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:55 +0100 Subject: include/linux/compiler.h: Introduce data_race(expr) macro This introduces the data_race(expr) macro, which can be used to annotate expressions for purposes of (1) documenting, and (2) giving tooling such as KCSAN information about which data races are deemed "safe". More context: http://lkml.kernel.org/r/CAHk-=wg5CkOEF8DTez1Qu0XTEFw_oHhxN98bDnFqbY7HL5AB2g@mail.gmail.com Signed-off-by: Marco Elver Cc: Alan Stern Cc: Eric Dumazet Cc: Linus Torvalds Cc: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/compiler.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c42fa83f23fb..7d3e77781578 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -310,6 +310,26 @@ unsigned long read_word_at_a_time(const void *addr) __u.__val; \ }) +#include + +/* + * data_race: macro to document that accesses in an expression may conflict with + * other concurrent accesses resulting in data races, but the resulting + * behaviour is deemed safe regardless. + * + * This macro *does not* affect normal code generation, but is a hint to tooling + * that data races here should be ignored. + */ +#define data_race(expr) \ + ({ \ + typeof(({ expr; })) __val; \ + kcsan_nestable_atomic_begin(); \ + __val = ({ expr; }); \ + kcsan_nestable_atomic_end(); \ + __val; \ + }) +#else + #endif /* __KERNEL__ */ /* -- cgit v1.2.3 From 88ecd153be9519f259b87a9f6f4c8383a8b3bbf1 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:02:59 +0100 Subject: seqlock, kcsan: Add annotations for KCSAN Since seqlocks in the Linux kernel do not require the use of marked atomic accesses in critical sections, we teach KCSAN to assume such accesses are atomic. KCSAN currently also pretends that writes to `sequence` are atomic, although currently plain writes are used (their corresponding reads are READ_ONCE). Further, to avoid false positives in the absence of clear ending of a seqlock reader critical section (only when using the raw interface), KCSAN assumes a fixed number of accesses after start of a seqlock critical section are atomic. === Commentary on design around absence of clear begin/end markings === Seqlock usage via seqlock_t follows a predictable usage pattern, where clear critical section begin/end is enforced. With subtle special cases for readers needing to be flat atomic regions, e.g. because usage such as in: - fs/namespace.c:__legitimize_mnt - unbalanced read_seqretry - fs/dcache.c:d_walk - unbalanced need_seqretry But, anything directly accessing seqcount_t seems to be unpredictable. Filtering for usage of read_seqcount_retry not following 'do { .. } while (read_seqcount_retry(..));': $ git grep 'read_seqcount_retry' | grep -Ev 'while \(|seqlock.h|Doc|\* ' => about 1/3 of the total read_seqcount_retry usage. Just looking at fs/namei.c, we conclude that it is non-trivial to prescribe and migrate to an interface that would force clear begin/end seqlock markings for critical sections. As such, we concluded that the best design currently, is to simply ensure that KCSAN works well with the existing code. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/seqlock.h | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index bcf4cf26b8c8..61232bc223fd 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -37,8 +37,24 @@ #include #include #include +#include #include +/* + * The seqlock interface does not prescribe a precise sequence of read + * begin/retry/end. For readers, typically there is a call to + * read_seqcount_begin() and read_seqcount_retry(), however, there are more + * esoteric cases which do not follow this pattern. + * + * As a consequence, we take the following best-effort approach for raw usage + * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, + * pessimistically mark then next KCSAN_SEQLOCK_REGION_MAX memory accesses as + * atomics; if there is a matching read_seqcount_retry() call, no following + * memory operations are considered atomic. Usage of seqlocks via seqlock_t + * interface is not affected. + */ +#define KCSAN_SEQLOCK_REGION_MAX 1000 + /* * Version using sequence counter only. * This can be used when code has its own mutex protecting the @@ -115,6 +131,7 @@ repeat: cpu_relax(); goto repeat; } + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return ret; } @@ -131,6 +148,7 @@ static inline unsigned raw_read_seqcount(const seqcount_t *s) { unsigned ret = READ_ONCE(s->sequence); smp_rmb(); + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return ret; } @@ -183,6 +201,7 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s) { unsigned ret = READ_ONCE(s->sequence); smp_rmb(); + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return ret & ~1; } @@ -202,7 +221,8 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s) */ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start) { - return unlikely(s->sequence != start); + kcsan_atomic_next(0); + return unlikely(READ_ONCE(s->sequence) != start); } /** @@ -225,6 +245,7 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) static inline void raw_write_seqcount_begin(seqcount_t *s) { + kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); } @@ -233,6 +254,7 @@ static inline void raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; + kcsan_nestable_atomic_end(); } /** @@ -271,9 +293,11 @@ static inline void raw_write_seqcount_end(seqcount_t *s) */ static inline void raw_write_seqcount_barrier(seqcount_t *s) { + kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); s->sequence++; + kcsan_nestable_atomic_end(); } static inline int raw_read_seqcount_latch(seqcount_t *s) @@ -398,7 +422,9 @@ static inline void write_seqcount_end(seqcount_t *s) static inline void write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); + kcsan_nestable_atomic_begin(); s->sequence+=2; + kcsan_nestable_atomic_end(); } typedef struct { @@ -430,11 +456,21 @@ typedef struct { */ static inline unsigned read_seqbegin(const seqlock_t *sl) { - return read_seqcount_begin(&sl->seqcount); + unsigned ret = read_seqcount_begin(&sl->seqcount); + + kcsan_atomic_next(0); /* non-raw usage, assume closing read_seqretry */ + kcsan_flat_atomic_begin(); + return ret; } static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { + /* + * Assume not nested: read_seqretry may be called multiple times when + * completing read critical section. + */ + kcsan_flat_atomic_end(); + return read_seqcount_retry(&sl->seqcount, start); } -- cgit v1.2.3 From bf07132f96d426bcbf2098227fb680915cf44498 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:03:00 +0100 Subject: seqlock: Require WRITE_ONCE surrounding raw_seqcount_barrier This patch proposes to require marked atomic accesses surrounding raw_write_seqcount_barrier. We reason that otherwise there is no way to guarantee propagation nor atomicity of writes before/after the barrier [1]. For example, consider the compiler tears stores either before or after the barrier; in this case, readers may observe a partial value, and because readers are unaware that writes are going on (writes are not in a seq-writer critical section), will complete the seq-reader critical section while having observed some partial state. [1] https://lwn.net/Articles/793253/ This came up when designing and implementing KCSAN, because KCSAN would flag these accesses as data-races. After careful analysis, our reasoning as above led us to conclude that the best thing to do is to propose an amendment to the raw_seqcount_barrier usage. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/seqlock.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 61232bc223fd..f52c91be8939 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -265,6 +265,13 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * usual consistency guarantee. It is one wmb cheaper, because we can * collapse the two back-to-back wmb()s. * + * Note that, writes surrounding the barrier should be declared atomic (e.g. + * via WRITE_ONCE): a) to ensure the writes become visible to other threads + * atomically, avoiding compiler optimizations; b) to document which writes are + * meant to propagate to the reader critical section. This is necessary because + * neither writes before and after the barrier are enclosed in a seq-writer + * critical section that would ensure readers are aware of ongoing writes. + * * seqcount_t seq; * bool X = true, Y = false; * @@ -284,11 +291,11 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * * void write(void) * { - * Y = true; + * WRITE_ONCE(Y, true); * * raw_write_seqcount_barrier(seq); * - * X = false; + * WRITE_ONCE(X, false); * } */ static inline void raw_write_seqcount_barrier(seqcount_t *s) -- cgit v1.2.3 From 5cbaefe9743bf14c9d3106db0cc19f8cb0a3ca22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Nov 2019 10:41:43 +0100 Subject: kcsan: Improve various small stylistic details Tidy up a few bits: - Fix typos and grammar, improve wording. - Remove spurious newlines that are col80 warning artifacts where the resulting line-break is worse than the disease it's curing. - Use core kernel coding style to improve readability and reduce spurious code pattern variations. - Use better vertical alignment for structure definitions and initialization sequences. - Misc other small details. No change in functionality intended. Cc: linux-kernel@vger.kernel.org Cc: Marco Elver Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Will Deacon Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- include/linux/compiler-clang.h | 2 +- include/linux/compiler.h | 2 +- include/linux/kcsan-checks.h | 22 ++++++--------- include/linux/kcsan.h | 23 ++++++---------- include/linux/seqlock.h | 8 +++--- kernel/kcsan/atomic.h | 2 +- kernel/kcsan/core.c | 59 ++++++++++++++++++---------------------- kernel/kcsan/debugfs.c | 62 ++++++++++++++++++++---------------------- kernel/kcsan/encoding.h | 25 +++++++++-------- kernel/kcsan/kcsan.h | 11 ++++---- kernel/kcsan/report.c | 42 ++++++++++++++-------------- kernel/kcsan/test.c | 6 ++-- kernel/sched/Makefile | 2 +- lib/Kconfig.kcsan | 16 +++++------ 15 files changed, 131 insertions(+), 153 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9933ca8ffe16..9cfa4a5c6f42 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -226,7 +226,7 @@ config X86 select VIRT_TO_BUS select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS - select HAVE_ARCH_KCSAN if X86_64 + select HAVE_ARCH_KCSAN if X86_64 config INSTRUCTION_DECODER def_bool y diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index a213eb55e725..2cb42d8bdedc 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -16,7 +16,7 @@ #define KASAN_ABI_VERSION 5 #if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) -/* emulate gcc's __SANITIZE_ADDRESS__ flag */ +/* Emulate GCC's __SANITIZE_ADDRESS__ flag */ #define __SANITIZE_ADDRESS__ #define __no_sanitize_address \ __attribute__((no_sanitize("address", "hwaddress"))) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7d3e77781578..ad8c76144a3c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -313,7 +313,7 @@ unsigned long read_word_at_a_time(const void *addr) #include /* - * data_race: macro to document that accesses in an expression may conflict with + * data_race(): macro to document that accesses in an expression may conflict with * other concurrent accesses resulting in data races, but the resulting * behaviour is deemed safe regardless. * diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index e78220661086..ef3ee233a3fa 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -8,17 +8,17 @@ /* * Access type modifiers. */ -#define KCSAN_ACCESS_WRITE 0x1 +#define KCSAN_ACCESS_WRITE 0x1 #define KCSAN_ACCESS_ATOMIC 0x2 /* - * __kcsan_*: Always calls into runtime when KCSAN is enabled. This may be used + * __kcsan_*: Always calls into the runtime when KCSAN is enabled. This may be used * even in compilation units that selectively disable KCSAN, but must use KCSAN - * to validate access to an address. Never use these in header files! + * to validate access to an address. Never use these in header files! */ #ifdef CONFIG_KCSAN /** - * __kcsan_check_access - check generic access for data race + * __kcsan_check_access - check generic access for data races * * @ptr address of access * @size size of access @@ -32,7 +32,7 @@ static inline void __kcsan_check_access(const volatile void *ptr, size_t size, #endif /* - * kcsan_*: Only calls into runtime when the particular compilation unit has + * kcsan_*: Only calls into the runtime when the particular compilation unit has * KCSAN instrumentation enabled. May be used in header files. */ #ifdef __SANITIZE_THREAD__ @@ -77,16 +77,12 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) /* - * Check for atomic accesses: if atomic access are not ignored, this simply - * aliases to kcsan_check_access, otherwise becomes a no-op. + * Check for atomic accesses: if atomic accesses are not ignored, this simply + * aliases to kcsan_check_access(), otherwise becomes a no-op. */ #ifdef CONFIG_KCSAN_IGNORE_ATOMICS -#define kcsan_check_atomic_read(...) \ - do { \ - } while (0) -#define kcsan_check_atomic_write(...) \ - do { \ - } while (0) +#define kcsan_check_atomic_read(...) do { } while (0) +#define kcsan_check_atomic_write(...) do { } while (0) #else #define kcsan_check_atomic_read(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC) diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 9047048fee84..1019e3a2c689 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -94,21 +94,14 @@ void kcsan_atomic_next(int n); #else /* CONFIG_KCSAN */ -static inline void kcsan_init(void) { } - -static inline void kcsan_disable_current(void) { } - -static inline void kcsan_enable_current(void) { } - -static inline void kcsan_nestable_atomic_begin(void) { } - -static inline void kcsan_nestable_atomic_end(void) { } - -static inline void kcsan_flat_atomic_begin(void) { } - -static inline void kcsan_flat_atomic_end(void) { } - -static inline void kcsan_atomic_next(int n) { } +static inline void kcsan_init(void) { } +static inline void kcsan_disable_current(void) { } +static inline void kcsan_enable_current(void) { } +static inline void kcsan_nestable_atomic_begin(void) { } +static inline void kcsan_nestable_atomic_end(void) { } +static inline void kcsan_flat_atomic_begin(void) { } +static inline void kcsan_flat_atomic_end(void) { } +static inline void kcsan_atomic_next(int n) { } #endif /* CONFIG_KCSAN */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index f52c91be8939..f80d50cac199 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -48,7 +48,7 @@ * * As a consequence, we take the following best-effort approach for raw usage * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, - * pessimistically mark then next KCSAN_SEQLOCK_REGION_MAX memory accesses as + * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following * memory operations are considered atomic. Usage of seqlocks via seqlock_t * interface is not affected. @@ -265,7 +265,7 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * usual consistency guarantee. It is one wmb cheaper, because we can * collapse the two back-to-back wmb()s. * - * Note that, writes surrounding the barrier should be declared atomic (e.g. + * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because @@ -465,7 +465,7 @@ static inline unsigned read_seqbegin(const seqlock_t *sl) { unsigned ret = read_seqcount_begin(&sl->seqcount); - kcsan_atomic_next(0); /* non-raw usage, assume closing read_seqretry */ + kcsan_atomic_next(0); /* non-raw usage, assume closing read_seqretry() */ kcsan_flat_atomic_begin(); return ret; } @@ -473,7 +473,7 @@ static inline unsigned read_seqbegin(const seqlock_t *sl) static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { /* - * Assume not nested: read_seqretry may be called multiple times when + * Assume not nested: read_seqretry() may be called multiple times when * completing read critical section. */ kcsan_flat_atomic_end(); diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index c9c3fe628011..576e03ddd6a3 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -6,7 +6,7 @@ #include /* - * Helper that returns true if access to ptr should be considered as an atomic + * Helper that returns true if access to @ptr should be considered an atomic * access, even though it is not explicitly atomic. * * List all volatile globals that have been observed in races, to suppress diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index d9410d58c93e..3314fc29e236 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -19,10 +19,10 @@ bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, }; /* @@ -50,11 +50,11 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * slot=9: [10, 11, 9] * slot=63: [64, 65, 63] */ -#define NUM_SLOTS (1 + 2 * KCSAN_CHECK_ADJACENT) +#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT) #define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS)) /* - * SLOT_IDX_FAST is used in fast-path. Not first checking the address's primary + * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary * slot (middle) is fine if we assume that data races occur rarely. The set of * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. @@ -68,9 +68,9 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { * zero-initialized state matches INVALID_WATCHPOINT. * * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to - * use more complicated SLOT_IDX_FAST calculation with modulo in fast-path. + * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path. */ -static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; +static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; /* * Instructions to skip watching counter, used in should_watch(). We use a @@ -78,7 +78,8 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS - 1]; */ static DEFINE_PER_CPU(long, kcsan_skip); -static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, +static inline atomic_long_t *find_watchpoint(unsigned long addr, + size_t size, bool expect_write, long *encoded_watchpoint) { @@ -110,8 +111,8 @@ static inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, return NULL; } -static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, - bool is_write) +static inline atomic_long_t * +insert_watchpoint(unsigned long addr, size_t size, bool is_write) { const int slot = watchpoint_slot(addr); const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); @@ -120,21 +121,16 @@ static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, /* Check slot index logic, ensuring we stay within array bounds. */ BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT); - BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT + 1) != 0); - BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, - KCSAN_CHECK_ADJACENT) != - ARRAY_SIZE(watchpoints) - 1); - BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS - 1, - KCSAN_CHECK_ADJACENT + 1) != - ARRAY_SIZE(watchpoints) - NUM_SLOTS); + BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1); + BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS); for (i = 0; i < NUM_SLOTS; ++i) { long expect_val = INVALID_WATCHPOINT; /* Try to acquire this slot. */ watchpoint = &watchpoints[SLOT_IDX(slot, i)]; - if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, - encoded_watchpoint)) + if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint)) return watchpoint; } @@ -150,11 +146,10 @@ static inline atomic_long_t *insert_watchpoint(unsigned long addr, size_t size, * 2. the thread that set up the watchpoint already removed it; * 3. the watchpoint was removed and then re-used. */ -static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, - long encoded_watchpoint) +static inline bool +try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint) { - return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, - CONSUMED_WATCHPOINT); + return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT); } /* @@ -162,14 +157,13 @@ static inline bool try_consume_watchpoint(atomic_long_t *watchpoint, */ static inline bool remove_watchpoint(atomic_long_t *watchpoint) { - return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != - CONSUMED_WATCHPOINT; + return atomic_long_xchg_relaxed(watchpoint, INVALID_WATCHPOINT) != CONSUMED_WATCHPOINT; } static inline struct kcsan_ctx *get_ctx(void) { /* - * In interrupt, use raw_cpu_ptr to avoid unnecessary checks, that would + * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would * also result in calls that generate warnings in uaccess regions. */ return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); @@ -260,7 +254,8 @@ static inline unsigned int get_delay(void) */ static noinline void kcsan_found_watchpoint(const volatile void *ptr, - size_t size, bool is_write, + size_t size, + bool is_write, atomic_long_t *watchpoint, long encoded_watchpoint) { @@ -296,8 +291,8 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, user_access_restore(flags); } -static noinline void kcsan_setup_watchpoint(const volatile void *ptr, - size_t size, bool is_write) +static noinline void +kcsan_setup_watchpoint(const volatile void *ptr, size_t size, bool is_write) { atomic_long_t *watchpoint; union { @@ -346,8 +341,8 @@ static noinline void kcsan_setup_watchpoint(const volatile void *ptr, watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { /* - * Out of capacity: the size of `watchpoints`, and the frequency - * with which `should_watch()` returns true should be tweaked so + * Out of capacity: the size of 'watchpoints', and the frequency + * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 041d520a0183..bec42dab32ee 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -24,39 +24,31 @@ static atomic_long_t counters[KCSAN_COUNTER_COUNT]; * whitelist or blacklist. */ static struct { - unsigned long *addrs; /* array of addresses */ - size_t size; /* current size */ - int used; /* number of elements used */ - bool sorted; /* if elements are sorted */ - bool whitelist; /* if list is a blacklist or whitelist */ + unsigned long *addrs; /* array of addresses */ + size_t size; /* current size */ + int used; /* number of elements used */ + bool sorted; /* if elements are sorted */ + bool whitelist; /* if list is a blacklist or whitelist */ } report_filterlist = { - .addrs = NULL, - .size = 8, /* small initial size */ - .used = 0, - .sorted = false, - .whitelist = false, /* default is blacklist */ + .addrs = NULL, + .size = 8, /* small initial size */ + .used = 0, + .sorted = false, + .whitelist = false, /* default is blacklist */ }; static DEFINE_SPINLOCK(report_filterlist_lock); static const char *counter_to_name(enum kcsan_counter_id id) { switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: - return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: - return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: - return "data_races"; - case KCSAN_COUNTER_NO_CAPACITY: - return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: - return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: - return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: - return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: - return "encoding_false_positives"; + case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; + case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; + case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; + case KCSAN_COUNTER_REPORT_RACES: return "report_races"; + case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; + case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; + case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; case KCSAN_COUNTER_COUNT: BUG(); } @@ -116,7 +108,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset)) return false; - func_addr -= offset; /* get function start */ + func_addr -= offset; /* Get function start */ spin_lock_irqsave(&report_filterlist_lock, flags); if (report_filterlist.used == 0) @@ -195,6 +187,7 @@ static ssize_t insert_report_filterlist(const char *func) out: spin_unlock_irqrestore(&report_filterlist_lock, flags); + return ret; } @@ -226,8 +219,8 @@ static int debugfs_open(struct inode *inode, struct file *file) return single_open(file, show_info, NULL); } -static ssize_t debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *off) +static ssize_t +debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off) { char kbuf[KSYM_NAME_LEN]; char *arg; @@ -264,10 +257,13 @@ static ssize_t debugfs_write(struct file *file, const char __user *buf, return count; } -static const struct file_operations debugfs_ops = { .read = seq_read, - .open = debugfs_open, - .write = debugfs_write, - .release = single_release }; +static const struct file_operations debugfs_ops = +{ + .read = seq_read, + .open = debugfs_open, + .write = debugfs_write, + .release = single_release +}; void __init kcsan_debugfs_init(void) { diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index e17bdac0e54b..b63890e86449 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -10,7 +10,8 @@ #include "kcsan.h" #define SLOT_RANGE PAGE_SIZE -#define INVALID_WATCHPOINT 0 + +#define INVALID_WATCHPOINT 0 #define CONSUMED_WATCHPOINT 1 /* @@ -34,24 +35,24 @@ * Both these are assumed to be very unlikely. However, in case it still happens * happens, the report logic will filter out the false positive (see report.c). */ -#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG - 1 - WATCHPOINT_SIZE_BITS) +#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) /* * Masks to set/retrieve the encoded data. */ -#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG - 1) +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) #define WATCHPOINT_SIZE_MASK \ - GENMASK(BITS_PER_LONG - 2, BITS_PER_LONG - 2 - WATCHPOINT_SIZE_BITS) + GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) #define WATCHPOINT_ADDR_MASK \ - GENMASK(BITS_PER_LONG - 3 - WATCHPOINT_SIZE_BITS, 0) + GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) static inline bool check_encodable(unsigned long addr, size_t size) { return size <= MAX_ENCODABLE_SIZE; } -static inline long encode_watchpoint(unsigned long addr, size_t size, - bool is_write) +static inline long +encode_watchpoint(unsigned long addr, size_t size, bool is_write) { return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) | (size << WATCHPOINT_ADDR_BITS) | @@ -59,17 +60,17 @@ static inline long encode_watchpoint(unsigned long addr, size_t size, } static inline bool decode_watchpoint(long watchpoint, - unsigned long *addr_masked, size_t *size, + unsigned long *addr_masked, + size_t *size, bool *is_write) { if (watchpoint == INVALID_WATCHPOINT || watchpoint == CONSUMED_WATCHPOINT) return false; - *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; - *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> - WATCHPOINT_ADDR_BITS; - *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); + *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK; + *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS; + *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK); return true; } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 1bb2f1c0d61e..d3b9a96ac8a4 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -72,14 +72,14 @@ enum kcsan_counter_id { /* * Increment/decrement counter with given id; avoid calling these in fast-path. */ -void kcsan_counter_inc(enum kcsan_counter_id id); -void kcsan_counter_dec(enum kcsan_counter_id id); +extern void kcsan_counter_inc(enum kcsan_counter_id id); +extern void kcsan_counter_dec(enum kcsan_counter_id id); /* * Returns true if data races in the function symbol that maps to func_addr * (offsets are ignored) should *not* be reported. */ -bool kcsan_skip_report_debugfs(unsigned long func_addr); +extern bool kcsan_skip_report_debugfs(unsigned long func_addr); enum kcsan_report_type { /* @@ -99,10 +99,11 @@ enum kcsan_report_type { */ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN, }; + /* * Print a race report from thread that encountered the race. */ -void kcsan_report(const volatile void *ptr, size_t size, bool is_write, - bool value_change, int cpu_id, enum kcsan_report_type type); +extern void kcsan_report(const volatile void *ptr, size_t size, bool is_write, + bool value_change, int cpu_id, enum kcsan_report_type type); #endif /* _KERNEL_KCSAN_KCSAN_H */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ead5610bafa7..0eea05a3135b 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -22,13 +22,13 @@ * the reports, with reporting being in the slow-path. */ static struct { - const volatile void *ptr; - size_t size; - bool is_write; - int task_pid; - int cpu_id; - unsigned long stack_entries[NUM_STACK_ENTRIES]; - int num_stack_entries; + const volatile void *ptr; + size_t size; + bool is_write; + int task_pid; + int cpu_id; + unsigned long stack_entries[NUM_STACK_ENTRIES]; + int num_stack_entries; } other_info = { .ptr = NULL }; /* @@ -40,8 +40,8 @@ static DEFINE_SPINLOCK(report_lock); /* * Special rules to skip reporting. */ -static bool skip_report(bool is_write, bool value_change, - unsigned long top_frame) +static bool +skip_report(bool is_write, bool value_change, unsigned long top_frame) { if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && is_write && !value_change) { @@ -105,6 +105,7 @@ static int sym_strcmp(void *addr1, void *addr2) snprintf(buf1, sizeof(buf1), "%pS", addr1); snprintf(buf2, sizeof(buf2), "%pS", addr2); + return strncmp(buf1, buf2, sizeof(buf1)); } @@ -116,8 +117,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, enum kcsan_report_type type) { unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; - int num_stack_entries = - stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); + int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); int skipnr = get_stack_skipnr(stack_entries, num_stack_entries); int other_skipnr; @@ -131,7 +131,7 @@ static bool print_report(const volatile void *ptr, size_t size, bool is_write, other_skipnr = get_stack_skipnr(other_info.stack_entries, other_info.num_stack_entries); - /* value_change is only known for the other thread */ + /* @value_change is only known for the other thread */ if (skip_report(other_info.is_write, value_change, other_info.stack_entries[other_skipnr])) return false; @@ -241,13 +241,12 @@ retry: if (other_info.ptr != NULL) break; /* still in use, retry */ - other_info.ptr = ptr; - other_info.size = size; - other_info.is_write = is_write; - other_info.task_pid = in_task() ? task_pid_nr(current) : -1; - other_info.cpu_id = cpu_id; - other_info.num_stack_entries = stack_trace_save( - other_info.stack_entries, NUM_STACK_ENTRIES, 1); + other_info.ptr = ptr; + other_info.size = size; + other_info.is_write = is_write; + other_info.task_pid = in_task() ? task_pid_nr(current) : -1; + other_info.cpu_id = cpu_id; + other_info.num_stack_entries = stack_trace_save(other_info.stack_entries, NUM_STACK_ENTRIES, 1); spin_unlock_irqrestore(&report_lock, *flags); @@ -299,6 +298,7 @@ retry: } spin_unlock_irqrestore(&report_lock, *flags); + goto retry; } @@ -309,9 +309,7 @@ void kcsan_report(const volatile void *ptr, size_t size, bool is_write, kcsan_disable_current(); if (prepare_report(&flags, ptr, size, is_write, cpu_id, type)) { - if (print_report(ptr, size, is_write, value_change, cpu_id, - type) && - panic_on_warn) + if (print_report(ptr, size, is_write, value_change, cpu_id, type) && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c index 0bae63c5ca65..cc6000239dc0 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/test.c @@ -34,7 +34,7 @@ static bool test_encode_decode(void) if (WARN_ON(!check_encodable(addr, size))) return false; - /* encode and decode */ + /* Encode and decode */ { const long encoded_watchpoint = encode_watchpoint(addr, size, is_write); @@ -42,7 +42,7 @@ static bool test_encode_decode(void) size_t verif_size; bool verif_is_write; - /* check special watchpoints */ + /* Check special watchpoints */ if (WARN_ON(decode_watchpoint( INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write))) @@ -52,7 +52,7 @@ static bool test_encode_decode(void) &verif_size, &verif_is_write))) return false; - /* check decoding watchpoint returns same data */ + /* Check decoding watchpoint returns same data */ if (WARN_ON(!decode_watchpoint( encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write))) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e9307a9c54e7..5fc9c9b70862 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -7,7 +7,7 @@ endif # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -# There are numerous races here, however, most of them due to plain accesses. +# There are numerous data races here, however, most of them are due to plain accesses. # This would make it even harder for syzbot to find reproducers, because these # bugs trigger without specific input. Disable by default, but should re-enable # eventually. diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 5dd464e52ab4..3f78b1434375 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -6,7 +6,6 @@ config HAVE_ARCH_KCSAN menuconfig KCSAN bool "KCSAN: watchpoint-based dynamic data race detector" depends on HAVE_ARCH_KCSAN && !KASAN && STACKTRACE - default n help Kernel Concurrency Sanitizer is a dynamic data race detector, which uses a watchpoint-based sampling approach to detect races. See @@ -16,13 +15,12 @@ if KCSAN config KCSAN_DEBUG bool "Debugging of KCSAN internals" - default n config KCSAN_SELFTEST bool "Perform short selftests on boot" default y help - Run KCSAN selftests on boot. On test failure, causes kernel to panic. + Run KCSAN selftests on boot. On test failure, causes the kernel to panic. config KCSAN_EARLY_ENABLE bool "Early enable during boot" @@ -62,7 +60,8 @@ config KCSAN_DELAY_RANDOMIZE default y help If delays should be randomized, where the maximum is KCSAN_UDELAY_*. - If false, the chosen delays are always KCSAN_UDELAY_* defined above. + If false, the chosen delays are always the KCSAN_UDELAY_* values + as defined above. config KCSAN_SKIP_WATCH int "Skip instructions before setting up watchpoint" @@ -86,9 +85,9 @@ config KCSAN_SKIP_WATCH_RANDOMIZE # parameters, to optimize for the common use-case, we avoid this because: (a) # it would impact performance (and we want to avoid static branch for all # {READ,WRITE}_ONCE, atomic_*, bitops, etc.), and (b) complicate the design -# without real benefit. The main purpose of the below options are for use in -# fuzzer configs to control reported data races, and are not expected to be -# switched frequently by a user. +# without real benefit. The main purpose of the below options is for use in +# fuzzer configs to control reported data races, and they are not expected +# to be switched frequently by a user. config KCSAN_REPORT_RACE_UNKNOWN_ORIGIN bool "Report races of unknown origin" @@ -103,13 +102,12 @@ config KCSAN_REPORT_VALUE_CHANGE_ONLY bool "Only report races where watcher observed a data value change" default y help - If enabled and a conflicting write is observed via watchpoint, but + If enabled and a conflicting write is observed via a watchpoint, but the data value of the memory location was observed to remain unchanged, do not report the data race. config KCSAN_IGNORE_ATOMICS bool "Do not instrument marked atomic accesses" - default n help If enabled, never instruments marked atomic accesses. This results in not reporting data races where one access is atomic and the other is -- cgit v1.2.3 From 944bc9cca7c392879fa2c3f911bbef7422707679 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 26 Nov 2019 15:04:05 +0100 Subject: asm-generic/atomic: Use __always_inline for fallback wrappers Use __always_inline for atomic fallback wrappers. When building for size (CC_OPTIMIZE_FOR_SIZE), some compilers appear to be less inclined to inline even relatively small static inline functions that are assumed to be inlinable such as atomic ops. This can cause problems, for example in UACCESS regions. While the fallback wrappers aren't pure wrappers, they are trivial nonetheless, and the function they wrap should determine the final inlining policy. For x86 tinyconfig we observe: - vmlinux baseline: 1315988 - vmlinux with patch: 1315928 (-60 bytes) Suggested-by: Mark Rutland Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- include/linux/atomic-fallback.h | 340 ++++++++++++++------------- scripts/atomic/fallbacks/acquire | 2 +- scripts/atomic/fallbacks/add_negative | 2 +- scripts/atomic/fallbacks/add_unless | 2 +- scripts/atomic/fallbacks/andnot | 2 +- scripts/atomic/fallbacks/dec | 2 +- scripts/atomic/fallbacks/dec_and_test | 2 +- scripts/atomic/fallbacks/dec_if_positive | 2 +- scripts/atomic/fallbacks/dec_unless_positive | 2 +- scripts/atomic/fallbacks/fence | 2 +- scripts/atomic/fallbacks/fetch_add_unless | 2 +- scripts/atomic/fallbacks/inc | 2 +- scripts/atomic/fallbacks/inc_and_test | 2 +- scripts/atomic/fallbacks/inc_not_zero | 2 +- scripts/atomic/fallbacks/inc_unless_negative | 2 +- scripts/atomic/fallbacks/read_acquire | 2 +- scripts/atomic/fallbacks/release | 2 +- scripts/atomic/fallbacks/set_release | 2 +- scripts/atomic/fallbacks/sub_and_test | 2 +- scripts/atomic/fallbacks/try_cmpxchg | 2 +- scripts/atomic/gen-atomic-fallback.sh | 2 + 21 files changed, 192 insertions(+), 188 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h index a7d240e465c0..656b5489b673 100644 --- a/include/linux/atomic-fallback.h +++ b/include/linux/atomic-fallback.h @@ -6,6 +6,8 @@ #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H +#include + #ifndef xchg_relaxed #define xchg_relaxed xchg #define xchg_acquire xchg @@ -76,7 +78,7 @@ #endif /* cmpxchg64_relaxed */ #ifndef atomic_read_acquire -static inline int +static __always_inline int atomic_read_acquire(const atomic_t *v) { return smp_load_acquire(&(v)->counter); @@ -85,7 +87,7 @@ atomic_read_acquire(const atomic_t *v) #endif #ifndef atomic_set_release -static inline void +static __always_inline void atomic_set_release(atomic_t *v, int i) { smp_store_release(&(v)->counter, i); @@ -100,7 +102,7 @@ atomic_set_release(atomic_t *v, int i) #else /* atomic_add_return_relaxed */ #ifndef atomic_add_return_acquire -static inline int +static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { int ret = atomic_add_return_relaxed(i, v); @@ -111,7 +113,7 @@ atomic_add_return_acquire(int i, atomic_t *v) #endif #ifndef atomic_add_return_release -static inline int +static __always_inline int atomic_add_return_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -121,7 +123,7 @@ atomic_add_return_release(int i, atomic_t *v) #endif #ifndef atomic_add_return -static inline int +static __always_inline int atomic_add_return(int i, atomic_t *v) { int ret; @@ -142,7 +144,7 @@ atomic_add_return(int i, atomic_t *v) #else /* atomic_fetch_add_relaxed */ #ifndef atomic_fetch_add_acquire -static inline int +static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { int ret = atomic_fetch_add_relaxed(i, v); @@ -153,7 +155,7 @@ atomic_fetch_add_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_add_release -static inline int +static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -163,7 +165,7 @@ atomic_fetch_add_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_add -static inline int +static __always_inline int atomic_fetch_add(int i, atomic_t *v) { int ret; @@ -184,7 +186,7 @@ atomic_fetch_add(int i, atomic_t *v) #else /* atomic_sub_return_relaxed */ #ifndef atomic_sub_return_acquire -static inline int +static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { int ret = atomic_sub_return_relaxed(i, v); @@ -195,7 +197,7 @@ atomic_sub_return_acquire(int i, atomic_t *v) #endif #ifndef atomic_sub_return_release -static inline int +static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -205,7 +207,7 @@ atomic_sub_return_release(int i, atomic_t *v) #endif #ifndef atomic_sub_return -static inline int +static __always_inline int atomic_sub_return(int i, atomic_t *v) { int ret; @@ -226,7 +228,7 @@ atomic_sub_return(int i, atomic_t *v) #else /* atomic_fetch_sub_relaxed */ #ifndef atomic_fetch_sub_acquire -static inline int +static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { int ret = atomic_fetch_sub_relaxed(i, v); @@ -237,7 +239,7 @@ atomic_fetch_sub_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_sub_release -static inline int +static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -247,7 +249,7 @@ atomic_fetch_sub_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_sub -static inline int +static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { int ret; @@ -262,7 +264,7 @@ atomic_fetch_sub(int i, atomic_t *v) #endif /* atomic_fetch_sub_relaxed */ #ifndef atomic_inc -static inline void +static __always_inline void atomic_inc(atomic_t *v) { atomic_add(1, v); @@ -278,7 +280,7 @@ atomic_inc(atomic_t *v) #endif /* atomic_inc_return */ #ifndef atomic_inc_return -static inline int +static __always_inline int atomic_inc_return(atomic_t *v) { return atomic_add_return(1, v); @@ -287,7 +289,7 @@ atomic_inc_return(atomic_t *v) #endif #ifndef atomic_inc_return_acquire -static inline int +static __always_inline int atomic_inc_return_acquire(atomic_t *v) { return atomic_add_return_acquire(1, v); @@ -296,7 +298,7 @@ atomic_inc_return_acquire(atomic_t *v) #endif #ifndef atomic_inc_return_release -static inline int +static __always_inline int atomic_inc_return_release(atomic_t *v) { return atomic_add_return_release(1, v); @@ -305,7 +307,7 @@ atomic_inc_return_release(atomic_t *v) #endif #ifndef atomic_inc_return_relaxed -static inline int +static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { return atomic_add_return_relaxed(1, v); @@ -316,7 +318,7 @@ atomic_inc_return_relaxed(atomic_t *v) #else /* atomic_inc_return_relaxed */ #ifndef atomic_inc_return_acquire -static inline int +static __always_inline int atomic_inc_return_acquire(atomic_t *v) { int ret = atomic_inc_return_relaxed(v); @@ -327,7 +329,7 @@ atomic_inc_return_acquire(atomic_t *v) #endif #ifndef atomic_inc_return_release -static inline int +static __always_inline int atomic_inc_return_release(atomic_t *v) { __atomic_release_fence(); @@ -337,7 +339,7 @@ atomic_inc_return_release(atomic_t *v) #endif #ifndef atomic_inc_return -static inline int +static __always_inline int atomic_inc_return(atomic_t *v) { int ret; @@ -359,7 +361,7 @@ atomic_inc_return(atomic_t *v) #endif /* atomic_fetch_inc */ #ifndef atomic_fetch_inc -static inline int +static __always_inline int atomic_fetch_inc(atomic_t *v) { return atomic_fetch_add(1, v); @@ -368,7 +370,7 @@ atomic_fetch_inc(atomic_t *v) #endif #ifndef atomic_fetch_inc_acquire -static inline int +static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { return atomic_fetch_add_acquire(1, v); @@ -377,7 +379,7 @@ atomic_fetch_inc_acquire(atomic_t *v) #endif #ifndef atomic_fetch_inc_release -static inline int +static __always_inline int atomic_fetch_inc_release(atomic_t *v) { return atomic_fetch_add_release(1, v); @@ -386,7 +388,7 @@ atomic_fetch_inc_release(atomic_t *v) #endif #ifndef atomic_fetch_inc_relaxed -static inline int +static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { return atomic_fetch_add_relaxed(1, v); @@ -397,7 +399,7 @@ atomic_fetch_inc_relaxed(atomic_t *v) #else /* atomic_fetch_inc_relaxed */ #ifndef atomic_fetch_inc_acquire -static inline int +static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { int ret = atomic_fetch_inc_relaxed(v); @@ -408,7 +410,7 @@ atomic_fetch_inc_acquire(atomic_t *v) #endif #ifndef atomic_fetch_inc_release -static inline int +static __always_inline int atomic_fetch_inc_release(atomic_t *v) { __atomic_release_fence(); @@ -418,7 +420,7 @@ atomic_fetch_inc_release(atomic_t *v) #endif #ifndef atomic_fetch_inc -static inline int +static __always_inline int atomic_fetch_inc(atomic_t *v) { int ret; @@ -433,7 +435,7 @@ atomic_fetch_inc(atomic_t *v) #endif /* atomic_fetch_inc_relaxed */ #ifndef atomic_dec -static inline void +static __always_inline void atomic_dec(atomic_t *v) { atomic_sub(1, v); @@ -449,7 +451,7 @@ atomic_dec(atomic_t *v) #endif /* atomic_dec_return */ #ifndef atomic_dec_return -static inline int +static __always_inline int atomic_dec_return(atomic_t *v) { return atomic_sub_return(1, v); @@ -458,7 +460,7 @@ atomic_dec_return(atomic_t *v) #endif #ifndef atomic_dec_return_acquire -static inline int +static __always_inline int atomic_dec_return_acquire(atomic_t *v) { return atomic_sub_return_acquire(1, v); @@ -467,7 +469,7 @@ atomic_dec_return_acquire(atomic_t *v) #endif #ifndef atomic_dec_return_release -static inline int +static __always_inline int atomic_dec_return_release(atomic_t *v) { return atomic_sub_return_release(1, v); @@ -476,7 +478,7 @@ atomic_dec_return_release(atomic_t *v) #endif #ifndef atomic_dec_return_relaxed -static inline int +static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { return atomic_sub_return_relaxed(1, v); @@ -487,7 +489,7 @@ atomic_dec_return_relaxed(atomic_t *v) #else /* atomic_dec_return_relaxed */ #ifndef atomic_dec_return_acquire -static inline int +static __always_inline int atomic_dec_return_acquire(atomic_t *v) { int ret = atomic_dec_return_relaxed(v); @@ -498,7 +500,7 @@ atomic_dec_return_acquire(atomic_t *v) #endif #ifndef atomic_dec_return_release -static inline int +static __always_inline int atomic_dec_return_release(atomic_t *v) { __atomic_release_fence(); @@ -508,7 +510,7 @@ atomic_dec_return_release(atomic_t *v) #endif #ifndef atomic_dec_return -static inline int +static __always_inline int atomic_dec_return(atomic_t *v) { int ret; @@ -530,7 +532,7 @@ atomic_dec_return(atomic_t *v) #endif /* atomic_fetch_dec */ #ifndef atomic_fetch_dec -static inline int +static __always_inline int atomic_fetch_dec(atomic_t *v) { return atomic_fetch_sub(1, v); @@ -539,7 +541,7 @@ atomic_fetch_dec(atomic_t *v) #endif #ifndef atomic_fetch_dec_acquire -static inline int +static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { return atomic_fetch_sub_acquire(1, v); @@ -548,7 +550,7 @@ atomic_fetch_dec_acquire(atomic_t *v) #endif #ifndef atomic_fetch_dec_release -static inline int +static __always_inline int atomic_fetch_dec_release(atomic_t *v) { return atomic_fetch_sub_release(1, v); @@ -557,7 +559,7 @@ atomic_fetch_dec_release(atomic_t *v) #endif #ifndef atomic_fetch_dec_relaxed -static inline int +static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { return atomic_fetch_sub_relaxed(1, v); @@ -568,7 +570,7 @@ atomic_fetch_dec_relaxed(atomic_t *v) #else /* atomic_fetch_dec_relaxed */ #ifndef atomic_fetch_dec_acquire -static inline int +static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { int ret = atomic_fetch_dec_relaxed(v); @@ -579,7 +581,7 @@ atomic_fetch_dec_acquire(atomic_t *v) #endif #ifndef atomic_fetch_dec_release -static inline int +static __always_inline int atomic_fetch_dec_release(atomic_t *v) { __atomic_release_fence(); @@ -589,7 +591,7 @@ atomic_fetch_dec_release(atomic_t *v) #endif #ifndef atomic_fetch_dec -static inline int +static __always_inline int atomic_fetch_dec(atomic_t *v) { int ret; @@ -610,7 +612,7 @@ atomic_fetch_dec(atomic_t *v) #else /* atomic_fetch_and_relaxed */ #ifndef atomic_fetch_and_acquire -static inline int +static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { int ret = atomic_fetch_and_relaxed(i, v); @@ -621,7 +623,7 @@ atomic_fetch_and_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_and_release -static inline int +static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -631,7 +633,7 @@ atomic_fetch_and_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_and -static inline int +static __always_inline int atomic_fetch_and(int i, atomic_t *v) { int ret; @@ -646,7 +648,7 @@ atomic_fetch_and(int i, atomic_t *v) #endif /* atomic_fetch_and_relaxed */ #ifndef atomic_andnot -static inline void +static __always_inline void atomic_andnot(int i, atomic_t *v) { atomic_and(~i, v); @@ -662,7 +664,7 @@ atomic_andnot(int i, atomic_t *v) #endif /* atomic_fetch_andnot */ #ifndef atomic_fetch_andnot -static inline int +static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { return atomic_fetch_and(~i, v); @@ -671,7 +673,7 @@ atomic_fetch_andnot(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_acquire -static inline int +static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { return atomic_fetch_and_acquire(~i, v); @@ -680,7 +682,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_release -static inline int +static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { return atomic_fetch_and_release(~i, v); @@ -689,7 +691,7 @@ atomic_fetch_andnot_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_relaxed -static inline int +static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { return atomic_fetch_and_relaxed(~i, v); @@ -700,7 +702,7 @@ atomic_fetch_andnot_relaxed(int i, atomic_t *v) #else /* atomic_fetch_andnot_relaxed */ #ifndef atomic_fetch_andnot_acquire -static inline int +static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { int ret = atomic_fetch_andnot_relaxed(i, v); @@ -711,7 +713,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot_release -static inline int +static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -721,7 +723,7 @@ atomic_fetch_andnot_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_andnot -static inline int +static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { int ret; @@ -742,7 +744,7 @@ atomic_fetch_andnot(int i, atomic_t *v) #else /* atomic_fetch_or_relaxed */ #ifndef atomic_fetch_or_acquire -static inline int +static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { int ret = atomic_fetch_or_relaxed(i, v); @@ -753,7 +755,7 @@ atomic_fetch_or_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_or_release -static inline int +static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -763,7 +765,7 @@ atomic_fetch_or_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_or -static inline int +static __always_inline int atomic_fetch_or(int i, atomic_t *v) { int ret; @@ -784,7 +786,7 @@ atomic_fetch_or(int i, atomic_t *v) #else /* atomic_fetch_xor_relaxed */ #ifndef atomic_fetch_xor_acquire -static inline int +static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { int ret = atomic_fetch_xor_relaxed(i, v); @@ -795,7 +797,7 @@ atomic_fetch_xor_acquire(int i, atomic_t *v) #endif #ifndef atomic_fetch_xor_release -static inline int +static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { __atomic_release_fence(); @@ -805,7 +807,7 @@ atomic_fetch_xor_release(int i, atomic_t *v) #endif #ifndef atomic_fetch_xor -static inline int +static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { int ret; @@ -826,7 +828,7 @@ atomic_fetch_xor(int i, atomic_t *v) #else /* atomic_xchg_relaxed */ #ifndef atomic_xchg_acquire -static inline int +static __always_inline int atomic_xchg_acquire(atomic_t *v, int i) { int ret = atomic_xchg_relaxed(v, i); @@ -837,7 +839,7 @@ atomic_xchg_acquire(atomic_t *v, int i) #endif #ifndef atomic_xchg_release -static inline int +static __always_inline int atomic_xchg_release(atomic_t *v, int i) { __atomic_release_fence(); @@ -847,7 +849,7 @@ atomic_xchg_release(atomic_t *v, int i) #endif #ifndef atomic_xchg -static inline int +static __always_inline int atomic_xchg(atomic_t *v, int i) { int ret; @@ -868,7 +870,7 @@ atomic_xchg(atomic_t *v, int i) #else /* atomic_cmpxchg_relaxed */ #ifndef atomic_cmpxchg_acquire -static inline int +static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { int ret = atomic_cmpxchg_relaxed(v, old, new); @@ -879,7 +881,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new) #endif #ifndef atomic_cmpxchg_release -static inline int +static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { __atomic_release_fence(); @@ -889,7 +891,7 @@ atomic_cmpxchg_release(atomic_t *v, int old, int new) #endif #ifndef atomic_cmpxchg -static inline int +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { int ret; @@ -911,7 +913,7 @@ atomic_cmpxchg(atomic_t *v, int old, int new) #endif /* atomic_try_cmpxchg */ #ifndef atomic_try_cmpxchg -static inline bool +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { int r, o = *old; @@ -924,7 +926,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { int r, o = *old; @@ -937,7 +939,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_release -static inline bool +static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { int r, o = *old; @@ -950,7 +952,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_relaxed -static inline bool +static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { int r, o = *old; @@ -965,7 +967,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) #else /* atomic_try_cmpxchg_relaxed */ #ifndef atomic_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { bool ret = atomic_try_cmpxchg_relaxed(v, old, new); @@ -976,7 +978,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg_release -static inline bool +static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { __atomic_release_fence(); @@ -986,7 +988,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) #endif #ifndef atomic_try_cmpxchg -static inline bool +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { bool ret; @@ -1010,7 +1012,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) * true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { return atomic_sub_return(i, v) == 0; @@ -1027,7 +1029,7 @@ atomic_sub_and_test(int i, atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline bool +static __always_inline bool atomic_dec_and_test(atomic_t *v) { return atomic_dec_return(v) == 0; @@ -1044,7 +1046,7 @@ atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic_inc_and_test(atomic_t *v) { return atomic_inc_return(v) == 0; @@ -1062,7 +1064,7 @@ atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool +static __always_inline bool atomic_add_negative(int i, atomic_t *v) { return atomic_add_return(i, v) < 0; @@ -1080,7 +1082,7 @@ atomic_add_negative(int i, atomic_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns original value of @v */ -static inline int +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); @@ -1105,7 +1107,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) * Atomically adds @a to @v, if @v was not already @u. * Returns true if the addition was done. */ -static inline bool +static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { return atomic_fetch_add_unless(v, a, u) != u; @@ -1121,7 +1123,7 @@ atomic_add_unless(atomic_t *v, int a, int u) * Atomically increments @v by 1, if @v is non-zero. * Returns true if the increment was done. */ -static inline bool +static __always_inline bool atomic_inc_not_zero(atomic_t *v) { return atomic_add_unless(v, 1, 0); @@ -1130,7 +1132,7 @@ atomic_inc_not_zero(atomic_t *v) #endif #ifndef atomic_inc_unless_negative -static inline bool +static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { int c = atomic_read(v); @@ -1146,7 +1148,7 @@ atomic_inc_unless_negative(atomic_t *v) #endif #ifndef atomic_dec_unless_positive -static inline bool +static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { int c = atomic_read(v); @@ -1162,7 +1164,7 @@ atomic_dec_unless_positive(atomic_t *v) #endif #ifndef atomic_dec_if_positive -static inline int +static __always_inline int atomic_dec_if_positive(atomic_t *v) { int dec, c = atomic_read(v); @@ -1186,7 +1188,7 @@ atomic_dec_if_positive(atomic_t *v) #endif #ifndef atomic64_read_acquire -static inline s64 +static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { return smp_load_acquire(&(v)->counter); @@ -1195,7 +1197,7 @@ atomic64_read_acquire(const atomic64_t *v) #endif #ifndef atomic64_set_release -static inline void +static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { smp_store_release(&(v)->counter, i); @@ -1210,7 +1212,7 @@ atomic64_set_release(atomic64_t *v, s64 i) #else /* atomic64_add_return_relaxed */ #ifndef atomic64_add_return_acquire -static inline s64 +static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_add_return_relaxed(i, v); @@ -1221,7 +1223,7 @@ atomic64_add_return_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_add_return_release -static inline s64 +static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1231,7 +1233,7 @@ atomic64_add_return_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_add_return -static inline s64 +static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { s64 ret; @@ -1252,7 +1254,7 @@ atomic64_add_return(s64 i, atomic64_t *v) #else /* atomic64_fetch_add_relaxed */ #ifndef atomic64_fetch_add_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_add_relaxed(i, v); @@ -1263,7 +1265,7 @@ atomic64_fetch_add_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_add_release -static inline s64 +static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1273,7 +1275,7 @@ atomic64_fetch_add_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_add -static inline s64 +static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { s64 ret; @@ -1294,7 +1296,7 @@ atomic64_fetch_add(s64 i, atomic64_t *v) #else /* atomic64_sub_return_relaxed */ #ifndef atomic64_sub_return_acquire -static inline s64 +static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_sub_return_relaxed(i, v); @@ -1305,7 +1307,7 @@ atomic64_sub_return_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_sub_return_release -static inline s64 +static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1315,7 +1317,7 @@ atomic64_sub_return_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_sub_return -static inline s64 +static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { s64 ret; @@ -1336,7 +1338,7 @@ atomic64_sub_return(s64 i, atomic64_t *v) #else /* atomic64_fetch_sub_relaxed */ #ifndef atomic64_fetch_sub_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_sub_relaxed(i, v); @@ -1347,7 +1349,7 @@ atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_sub_release -static inline s64 +static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1357,7 +1359,7 @@ atomic64_fetch_sub_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_sub -static inline s64 +static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { s64 ret; @@ -1372,7 +1374,7 @@ atomic64_fetch_sub(s64 i, atomic64_t *v) #endif /* atomic64_fetch_sub_relaxed */ #ifndef atomic64_inc -static inline void +static __always_inline void atomic64_inc(atomic64_t *v) { atomic64_add(1, v); @@ -1388,7 +1390,7 @@ atomic64_inc(atomic64_t *v) #endif /* atomic64_inc_return */ #ifndef atomic64_inc_return -static inline s64 +static __always_inline s64 atomic64_inc_return(atomic64_t *v) { return atomic64_add_return(1, v); @@ -1397,7 +1399,7 @@ atomic64_inc_return(atomic64_t *v) #endif #ifndef atomic64_inc_return_acquire -static inline s64 +static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { return atomic64_add_return_acquire(1, v); @@ -1406,7 +1408,7 @@ atomic64_inc_return_acquire(atomic64_t *v) #endif #ifndef atomic64_inc_return_release -static inline s64 +static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { return atomic64_add_return_release(1, v); @@ -1415,7 +1417,7 @@ atomic64_inc_return_release(atomic64_t *v) #endif #ifndef atomic64_inc_return_relaxed -static inline s64 +static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { return atomic64_add_return_relaxed(1, v); @@ -1426,7 +1428,7 @@ atomic64_inc_return_relaxed(atomic64_t *v) #else /* atomic64_inc_return_relaxed */ #ifndef atomic64_inc_return_acquire -static inline s64 +static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { s64 ret = atomic64_inc_return_relaxed(v); @@ -1437,7 +1439,7 @@ atomic64_inc_return_acquire(atomic64_t *v) #endif #ifndef atomic64_inc_return_release -static inline s64 +static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { __atomic_release_fence(); @@ -1447,7 +1449,7 @@ atomic64_inc_return_release(atomic64_t *v) #endif #ifndef atomic64_inc_return -static inline s64 +static __always_inline s64 atomic64_inc_return(atomic64_t *v) { s64 ret; @@ -1469,7 +1471,7 @@ atomic64_inc_return(atomic64_t *v) #endif /* atomic64_fetch_inc */ #ifndef atomic64_fetch_inc -static inline s64 +static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { return atomic64_fetch_add(1, v); @@ -1478,7 +1480,7 @@ atomic64_fetch_inc(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { return atomic64_fetch_add_acquire(1, v); @@ -1487,7 +1489,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_release -static inline s64 +static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { return atomic64_fetch_add_release(1, v); @@ -1496,7 +1498,7 @@ atomic64_fetch_inc_release(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_relaxed -static inline s64 +static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { return atomic64_fetch_add_relaxed(1, v); @@ -1507,7 +1509,7 @@ atomic64_fetch_inc_relaxed(atomic64_t *v) #else /* atomic64_fetch_inc_relaxed */ #ifndef atomic64_fetch_inc_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { s64 ret = atomic64_fetch_inc_relaxed(v); @@ -1518,7 +1520,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_inc_release -static inline s64 +static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { __atomic_release_fence(); @@ -1528,7 +1530,7 @@ atomic64_fetch_inc_release(atomic64_t *v) #endif #ifndef atomic64_fetch_inc -static inline s64 +static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { s64 ret; @@ -1543,7 +1545,7 @@ atomic64_fetch_inc(atomic64_t *v) #endif /* atomic64_fetch_inc_relaxed */ #ifndef atomic64_dec -static inline void +static __always_inline void atomic64_dec(atomic64_t *v) { atomic64_sub(1, v); @@ -1559,7 +1561,7 @@ atomic64_dec(atomic64_t *v) #endif /* atomic64_dec_return */ #ifndef atomic64_dec_return -static inline s64 +static __always_inline s64 atomic64_dec_return(atomic64_t *v) { return atomic64_sub_return(1, v); @@ -1568,7 +1570,7 @@ atomic64_dec_return(atomic64_t *v) #endif #ifndef atomic64_dec_return_acquire -static inline s64 +static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { return atomic64_sub_return_acquire(1, v); @@ -1577,7 +1579,7 @@ atomic64_dec_return_acquire(atomic64_t *v) #endif #ifndef atomic64_dec_return_release -static inline s64 +static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { return atomic64_sub_return_release(1, v); @@ -1586,7 +1588,7 @@ atomic64_dec_return_release(atomic64_t *v) #endif #ifndef atomic64_dec_return_relaxed -static inline s64 +static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { return atomic64_sub_return_relaxed(1, v); @@ -1597,7 +1599,7 @@ atomic64_dec_return_relaxed(atomic64_t *v) #else /* atomic64_dec_return_relaxed */ #ifndef atomic64_dec_return_acquire -static inline s64 +static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { s64 ret = atomic64_dec_return_relaxed(v); @@ -1608,7 +1610,7 @@ atomic64_dec_return_acquire(atomic64_t *v) #endif #ifndef atomic64_dec_return_release -static inline s64 +static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { __atomic_release_fence(); @@ -1618,7 +1620,7 @@ atomic64_dec_return_release(atomic64_t *v) #endif #ifndef atomic64_dec_return -static inline s64 +static __always_inline s64 atomic64_dec_return(atomic64_t *v) { s64 ret; @@ -1640,7 +1642,7 @@ atomic64_dec_return(atomic64_t *v) #endif /* atomic64_fetch_dec */ #ifndef atomic64_fetch_dec -static inline s64 +static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { return atomic64_fetch_sub(1, v); @@ -1649,7 +1651,7 @@ atomic64_fetch_dec(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { return atomic64_fetch_sub_acquire(1, v); @@ -1658,7 +1660,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_release -static inline s64 +static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { return atomic64_fetch_sub_release(1, v); @@ -1667,7 +1669,7 @@ atomic64_fetch_dec_release(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_relaxed -static inline s64 +static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { return atomic64_fetch_sub_relaxed(1, v); @@ -1678,7 +1680,7 @@ atomic64_fetch_dec_relaxed(atomic64_t *v) #else /* atomic64_fetch_dec_relaxed */ #ifndef atomic64_fetch_dec_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { s64 ret = atomic64_fetch_dec_relaxed(v); @@ -1689,7 +1691,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) #endif #ifndef atomic64_fetch_dec_release -static inline s64 +static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { __atomic_release_fence(); @@ -1699,7 +1701,7 @@ atomic64_fetch_dec_release(atomic64_t *v) #endif #ifndef atomic64_fetch_dec -static inline s64 +static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { s64 ret; @@ -1720,7 +1722,7 @@ atomic64_fetch_dec(atomic64_t *v) #else /* atomic64_fetch_and_relaxed */ #ifndef atomic64_fetch_and_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_and_relaxed(i, v); @@ -1731,7 +1733,7 @@ atomic64_fetch_and_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_and_release -static inline s64 +static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1741,7 +1743,7 @@ atomic64_fetch_and_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_and -static inline s64 +static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { s64 ret; @@ -1756,7 +1758,7 @@ atomic64_fetch_and(s64 i, atomic64_t *v) #endif /* atomic64_fetch_and_relaxed */ #ifndef atomic64_andnot -static inline void +static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { atomic64_and(~i, v); @@ -1772,7 +1774,7 @@ atomic64_andnot(s64 i, atomic64_t *v) #endif /* atomic64_fetch_andnot */ #ifndef atomic64_fetch_andnot -static inline s64 +static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { return atomic64_fetch_and(~i, v); @@ -1781,7 +1783,7 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { return atomic64_fetch_and_acquire(~i, v); @@ -1790,7 +1792,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_release -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { return atomic64_fetch_and_release(~i, v); @@ -1799,7 +1801,7 @@ atomic64_fetch_andnot_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_relaxed -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { return atomic64_fetch_and_relaxed(~i, v); @@ -1810,7 +1812,7 @@ atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) #else /* atomic64_fetch_andnot_relaxed */ #ifndef atomic64_fetch_andnot_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_andnot_relaxed(i, v); @@ -1821,7 +1823,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot_release -static inline s64 +static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1831,7 +1833,7 @@ atomic64_fetch_andnot_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_andnot -static inline s64 +static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { s64 ret; @@ -1852,7 +1854,7 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) #else /* atomic64_fetch_or_relaxed */ #ifndef atomic64_fetch_or_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_or_relaxed(i, v); @@ -1863,7 +1865,7 @@ atomic64_fetch_or_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_or_release -static inline s64 +static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1873,7 +1875,7 @@ atomic64_fetch_or_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_or -static inline s64 +static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { s64 ret; @@ -1894,7 +1896,7 @@ atomic64_fetch_or(s64 i, atomic64_t *v) #else /* atomic64_fetch_xor_relaxed */ #ifndef atomic64_fetch_xor_acquire -static inline s64 +static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { s64 ret = atomic64_fetch_xor_relaxed(i, v); @@ -1905,7 +1907,7 @@ atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_xor_release -static inline s64 +static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { __atomic_release_fence(); @@ -1915,7 +1917,7 @@ atomic64_fetch_xor_release(s64 i, atomic64_t *v) #endif #ifndef atomic64_fetch_xor -static inline s64 +static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 ret; @@ -1936,7 +1938,7 @@ atomic64_fetch_xor(s64 i, atomic64_t *v) #else /* atomic64_xchg_relaxed */ #ifndef atomic64_xchg_acquire -static inline s64 +static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 i) { s64 ret = atomic64_xchg_relaxed(v, i); @@ -1947,7 +1949,7 @@ atomic64_xchg_acquire(atomic64_t *v, s64 i) #endif #ifndef atomic64_xchg_release -static inline s64 +static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 i) { __atomic_release_fence(); @@ -1957,7 +1959,7 @@ atomic64_xchg_release(atomic64_t *v, s64 i) #endif #ifndef atomic64_xchg -static inline s64 +static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) { s64 ret; @@ -1978,7 +1980,7 @@ atomic64_xchg(atomic64_t *v, s64 i) #else /* atomic64_cmpxchg_relaxed */ #ifndef atomic64_cmpxchg_acquire -static inline s64 +static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { s64 ret = atomic64_cmpxchg_relaxed(v, old, new); @@ -1989,7 +1991,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) #endif #ifndef atomic64_cmpxchg_release -static inline s64 +static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { __atomic_release_fence(); @@ -1999,7 +2001,7 @@ atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) #endif #ifndef atomic64_cmpxchg -static inline s64 +static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { s64 ret; @@ -2021,7 +2023,7 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) #endif /* atomic64_try_cmpxchg */ #ifndef atomic64_try_cmpxchg -static inline bool +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2034,7 +2036,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2047,7 +2049,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_release -static inline bool +static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2060,7 +2062,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_relaxed -static inline bool +static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; @@ -2075,7 +2077,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) #else /* atomic64_try_cmpxchg_relaxed */ #ifndef atomic64_try_cmpxchg_acquire -static inline bool +static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { bool ret = atomic64_try_cmpxchg_relaxed(v, old, new); @@ -2086,7 +2088,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg_release -static inline bool +static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { __atomic_release_fence(); @@ -2096,7 +2098,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) #endif #ifndef atomic64_try_cmpxchg -static inline bool +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { bool ret; @@ -2120,7 +2122,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) * true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { return atomic64_sub_return(i, v) == 0; @@ -2137,7 +2139,7 @@ atomic64_sub_and_test(s64 i, atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline bool +static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { return atomic64_dec_return(v) == 0; @@ -2154,7 +2156,7 @@ atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline bool +static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { return atomic64_inc_return(v) == 0; @@ -2172,7 +2174,7 @@ atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool +static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { return atomic64_add_return(i, v) < 0; @@ -2190,7 +2192,7 @@ atomic64_add_negative(s64 i, atomic64_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns original value of @v */ -static inline s64 +static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { s64 c = atomic64_read(v); @@ -2215,7 +2217,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) * Atomically adds @a to @v, if @v was not already @u. * Returns true if the addition was done. */ -static inline bool +static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { return atomic64_fetch_add_unless(v, a, u) != u; @@ -2231,7 +2233,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) * Atomically increments @v by 1, if @v is non-zero. * Returns true if the increment was done. */ -static inline bool +static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { return atomic64_add_unless(v, 1, 0); @@ -2240,7 +2242,7 @@ atomic64_inc_not_zero(atomic64_t *v) #endif #ifndef atomic64_inc_unless_negative -static inline bool +static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { s64 c = atomic64_read(v); @@ -2256,7 +2258,7 @@ atomic64_inc_unless_negative(atomic64_t *v) #endif #ifndef atomic64_dec_unless_positive -static inline bool +static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { s64 c = atomic64_read(v); @@ -2272,7 +2274,7 @@ atomic64_dec_unless_positive(atomic64_t *v) #endif #ifndef atomic64_dec_if_positive -static inline s64 +static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { s64 dec, c = atomic64_read(v); @@ -2292,4 +2294,4 @@ atomic64_dec_if_positive(atomic64_t *v) #define atomic64_cond_read_relaxed(v, c) smp_cond_load_relaxed(&(v)->counter, (c)) #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 25de4a2804d70f57e994fe3b419148658bb5378a +// baaf45f4c24ed88ceae58baca39d7fd80bb8101b diff --git a/scripts/atomic/fallbacks/acquire b/scripts/atomic/fallbacks/acquire index e38871e64db6..ea489acc285e 100755 --- a/scripts/atomic/fallbacks/acquire +++ b/scripts/atomic/fallbacks/acquire @@ -1,5 +1,5 @@ cat <counter); diff --git a/scripts/atomic/fallbacks/release b/scripts/atomic/fallbacks/release index 3f628a3802d9..730d2a6d3e07 100755 --- a/scripts/atomic/fallbacks/release +++ b/scripts/atomic/fallbacks/release @@ -1,5 +1,5 @@ cat <counter, i); diff --git a/scripts/atomic/fallbacks/sub_and_test b/scripts/atomic/fallbacks/sub_and_test index 289ef17a2d7a..6cfe4ed49746 100755 --- a/scripts/atomic/fallbacks/sub_and_test +++ b/scripts/atomic/fallbacks/sub_and_test @@ -8,7 +8,7 @@ cat < + EOF for xchg in "xchg" "cmpxchg" "cmpxchg64"; do -- cgit v1.2.3 From e33f9a169747880a008dd5e7b934fc592e91cd63 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 12 Dec 2019 01:07:09 +0100 Subject: kcsan: Add __no_kcsan function attribute Since the use of -fsanitize=thread is an implementation detail of KCSAN, the name __no_sanitize_thread could be misleading if used widely. Instead, we introduce the __no_kcsan attribute which is shorter and more accurate in the context of KCSAN. This matches the attribute name __no_kcsan_or_inline. The use of __kcsan_or_inline itself is still required for __always_inline functions to retain compatibility with older compilers. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/compiler-gcc.h | 3 +-- include/linux/compiler.h | 7 +++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 0eb2a1cc411d..cf294faec2f8 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -146,8 +146,7 @@ #endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) -#define __no_sanitize_thread \ - __attribute__((__noinline__)) __attribute__((no_sanitize_thread)) +#define __no_sanitize_thread __attribute__((no_sanitize_thread)) #else #define __no_sanitize_thread #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ad8c76144a3c..8c0beb10c1dd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -207,12 +207,15 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define __no_kasan_or_inline __always_inline #endif +#define __no_kcsan __no_sanitize_thread #ifdef __SANITIZE_THREAD__ /* * Rely on __SANITIZE_THREAD__ instead of CONFIG_KCSAN, to avoid not inlining in - * compilation units where instrumentation is disabled. + * compilation units where instrumentation is disabled. The attribute 'noinline' + * is required for older compilers, where implicit inlining of very small + * functions renders __no_sanitize_thread ineffective. */ -# define __no_kcsan_or_inline __no_sanitize_thread notrace __maybe_unused +# define __no_kcsan_or_inline __no_kcsan noinline notrace __maybe_unused # define __no_sanitize_or_inline __no_kcsan_or_inline #else # define __no_kcsan_or_inline __always_inline -- cgit v1.2.3 From 33225d7b0ac9903c5701bbede7dfdaeba74ad6c3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 13 Feb 2020 10:51:33 +0100 Subject: printk: Correctly set CON_CONSDEV even when preferred console was not registered CON_CONSDEV flag was historically used to put/keep the preferred console first in console_drivers list. Where the preferred console is the last on the command line. The ordering is important only when opening /dev/console: + tty_kopen() + tty_lookup_driver() + console_device() The flag was originally an implementation detail. But it was later made accessible from userspace via /proc/consoles. It was used, for example, by the tool "showconsole" to show the real tty accessible via /dev/console, see https://github.com/bitstreamout/showconsole Now, the current code sets CON_CONSDEV only for the preferred console or when a fallback console is added. The flag is not set when the preferred console is defined on the command line but it is not registered from some reasons. Simple solution is to set CON_CONSDEV flag for the first registered console. It will work most of the time because: + Most real consoles have console->device defined. + Boot consoles are removed in printk_late_init(). + unregister_console() moves CON_CONSDEV flag to the next console. Clean solution would require checking con->device when the preferred console is registered and in unregister_console(). Conclusion: Use the simple solution for now. It is better than the current state and good enough. The clean solution is not worth it. It would complicate the already complicated code without too much gain. Instead the code would deserve a complete rewrite. Link: https://lore.kernel.org/r/20200213095133.23176-4-pmladek@suse.com Signed-off-by: Benjamin Herrenschmidt [pmladek@suse.com: Correct reasoning in the commit message, comment update.] Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- kernel/printk/printk.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index d09951d5a94e..72141f71c014 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -135,7 +135,7 @@ static inline int con_debug_leave(void) */ #define CON_PRINTBUFFER (1) -#define CON_CONSDEV (2) /* Last on the command line */ +#define CON_CONSDEV (2) /* Preferred console, /dev/console */ #define CON_ENABLED (4) #define CON_BOOT (8) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f76ef3f0efca..cf0ceacdae2f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2788,6 +2788,8 @@ void register_console(struct console *newcon) console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; + /* Ensure this flag is always set for the head of the list */ + newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; -- cgit v1.2.3 From db751e309ff05461a0c8e114b1238d7a69cc1f18 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:43 +0000 Subject: ELF: UAPI and Kconfig additions for ELF program properties Pull the basic ELF definitions relating to the NT_GNU_PROPERTY_TYPE_0 note from Yu-Cheng Yu's earlier x86 shstk series. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Signed-off-by: Yu-cheng Yu Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- fs/Kconfig.binfmt | 3 +++ include/linux/elf.h | 12 ++++++++++++ include/uapi/linux/elf.h | 1 + 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..d2cfe0729a73 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -36,6 +36,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_USE_GNU_PROPERTY + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF diff --git a/include/linux/elf.h b/include/linux/elf.h index e3649b3e970e..f7b24c5fcfb6 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -2,6 +2,7 @@ #ifndef _LINUX_ELF_H #define _LINUX_ELF_H +#include #include #include @@ -56,4 +57,15 @@ static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) { extern int elf_coredump_extra_notes_size(void); extern int elf_coredump_extra_notes_write(struct coredump_params *cprm); #endif + +/* + * NT_GNU_PROPERTY_TYPE_0 header: + * Keep this internal until/unless there is an agreed UAPI definition. + * pr_type values (GNU_PROPERTY_*) are public and defined in the UAPI header. + */ +struct gnu_property { + u32 pr_type; + u32 pr_datasz; +}; + #endif /* _LINUX_ELF_H */ diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 34c02e4290fe..c37731407074 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -36,6 +36,7 @@ typedef __s64 Elf64_Sxword; #define PT_LOPROC 0x70000000 #define PT_HIPROC 0x7fffffff #define PT_GNU_EH_FRAME 0x6474e550 +#define PT_GNU_PROPERTY 0x6474e553 #define PT_GNU_STACK (PT_LOOS + 0x474e551) -- cgit v1.2.3 From 00e19ceec80b03a43f626f891fcc53e57919f1b3 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:44 +0000 Subject: ELF: Add ELF program property parsing support ELF program properties will be needed for detecting whether to enable optional architecture or ABI features for a new ELF process. For now, there are no generic properties that we care about, so do nothing unless CONFIG_ARCH_USE_GNU_PROPERTY=y. Otherwise, the presence of properties using the PT_PROGRAM_PROPERTY phdrs entry (if any), and notify each property to the arch code. For now, the added code is not used. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- fs/binfmt_elf.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++ fs/compat_binfmt_elf.c | 4 ++ include/linux/elf.h | 19 +++++++ include/uapi/linux/elf.h | 4 ++ 4 files changed, 154 insertions(+) (limited to 'include/linux') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f4713ea76e82..1fb67e506b68 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -39,12 +39,18 @@ #include #include #include +#include +#include #include #include #include #include #include +#ifndef ELF_COMPAT +#define ELF_COMPAT 0 +#endif + #ifndef user_long_t #define user_long_t long #endif @@ -681,6 +687,111 @@ out: * libraries. There is no binary dependent code anywhere else. */ +static int parse_elf_property(const char *data, size_t *off, size_t datasz, + struct arch_elf_state *arch, + bool have_prev_type, u32 *prev_type) +{ + size_t o, step; + const struct gnu_property *pr; + int ret; + + if (*off == datasz) + return -ENOENT; + + if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) + return -EIO; + o = *off; + datasz -= *off; + + if (datasz < sizeof(*pr)) + return -ENOEXEC; + pr = (const struct gnu_property *)(data + o); + o += sizeof(*pr); + datasz -= sizeof(*pr); + + if (pr->pr_datasz > datasz) + return -ENOEXEC; + + WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); + step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); + if (step > datasz) + return -ENOEXEC; + + /* Properties are supposed to be unique and sorted on pr_type: */ + if (have_prev_type && pr->pr_type <= *prev_type) + return -ENOEXEC; + *prev_type = pr->pr_type; + + ret = arch_parse_elf_property(pr->pr_type, data + o, + pr->pr_datasz, ELF_COMPAT, arch); + if (ret) + return ret; + + *off = o + step; + return 0; +} + +#define NOTE_DATA_SZ SZ_1K +#define GNU_PROPERTY_TYPE_0_NAME "GNU" +#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) + +static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, + struct arch_elf_state *arch) +{ + union { + struct elf_note nhdr; + char data[NOTE_DATA_SZ]; + } note; + loff_t pos; + ssize_t n; + size_t off, datasz; + int ret; + bool have_prev_type; + u32 prev_type; + + if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) + return 0; + + /* load_elf_binary() shouldn't call us unless this is true... */ + if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) + return -ENOEXEC; + + /* If the properties are crazy large, that's too bad (for now): */ + if (phdr->p_filesz > sizeof(note)) + return -ENOEXEC; + + pos = phdr->p_offset; + n = kernel_read(f, ¬e, phdr->p_filesz, &pos); + + BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); + if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) + return -EIO; + + if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || + note.nhdr.n_namesz != NOTE_NAME_SZ || + strncmp(note.data + sizeof(note.nhdr), + GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + return -ENOEXEC; + + off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, + ELF_GNU_PROPERTY_ALIGN); + if (off > n) + return -ENOEXEC; + + if (note.nhdr.n_descsz > n - off) + return -ENOEXEC; + datasz = off + note.nhdr.n_descsz; + + have_prev_type = false; + do { + ret = parse_elf_property(note.data, &off, datasz, arch, + have_prev_type, &prev_type); + have_prev_type = true; + } while (!ret); + + return ret == -ENOENT ? 0 : ret; +} + static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -688,6 +799,7 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; + struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; int bss_prot = 0; int retval, i; @@ -733,6 +845,11 @@ static int load_elf_binary(struct linux_binprm *bprm) for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; + if (elf_ppnt->p_type == PT_GNU_PROPERTY) { + elf_property_phdata = elf_ppnt; + continue; + } + if (elf_ppnt->p_type != PT_INTERP) continue; @@ -820,9 +937,14 @@ out_free_interp: goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { + case PT_GNU_PROPERTY: + elf_property_phdata = elf_ppnt; + break; + case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(&loc->interp_elf_ex, elf_ppnt, interpreter, @@ -833,6 +955,11 @@ out_free_interp: } } + retval = parse_elf_properties(interpreter ?: bprm->file, + elf_property_phdata, &arch_state); + if (retval) + goto out_free_dentry; + /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index aaad4ca1217e..13a087bc816b 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -17,6 +17,8 @@ #include #include +#define ELF_COMPAT 1 + /* * Rename the basic ELF layout types to refer to the 32-bit class of files. */ @@ -28,11 +30,13 @@ #undef elf_shdr #undef elf_note #undef elf_addr_t +#undef ELF_GNU_PROPERTY_ALIGN #define elfhdr elf32_hdr #define elf_phdr elf32_phdr #define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN /* * Some data types as stored in coredump. diff --git a/include/linux/elf.h b/include/linux/elf.h index f7b24c5fcfb6..db5113479f5e 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -22,6 +22,9 @@ SET_PERSONALITY(ex) #endif +#define ELF32_GNU_PROPERTY_ALIGN 4 +#define ELF64_GNU_PROPERTY_ALIGN 8 + #if ELF_CLASS == ELFCLASS32 extern Elf32_Dyn _DYNAMIC []; @@ -32,6 +35,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half #define Elf_Word Elf32_Word +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN #else @@ -43,6 +47,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half #define Elf_Word Elf64_Word +#define ELF_GNU_PROPERTY_ALIGN ELF64_GNU_PROPERTY_ALIGN #endif @@ -68,4 +73,18 @@ struct gnu_property { u32 pr_datasz; }; +struct arch_elf_state; + +#ifndef CONFIG_ARCH_USE_GNU_PROPERTY +static inline int arch_parse_elf_property(u32 type, const void *data, + size_t datasz, bool compat, + struct arch_elf_state *arch) +{ + return 0; +} +#else +extern int arch_parse_elf_property(u32 type, const void *data, size_t datasz, + bool compat, struct arch_elf_state *arch); +#endif + #endif /* _LINUX_ELF_H */ diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index c37731407074..20900f4496b7 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -368,6 +368,7 @@ typedef struct elf64_shdr { * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. + * The note name for all these is "LINUX". */ #define NT_PRSTATUS 1 #define NT_PRFPREG 2 @@ -430,6 +431,9 @@ typedef struct elf64_shdr { #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +/* Note types with note name "GNU" */ +#define NT_GNU_PROPERTY_TYPE_0 5 + /* Note header in a PT_NOTE section */ typedef struct elf32_note { Elf32_Word n_namesz; /* Name size */ -- cgit v1.2.3 From 8ef8f360cf30be12382f89ff48a57fbbd9b31c14 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:45 +0000 Subject: arm64: Basic Branch Target Identification support This patch adds the bare minimum required to expose the ARMv8.5 Branch Target Identification feature to userspace. By itself, this does _not_ automatically enable BTI for any initial executable pages mapped by execve(). This will come later, but for now it should be possible to enable BTI manually on those pages by using mprotect() from within the target process. Other arches already using the generic mman.h are already using 0x10 for arch-specific prot flags, so we use that for PROT_BTI here. For consistency, signal handler entry points in BTI guarded pages are required to be annotated as such, just like any other function. This blocks a relatively minor attack vector, but comforming userspace will have the annotations anyway, so we may as well enforce them. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- Documentation/arm64/cpu-feature-registers.rst | 2 ++ Documentation/arm64/elf_hwcaps.rst | 5 ++++ arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/cpufeature.h | 6 +++++ arch/arm64/include/asm/esr.h | 2 +- arch/arm64/include/asm/exception.h | 1 + arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/asm/mman.h | 37 +++++++++++++++++++++++++++ arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/include/asm/pgtable.h | 2 +- arch/arm64/include/asm/ptrace.h | 1 + arch/arm64/include/asm/sysreg.h | 4 +++ arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/mman.h | 9 +++++++ arch/arm64/include/uapi/asm/ptrace.h | 9 +++++++ arch/arm64/kernel/cpufeature.c | 33 ++++++++++++++++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + arch/arm64/kernel/entry-common.c | 11 ++++++++ arch/arm64/kernel/ptrace.c | 2 +- arch/arm64/kernel/signal.c | 16 ++++++++++++ arch/arm64/kernel/syscall.c | 18 +++++++++++++ arch/arm64/kernel/traps.c | 8 ++++++ include/linux/mm.h | 3 +++ 23 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/mman.h create mode 100644 arch/arm64/include/uapi/asm/mman.h (limited to 'include/linux') diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index 41937a8091aa..314fa5bc2655 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -176,6 +176,8 @@ infrastructure: +------------------------------+---------+---------+ | SSBS | [7-4] | y | +------------------------------+---------+---------+ + | BT | [3-0] | y | + +------------------------------+---------+---------+ 4) MIDR_EL1 - Main ID Register diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 7dfb97dfe416..84a9fd2d41b4 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -236,6 +236,11 @@ HWCAP2_RNG Functionality implied by ID_AA64ISAR0_EL1.RNDR == 0b0001. +HWCAP2_BTI + + Functionality implied by ID_AA64PFR0_EL1.BT == 0b0001. + + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 865e0253fc1e..58e776c22aab 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -58,7 +58,8 @@ #define ARM64_WORKAROUND_SPECULATIVE_AT_NVHE 48 #define ARM64_HAS_E0PD 49 #define ARM64_HAS_RNG 50 +#define ARM64_BTI 51 -#define ARM64_NCAPS 51 +#define ARM64_NCAPS 52 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 92ef9539874a..e3ebcc59e83b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -613,6 +613,12 @@ static inline bool system_has_prio_mask_debugging(void) system_uses_irq_prio_masking(); } +static inline bool system_supports_bti(void) +{ + return IS_ENABLED(CONFIG_ARM64_BTI) && + cpus_have_const_cap(ARM64_BTI); +} + static inline bool system_capabilities_finalized(void) { return static_branch_likely(&arm64_const_caps_ready); diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index cb29253ae86b..390b8ba67830 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -22,7 +22,7 @@ #define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ #define ESR_ELx_EC_CP14_64 (0x0C) -/* Unallocated EC: 0x0d */ +#define ESR_ELx_EC_BTI (0x0D) #define ESR_ELx_EC_ILL (0x0E) /* Unallocated EC: 0x0F - 0x10 */ #define ESR_ELx_EC_SVC32 (0x11) diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 7a6e81ca23a8..7577a754d443 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -34,6 +34,7 @@ static inline u32 disr_to_esr(u64 disr) asmlinkage void enter_from_user_mode(void); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); +void do_bti(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, struct pt_regs *regs); diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 0f00265248b5..d683bcbf1e7c 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -94,6 +94,7 @@ #define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16) #define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) #define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) +#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h new file mode 100644 index 000000000000..081ec8de9ea6 --- /dev/null +++ b/arch/arm64/include/asm/mman.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include +#include +#include + +static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, + unsigned long pkey __always_unused) +{ + if (system_supports_bti() && (prot & PROT_BTI)) + return VM_ARM64_BTI; + + return 0; +} +#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) + +static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) +{ + return (vm_flags & VM_ARM64_BTI) ? __pgprot(PTE_GP) : __pgprot(0); +} +#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) + +static inline bool arch_validate_prot(unsigned long prot, + unsigned long addr __always_unused) +{ + unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM; + + if (system_supports_bti()) + supported |= PROT_BTI; + + return (prot & ~supported) == 0; +} +#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) + +#endif /* ! __ASM_MMAN_H__ */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 6bf5e650da78..167f1d1d48aa 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -151,6 +151,7 @@ #define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define PTE_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ #define PTE_NG (_AT(pteval_t, 1) << 11) /* nG */ +#define PTE_GP (_AT(pteval_t, 1) << 50) /* BTI guarded */ #define PTE_DBM (_AT(pteval_t, 1) << 51) /* Dirty Bit Management */ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 538c85e62f86..4fbf516d8cb2 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -660,7 +660,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | - PTE_PROT_NONE | PTE_VALID | PTE_WRITE; + PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = pte_mkdirty(pte); diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index bf57308fcd63..2172ec7594ba 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -35,6 +35,7 @@ #define GIC_PRIO_PSR_I_SET (1 << 4) /* Additional SPSR bits not exposed in the UABI */ + #define PSR_IL_BIT (1 << 20) /* AArch32-specific ptrace requests */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b91570ff9db1..db08ceb4cc9a 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -514,6 +514,8 @@ #endif /* SCTLR_EL1 specific flags. */ +#define SCTLR_EL1_BT1 (BIT(36)) +#define SCTLR_EL1_BT0 (BIT(35)) #define SCTLR_EL1_UCI (BIT(26)) #define SCTLR_EL1_E0E (BIT(24)) #define SCTLR_EL1_SPAN (BIT(23)) @@ -620,10 +622,12 @@ /* id_aa64pfr1 */ #define ID_AA64PFR1_SSBS_SHIFT 4 +#define ID_AA64PFR1_BT_SHIFT 0 #define ID_AA64PFR1_SSBS_PSTATE_NI 0 #define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 +#define ID_AA64PFR1_BT_BTI 0x1 /* id_aa64zfr0 */ #define ID_AA64ZFR0_F64MM_SHIFT 56 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 7752d93bb50f..2d6ba1c2592e 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -73,5 +73,6 @@ #define HWCAP2_BF16 (1 << 14) #define HWCAP2_DGH (1 << 15) #define HWCAP2_RNG (1 << 16) +#define HWCAP2_BTI (1 << 17) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h new file mode 100644 index 000000000000..6fdd71eb644f --- /dev/null +++ b/arch/arm64/include/uapi/asm/mman.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__ASM_MMAN_H +#define _UAPI__ASM_MMAN_H + +#include + +#define PROT_BTI 0x10 /* BTI guarded page */ + +#endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index d1bb5b69f1ce..42cbe34d95ce 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -46,6 +46,7 @@ #define PSR_I_BIT 0x00000080 #define PSR_A_BIT 0x00000100 #define PSR_D_BIT 0x00000200 +#define PSR_BTYPE_MASK 0x00000c00 #define PSR_SSBS_BIT 0x00001000 #define PSR_PAN_BIT 0x00400000 #define PSR_UAO_BIT 0x00800000 @@ -55,6 +56,8 @@ #define PSR_Z_BIT 0x40000000 #define PSR_N_BIT 0x80000000 +#define PSR_BTYPE_SHIFT 10 + /* * Groups of PSR bits */ @@ -63,6 +66,12 @@ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ +/* Convenience names for the values of PSTATE.BTYPE */ +#define PSR_BTYPE_NONE (0b00 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_JC (0b01 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_C (0b10 << PSR_BTYPE_SHIFT) +#define PSR_BTYPE_J (0b11 << PSR_BTYPE_SHIFT) + /* syscall emulation path in ptrace */ #define PTRACE_SYSEMU 31 #define PTRACE_SYSEMU_SINGLESTEP 32 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0b6715625cf6..e6d31776e49b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -179,6 +179,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -1347,6 +1349,21 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, } #endif +#ifdef CONFIG_ARM64_BTI +static void bti_enable(const struct arm64_cpu_capabilities *__unused) +{ + /* + * Use of X16/X17 for tail-calls and trampolines that jump to + * function entry points using BR is a requirement for + * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. + * So, be strict and forbid other BRs using other registers to + * jump onto a PACIxSP instruction: + */ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); + isb(); +} +#endif /* CONFIG_ARM64_BTI */ + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -1671,6 +1688,19 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, .min_field_value = 1, }, +#endif +#ifdef CONFIG_ARM64_BTI + { + .desc = "Branch Target Identification", + .capability = ARM64_BTI, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = bti_enable, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_BT_SHIFT, + .min_field_value = ID_AA64PFR1_BT_BTI, + .sign = FTR_UNSIGNED, + }, #endif {}, }; @@ -1781,6 +1811,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F64MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_F64MM, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), #endif HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), +#ifdef CONFIG_ARM64_BTI + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_BT_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_BT_BTI, CAP_HWCAP, KERNEL_HWCAP_BTI), +#endif #ifdef CONFIG_ARM64_PTR_AUTH HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 86136075ae41..5e47e93b5dc1 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -92,6 +92,7 @@ static const char *const hwcap_str[] = { "bf16", "dgh", "rng", + "bti", NULL }; diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index fde59981445c..55ec0627f5a7 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -188,6 +188,14 @@ static void notrace el0_undef(struct pt_regs *regs) } NOKPROBE_SYMBOL(el0_undef); +static void notrace el0_bti(struct pt_regs *regs) +{ + user_exit_irqoff(); + local_daif_restore(DAIF_PROCCTX); + do_bti(regs); +} +NOKPROBE_SYMBOL(el0_bti); + static void notrace el0_inv(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); @@ -255,6 +263,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_UNKNOWN: el0_undef(regs); break; + case ESR_ELx_EC_BTI: + el0_bti(regs); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index cd6e5fa48b9c..fd8ac7cf68e7 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1874,7 +1874,7 @@ void syscall_trace_exit(struct pt_regs *regs) */ #define SPSR_EL1_AARCH64_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ - GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5)) + GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 339882db5a91..801d56cdf701 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -732,6 +732,22 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->pc = (unsigned long)ka->sa.sa_handler; + /* + * Signal delivery is a (wacky) indirect function call in + * userspace, so simulate the same setting of BTYPE as a BLR + * . + * Signal delivery to a location in a PROT_BTI guarded page + * that is not a function entry point will now trigger a + * SIGILL in userspace. + * + * If the signal handler entry point is not in a PROT_BTI + * guarded page, this is harmless. + */ + if (system_supports_bti()) { + regs->pstate &= ~PSR_BTYPE_MASK; + regs->pstate |= PSR_BTYPE_C; + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index a12c0c88d345..5f5b868292f5 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -98,6 +98,24 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; + /* + * BTI note: + * The architecture does not guarantee that SPSR.BTYPE is zero + * on taking an SVC, so we could return to userspace with a + * non-zero BTYPE after the syscall. + * + * This shouldn't matter except when userspace is explicitly + * doing something stupid, such as setting PROT_BTI on a page + * that lacks conforming BTI/PACIxSP instructions, falling + * through from one executable page to another with differing + * PROT_BTI, or messing with BTYPE via ptrace: in such cases, + * userspace should not be surprised if a SIGILL occurs on + * syscall return. + * + * So, don't touch regs->pstate & PSR_BTYPE_MASK here. + * (Similarly for HVC and SMC elsewhere.) + */ + cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); user_exit(); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cf402be5c573..b8c714dda851 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -411,6 +411,13 @@ void do_undefinstr(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_undefinstr); +void do_bti(struct pt_regs *regs) +{ + BUG_ON(!user_mode(regs)); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); +} +NOKPROBE_SYMBOL(do_bti); + #define __user_cache_maint(insn, address, res) \ if (address >= user_addr_max()) { \ res = -EFAULT; \ @@ -753,6 +760,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", + [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", diff --git a/include/linux/mm.h b/include/linux/mm.h index 52269e56c514..9e5fce1b2099 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -324,6 +324,9 @@ extern unsigned int kobjsize(const void *objp); #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI +#elif defined(CONFIG_ARM64) +# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ +# define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif -- cgit v1.2.3 From fe0f67660ee9c99408be5261ae045f8b41953b05 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:46 +0000 Subject: elf: Allow arch to tweak initial mmap prot flags An arch may want to tweak the mmap prot flags for an ELFexecutable's initial mappings. For example, arm64 is going to need to add PROT_BTI for executable pages in an ELF process whose executable is marked as using Branch Target Identification (an ARMv8.5-A control flow integrity feature). So that this can be done in a generic way, add a hook arch_elf_adjust_prot() to modify the prot flags as desired: arches can select CONFIG_HAVE_ELF_PROT and implement their own backend where necessary. By default, leave the prot flags unchanged. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- fs/Kconfig.binfmt | 3 +++ fs/binfmt_elf.c | 18 ++++++++++++------ include/linux/elf.h | 12 ++++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index d2cfe0729a73..2358368319b8 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -36,6 +36,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_HAVE_ELF_PROT + bool + config ARCH_USE_GNU_PROPERTY bool diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1fb67e506b68..cceb29d6ef1d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -544,7 +544,8 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ -static inline int make_prot(u32 p_flags) +static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state, + bool has_interp, bool is_interp) { int prot = 0; @@ -554,7 +555,8 @@ static inline int make_prot(u32 p_flags) prot |= PROT_WRITE; if (p_flags & PF_X) prot |= PROT_EXEC; - return prot; + + return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp); } /* This is much more generalized than the library routine read function, @@ -564,7 +566,8 @@ static inline int make_prot(u32 p_flags) static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, - unsigned long no_base, struct elf_phdr *interp_elf_phdata) + unsigned long no_base, struct elf_phdr *interp_elf_phdata, + struct arch_elf_state *arch_state) { struct elf_phdr *eppnt; unsigned long load_addr = 0; @@ -596,7 +599,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; - int elf_prot = make_prot(eppnt->p_flags); + int elf_prot = make_prot(eppnt->p_flags, arch_state, + true, true); unsigned long vaddr = 0; unsigned long k, map_addr; @@ -1041,7 +1045,8 @@ out_free_interp: } } - elf_prot = make_prot(elf_ppnt->p_flags); + elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, + !!interpreter, false); elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; @@ -1184,7 +1189,8 @@ out_free_interp: if (interpreter) { elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, - load_bias, interp_elf_phdata); + load_bias, interp_elf_phdata, + &arch_state); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation diff --git a/include/linux/elf.h b/include/linux/elf.h index db5113479f5e..5d5b0321da0b 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -87,4 +87,16 @@ extern int arch_parse_elf_property(u32 type, const void *data, size_t datasz, bool compat, struct arch_elf_state *arch); #endif +#ifdef CONFIG_ARCH_HAVE_ELF_PROT +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp); +#else +static inline int arch_elf_adjust_prot(int prot, + const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + return prot; +} +#endif + #endif /* _LINUX_ELF_H */ -- cgit v1.2.3 From 36e4d4dd4fc4f1d99e7966a460a2b12ce438abc2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 21 Jan 2020 17:05:08 +0100 Subject: include/linux: Add instrumented.h infrastructure This adds instrumented.h, which provides generic wrappers for memory access instrumentation that the compiler cannot emit for various sanitizers. Currently this unifies KASAN and KCSAN instrumentation. In future this will also include KMSAN instrumentation. Note that, copy_{to,from}_user should use special instrumentation, since we should be able to instrument both source and destination memory accesses if both are kernel memory. The current patch only instruments the memory access where the address is always in kernel space, however, both may in fact be kernel addresses when a compat syscall passes an argument allocated in the kernel to a real syscall. In a future change, both KASAN and KCSAN should check both addresses in such cases, as well as KMSAN will make use of both addresses. [It made more sense to provide the completed function signature, rather than updating it and changing all locations again at a later time.] Suggested-by: Arnd Bergmann Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Reviewed-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/instrumented.h | 109 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 include/linux/instrumented.h (limited to 'include/linux') diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h new file mode 100644 index 000000000000..43e6ea591975 --- /dev/null +++ b/include/linux/instrumented.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * This header provides generic wrappers for memory access instrumentation that + * the compiler cannot emit for: KASAN, KCSAN. + */ +#ifndef _LINUX_INSTRUMENTED_H +#define _LINUX_INSTRUMENTED_H + +#include +#include +#include +#include + +/** + * instrument_read - instrument regular read access + * + * Instrument a regular read access. The instrumentation should be inserted + * before the actual read happens. + * + * @ptr address of access + * @size size of access + */ +static __always_inline void instrument_read(const volatile void *v, size_t size) +{ + kasan_check_read(v, size); + kcsan_check_read(v, size); +} + +/** + * instrument_write - instrument regular write access + * + * Instrument a regular write access. The instrumentation should be inserted + * before the actual write happens. + * + * @ptr address of access + * @size size of access + */ +static __always_inline void instrument_write(const volatile void *v, size_t size) +{ + kasan_check_write(v, size); + kcsan_check_write(v, size); +} + +/** + * instrument_atomic_read - instrument atomic read access + * + * Instrument an atomic read access. The instrumentation should be inserted + * before the actual read happens. + * + * @ptr address of access + * @size size of access + */ +static __always_inline void instrument_atomic_read(const volatile void *v, size_t size) +{ + kasan_check_read(v, size); + kcsan_check_atomic_read(v, size); +} + +/** + * instrument_atomic_write - instrument atomic write access + * + * Instrument an atomic write access. The instrumentation should be inserted + * before the actual write happens. + * + * @ptr address of access + * @size size of access + */ +static __always_inline void instrument_atomic_write(const volatile void *v, size_t size) +{ + kasan_check_write(v, size); + kcsan_check_atomic_write(v, size); +} + +/** + * instrument_copy_to_user - instrument reads of copy_to_user + * + * Instrument reads from kernel memory, that are due to copy_to_user (and + * variants). The instrumentation must be inserted before the accesses. + * + * @to destination address + * @from source address + * @n number of bytes to copy + */ +static __always_inline void +instrument_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + kasan_check_read(from, n); + kcsan_check_read(from, n); +} + +/** + * instrument_copy_from_user - instrument writes of copy_from_user + * + * Instrument writes to kernel memory, that are due to copy_from_user (and + * variants). The instrumentation should be inserted before the accesses. + * + * @to destination address + * @from source address + * @n number of bytes to copy + */ +static __always_inline void +instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) +{ + kasan_check_write(to, n); + kcsan_check_write(to, n); +} + +#endif /* _LINUX_INSTRUMENTED_H */ -- cgit v1.2.3 From 76d6f06c36a3b5cc402eeeb709613cb211fdaa8f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 21 Jan 2020 17:05:12 +0100 Subject: copy_to_user, copy_from_user: Use generic instrumented.h This replaces the KASAN instrumentation with generic instrumentation, implicitly adding KCSAN instrumentation support. For KASAN no functional change is intended. Suggested-by: Arnd Bergmann Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 14 +++++++------- lib/usercopy.c | 7 ++++--- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 67f016010aad..8a215c5c1aed 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,9 +2,9 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include #include #include -#include #define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS) @@ -58,7 +58,7 @@ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - kasan_check_write(to, n); + instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); } @@ -67,7 +67,7 @@ static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); - kasan_check_write(to, n); + instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); } @@ -88,7 +88,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { - kasan_check_read(from, n); + instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } @@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); - kasan_check_read(from, n); + instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } @@ -109,7 +109,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) unsigned long res = n; might_fault(); if (likely(access_ok(from, n))) { - kasan_check_write(to, n); + instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } if (unlikely(res)) @@ -127,7 +127,7 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (access_ok(to, n)) { - kasan_check_read(from, n); + instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; diff --git a/lib/usercopy.c b/lib/usercopy.c index cbb4d9ec00f2..4bb1c5e7a3eb 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include +#include +#include /* out-of-line parts */ @@ -10,7 +11,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n unsigned long res = n; might_fault(); if (likely(access_ok(from, n))) { - kasan_check_write(to, n); + instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } if (unlikely(res)) @@ -25,7 +26,7 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (likely(access_ok(to, n))) { - kasan_check_read(from, n); + instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; -- cgit v1.2.3 From 7ad900d35b49af5a05f595d2274c32e69e01b055 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Feb 2020 14:42:18 -0800 Subject: kcsan: Add docbook header for data_race() Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Cc: Marco Elver Cc: Dmitry Vyukov --- include/linux/compiler.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8c0beb10c1dd..c1bdf37571cb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -315,13 +315,15 @@ unsigned long read_word_at_a_time(const void *addr) #include -/* - * data_race(): macro to document that accesses in an expression may conflict with - * other concurrent accesses resulting in data races, but the resulting - * behaviour is deemed safe regardless. +/** + * data_race - mark an expression as containing intentional data races + * + * This data_race() macro is useful for situations in which data races + * should be forgiven. One example is diagnostic code that accesses + * shared variables but is not a part of the core synchronization design. * - * This macro *does not* affect normal code generation, but is a hint to tooling - * that data races here should be ignored. + * This macro *does not* affect normal code generation, but is a hint + * to tooling that data races here are to be ignored. */ #define data_race(expr) \ ({ \ -- cgit v1.2.3 From d591ec3db75f9eadfa7976ff8796c674c0027715 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Feb 2020 16:46:24 +0100 Subject: kcsan: Introduce KCSAN_ACCESS_ASSERT access type The KCSAN_ACCESS_ASSERT access type may be used to introduce dummy reads and writes to assert certain properties of concurrent code, where bugs could not be detected as normal data races. For example, a variable that is only meant to be written by a single CPU, but may be read (without locking) by other CPUs must still be marked properly to avoid data races. However, concurrent writes, regardless if WRITE_ONCE() or not, would be a bug. Using kcsan_check_access(&x, sizeof(x), KCSAN_ACCESS_ASSERT) would allow catching such bugs. To support KCSAN_ACCESS_ASSERT the following notable changes were made: * If an access is of type KCSAN_ASSERT_ACCESS, disable various filters that only apply to data races, so that all races that KCSAN observes are reported. * Bug reports that involve an ASSERT access type will be reported as "KCSAN: assert: race in ..." instead of "data-race"; this will help more easily distinguish them. * Update a few comments to just mention 'races' where we do not always mean pure data races. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/kcsan-checks.h | 18 ++++++++++++------ kernel/kcsan/core.c | 44 ++++++++++++++++++++++++++++++++++++++------ kernel/kcsan/debugfs.c | 1 + kernel/kcsan/kcsan.h | 7 +++++++ kernel/kcsan/report.c | 43 +++++++++++++++++++++++++++++++------------ lib/Kconfig.kcsan | 24 ++++++++++++++---------- 6 files changed, 103 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index ef3ee233a3fa..5dcadc221026 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -6,10 +6,16 @@ #include /* - * Access type modifiers. + * ACCESS TYPE MODIFIERS + * + * : normal read access; + * WRITE : write access; + * ATOMIC: access is atomic; + * ASSERT: access is not a regular access, but an assertion; */ #define KCSAN_ACCESS_WRITE 0x1 #define KCSAN_ACCESS_ATOMIC 0x2 +#define KCSAN_ACCESS_ASSERT 0x4 /* * __kcsan_*: Always calls into the runtime when KCSAN is enabled. This may be used @@ -18,7 +24,7 @@ */ #ifdef CONFIG_KCSAN /** - * __kcsan_check_access - check generic access for data races + * __kcsan_check_access - check generic access for races * * @ptr address of access * @size size of access @@ -43,7 +49,7 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, #endif /** - * __kcsan_check_read - check regular read access for data races + * __kcsan_check_read - check regular read access for races * * @ptr address of access * @size size of access @@ -51,7 +57,7 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, #define __kcsan_check_read(ptr, size) __kcsan_check_access(ptr, size, 0) /** - * __kcsan_check_write - check regular write access for data races + * __kcsan_check_write - check regular write access for races * * @ptr address of access * @size size of access @@ -60,7 +66,7 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, __kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) /** - * kcsan_check_read - check regular read access for data races + * kcsan_check_read - check regular read access for races * * @ptr address of access * @size size of access @@ -68,7 +74,7 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, #define kcsan_check_read(ptr, size) kcsan_check_access(ptr, size, 0) /** - * kcsan_check_write - check regular write access for data races + * kcsan_check_write - check regular write access for races * * @ptr address of access * @size size of access diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 82c2bef827d4..87ef01e40199 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -56,7 +56,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { /* * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary - * slot (middle) is fine if we assume that data races occur rarely. The set of + * slot (middle) is fine if we assume that races occur rarely. The set of * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}. */ @@ -178,6 +178,14 @@ is_atomic(const volatile void *ptr, size_t size, int type) if ((type & KCSAN_ACCESS_ATOMIC) != 0) return true; + /* + * Unless explicitly declared atomic, never consider an assertion access + * as atomic. This allows using them also in atomic regions, such as + * seqlocks, without implicitly changing their semantics. + */ + if ((type & KCSAN_ACCESS_ASSERT) != 0) + return false; + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) != 0 && size <= sizeof(long) && IS_ALIGNED((unsigned long)ptr, size)) @@ -298,7 +306,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, */ kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); } - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + + if ((type & KCSAN_ACCESS_ASSERT) != 0) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + else + kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); user_access_restore(flags); } @@ -307,6 +319,7 @@ static noinline void kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) { const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; + const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; atomic_long_t *watchpoint; union { u8 _1; @@ -429,13 +442,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) /* * No need to increment 'data_races' counter, as the racing * thread already did. + * + * Count 'assert_failures' for each failed ASSERT access, + * therefore both this thread and the racing thread may + * increment this counter. */ - kcsan_report(ptr, size, type, size > 8 || value_change, - smp_processor_id(), KCSAN_REPORT_RACE_SIGNAL); + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + /* + * - If we were not able to observe a value change due to size + * constraints, always assume a value change. + * - If the access type is an assertion, we also always assume a + * value change to always report the race. + */ + value_change = value_change || size > 8 || is_assert; + + kcsan_report(ptr, size, type, value_change, smp_processor_id(), + KCSAN_REPORT_RACE_SIGNAL); } else if (value_change) { /* Inferring a race, since the value should not have changed. */ + kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); - if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + if (is_assert) + kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, true, smp_processor_id(), KCSAN_REPORT_RACE_UNKNOWN_ORIGIN); @@ -471,7 +503,7 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, &encoded_watchpoint); /* * It is safe to check kcsan_is_enabled() after find_watchpoint in the - * slow-path, as long as no state changes that cause a data race to be + * slow-path, as long as no state changes that cause a race to be * detected and reported have occurred until kcsan_is_enabled() is * checked. */ diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index bec42dab32ee..a9dad44130e6 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -44,6 +44,7 @@ static const char *counter_to_name(enum kcsan_counter_id id) case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; case KCSAN_COUNTER_DATA_RACES: return "data_races"; + case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; case KCSAN_COUNTER_REPORT_RACES: return "report_races"; case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 8492da45494b..50078e7d43c3 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -39,6 +39,13 @@ enum kcsan_counter_id { */ KCSAN_COUNTER_DATA_RACES, + /* + * Total number of ASSERT failures due to races. If the observed race is + * due to two conflicting ASSERT type accesses, then both will be + * counted. + */ + KCSAN_COUNTER_ASSERT_FAILURES, + /* * Number of times no watchpoints were available. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 7cd34285df74..3bc590e6be7e 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -34,11 +34,11 @@ static struct { } other_info = { .ptr = NULL }; /* - * Information about reported data races; used to rate limit reporting. + * Information about reported races; used to rate limit reporting. */ struct report_time { /* - * The last time the data race was reported. + * The last time the race was reported. */ unsigned long time; @@ -57,7 +57,7 @@ struct report_time { * * Therefore, we use a fixed-size array, which at most will occupy a page. This * still adequately rate limits reports, assuming that a) number of unique data - * races is not excessive, and b) occurrence of unique data races within the + * races is not excessive, and b) occurrence of unique races within the * same time window is limited. */ #define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time)) @@ -74,7 +74,7 @@ static struct report_time report_times[REPORT_TIMES_SIZE]; static DEFINE_SPINLOCK(report_lock); /* - * Checks if the data race identified by thread frames frame1 and frame2 has + * Checks if the race identified by thread frames frame1 and frame2 has * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). */ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) @@ -90,7 +90,7 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS); - /* Check if a matching data race report exists. */ + /* Check if a matching race report exists. */ for (i = 0; i < REPORT_TIMES_SIZE; ++i) { struct report_time *rt = &report_times[i]; @@ -114,7 +114,7 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) if (time_before(rt->time, invalid_before)) continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */ - /* Reported recently, check if data race matches. */ + /* Reported recently, check if race matches. */ if ((rt->frame1 == frame1 && rt->frame2 == frame2) || (rt->frame1 == frame2 && rt->frame2 == frame1)) return true; @@ -142,11 +142,12 @@ skip_report(bool value_change, unsigned long top_frame) * 3. write watchpoint, conflicting write (value_change==true): report; * 4. write watchpoint, conflicting write (value_change==false): skip; * 5. write watchpoint, conflicting read (value_change==false): skip; - * 6. write watchpoint, conflicting read (value_change==true): impossible; + * 6. write watchpoint, conflicting read (value_change==true): report; * * Cases 1-4 are intuitive and expected; case 5 ensures we do not report - * data races where the write may have rewritten the same value; and - * case 6 is simply impossible. + * data races where the write may have rewritten the same value; case 6 + * is possible either if the size is larger than what we check value + * changes for or the access type is KCSAN_ACCESS_ASSERT. */ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) && !value_change) { /* @@ -178,11 +179,27 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + + /* + * ASSERT variants: + */ + case KCSAN_ACCESS_ASSERT: + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_ATOMIC: + return "assert no writes"; + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE: + case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "assert no accesses"; + default: BUG(); } } +static const char *get_bug_type(int type) +{ + return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race"; +} + /* Return thread description: in task or interrupt. */ static const char *get_thread_desc(int task_id) { @@ -268,13 +285,15 @@ static bool print_report(const volatile void *ptr, size_t size, int access_type, * Do not print offset of functions to keep title short. */ cmp = sym_strcmp((void *)other_frame, (void *)this_frame); - pr_err("BUG: KCSAN: data-race in %ps / %ps\n", + pr_err("BUG: KCSAN: %s in %ps / %ps\n", + get_bug_type(access_type | other_info.access_type), (void *)(cmp < 0 ? other_frame : this_frame), (void *)(cmp < 0 ? this_frame : other_frame)); } break; case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN: - pr_err("BUG: KCSAN: data-race in %pS\n", (void *)this_frame); + pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(access_type), + (void *)this_frame); break; default: @@ -427,7 +446,7 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, /* * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a data race in utilities used by + * into lockdep via KCSAN if we detect a race in utilities used by * lockdep. */ lockdep_off(); diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 9785bbf9a1d1..f0b791143c6a 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -4,13 +4,17 @@ config HAVE_ARCH_KCSAN bool menuconfig KCSAN - bool "KCSAN: dynamic data race detector" + bool "KCSAN: dynamic race detector" depends on HAVE_ARCH_KCSAN && DEBUG_KERNEL && !KASAN select STACKTRACE help - The Kernel Concurrency Sanitizer (KCSAN) is a dynamic data race - detector, which relies on compile-time instrumentation, and uses a - watchpoint-based sampling approach to detect data races. + The Kernel Concurrency Sanitizer (KCSAN) is a dynamic race detector, + which relies on compile-time instrumentation, and uses a + watchpoint-based sampling approach to detect races. + + KCSAN's primary purpose is to detect data races. KCSAN can also be + used to check properties, with the help of provided assertions, of + concurrent code where bugs do not manifest as data races. See for more details. @@ -85,14 +89,14 @@ config KCSAN_SKIP_WATCH_RANDOMIZE KCSAN_WATCH_SKIP. config KCSAN_REPORT_ONCE_IN_MS - int "Duration in milliseconds, in which any given data race is only reported once" + int "Duration in milliseconds, in which any given race is only reported once" default 3000 help - Any given data race is only reported once in the defined time window. - Different data races may still generate reports within a duration - that is smaller than the duration defined here. This allows rate - limiting reporting to avoid flooding the console with reports. - Setting this to 0 disables rate limiting. + Any given race is only reported once in the defined time window. + Different races may still generate reports within a duration that is + smaller than the duration defined here. This allows rate limiting + reporting to avoid flooding the console with reports. Setting this + to 0 disables rate limiting. # The main purpose of the below options is to control reported data races (e.g. # in fuzzer configs), and are not expected to be switched frequently by other -- cgit v1.2.3 From f97f713dc25714ac13f3b5abdeffce2da3f6fe70 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Feb 2020 16:46:25 +0100 Subject: kcsan: Introduce ASSERT_EXCLUSIVE_*() macros Introduces ASSERT_EXCLUSIVE_WRITER() and ASSERT_EXCLUSIVE_ACCESS(), which may be used to assert properties of synchronization logic, where violation cannot be detected as a normal data race. Examples of the reports that may be generated: ================================================================== BUG: KCSAN: assert: race in test_thread / test_thread write to 0xffffffffab3d1540 of 8 bytes by task 466 on cpu 2: test_thread+0x8d/0x111 debugfs_write.cold+0x32/0x44 ... assert no writes to 0xffffffffab3d1540 of 8 bytes by task 464 on cpu 0: test_thread+0xa3/0x111 debugfs_write.cold+0x32/0x44 ... ================================================================== ================================================================== BUG: KCSAN: assert: race in test_thread / test_thread assert no accesses to 0xffffffffab3d1540 of 8 bytes by task 465 on cpu 1: test_thread+0xb9/0x111 debugfs_write.cold+0x32/0x44 ... read to 0xffffffffab3d1540 of 8 bytes by task 464 on cpu 0: test_thread+0x77/0x111 debugfs_write.cold+0x32/0x44 ... ================================================================== Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/kcsan-checks.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 5dcadc221026..cf6961794e9a 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -96,4 +96,44 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, kcsan_check_access(ptr, size, KCSAN_ACCESS_ATOMIC | KCSAN_ACCESS_WRITE) #endif +/** + * ASSERT_EXCLUSIVE_WRITER - assert no other threads are writing @var + * + * Assert that there are no other threads writing @var; other readers are + * allowed. This assertion can be used to specify properties of concurrent code, + * where violation cannot be detected as a normal data race. + * + * For example, if a per-CPU variable is only meant to be written by a single + * CPU, but may be read from other CPUs; in this case, reads and writes must be + * marked properly, however, if an off-CPU WRITE_ONCE() races with the owning + * CPU's WRITE_ONCE(), would not constitute a data race but could be a harmful + * race condition. Using this macro allows specifying this property in the code + * and catch such bugs. + * + * @var variable to assert on + */ +#define ASSERT_EXCLUSIVE_WRITER(var) \ + __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_ASSERT) + +/** + * ASSERT_EXCLUSIVE_ACCESS - assert no other threads are accessing @var + * + * Assert that no other thread is accessing @var (no readers nor writers). This + * assertion can be used to specify properties of concurrent code, where + * violation cannot be detected as a normal data race. + * + * For example, in a reference-counting algorithm where exclusive access is + * expected after the refcount reaches 0. We can check that this property + * actually holds as follows: + * + * if (refcount_dec_and_test(&obj->refcnt)) { + * ASSERT_EXCLUSIVE_ACCESS(*obj); + * safely_dispose_of(obj); + * } + * + * @var variable to assert on + */ +#define ASSERT_EXCLUSIVE_ACCESS(var) \ + __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT) + #endif /* _LINUX_KCSAN_CHECKS_H */ -- cgit v1.2.3 From f0f6928c2c4c19ab6171d4f468f542fac1888a8f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:19 +0100 Subject: kcsan: Move interfaces that affects checks to kcsan-checks.h This moves functions that affect state changing the behaviour of kcsan_check_access() to kcsan-checks.h. Since these are likely used with kcsan_check_access() it makes more sense to have them in kcsan-checks.h, to avoid including all of 'include/linux/kcsan.h'. No functional change intended. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/kcsan-checks.h | 48 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/kcsan.h | 41 ------------------------------------- 2 files changed, 46 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index cf6961794e9a..8675411c8dbc 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -32,10 +32,54 @@ */ void __kcsan_check_access(const volatile void *ptr, size_t size, int type); -#else +/** + * kcsan_nestable_atomic_begin - begin nestable atomic region + * + * Accesses within the atomic region may appear to race with other accesses but + * should be considered atomic. + */ +void kcsan_nestable_atomic_begin(void); + +/** + * kcsan_nestable_atomic_end - end nestable atomic region + */ +void kcsan_nestable_atomic_end(void); + +/** + * kcsan_flat_atomic_begin - begin flat atomic region + * + * Accesses within the atomic region may appear to race with other accesses but + * should be considered atomic. + */ +void kcsan_flat_atomic_begin(void); + +/** + * kcsan_flat_atomic_end - end flat atomic region + */ +void kcsan_flat_atomic_end(void); + +/** + * kcsan_atomic_next - consider following accesses as atomic + * + * Force treating the next n memory accesses for the current context as atomic + * operations. + * + * @n number of following memory accesses to treat as atomic. + */ +void kcsan_atomic_next(int n); + +#else /* CONFIG_KCSAN */ + static inline void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { } -#endif + +static inline void kcsan_nestable_atomic_begin(void) { } +static inline void kcsan_nestable_atomic_end(void) { } +static inline void kcsan_flat_atomic_begin(void) { } +static inline void kcsan_flat_atomic_end(void) { } +static inline void kcsan_atomic_next(int n) { } + +#endif /* CONFIG_KCSAN */ /* * kcsan_*: Only calls into the runtime when the particular compilation unit has diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 1019e3a2c689..7a614ca558f6 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -56,52 +56,11 @@ void kcsan_disable_current(void); */ void kcsan_enable_current(void); -/** - * kcsan_nestable_atomic_begin - begin nestable atomic region - * - * Accesses within the atomic region may appear to race with other accesses but - * should be considered atomic. - */ -void kcsan_nestable_atomic_begin(void); - -/** - * kcsan_nestable_atomic_end - end nestable atomic region - */ -void kcsan_nestable_atomic_end(void); - -/** - * kcsan_flat_atomic_begin - begin flat atomic region - * - * Accesses within the atomic region may appear to race with other accesses but - * should be considered atomic. - */ -void kcsan_flat_atomic_begin(void); - -/** - * kcsan_flat_atomic_end - end flat atomic region - */ -void kcsan_flat_atomic_end(void); - -/** - * kcsan_atomic_next - consider following accesses as atomic - * - * Force treating the next n memory accesses for the current context as atomic - * operations. - * - * @n number of following memory accesses to treat as atomic. - */ -void kcsan_atomic_next(int n); - #else /* CONFIG_KCSAN */ static inline void kcsan_init(void) { } static inline void kcsan_disable_current(void) { } static inline void kcsan_enable_current(void) { } -static inline void kcsan_nestable_atomic_begin(void) { } -static inline void kcsan_nestable_atomic_end(void) { } -static inline void kcsan_flat_atomic_begin(void) { } -static inline void kcsan_flat_atomic_end(void) { } -static inline void kcsan_atomic_next(int n) { } #endif /* CONFIG_KCSAN */ -- cgit v1.2.3 From b968a08f242d51982e46041c506115b5e11a7570 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:20 +0100 Subject: compiler.h, seqlock.h: Remove unnecessary kcsan.h includes No we longer have to include kcsan.h, since the required KCSAN interface for both compiler.h and seqlock.h are now provided by kcsan-checks.h. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 2 -- include/linux/seqlock.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c1bdf37571cb..f504edebd5d7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -313,8 +313,6 @@ unsigned long read_word_at_a_time(const void *addr) __u.__val; \ }) -#include - /** * data_race - mark an expression as containing intentional data races * diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 239701cae376..8b97204f35a7 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include /* -- cgit v1.2.3 From 81af89e15862909881ff010a0adb67148487e88a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:22 +0100 Subject: kcsan: Add kcsan_set_access_mask() support When setting up an access mask with kcsan_set_access_mask(), KCSAN will only report races if concurrent changes to bits set in access_mask are observed. Conveying access_mask via a separate call avoids introducing overhead in the common-case fast-path. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/kcsan-checks.h | 11 +++++++++++ include/linux/kcsan.h | 5 +++++ init/init_task.c | 1 + kernel/kcsan/core.c | 43 +++++++++++++++++++++++++++++++++++++++---- kernel/kcsan/kcsan.h | 5 +++++ kernel/kcsan/report.c | 13 ++++++++++++- 6 files changed, 73 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 8675411c8dbc..4ef5233ff3f0 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -68,6 +68,16 @@ void kcsan_flat_atomic_end(void); */ void kcsan_atomic_next(int n); +/** + * kcsan_set_access_mask - set access mask + * + * Set the access mask for all accesses for the current context if non-zero. + * Only value changes to bits set in the mask will be reported. + * + * @mask bitmask + */ +void kcsan_set_access_mask(unsigned long mask); + #else /* CONFIG_KCSAN */ static inline void __kcsan_check_access(const volatile void *ptr, size_t size, @@ -78,6 +88,7 @@ static inline void kcsan_nestable_atomic_end(void) { } static inline void kcsan_flat_atomic_begin(void) { } static inline void kcsan_flat_atomic_end(void) { } static inline void kcsan_atomic_next(int n) { } +static inline void kcsan_set_access_mask(unsigned long mask) { } #endif /* CONFIG_KCSAN */ diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 7a614ca558f6..3b84606e1e67 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -35,6 +35,11 @@ struct kcsan_ctx { */ int atomic_nest_count; bool in_flat_atomic; + + /* + * Access mask for all accesses if non-zero. + */ + unsigned long access_mask; }; /** diff --git a/init/init_task.c b/init/init_task.c index 2b4fe98b0f09..096191d177d5 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -167,6 +167,7 @@ struct task_struct init_task .atomic_next = 0, .atomic_nest_count = 0, .in_flat_atomic = false, + .access_mask = 0, }, #endif #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3f89801161d3..589b1e7f0f25 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -39,6 +39,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { .atomic_next = 0, .atomic_nest_count = 0, .in_flat_atomic = false, + .access_mask = 0, }; /* @@ -298,6 +299,15 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (!kcsan_is_enabled()) return; + + /* + * The access_mask check relies on value-change comparison. To avoid + * reporting a race where e.g. the writer set up the watchpoint, but the + * reader has access_mask!=0, we have to ignore the found watchpoint. + */ + if (get_ctx()->access_mask != 0) + return; + /* * Consume the watchpoint as soon as possible, to minimize the chances * of !consumed. Consuming the watchpoint must always be guarded by @@ -341,6 +351,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) u32 _4; u64 _8; } expect_value; + unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; @@ -435,18 +446,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ + access_mask = get_ctx()->access_mask; switch (size) { case 1: expect_value._1 ^= READ_ONCE(*(const u8 *)ptr); + if (access_mask) + expect_value._1 &= (u8)access_mask; break; case 2: expect_value._2 ^= READ_ONCE(*(const u16 *)ptr); + if (access_mask) + expect_value._2 &= (u16)access_mask; break; case 4: expect_value._4 ^= READ_ONCE(*(const u32 *)ptr); + if (access_mask) + expect_value._4 &= (u32)access_mask; break; case 8: expect_value._8 ^= READ_ONCE(*(const u64 *)ptr); + if (access_mask) + expect_value._8 &= (u64)access_mask; break; default: break; /* ignore; we do not diff the values */ @@ -460,11 +480,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (!remove_watchpoint(watchpoint)) { /* * Depending on the access type, map a value_change of MAYBE to - * TRUE (require reporting). + * TRUE (always report) or FALSE (never report). */ - if (value_change == KCSAN_VALUE_CHANGE_MAYBE && (size > 8 || is_assert)) { - /* Always assume a value-change. */ - value_change = KCSAN_VALUE_CHANGE_TRUE; + if (value_change == KCSAN_VALUE_CHANGE_MAYBE) { + if (access_mask != 0) { + /* + * For access with access_mask, we require a + * value-change, as it is likely that races on + * ~access_mask bits are expected. + */ + value_change = KCSAN_VALUE_CHANGE_FALSE; + } else if (size > 8 || is_assert) { + /* Always assume a value-change. */ + value_change = KCSAN_VALUE_CHANGE_TRUE; + } } /* @@ -622,6 +651,12 @@ void kcsan_atomic_next(int n) } EXPORT_SYMBOL(kcsan_atomic_next); +void kcsan_set_access_mask(unsigned long mask) +{ + get_ctx()->access_mask = mask; +} +EXPORT_SYMBOL(kcsan_set_access_mask); + void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { check_access(ptr, size, type); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 83a79b08b550..892de5120c1b 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -98,6 +98,11 @@ enum kcsan_value_change { */ KCSAN_VALUE_CHANGE_MAYBE, + /* + * Did not observe a value-change, and it is invalid to report the race. + */ + KCSAN_VALUE_CHANGE_FALSE, + /* * The value was observed to change, and the race should be reported. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index d871476dc134..11c791b886f3 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -132,6 +132,9 @@ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) static bool skip_report(enum kcsan_value_change value_change, unsigned long top_frame) { + /* Should never get here if value_change==FALSE. */ + WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE); + /* * The first call to skip_report always has value_change==TRUE, since we * cannot know the value written of an instrumented access. For the 2nd @@ -493,7 +496,15 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, kcsan_disable_current(); if (prepare_report(&flags, ptr, size, access_type, cpu_id, type)) { - if (print_report(ptr, size, access_type, value_change, cpu_id, type) && panic_on_warn) + /* + * Never report if value_change is FALSE, only if we it is + * either TRUE or MAYBE. In case of MAYBE, further filtering may + * be done once we know the full stack trace in print_report(). + */ + bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE && + print_report(ptr, size, access_type, value_change, cpu_id, type); + + if (reported && panic_on_warn) panic("panic_on_warn set ...\n"); release_report(&flags, type); -- cgit v1.2.3 From 703b321501c95c658275fd76775282fe45989641 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Feb 2020 17:04:23 +0100 Subject: kcsan: Introduce ASSERT_EXCLUSIVE_BITS(var, mask) This introduces ASSERT_EXCLUSIVE_BITS(var, mask). ASSERT_EXCLUSIVE_BITS(var, mask) will cause KCSAN to assume that the following access is safe w.r.t. data races (however, please see the docbook comment for disclaimer here). For more context on why this was considered necessary, please see: http://lkml.kernel.org/r/1580995070-25139-1-git-send-email-cai@lca.pw In particular, before this patch, data races between reads (that use @mask bits of an access that should not be modified concurrently) and writes (that change ~@mask bits not used by the readers) would have been annotated with "data_race()" (or "READ_ONCE()"). However, doing so would then hide real problems: we would no longer be able to detect harmful races between reads to @mask bits and writes to @mask bits. Therefore, by using ASSERT_EXCLUSIVE_BITS(var, mask), we accomplish: 1. Avoid proliferation of specific macros at the call sites: by including a single mask in the argument list, we can use the same macro in a wide variety of call sites, regardless of how and which bits in a field each call site actually accesses. 2. The existing code does not need to be modified (although READ_ONCE() may still be advisable if we cannot prove that the data race is always safe). 3. We catch bugs where the exclusive bits are modified concurrently. 4. We document properties of the current code. Acked-by: John Hubbard Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Cc: David Hildenbrand Cc: Jan Kara Cc: Qian Cai --- include/linux/kcsan-checks.h | 69 ++++++++++++++++++++++++++++++++++++++++---- kernel/kcsan/debugfs.c | 15 +++++++++- 2 files changed, 77 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 4ef5233ff3f0..8f9f6e2191dc 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -152,9 +152,9 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, #endif /** - * ASSERT_EXCLUSIVE_WRITER - assert no other threads are writing @var + * ASSERT_EXCLUSIVE_WRITER - assert no concurrent writes to @var * - * Assert that there are no other threads writing @var; other readers are + * Assert that there are no concurrent writes to @var; other readers are * allowed. This assertion can be used to specify properties of concurrent code, * where violation cannot be detected as a normal data race. * @@ -171,11 +171,11 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_ASSERT) /** - * ASSERT_EXCLUSIVE_ACCESS - assert no other threads are accessing @var + * ASSERT_EXCLUSIVE_ACCESS - assert no concurrent accesses to @var * - * Assert that no other thread is accessing @var (no readers nor writers). This - * assertion can be used to specify properties of concurrent code, where - * violation cannot be detected as a normal data race. + * Assert that there are no concurrent accesses to @var (no readers nor + * writers). This assertion can be used to specify properties of concurrent + * code, where violation cannot be detected as a normal data race. * * For example, in a reference-counting algorithm where exclusive access is * expected after the refcount reaches 0. We can check that this property @@ -191,4 +191,61 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, #define ASSERT_EXCLUSIVE_ACCESS(var) \ __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT) +/** + * ASSERT_EXCLUSIVE_BITS - assert no concurrent writes to subset of bits in @var + * + * Bit-granular variant of ASSERT_EXCLUSIVE_WRITER(var). + * + * Assert that there are no concurrent writes to a subset of bits in @var; + * concurrent readers are permitted. This assertion captures more detailed + * bit-level properties, compared to the other (word granularity) assertions. + * Only the bits set in @mask are checked for concurrent modifications, while + * ignoring the remaining bits, i.e. concurrent writes (or reads) to ~@mask bits + * are ignored. + * + * Use this for variables, where some bits must not be modified concurrently, + * yet other bits are expected to be modified concurrently. + * + * For example, variables where, after initialization, some bits are read-only, + * but other bits may still be modified concurrently. A reader may wish to + * assert that this is true as follows: + * + * ASSERT_EXCLUSIVE_BITS(flags, READ_ONLY_MASK); + * foo = (READ_ONCE(flags) & READ_ONLY_MASK) >> READ_ONLY_SHIFT; + * + * Note: The access that immediately follows ASSERT_EXCLUSIVE_BITS() is + * assumed to access the masked bits only, and KCSAN optimistically assumes it + * is therefore safe, even in the presence of data races, and marking it with + * READ_ONCE() is optional from KCSAN's point-of-view. We caution, however, + * that it may still be advisable to do so, since we cannot reason about all + * compiler optimizations when it comes to bit manipulations (on the reader + * and writer side). If you are sure nothing can go wrong, we can write the + * above simply as: + * + * ASSERT_EXCLUSIVE_BITS(flags, READ_ONLY_MASK); + * foo = (flags & READ_ONLY_MASK) >> READ_ONLY_SHIFT; + * + * Another example, where this may be used, is when certain bits of @var may + * only be modified when holding the appropriate lock, but other bits may still + * be modified concurrently. Writers, where other bits may change concurrently, + * could use the assertion as follows: + * + * spin_lock(&foo_lock); + * ASSERT_EXCLUSIVE_BITS(flags, FOO_MASK); + * old_flags = READ_ONCE(flags); + * new_flags = (old_flags & ~FOO_MASK) | (new_foo << FOO_SHIFT); + * if (cmpxchg(&flags, old_flags, new_flags) != old_flags) { ... } + * spin_unlock(&foo_lock); + * + * @var variable to assert on + * @mask only check for modifications to bits set in @mask + */ +#define ASSERT_EXCLUSIVE_BITS(var, mask) \ + do { \ + kcsan_set_access_mask(mask); \ + __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_ASSERT);\ + kcsan_set_access_mask(0); \ + kcsan_atomic_next(1); \ + } while (0) + #endif /* _LINUX_KCSAN_CHECKS_H */ diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 9bbba0e57c9b..2ff196123977 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -100,8 +100,10 @@ static noinline void microbenchmark(unsigned long iters) * debugfs file from multiple tasks to generate real conflicts and show reports. */ static long test_dummy; +static long test_flags; static noinline void test_thread(unsigned long iters) { + const long CHANGE_BITS = 0xff00ff00ff00ff00L; const struct kcsan_ctx ctx_save = current->kcsan_ctx; cycles_t cycles; @@ -109,16 +111,27 @@ static noinline void test_thread(unsigned long iters) memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("test_dummy@%px, test_flags@%px\n", &test_dummy, &test_flags); cycles = get_cycles(); while (iters--) { + /* These all should generate reports. */ __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); ASSERT_EXCLUSIVE_WRITER(test_dummy); ASSERT_EXCLUSIVE_ACCESS(test_dummy); + ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + + ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ + __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ + /* not actually instrumented */ WRITE_ONCE(test_dummy, iters); /* to observe value-change */ + __kcsan_check_write(&test_dummy, sizeof(test_dummy)); + + test_flags ^= CHANGE_BITS; /* generate value-change */ + __kcsan_check_write(&test_flags, sizeof(test_flags)); } cycles = get_cycles() - cycles; -- cgit v1.2.3 From 143324fd89eff7423c01eba79fc7044003a257e9 Mon Sep 17 00:00:00 2001 From: Rohit Sarkar Date: Wed, 18 Mar 2020 21:41:51 +0530 Subject: iio: core: Make mlock internal to the iio core "mlock" should ideally only be used by the iio core. The mlock implementation may change in the future which means that no driver should be explicitly using mlock. Signed-off-by: Rohit Sarkar Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index eed58ed2f368..e975020abaa6 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -492,7 +492,7 @@ struct iio_buffer_setup_ops { * @buffer: [DRIVER] any buffer present * @buffer_list: [INTERN] list of all buffers currently attached * @scan_bytes: [INTERN] num bytes captured to be fed to buffer demux - * @mlock: [DRIVER] lock used to prevent simultaneous device state + * @mlock: [INTERN] lock used to prevent simultaneous device state * changes * @available_scan_masks: [DRIVER] optional array of allowed bitmasks * @masklength: [INTERN] the length of the mask established from -- cgit v1.2.3 From 1443b8c9e712ef8914a2cab9ae7ce133229ed96c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 5 Mar 2020 15:21:09 +0100 Subject: kcsan: Update API documentation in kcsan-checks.h Update the API documentation for ASSERT_EXCLUSIVE_* macros and make them generate readable documentation for the code examples. All @variable short summaries were missing ':', which was updated for the whole file. Tested with "make htmldocs". Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 98 +++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 8f9f6e2191dc..3cd8bb03eb41 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -26,9 +26,9 @@ /** * __kcsan_check_access - check generic access for races * - * @ptr address of access - * @size size of access - * @type access type modifier + * @ptr: address of access + * @size: size of access + * @type: access type modifier */ void __kcsan_check_access(const volatile void *ptr, size_t size, int type); @@ -64,7 +64,7 @@ void kcsan_flat_atomic_end(void); * Force treating the next n memory accesses for the current context as atomic * operations. * - * @n number of following memory accesses to treat as atomic. + * @n: number of following memory accesses to treat as atomic. */ void kcsan_atomic_next(int n); @@ -74,7 +74,7 @@ void kcsan_atomic_next(int n); * Set the access mask for all accesses for the current context if non-zero. * Only value changes to bits set in the mask will be reported. * - * @mask bitmask + * @mask: bitmask */ void kcsan_set_access_mask(unsigned long mask); @@ -106,16 +106,16 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, /** * __kcsan_check_read - check regular read access for races * - * @ptr address of access - * @size size of access + * @ptr: address of access + * @size: size of access */ #define __kcsan_check_read(ptr, size) __kcsan_check_access(ptr, size, 0) /** * __kcsan_check_write - check regular write access for races * - * @ptr address of access - * @size size of access + * @ptr: address of access + * @size: size of access */ #define __kcsan_check_write(ptr, size) \ __kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) @@ -123,16 +123,16 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, /** * kcsan_check_read - check regular read access for races * - * @ptr address of access - * @size size of access + * @ptr: address of access + * @size: size of access */ #define kcsan_check_read(ptr, size) kcsan_check_access(ptr, size, 0) /** * kcsan_check_write - check regular write access for races * - * @ptr address of access - * @size size of access + * @ptr: address of access + * @size: size of access */ #define kcsan_check_write(ptr, size) \ kcsan_check_access(ptr, size, KCSAN_ACCESS_WRITE) @@ -158,14 +158,26 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * allowed. This assertion can be used to specify properties of concurrent code, * where violation cannot be detected as a normal data race. * - * For example, if a per-CPU variable is only meant to be written by a single - * CPU, but may be read from other CPUs; in this case, reads and writes must be - * marked properly, however, if an off-CPU WRITE_ONCE() races with the owning - * CPU's WRITE_ONCE(), would not constitute a data race but could be a harmful - * race condition. Using this macro allows specifying this property in the code - * and catch such bugs. + * For example, if we only have a single writer, but multiple concurrent + * readers, to avoid data races, all these accesses must be marked; even + * concurrent marked writes racing with the single writer are bugs. + * Unfortunately, due to being marked, they are no longer data races. For cases + * like these, we can use the macro as follows: * - * @var variable to assert on + * .. code-block:: c + * + * void writer(void) { + * spin_lock(&update_foo_lock); + * ASSERT_EXCLUSIVE_WRITER(shared_foo); + * WRITE_ONCE(shared_foo, ...); + * spin_unlock(&update_foo_lock); + * } + * void reader(void) { + * // update_foo_lock does not need to be held! + * ... = READ_ONCE(shared_foo); + * } + * + * @var: variable to assert on */ #define ASSERT_EXCLUSIVE_WRITER(var) \ __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_ASSERT) @@ -177,16 +189,22 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * writers). This assertion can be used to specify properties of concurrent * code, where violation cannot be detected as a normal data race. * - * For example, in a reference-counting algorithm where exclusive access is - * expected after the refcount reaches 0. We can check that this property - * actually holds as follows: + * For example, where exclusive access is expected after determining no other + * users of an object are left, but the object is not actually freed. We can + * check that this property actually holds as follows: + * + * .. code-block:: c * * if (refcount_dec_and_test(&obj->refcnt)) { * ASSERT_EXCLUSIVE_ACCESS(*obj); - * safely_dispose_of(obj); + * do_some_cleanup(obj); + * release_for_reuse(obj); * } * - * @var variable to assert on + * Note: For cases where the object is freed, `KASAN `_ is a better + * fit to detect use-after-free bugs. + * + * @var: variable to assert on */ #define ASSERT_EXCLUSIVE_ACCESS(var) \ __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT) @@ -200,7 +218,7 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * concurrent readers are permitted. This assertion captures more detailed * bit-level properties, compared to the other (word granularity) assertions. * Only the bits set in @mask are checked for concurrent modifications, while - * ignoring the remaining bits, i.e. concurrent writes (or reads) to ~@mask bits + * ignoring the remaining bits, i.e. concurrent writes (or reads) to ~mask bits * are ignored. * * Use this for variables, where some bits must not be modified concurrently, @@ -210,17 +228,21 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * but other bits may still be modified concurrently. A reader may wish to * assert that this is true as follows: * + * .. code-block:: c + * * ASSERT_EXCLUSIVE_BITS(flags, READ_ONLY_MASK); * foo = (READ_ONCE(flags) & READ_ONLY_MASK) >> READ_ONLY_SHIFT; * - * Note: The access that immediately follows ASSERT_EXCLUSIVE_BITS() is - * assumed to access the masked bits only, and KCSAN optimistically assumes it - * is therefore safe, even in the presence of data races, and marking it with - * READ_ONCE() is optional from KCSAN's point-of-view. We caution, however, - * that it may still be advisable to do so, since we cannot reason about all - * compiler optimizations when it comes to bit manipulations (on the reader - * and writer side). If you are sure nothing can go wrong, we can write the - * above simply as: + * Note: The access that immediately follows ASSERT_EXCLUSIVE_BITS() is assumed + * to access the masked bits only, and KCSAN optimistically assumes it is + * therefore safe, even in the presence of data races, and marking it with + * READ_ONCE() is optional from KCSAN's point-of-view. We caution, however, that + * it may still be advisable to do so, since we cannot reason about all compiler + * optimizations when it comes to bit manipulations (on the reader and writer + * side). If you are sure nothing can go wrong, we can write the above simply + * as: + * + * .. code-block:: c * * ASSERT_EXCLUSIVE_BITS(flags, READ_ONLY_MASK); * foo = (flags & READ_ONLY_MASK) >> READ_ONLY_SHIFT; @@ -230,15 +252,17 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * be modified concurrently. Writers, where other bits may change concurrently, * could use the assertion as follows: * + * .. code-block:: c + * * spin_lock(&foo_lock); * ASSERT_EXCLUSIVE_BITS(flags, FOO_MASK); - * old_flags = READ_ONCE(flags); + * old_flags = flags; * new_flags = (old_flags & ~FOO_MASK) | (new_foo << FOO_SHIFT); * if (cmpxchg(&flags, old_flags, new_flags) != old_flags) { ... } * spin_unlock(&foo_lock); * - * @var variable to assert on - * @mask only check for modifications to bits set in @mask + * @var: variable to assert on + * @mask: only check for modifications to bits set in @mask */ #define ASSERT_EXCLUSIVE_BITS(var, mask) \ do { \ -- cgit v1.2.3 From 09606b5446c25b2c3b16c5bd2977eca730e3a570 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 22 Mar 2018 17:09:42 +0100 Subject: dma-buf: add peer2peer flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a peer2peer flag noting that the importer can deal with device resources which are not backed by pages. Signed-off-by: Christian König Acked-by: Daniel Vetter Acked-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/359286/ --- drivers/dma-buf/dma-buf.c | 2 ++ include/linux/dma-buf.h | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index ccc9eda1bc28..570c923023e6 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -690,6 +690,8 @@ dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, attach->dev = dev; attach->dmabuf = dmabuf; + if (importer_ops) + attach->peer2peer = importer_ops->allow_peer2peer; attach->importer_ops = importer_ops; attach->importer_priv = importer_priv; diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 1ade486fc2bb..82e0a4a64601 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -334,6 +334,14 @@ struct dma_buf { * Attachment operations implemented by the importer. */ struct dma_buf_attach_ops { + /** + * @allow_peer2peer: + * + * If this is set to true the importer must be able to handle peer + * resources without struct pages. + */ + bool allow_peer2peer; + /** * @move_notify * @@ -362,6 +370,7 @@ struct dma_buf_attach_ops { * @node: list of dma_buf_attachment, protected by dma_resv lock of the dmabuf. * @sgt: cached mapping. * @dir: direction of cached mapping. + * @peer2peer: true if the importer can handle peer resources without pages. * @priv: exporter specific attachment data. * @importer_ops: importer operations for this attachment, if provided * dma_buf_map/unmap_attachment() must be called with the dma_resv lock held. @@ -382,6 +391,7 @@ struct dma_buf_attachment { struct list_head node; struct sg_table *sgt; enum dma_data_direction dir; + bool peer2peer; const struct dma_buf_attach_ops *importer_ops; void *importer_priv; void *priv; -- cgit v1.2.3 From 2c758e301ed95aefde68f98584204811d55c9bb8 Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Wed, 25 Mar 2020 18:31:22 +0100 Subject: soc / drm: mediatek: Move routing control to mmsys device Provide a mtk_mmsys_ddp_connect() and mtk_mmsys_disconnect() functions to replace mtk_ddp_add_comp_to_path() and mtk_ddp_remove_comp_from_path(). Those functions will allow DRM driver and others to control the data path routing. Signed-off-by: Enric Balletbo i Serra Reviewed-by: Matthias Brugger Reviewed-by: CK Hu Acked-by: CK Hu Tested-by: Anders Roxell Reviewed-by: Chun-Kuang Hu Signed-off-by: Matthias Brugger --- drivers/gpu/drm/mediatek/Kconfig | 1 + drivers/gpu/drm/mediatek/mtk_drm_crtc.c | 19 ++- drivers/gpu/drm/mediatek/mtk_drm_ddp.c | 256 ----------------------------- drivers/gpu/drm/mediatek/mtk_drm_ddp.h | 7 - drivers/gpu/drm/mediatek/mtk_drm_drv.c | 14 +- drivers/gpu/drm/mediatek/mtk_drm_drv.h | 2 +- drivers/soc/mediatek/mtk-mmsys.c | 281 ++++++++++++++++++++++++++++++++ include/linux/soc/mediatek/mtk-mmsys.h | 20 +++ 8 files changed, 318 insertions(+), 282 deletions(-) create mode 100644 include/linux/soc/mediatek/mtk-mmsys.h (limited to 'include/linux') diff --git a/drivers/gpu/drm/mediatek/Kconfig b/drivers/gpu/drm/mediatek/Kconfig index fa5ffc4fe823..c420f5a3d33b 100644 --- a/drivers/gpu/drm/mediatek/Kconfig +++ b/drivers/gpu/drm/mediatek/Kconfig @@ -11,6 +11,7 @@ config DRM_MEDIATEK select DRM_MIPI_DSI select DRM_PANEL select MEMORY + select MTK_MMSYS select MTK_SMI select VIDEOMODE_HELPERS help diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c index fe85e487e477..fe46c4bac64d 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -28,7 +29,7 @@ * @enabled: records whether crtc_enable succeeded * @planes: array of 4 drm_plane structures, one for each overlay plane * @pending_planes: whether any plane has pending changes to be applied - * @config_regs: memory mapped mmsys configuration register space + * @mmsys_dev: pointer to the mmsys device for configuration registers * @mutex: handle to one of the ten disp_mutex streams * @ddp_comp_nr: number of components in ddp_comp * @ddp_comp: array of pointers the mtk_ddp_comp structures used by this crtc @@ -50,7 +51,7 @@ struct mtk_drm_crtc { u32 cmdq_event; #endif - void __iomem *config_regs; + struct device *mmsys_dev; struct mtk_disp_mutex *mutex; unsigned int ddp_comp_nr; struct mtk_ddp_comp **ddp_comp; @@ -300,9 +301,9 @@ static int mtk_crtc_ddp_hw_init(struct mtk_drm_crtc *mtk_crtc) DRM_DEBUG_DRIVER("mediatek_ddp_ddp_path_setup\n"); for (i = 0; i < mtk_crtc->ddp_comp_nr - 1; i++) { - mtk_ddp_add_comp_to_path(mtk_crtc->config_regs, - mtk_crtc->ddp_comp[i]->id, - mtk_crtc->ddp_comp[i + 1]->id); + mtk_mmsys_ddp_connect(mtk_crtc->mmsys_dev, + mtk_crtc->ddp_comp[i]->id, + mtk_crtc->ddp_comp[i + 1]->id); mtk_disp_mutex_add_comp(mtk_crtc->mutex, mtk_crtc->ddp_comp[i]->id); } @@ -360,9 +361,9 @@ static void mtk_crtc_ddp_hw_fini(struct mtk_drm_crtc *mtk_crtc) mtk_crtc->ddp_comp[i]->id); mtk_disp_mutex_disable(mtk_crtc->mutex); for (i = 0; i < mtk_crtc->ddp_comp_nr - 1; i++) { - mtk_ddp_remove_comp_from_path(mtk_crtc->config_regs, - mtk_crtc->ddp_comp[i]->id, - mtk_crtc->ddp_comp[i + 1]->id); + mtk_mmsys_ddp_disconnect(mtk_crtc->mmsys_dev, + mtk_crtc->ddp_comp[i]->id, + mtk_crtc->ddp_comp[i + 1]->id); mtk_disp_mutex_remove_comp(mtk_crtc->mutex, mtk_crtc->ddp_comp[i]->id); } @@ -766,7 +767,7 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev, if (!mtk_crtc) return -ENOMEM; - mtk_crtc->config_regs = priv->config_regs; + mtk_crtc->mmsys_dev = priv->mmsys_dev; mtk_crtc->ddp_comp_nr = path_len; mtk_crtc->ddp_comp = devm_kmalloc_array(dev, mtk_crtc->ddp_comp_nr, sizeof(*mtk_crtc->ddp_comp), diff --git a/drivers/gpu/drm/mediatek/mtk_drm_ddp.c b/drivers/gpu/drm/mediatek/mtk_drm_ddp.c index b885f60f474c..014c1bbe1df2 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_ddp.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_ddp.c @@ -13,26 +13,6 @@ #include "mtk_drm_ddp.h" #include "mtk_drm_ddp_comp.h" -#define DISP_REG_CONFIG_DISP_OVL0_MOUT_EN 0x040 -#define DISP_REG_CONFIG_DISP_OVL1_MOUT_EN 0x044 -#define DISP_REG_CONFIG_DISP_OD_MOUT_EN 0x048 -#define DISP_REG_CONFIG_DISP_GAMMA_MOUT_EN 0x04c -#define DISP_REG_CONFIG_DISP_UFOE_MOUT_EN 0x050 -#define DISP_REG_CONFIG_DISP_COLOR0_SEL_IN 0x084 -#define DISP_REG_CONFIG_DISP_COLOR1_SEL_IN 0x088 -#define DISP_REG_CONFIG_DSIE_SEL_IN 0x0a4 -#define DISP_REG_CONFIG_DSIO_SEL_IN 0x0a8 -#define DISP_REG_CONFIG_DPI_SEL_IN 0x0ac -#define DISP_REG_CONFIG_DISP_RDMA2_SOUT 0x0b8 -#define DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN 0x0c4 -#define DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN 0x0c8 -#define DISP_REG_CONFIG_MMSYS_CG_CON0 0x100 - -#define DISP_REG_CONFIG_DISP_OVL_MOUT_EN 0x030 -#define DISP_REG_CONFIG_OUT_SEL 0x04c -#define DISP_REG_CONFIG_DSI_SEL 0x050 -#define DISP_REG_CONFIG_DPI_SEL 0x064 - #define MT2701_DISP_MUTEX0_MOD0 0x2c #define MT2701_DISP_MUTEX0_SOF0 0x30 @@ -94,48 +74,6 @@ #define MUTEX_SOF_DSI2 5 #define MUTEX_SOF_DSI3 6 -#define OVL0_MOUT_EN_COLOR0 0x1 -#define OD_MOUT_EN_RDMA0 0x1 -#define OD1_MOUT_EN_RDMA1 BIT(16) -#define UFOE_MOUT_EN_DSI0 0x1 -#define COLOR0_SEL_IN_OVL0 0x1 -#define OVL1_MOUT_EN_COLOR1 0x1 -#define GAMMA_MOUT_EN_RDMA1 0x1 -#define RDMA0_SOUT_DPI0 0x2 -#define RDMA0_SOUT_DPI1 0x3 -#define RDMA0_SOUT_DSI1 0x1 -#define RDMA0_SOUT_DSI2 0x4 -#define RDMA0_SOUT_DSI3 0x5 -#define RDMA1_SOUT_DPI0 0x2 -#define RDMA1_SOUT_DPI1 0x3 -#define RDMA1_SOUT_DSI1 0x1 -#define RDMA1_SOUT_DSI2 0x4 -#define RDMA1_SOUT_DSI3 0x5 -#define RDMA2_SOUT_DPI0 0x2 -#define RDMA2_SOUT_DPI1 0x3 -#define RDMA2_SOUT_DSI1 0x1 -#define RDMA2_SOUT_DSI2 0x4 -#define RDMA2_SOUT_DSI3 0x5 -#define DPI0_SEL_IN_RDMA1 0x1 -#define DPI0_SEL_IN_RDMA2 0x3 -#define DPI1_SEL_IN_RDMA1 (0x1 << 8) -#define DPI1_SEL_IN_RDMA2 (0x3 << 8) -#define DSI0_SEL_IN_RDMA1 0x1 -#define DSI0_SEL_IN_RDMA2 0x4 -#define DSI1_SEL_IN_RDMA1 0x1 -#define DSI1_SEL_IN_RDMA2 0x4 -#define DSI2_SEL_IN_RDMA1 (0x1 << 16) -#define DSI2_SEL_IN_RDMA2 (0x4 << 16) -#define DSI3_SEL_IN_RDMA1 (0x1 << 16) -#define DSI3_SEL_IN_RDMA2 (0x4 << 16) -#define COLOR1_SEL_IN_OVL1 0x1 - -#define OVL_MOUT_EN_RDMA 0x1 -#define BLS_TO_DSI_RDMA1_TO_DPI1 0x8 -#define BLS_TO_DPI_RDMA1_TO_DSI 0x2 -#define DSI_SEL_IN_BLS 0x0 -#define DPI_SEL_IN_BLS 0x0 -#define DSI_SEL_IN_RDMA 0x1 struct mtk_disp_mutex { int id; @@ -246,200 +184,6 @@ static const struct mtk_ddp_data mt8173_ddp_driver_data = { .mutex_sof_reg = MT2701_DISP_MUTEX0_SOF0, }; -static unsigned int mtk_ddp_mout_en(enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next, - unsigned int *addr) -{ - unsigned int value; - - if (cur == DDP_COMPONENT_OVL0 && next == DDP_COMPONENT_COLOR0) { - *addr = DISP_REG_CONFIG_DISP_OVL0_MOUT_EN; - value = OVL0_MOUT_EN_COLOR0; - } else if (cur == DDP_COMPONENT_OVL0 && next == DDP_COMPONENT_RDMA0) { - *addr = DISP_REG_CONFIG_DISP_OVL_MOUT_EN; - value = OVL_MOUT_EN_RDMA; - } else if (cur == DDP_COMPONENT_OD0 && next == DDP_COMPONENT_RDMA0) { - *addr = DISP_REG_CONFIG_DISP_OD_MOUT_EN; - value = OD_MOUT_EN_RDMA0; - } else if (cur == DDP_COMPONENT_UFOE && next == DDP_COMPONENT_DSI0) { - *addr = DISP_REG_CONFIG_DISP_UFOE_MOUT_EN; - value = UFOE_MOUT_EN_DSI0; - } else if (cur == DDP_COMPONENT_OVL1 && next == DDP_COMPONENT_COLOR1) { - *addr = DISP_REG_CONFIG_DISP_OVL1_MOUT_EN; - value = OVL1_MOUT_EN_COLOR1; - } else if (cur == DDP_COMPONENT_GAMMA && next == DDP_COMPONENT_RDMA1) { - *addr = DISP_REG_CONFIG_DISP_GAMMA_MOUT_EN; - value = GAMMA_MOUT_EN_RDMA1; - } else if (cur == DDP_COMPONENT_OD1 && next == DDP_COMPONENT_RDMA1) { - *addr = DISP_REG_CONFIG_DISP_OD_MOUT_EN; - value = OD1_MOUT_EN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DPI0) { - *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; - value = RDMA0_SOUT_DPI0; - } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DPI1) { - *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; - value = RDMA0_SOUT_DPI1; - } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DSI1) { - *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; - value = RDMA0_SOUT_DSI1; - } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DSI2) { - *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; - value = RDMA0_SOUT_DSI2; - } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DSI3) { - *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; - value = RDMA0_SOUT_DSI3; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI1) { - *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; - value = RDMA1_SOUT_DSI1; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI2) { - *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; - value = RDMA1_SOUT_DSI2; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI3) { - *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; - value = RDMA1_SOUT_DSI3; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI0) { - *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; - value = RDMA1_SOUT_DPI0; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI1) { - *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; - value = RDMA1_SOUT_DPI1; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI0) { - *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; - value = RDMA2_SOUT_DPI0; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI1) { - *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; - value = RDMA2_SOUT_DPI1; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI1) { - *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; - value = RDMA2_SOUT_DSI1; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI2) { - *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; - value = RDMA2_SOUT_DSI2; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI3) { - *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; - value = RDMA2_SOUT_DSI3; - } else { - value = 0; - } - - return value; -} - -static unsigned int mtk_ddp_sel_in(enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next, - unsigned int *addr) -{ - unsigned int value; - - if (cur == DDP_COMPONENT_OVL0 && next == DDP_COMPONENT_COLOR0) { - *addr = DISP_REG_CONFIG_DISP_COLOR0_SEL_IN; - value = COLOR0_SEL_IN_OVL0; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI0) { - *addr = DISP_REG_CONFIG_DPI_SEL_IN; - value = DPI0_SEL_IN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI1) { - *addr = DISP_REG_CONFIG_DPI_SEL_IN; - value = DPI1_SEL_IN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI0) { - *addr = DISP_REG_CONFIG_DSIE_SEL_IN; - value = DSI0_SEL_IN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI1) { - *addr = DISP_REG_CONFIG_DSIO_SEL_IN; - value = DSI1_SEL_IN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI2) { - *addr = DISP_REG_CONFIG_DSIE_SEL_IN; - value = DSI2_SEL_IN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI3) { - *addr = DISP_REG_CONFIG_DSIO_SEL_IN; - value = DSI3_SEL_IN_RDMA1; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI0) { - *addr = DISP_REG_CONFIG_DPI_SEL_IN; - value = DPI0_SEL_IN_RDMA2; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI1) { - *addr = DISP_REG_CONFIG_DPI_SEL_IN; - value = DPI1_SEL_IN_RDMA2; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI0) { - *addr = DISP_REG_CONFIG_DSIE_SEL_IN; - value = DSI0_SEL_IN_RDMA2; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI1) { - *addr = DISP_REG_CONFIG_DSIO_SEL_IN; - value = DSI1_SEL_IN_RDMA2; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI2) { - *addr = DISP_REG_CONFIG_DSIE_SEL_IN; - value = DSI2_SEL_IN_RDMA2; - } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI3) { - *addr = DISP_REG_CONFIG_DSIE_SEL_IN; - value = DSI3_SEL_IN_RDMA2; - } else if (cur == DDP_COMPONENT_OVL1 && next == DDP_COMPONENT_COLOR1) { - *addr = DISP_REG_CONFIG_DISP_COLOR1_SEL_IN; - value = COLOR1_SEL_IN_OVL1; - } else if (cur == DDP_COMPONENT_BLS && next == DDP_COMPONENT_DSI0) { - *addr = DISP_REG_CONFIG_DSI_SEL; - value = DSI_SEL_IN_BLS; - } else { - value = 0; - } - - return value; -} - -static void mtk_ddp_sout_sel(void __iomem *config_regs, - enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next) -{ - if (cur == DDP_COMPONENT_BLS && next == DDP_COMPONENT_DSI0) { - writel_relaxed(BLS_TO_DSI_RDMA1_TO_DPI1, - config_regs + DISP_REG_CONFIG_OUT_SEL); - } else if (cur == DDP_COMPONENT_BLS && next == DDP_COMPONENT_DPI0) { - writel_relaxed(BLS_TO_DPI_RDMA1_TO_DSI, - config_regs + DISP_REG_CONFIG_OUT_SEL); - writel_relaxed(DSI_SEL_IN_RDMA, - config_regs + DISP_REG_CONFIG_DSI_SEL); - writel_relaxed(DPI_SEL_IN_BLS, - config_regs + DISP_REG_CONFIG_DPI_SEL); - } -} - -void mtk_ddp_add_comp_to_path(void __iomem *config_regs, - enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next) -{ - unsigned int addr, value, reg; - - value = mtk_ddp_mout_en(cur, next, &addr); - if (value) { - reg = readl_relaxed(config_regs + addr) | value; - writel_relaxed(reg, config_regs + addr); - } - - mtk_ddp_sout_sel(config_regs, cur, next); - - value = mtk_ddp_sel_in(cur, next, &addr); - if (value) { - reg = readl_relaxed(config_regs + addr) | value; - writel_relaxed(reg, config_regs + addr); - } -} - -void mtk_ddp_remove_comp_from_path(void __iomem *config_regs, - enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next) -{ - unsigned int addr, value, reg; - - value = mtk_ddp_mout_en(cur, next, &addr); - if (value) { - reg = readl_relaxed(config_regs + addr) & ~value; - writel_relaxed(reg, config_regs + addr); - } - - value = mtk_ddp_sel_in(cur, next, &addr); - if (value) { - reg = readl_relaxed(config_regs + addr) & ~value; - writel_relaxed(reg, config_regs + addr); - } -} - struct mtk_disp_mutex *mtk_disp_mutex_get(struct device *dev, unsigned int id) { struct mtk_ddp *ddp = dev_get_drvdata(dev); diff --git a/drivers/gpu/drm/mediatek/mtk_drm_ddp.h b/drivers/gpu/drm/mediatek/mtk_drm_ddp.h index 827be424a148..6b691a57be4a 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_ddp.h +++ b/drivers/gpu/drm/mediatek/mtk_drm_ddp.h @@ -12,13 +12,6 @@ struct regmap; struct device; struct mtk_disp_mutex; -void mtk_ddp_add_comp_to_path(void __iomem *config_regs, - enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next); -void mtk_ddp_remove_comp_from_path(void __iomem *config_regs, - enum mtk_ddp_comp_id cur, - enum mtk_ddp_comp_id next); - struct mtk_disp_mutex *mtk_disp_mutex_get(struct device *dev, unsigned int id); int mtk_disp_mutex_prepare(struct mtk_disp_mutex *mutex); void mtk_disp_mutex_add_comp(struct mtk_disp_mutex *mutex, diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 0563c6813333..f2f07098265d 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -425,7 +426,6 @@ static int mtk_drm_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct mtk_drm_private *private; - struct resource *mem; struct device_node *node; struct component_match *match = NULL; int ret; @@ -436,14 +436,10 @@ static int mtk_drm_probe(struct platform_device *pdev) return -ENOMEM; private->data = of_device_get_match_data(dev); - - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - private->config_regs = devm_ioremap_resource(dev, mem); - if (IS_ERR(private->config_regs)) { - ret = PTR_ERR(private->config_regs); - dev_err(dev, "Failed to ioremap mmsys-config resource: %d\n", - ret); - return ret; + private->mmsys_dev = dev->parent; + if (!private->mmsys_dev) { + dev_err(dev, "Failed to get MMSYS device\n"); + return -ENODEV; } /* Iterate over sibling DISP function blocks */ diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.h b/drivers/gpu/drm/mediatek/mtk_drm_drv.h index 17bc99b9f5d4..b5be63e53176 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.h +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.h @@ -39,7 +39,7 @@ struct mtk_drm_private { struct device_node *mutex_node; struct device *mutex_dev; - void __iomem *config_regs; + struct device *mmsys_dev; struct device_node *comp_node[DDP_COMPONENT_ID_MAX]; struct mtk_ddp_comp *ddp_comp[DDP_COMPONENT_ID_MAX]; const struct mtk_mmsys_driver_data *data; diff --git a/drivers/soc/mediatek/mtk-mmsys.c b/drivers/soc/mediatek/mtk-mmsys.c index dbdfedd302fa..703064f45616 100644 --- a/drivers/soc/mediatek/mtk-mmsys.c +++ b/drivers/soc/mediatek/mtk-mmsys.c @@ -5,8 +5,76 @@ */ #include +#include #include #include +#include + +#include "../../gpu/drm/mediatek/mtk_drm_ddp.h" +#include "../../gpu/drm/mediatek/mtk_drm_ddp_comp.h" + +#define DISP_REG_CONFIG_DISP_OVL0_MOUT_EN 0x040 +#define DISP_REG_CONFIG_DISP_OVL1_MOUT_EN 0x044 +#define DISP_REG_CONFIG_DISP_OD_MOUT_EN 0x048 +#define DISP_REG_CONFIG_DISP_GAMMA_MOUT_EN 0x04c +#define DISP_REG_CONFIG_DISP_UFOE_MOUT_EN 0x050 +#define DISP_REG_CONFIG_DISP_COLOR0_SEL_IN 0x084 +#define DISP_REG_CONFIG_DISP_COLOR1_SEL_IN 0x088 +#define DISP_REG_CONFIG_DSIE_SEL_IN 0x0a4 +#define DISP_REG_CONFIG_DSIO_SEL_IN 0x0a8 +#define DISP_REG_CONFIG_DPI_SEL_IN 0x0ac +#define DISP_REG_CONFIG_DISP_RDMA2_SOUT 0x0b8 +#define DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN 0x0c4 +#define DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN 0x0c8 +#define DISP_REG_CONFIG_MMSYS_CG_CON0 0x100 + +#define DISP_REG_CONFIG_DISP_OVL_MOUT_EN 0x030 +#define DISP_REG_CONFIG_OUT_SEL 0x04c +#define DISP_REG_CONFIG_DSI_SEL 0x050 +#define DISP_REG_CONFIG_DPI_SEL 0x064 + +#define OVL0_MOUT_EN_COLOR0 0x1 +#define OD_MOUT_EN_RDMA0 0x1 +#define OD1_MOUT_EN_RDMA1 BIT(16) +#define UFOE_MOUT_EN_DSI0 0x1 +#define COLOR0_SEL_IN_OVL0 0x1 +#define OVL1_MOUT_EN_COLOR1 0x1 +#define GAMMA_MOUT_EN_RDMA1 0x1 +#define RDMA0_SOUT_DPI0 0x2 +#define RDMA0_SOUT_DPI1 0x3 +#define RDMA0_SOUT_DSI1 0x1 +#define RDMA0_SOUT_DSI2 0x4 +#define RDMA0_SOUT_DSI3 0x5 +#define RDMA1_SOUT_DPI0 0x2 +#define RDMA1_SOUT_DPI1 0x3 +#define RDMA1_SOUT_DSI1 0x1 +#define RDMA1_SOUT_DSI2 0x4 +#define RDMA1_SOUT_DSI3 0x5 +#define RDMA2_SOUT_DPI0 0x2 +#define RDMA2_SOUT_DPI1 0x3 +#define RDMA2_SOUT_DSI1 0x1 +#define RDMA2_SOUT_DSI2 0x4 +#define RDMA2_SOUT_DSI3 0x5 +#define DPI0_SEL_IN_RDMA1 0x1 +#define DPI0_SEL_IN_RDMA2 0x3 +#define DPI1_SEL_IN_RDMA1 (0x1 << 8) +#define DPI1_SEL_IN_RDMA2 (0x3 << 8) +#define DSI0_SEL_IN_RDMA1 0x1 +#define DSI0_SEL_IN_RDMA2 0x4 +#define DSI1_SEL_IN_RDMA1 0x1 +#define DSI1_SEL_IN_RDMA2 0x4 +#define DSI2_SEL_IN_RDMA1 (0x1 << 16) +#define DSI2_SEL_IN_RDMA2 (0x4 << 16) +#define DSI3_SEL_IN_RDMA1 (0x1 << 16) +#define DSI3_SEL_IN_RDMA2 (0x4 << 16) +#define COLOR1_SEL_IN_OVL1 0x1 + +#define OVL_MOUT_EN_RDMA 0x1 +#define BLS_TO_DSI_RDMA1_TO_DPI1 0x8 +#define BLS_TO_DPI_RDMA1_TO_DSI 0x2 +#define DSI_SEL_IN_BLS 0x0 +#define DPI_SEL_IN_BLS 0x0 +#define DSI_SEL_IN_RDMA 0x1 struct mtk_mmsys_driver_data { const char *clk_driver; @@ -16,10 +84,223 @@ static const struct mtk_mmsys_driver_data mt8173_mmsys_driver_data = { .clk_driver = "clk-mt8173-mm", }; +static unsigned int mtk_mmsys_ddp_mout_en(enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next, + unsigned int *addr) +{ + unsigned int value; + + if (cur == DDP_COMPONENT_OVL0 && next == DDP_COMPONENT_COLOR0) { + *addr = DISP_REG_CONFIG_DISP_OVL0_MOUT_EN; + value = OVL0_MOUT_EN_COLOR0; + } else if (cur == DDP_COMPONENT_OVL0 && next == DDP_COMPONENT_RDMA0) { + *addr = DISP_REG_CONFIG_DISP_OVL_MOUT_EN; + value = OVL_MOUT_EN_RDMA; + } else if (cur == DDP_COMPONENT_OD0 && next == DDP_COMPONENT_RDMA0) { + *addr = DISP_REG_CONFIG_DISP_OD_MOUT_EN; + value = OD_MOUT_EN_RDMA0; + } else if (cur == DDP_COMPONENT_UFOE && next == DDP_COMPONENT_DSI0) { + *addr = DISP_REG_CONFIG_DISP_UFOE_MOUT_EN; + value = UFOE_MOUT_EN_DSI0; + } else if (cur == DDP_COMPONENT_OVL1 && next == DDP_COMPONENT_COLOR1) { + *addr = DISP_REG_CONFIG_DISP_OVL1_MOUT_EN; + value = OVL1_MOUT_EN_COLOR1; + } else if (cur == DDP_COMPONENT_GAMMA && next == DDP_COMPONENT_RDMA1) { + *addr = DISP_REG_CONFIG_DISP_GAMMA_MOUT_EN; + value = GAMMA_MOUT_EN_RDMA1; + } else if (cur == DDP_COMPONENT_OD1 && next == DDP_COMPONENT_RDMA1) { + *addr = DISP_REG_CONFIG_DISP_OD_MOUT_EN; + value = OD1_MOUT_EN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DPI0) { + *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; + value = RDMA0_SOUT_DPI0; + } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DPI1) { + *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; + value = RDMA0_SOUT_DPI1; + } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DSI1) { + *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; + value = RDMA0_SOUT_DSI1; + } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DSI2) { + *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; + value = RDMA0_SOUT_DSI2; + } else if (cur == DDP_COMPONENT_RDMA0 && next == DDP_COMPONENT_DSI3) { + *addr = DISP_REG_CONFIG_DISP_RDMA0_SOUT_EN; + value = RDMA0_SOUT_DSI3; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI1) { + *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; + value = RDMA1_SOUT_DSI1; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI2) { + *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; + value = RDMA1_SOUT_DSI2; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI3) { + *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; + value = RDMA1_SOUT_DSI3; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI0) { + *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; + value = RDMA1_SOUT_DPI0; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI1) { + *addr = DISP_REG_CONFIG_DISP_RDMA1_SOUT_EN; + value = RDMA1_SOUT_DPI1; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI0) { + *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; + value = RDMA2_SOUT_DPI0; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI1) { + *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; + value = RDMA2_SOUT_DPI1; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI1) { + *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; + value = RDMA2_SOUT_DSI1; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI2) { + *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; + value = RDMA2_SOUT_DSI2; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI3) { + *addr = DISP_REG_CONFIG_DISP_RDMA2_SOUT; + value = RDMA2_SOUT_DSI3; + } else { + value = 0; + } + + return value; +} + +static unsigned int mtk_mmsys_ddp_sel_in(enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next, + unsigned int *addr) +{ + unsigned int value; + + if (cur == DDP_COMPONENT_OVL0 && next == DDP_COMPONENT_COLOR0) { + *addr = DISP_REG_CONFIG_DISP_COLOR0_SEL_IN; + value = COLOR0_SEL_IN_OVL0; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI0) { + *addr = DISP_REG_CONFIG_DPI_SEL_IN; + value = DPI0_SEL_IN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DPI1) { + *addr = DISP_REG_CONFIG_DPI_SEL_IN; + value = DPI1_SEL_IN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI0) { + *addr = DISP_REG_CONFIG_DSIE_SEL_IN; + value = DSI0_SEL_IN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI1) { + *addr = DISP_REG_CONFIG_DSIO_SEL_IN; + value = DSI1_SEL_IN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI2) { + *addr = DISP_REG_CONFIG_DSIE_SEL_IN; + value = DSI2_SEL_IN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA1 && next == DDP_COMPONENT_DSI3) { + *addr = DISP_REG_CONFIG_DSIO_SEL_IN; + value = DSI3_SEL_IN_RDMA1; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI0) { + *addr = DISP_REG_CONFIG_DPI_SEL_IN; + value = DPI0_SEL_IN_RDMA2; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DPI1) { + *addr = DISP_REG_CONFIG_DPI_SEL_IN; + value = DPI1_SEL_IN_RDMA2; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI0) { + *addr = DISP_REG_CONFIG_DSIE_SEL_IN; + value = DSI0_SEL_IN_RDMA2; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI1) { + *addr = DISP_REG_CONFIG_DSIO_SEL_IN; + value = DSI1_SEL_IN_RDMA2; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI2) { + *addr = DISP_REG_CONFIG_DSIE_SEL_IN; + value = DSI2_SEL_IN_RDMA2; + } else if (cur == DDP_COMPONENT_RDMA2 && next == DDP_COMPONENT_DSI3) { + *addr = DISP_REG_CONFIG_DSIE_SEL_IN; + value = DSI3_SEL_IN_RDMA2; + } else if (cur == DDP_COMPONENT_OVL1 && next == DDP_COMPONENT_COLOR1) { + *addr = DISP_REG_CONFIG_DISP_COLOR1_SEL_IN; + value = COLOR1_SEL_IN_OVL1; + } else if (cur == DDP_COMPONENT_BLS && next == DDP_COMPONENT_DSI0) { + *addr = DISP_REG_CONFIG_DSI_SEL; + value = DSI_SEL_IN_BLS; + } else { + value = 0; + } + + return value; +} + +static void mtk_mmsys_ddp_sout_sel(void __iomem *config_regs, + enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next) +{ + if (cur == DDP_COMPONENT_BLS && next == DDP_COMPONENT_DSI0) { + writel_relaxed(BLS_TO_DSI_RDMA1_TO_DPI1, + config_regs + DISP_REG_CONFIG_OUT_SEL); + } else if (cur == DDP_COMPONENT_BLS && next == DDP_COMPONENT_DPI0) { + writel_relaxed(BLS_TO_DPI_RDMA1_TO_DSI, + config_regs + DISP_REG_CONFIG_OUT_SEL); + writel_relaxed(DSI_SEL_IN_RDMA, + config_regs + DISP_REG_CONFIG_DSI_SEL); + writel_relaxed(DPI_SEL_IN_BLS, + config_regs + DISP_REG_CONFIG_DPI_SEL); + } +} + +void mtk_mmsys_ddp_connect(struct device *dev, + enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next) +{ + void __iomem *config_regs = dev_get_drvdata(dev); + unsigned int addr, value, reg; + + value = mtk_mmsys_ddp_mout_en(cur, next, &addr); + if (value) { + reg = readl_relaxed(config_regs + addr) | value; + writel_relaxed(reg, config_regs + addr); + } + + mtk_mmsys_ddp_sout_sel(config_regs, cur, next); + + value = mtk_mmsys_ddp_sel_in(cur, next, &addr); + if (value) { + reg = readl_relaxed(config_regs + addr) | value; + writel_relaxed(reg, config_regs + addr); + } +} +EXPORT_SYMBOL_GPL(mtk_mmsys_ddp_connect); + +void mtk_mmsys_ddp_disconnect(struct device *dev, + enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next) +{ + void __iomem *config_regs = dev_get_drvdata(dev); + unsigned int addr, value, reg; + + value = mtk_mmsys_ddp_mout_en(cur, next, &addr); + if (value) { + reg = readl_relaxed(config_regs + addr) & ~value; + writel_relaxed(reg, config_regs + addr); + } + + value = mtk_mmsys_ddp_sel_in(cur, next, &addr); + if (value) { + reg = readl_relaxed(config_regs + addr) & ~value; + writel_relaxed(reg, config_regs + addr); + } +} +EXPORT_SYMBOL_GPL(mtk_mmsys_ddp_disconnect); + static int mtk_mmsys_probe(struct platform_device *pdev) { const struct mtk_mmsys_driver_data *data; + struct device *dev = &pdev->dev; struct platform_device *clks; + void __iomem *config_regs; + struct resource *mem; + int ret; + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + config_regs = devm_ioremap_resource(dev, mem); + if (IS_ERR(config_regs)) { + ret = PTR_ERR(config_regs); + dev_err(dev, "Failed to ioremap mmsys-config resource: %d\n", + ret); + return ret; + } + + platform_set_drvdata(pdev, config_regs); data = of_device_get_match_data(&pdev->dev); diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h new file mode 100644 index 000000000000..7bab5d9a3d31 --- /dev/null +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2015 MediaTek Inc. + */ + +#ifndef __MTK_MMSYS_H +#define __MTK_MMSYS_H + +enum mtk_ddp_comp_id; +struct device; + +void mtk_mmsys_ddp_connect(struct device *dev, + enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next); + +void mtk_mmsys_ddp_disconnect(struct device *dev, + enum mtk_ddp_comp_id cur, + enum mtk_ddp_comp_id next); + +#endif /* __MTK_MMSYS_H */ -- cgit v1.2.3 From f454f4d1915b44cc3857148aac7a297a31591501 Mon Sep 17 00:00:00 2001 From: Maciej Grochowski Date: Thu, 9 Apr 2020 01:03:37 -0400 Subject: include/ntb: Fix typo in ntb_unregister_device description Signed-off-by: Maciej Grochowski Signed-off-by: Jon Mason --- include/linux/ntb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h index 8c13538aeffe..a07252f78770 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -478,7 +478,7 @@ void ntb_unregister_client(struct ntb_client *client); int ntb_register_device(struct ntb_dev *ntb); /** - * ntb_register_device() - unregister a ntb device + * ntb_unregister_device() - unregister a ntb device * @ntb: NTB device context. * * The device will be removed from the list of ntb devices. If the ntb device -- cgit v1.2.3 From 757a4cefde76697af2b2c284c8a320912b77e7e6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Mar 2020 17:41:56 +0100 Subject: kcsan: Add support for scoped accesses This adds support for scoped accesses, where the memory range is checked for the duration of the scope. The feature is implemented by inserting the relevant access information into a list of scoped accesses for the current execution context, which are then checked (until removed) on every call (through instrumentation) into the KCSAN runtime. An alternative, more complex, implementation could set up a watchpoint for the scoped access, and keep the watchpoint set up. This, however, would require first exposing a handle to the watchpoint, as well as dealing with cases such as accesses by the same thread while the watchpoint is still set up (and several more cases). It is also doubtful if this would provide any benefit, since the majority of delay where the watchpoint is set up is likely due to the injected delays by KCSAN. Therefore, the implementation in this patch is simpler and avoids hurting KCSAN's main use-case (normal data race detection); it also implicitly increases scoped-access race-detection-ability due to increased probability of setting up watchpoints by repeatedly calling __kcsan_check_access() throughout the scope of the access. The implementation required adding an additional conditional branch to the fast-path. However, the microbenchmark showed a *speedup* of ~5% on the fast-path. This appears to be due to subtly improved codegen by GCC from moving get_ctx() and associated load of preempt_count earlier. Suggested-by: Boqun Feng Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 57 ++++++++++++++++++++++++++++++ include/linux/kcsan.h | 3 ++ init/init_task.c | 1 + kernel/kcsan/core.c | 83 +++++++++++++++++++++++++++++++++++++++----- kernel/kcsan/report.c | 33 ++++++++++++------ 5 files changed, 158 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 3cd8bb03eb41..b24253d3a442 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -3,6 +3,8 @@ #ifndef _LINUX_KCSAN_CHECKS_H #define _LINUX_KCSAN_CHECKS_H +/* Note: Only include what is already included by compiler.h. */ +#include #include /* @@ -12,10 +14,12 @@ * WRITE : write access; * ATOMIC: access is atomic; * ASSERT: access is not a regular access, but an assertion; + * SCOPED: access is a scoped access; */ #define KCSAN_ACCESS_WRITE 0x1 #define KCSAN_ACCESS_ATOMIC 0x2 #define KCSAN_ACCESS_ASSERT 0x4 +#define KCSAN_ACCESS_SCOPED 0x8 /* * __kcsan_*: Always calls into the runtime when KCSAN is enabled. This may be used @@ -78,6 +82,52 @@ void kcsan_atomic_next(int n); */ void kcsan_set_access_mask(unsigned long mask); +/* Scoped access information. */ +struct kcsan_scoped_access { + struct list_head list; + const volatile void *ptr; + size_t size; + int type; +}; +/* + * Automatically call kcsan_end_scoped_access() when kcsan_scoped_access goes + * out of scope; relies on attribute "cleanup", which is supported by all + * compilers that support KCSAN. + */ +#define __kcsan_cleanup_scoped \ + __maybe_unused __attribute__((__cleanup__(kcsan_end_scoped_access))) + +/** + * kcsan_begin_scoped_access - begin scoped access + * + * Begin scoped access and initialize @sa, which will cause KCSAN to + * continuously check the memory range in the current thread until + * kcsan_end_scoped_access() is called for @sa. + * + * Scoped accesses are implemented by appending @sa to an internal list for the + * current execution context, and then checked on every call into the KCSAN + * runtime. + * + * @ptr: address of access + * @size: size of access + * @type: access type modifier + * @sa: struct kcsan_scoped_access to use for the scope of the access + */ +struct kcsan_scoped_access * +kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type, + struct kcsan_scoped_access *sa); + +/** + * kcsan_end_scoped_access - end scoped access + * + * End a scoped access, which will stop KCSAN checking the memory range. + * Requires that kcsan_begin_scoped_access() was previously called once for @sa. + * + * @sa: a previously initialized struct kcsan_scoped_access + */ +void kcsan_end_scoped_access(struct kcsan_scoped_access *sa); + + #else /* CONFIG_KCSAN */ static inline void __kcsan_check_access(const volatile void *ptr, size_t size, @@ -90,6 +140,13 @@ static inline void kcsan_flat_atomic_end(void) { } static inline void kcsan_atomic_next(int n) { } static inline void kcsan_set_access_mask(unsigned long mask) { } +struct kcsan_scoped_access { }; +#define __kcsan_cleanup_scoped __maybe_unused +static inline struct kcsan_scoped_access * +kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type, + struct kcsan_scoped_access *sa) { return sa; } +static inline void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) { } + #endif /* CONFIG_KCSAN */ /* diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 3b84606e1e67..17ae59e4b685 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -40,6 +40,9 @@ struct kcsan_ctx { * Access mask for all accesses if non-zero. */ unsigned long access_mask; + + /* List of scoped accesses. */ + struct list_head scoped_accesses; }; /** diff --git a/init/init_task.c b/init/init_task.c index 096191d177d5..198943851caf 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -168,6 +168,7 @@ struct task_struct init_task .atomic_nest_count = 0, .in_flat_atomic = false, .access_mask = 0, + .scoped_accesses = {LIST_POISON1, NULL}, }, #endif #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4d8ea0fca5f1..a572aae61b98 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { .atomic_nest_count = 0, .in_flat_atomic = false, .access_mask = 0, + .scoped_accesses = {LIST_POISON1, NULL}, }; /* @@ -191,12 +193,23 @@ static __always_inline struct kcsan_ctx *get_ctx(void) return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx); } +/* Check scoped accesses; never inline because this is a slow-path! */ +static noinline void kcsan_check_scoped_accesses(void) +{ + struct kcsan_ctx *ctx = get_ctx(); + struct list_head *prev_save = ctx->scoped_accesses.prev; + struct kcsan_scoped_access *scoped_access; + + ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) + __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type); + ctx->scoped_accesses.prev = prev_save; +} + /* Rules for generic atomic accesses. Called from fast-path. */ static __always_inline bool -is_atomic(const volatile void *ptr, size_t size, int type) +is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) { - struct kcsan_ctx *ctx; - if (type & KCSAN_ACCESS_ATOMIC) return true; @@ -213,7 +226,6 @@ is_atomic(const volatile void *ptr, size_t size, int type) IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ - ctx = get_ctx(); if (ctx->atomic_next > 0) { /* * Because we do not have separate contexts for nested @@ -233,7 +245,7 @@ is_atomic(const volatile void *ptr, size_t size, int type) } static __always_inline bool -should_watch(const volatile void *ptr, size_t size, int type) +should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx) { /* * Never set up watchpoints when memory operations are atomic. @@ -242,7 +254,7 @@ should_watch(const volatile void *ptr, size_t size, int type) * should not count towards skipped instructions, and (2) to actually * decrement kcsan_atomic_next for consecutive instruction stream. */ - if (is_atomic(ptr, size, type)) + if (is_atomic(ptr, size, type, ctx)) return false; if (this_cpu_dec_return(kcsan_skip) >= 0) @@ -563,8 +575,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, if (unlikely(watchpoint != NULL)) kcsan_found_watchpoint(ptr, size, type, watchpoint, encoded_watchpoint); - else if (unlikely(should_watch(ptr, size, type))) - kcsan_setup_watchpoint(ptr, size, type); + else { + struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ + + if (unlikely(should_watch(ptr, size, type, ctx))) + kcsan_setup_watchpoint(ptr, size, type); + else if (unlikely(ctx->scoped_accesses.prev)) + kcsan_check_scoped_accesses(); + } } /* === Public interface ===================================================== */ @@ -660,6 +678,55 @@ void kcsan_set_access_mask(unsigned long mask) } EXPORT_SYMBOL(kcsan_set_access_mask); +struct kcsan_scoped_access * +kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type, + struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + __kcsan_check_access(ptr, size, type); + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + INIT_LIST_HEAD(&sa->list); + sa->ptr = ptr; + sa->size = size; + sa->type = type; + + if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */ + INIT_LIST_HEAD(&ctx->scoped_accesses); + list_add(&sa->list, &ctx->scoped_accesses); + + ctx->disable_count--; + return sa; +} +EXPORT_SYMBOL(kcsan_begin_scoped_access); + +void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) +{ + struct kcsan_ctx *ctx = get_ctx(); + + if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__)) + return; + + ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */ + + list_del(&sa->list); + if (list_empty(&ctx->scoped_accesses)) + /* + * Ensure we do not enter kcsan_check_scoped_accesses() + * slow-path if unnecessary, and avoids requiring list_empty() + * in the fast-path (to avoid a READ_ONCE() and potential + * uaccess warning). + */ + ctx->scoped_accesses.prev = NULL; + + ctx->disable_count--; + + __kcsan_check_access(sa->ptr, sa->size, sa->type); +} +EXPORT_SYMBOL(kcsan_end_scoped_access); + void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { check_access(ptr, size, type); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ae0a383238ea..ddc18f1224a4 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -205,6 +205,20 @@ skip_report(enum kcsan_value_change value_change, unsigned long top_frame) static const char *get_access_type(int type) { + if (type & KCSAN_ACCESS_ASSERT) { + if (type & KCSAN_ACCESS_SCOPED) { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses (scoped)"; + else + return "assert no writes (scoped)"; + } else { + if (type & KCSAN_ACCESS_WRITE) + return "assert no accesses"; + else + return "assert no writes"; + } + } + switch (type) { case 0: return "read"; @@ -214,17 +228,14 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; - - /* - * ASSERT variants: - */ - case KCSAN_ACCESS_ASSERT: - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_ATOMIC: - return "assert no writes"; - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE: - case KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "assert no accesses"; - + case KCSAN_ACCESS_SCOPED: + return "read (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: + return "read (marked, scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE: + return "write (scoped)"; + case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "write (marked, scoped)"; default: BUG(); } -- cgit v1.2.3 From d8949ef1d9f1062848cd068cf369a57ce33dae6f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Mar 2020 17:41:58 +0100 Subject: kcsan: Introduce scoped ASSERT_EXCLUSIVE macros Introduce ASSERT_EXCLUSIVE_*_SCOPED(), which provide an intuitive interface to use the scoped-access feature, without having to explicitly mark the start and end of the desired scope. Basing duration of the checks on scope avoids accidental misuse and resulting false positives, which may be hard to debug. See added comments for usage. The macros are implemented using __attribute__((__cleanup__(func))), which is supported by all compilers that currently support KCSAN. Suggested-by: Boqun Feng Suggested-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- Documentation/dev-tools/kcsan.rst | 3 +- include/linux/kcsan-checks.h | 73 ++++++++++++++++++++++++++++++++++++++- kernel/kcsan/debugfs.c | 16 ++++++++- 3 files changed, 89 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index 52a5d6fb9701..f4b5766f12cc 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -238,7 +238,8 @@ are defined at the C-language level. The following macros can be used to check properties of concurrent code where bugs would not manifest as data races. .. kernel-doc:: include/linux/kcsan-checks.h - :functions: ASSERT_EXCLUSIVE_WRITER ASSERT_EXCLUSIVE_ACCESS + :functions: ASSERT_EXCLUSIVE_WRITER ASSERT_EXCLUSIVE_WRITER_SCOPED + ASSERT_EXCLUSIVE_ACCESS ASSERT_EXCLUSIVE_ACCESS_SCOPED ASSERT_EXCLUSIVE_BITS Implementation Details diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index b24253d3a442..101df7f46d89 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -234,11 +234,63 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * ... = READ_ONCE(shared_foo); * } * + * Note: ASSERT_EXCLUSIVE_WRITER_SCOPED(), if applicable, performs more thorough + * checking if a clear scope where no concurrent writes are expected exists. + * * @var: variable to assert on */ #define ASSERT_EXCLUSIVE_WRITER(var) \ __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_ASSERT) +/* + * Helper macros for implementation of for ASSERT_EXCLUSIVE_*_SCOPED(). @id is + * expected to be unique for the scope in which instances of kcsan_scoped_access + * are declared. + */ +#define __kcsan_scoped_name(c, suffix) __kcsan_scoped_##c##suffix +#define __ASSERT_EXCLUSIVE_SCOPED(var, type, id) \ + struct kcsan_scoped_access __kcsan_scoped_name(id, _) \ + __kcsan_cleanup_scoped; \ + struct kcsan_scoped_access *__kcsan_scoped_name(id, _dummy_p) \ + __maybe_unused = kcsan_begin_scoped_access( \ + &(var), sizeof(var), KCSAN_ACCESS_SCOPED | (type), \ + &__kcsan_scoped_name(id, _)) + +/** + * ASSERT_EXCLUSIVE_WRITER_SCOPED - assert no concurrent writes to @var in scope + * + * Scoped variant of ASSERT_EXCLUSIVE_WRITER(). + * + * Assert that there are no concurrent writes to @var for the duration of the + * scope in which it is introduced. This provides a better way to fully cover + * the enclosing scope, compared to multiple ASSERT_EXCLUSIVE_WRITER(), and + * increases the likelihood for KCSAN to detect racing accesses. + * + * For example, it allows finding race-condition bugs that only occur due to + * state changes within the scope itself: + * + * .. code-block:: c + * + * void writer(void) { + * spin_lock(&update_foo_lock); + * { + * ASSERT_EXCLUSIVE_WRITER_SCOPED(shared_foo); + * WRITE_ONCE(shared_foo, 42); + * ... + * // shared_foo should still be 42 here! + * } + * spin_unlock(&update_foo_lock); + * } + * void buggy(void) { + * if (READ_ONCE(shared_foo) == 42) + * WRITE_ONCE(shared_foo, 1); // bug! + * } + * + * @var: variable to assert on + */ +#define ASSERT_EXCLUSIVE_WRITER_SCOPED(var) \ + __ASSERT_EXCLUSIVE_SCOPED(var, KCSAN_ACCESS_ASSERT, __COUNTER__) + /** * ASSERT_EXCLUSIVE_ACCESS - assert no concurrent accesses to @var * @@ -258,6 +310,9 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, * release_for_reuse(obj); * } * + * Note: ASSERT_EXCLUSIVE_ACCESS_SCOPED(), if applicable, performs more thorough + * checking if a clear scope where no concurrent accesses are expected exists. + * * Note: For cases where the object is freed, `KASAN `_ is a better * fit to detect use-after-free bugs. * @@ -266,10 +321,26 @@ static inline void kcsan_check_access(const volatile void *ptr, size_t size, #define ASSERT_EXCLUSIVE_ACCESS(var) \ __kcsan_check_access(&(var), sizeof(var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT) +/** + * ASSERT_EXCLUSIVE_ACCESS_SCOPED - assert no concurrent accesses to @var in scope + * + * Scoped variant of ASSERT_EXCLUSIVE_ACCESS(). + * + * Assert that there are no concurrent accesses to @var (no readers nor writers) + * for the entire duration of the scope in which it is introduced. This provides + * a better way to fully cover the enclosing scope, compared to multiple + * ASSERT_EXCLUSIVE_ACCESS(), and increases the likelihood for KCSAN to detect + * racing accesses. + * + * @var: variable to assert on + */ +#define ASSERT_EXCLUSIVE_ACCESS_SCOPED(var) \ + __ASSERT_EXCLUSIVE_SCOPED(var, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, __COUNTER__) + /** * ASSERT_EXCLUSIVE_BITS - assert no concurrent writes to subset of bits in @var * - * Bit-granular variant of ASSERT_EXCLUSIVE_WRITER(var). + * Bit-granular variant of ASSERT_EXCLUSIVE_WRITER(). * * Assert that there are no concurrent writes to a subset of bits in @var; * concurrent readers are permitted. This assertion captures more detailed diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 72ee188ebc54..1a08664a7fab 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -110,6 +110,7 @@ static noinline void microbenchmark(unsigned long iters) */ static long test_dummy; static long test_flags; +static long test_scoped; static noinline void test_thread(unsigned long iters) { const long CHANGE_BITS = 0xff00ff00ff00ff00L; @@ -120,7 +121,8 @@ static noinline void test_thread(unsigned long iters) memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px\n", &test_dummy, &test_flags); + pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", + &test_dummy, &test_flags, &test_scoped); cycles = get_cycles(); while (iters--) { @@ -141,6 +143,18 @@ static noinline void test_thread(unsigned long iters) test_flags ^= CHANGE_BITS; /* generate value-change */ __kcsan_check_write(&test_flags, sizeof(test_flags)); + + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); + { + /* Should generate reports anywhere in this block. */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); + BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); + /* Unrelated accesses. */ + __kcsan_check_access(&cycles, sizeof(cycles), 0); + __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); + } + BUG_ON(current->kcsan_ctx.scoped_accesses.prev); } cycles = get_cycles() - cycles; -- cgit v1.2.3 From 01b4ff58f72dbee926077d9afa0650f6e685e866 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 31 Mar 2020 21:32:32 +0200 Subject: kcsan: Move kcsan_{disable,enable}_current() to kcsan-checks.h Both affect access checks, and should therefore be in kcsan-checks.h. This is in preparation to use these in compiler.h. Acked-by: Will Deacon Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 16 ++++++++++++++++ include/linux/kcsan.h | 16 ---------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 101df7f46d89..ef95ddc49182 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -36,6 +36,20 @@ */ void __kcsan_check_access(const volatile void *ptr, size_t size, int type); +/** + * kcsan_disable_current - disable KCSAN for the current context + * + * Supports nesting. + */ +void kcsan_disable_current(void); + +/** + * kcsan_enable_current - re-enable KCSAN for the current context + * + * Supports nesting. + */ +void kcsan_enable_current(void); + /** * kcsan_nestable_atomic_begin - begin nestable atomic region * @@ -133,6 +147,8 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa); static inline void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { } +static inline void kcsan_disable_current(void) { } +static inline void kcsan_enable_current(void) { } static inline void kcsan_nestable_atomic_begin(void) { } static inline void kcsan_nestable_atomic_end(void) { } static inline void kcsan_flat_atomic_begin(void) { } diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 17ae59e4b685..53340d8789f9 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -50,25 +50,9 @@ struct kcsan_ctx { */ void kcsan_init(void); -/** - * kcsan_disable_current - disable KCSAN for the current context - * - * Supports nesting. - */ -void kcsan_disable_current(void); - -/** - * kcsan_enable_current - re-enable KCSAN for the current context - * - * Supports nesting. - */ -void kcsan_enable_current(void); - #else /* CONFIG_KCSAN */ static inline void kcsan_init(void) { } -static inline void kcsan_disable_current(void) { } -static inline void kcsan_enable_current(void) { } #endif /* CONFIG_KCSAN */ -- cgit v1.2.3 From d071e91361bbfef524ae8abf7e560fb294d0ad64 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 31 Mar 2020 21:32:33 +0200 Subject: kcsan: Change data_race() to no longer require marking racing accesses Thus far, accesses marked with data_race() would still require the racing access to be marked in some way (be it with READ_ONCE(), WRITE_ONCE(), or data_race() itself), as otherwise KCSAN would still report a data race. This requirement, however, seems to be unintuitive, and some valid use-cases demand *not* marking other accesses, as it might hide more serious bugs (e.g. diagnostic reads). Therefore, this commit changes data_race() to no longer require marking racing accesses (although it's still recommended if possible). The alternative would have been introducing another variant of data_race(), however, since usage of data_race() already needs to be carefully reasoned about, distinguishing between these cases likely adds more complexity in the wrong place. Link: https://lkml.kernel.org/r/20200331131002.GA30975@willie-the-truck Cc: Paul E. McKenney Cc: Will Deacon Cc: Qian Cai Acked-by: Will Deacon Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/compiler.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index f504edebd5d7..1729bd17e9b7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -326,9 +326,9 @@ unsigned long read_word_at_a_time(const void *addr) #define data_race(expr) \ ({ \ typeof(({ expr; })) __val; \ - kcsan_nestable_atomic_begin(); \ + kcsan_disable_current(); \ __val = ({ expr; }); \ - kcsan_nestable_atomic_end(); \ + kcsan_enable_current(); \ __val; \ }) #else -- cgit v1.2.3 From 70771c69ab9be6e37618b1ec6c105f370e510f94 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 3 Apr 2020 18:10:17 +0100 Subject: firmware: arm_scmi: Add include guard to linux/scmi_protocol.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If this header is include twice, it will generate loads of compile time error with the following below error pattern. It was reported by 0day kbuild robot on a branch pushed with double inclusion by accident. error: conflicting types for ‘...’ note: previous declaration of ‘...’ was here error: redefinition of ‘...’ Add a header include guard just in case. Link: https://lore.kernel.org/r/20200403171018.1230-1-sudeep.holla@arm.com Reported-by: kbuild test robot Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 5c873a59b387..ce2f5c28b2df 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -4,6 +4,10 @@ * * Copyright (C) 2018 ARM Ltd. */ + +#ifndef _LINUX_SCMI_PROTOCOL_H +#define _LINUX_SCMI_PROTOCOL_H + #include #include @@ -319,3 +323,5 @@ static inline void scmi_driver_unregister(struct scmi_driver *driver) {} typedef int (*scmi_prot_init_fn_t)(struct scmi_handle *); int scmi_protocol_register(int protocol_id, scmi_prot_init_fn_t fn); void scmi_protocol_unregister(int protocol_id); + +#endif /* _LINUX_SCMI_PROTOCOL_H */ -- cgit v1.2.3 From 23818b3d8590a79d56af9659cf709ebfae30f832 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 3 Apr 2020 18:10:18 +0100 Subject: firmware: arm_scpi: Add include guard to linux/scpi_protocol.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If this header is include twice, it will generate loads of compile time error with the following below error pattern. It was reported by 0day kbuild robot on a branch pushed with double inclusion by accident. This is based on the similar change in linux/scmi_protocol.h error: conflicting types for ‘...’ note: previous declaration of ‘...’ was here error: redefinition of ‘...’ Add a header include guard just in case. Link: https://lore.kernel.org/r/20200403171018.1230-2-sudeep.holla@arm.com Reported-by: kbuild test robot Signed-off-by: Sudeep Holla --- include/linux/scpi_protocol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h index ecb004711acf..afbf8037d8db 100644 --- a/include/linux/scpi_protocol.h +++ b/include/linux/scpi_protocol.h @@ -4,6 +4,10 @@ * * Copyright (C) 2014 ARM Ltd. */ + +#ifndef _LINUX_SCPI_PROTOCOL_H +#define _LINUX_SCPI_PROTOCOL_H + #include struct scpi_opp { @@ -71,3 +75,5 @@ struct scpi_ops *get_scpi_ops(void); #else static inline struct scpi_ops *get_scpi_ops(void) { return NULL; } #endif + +#endif /* _LINUX_SCPI_PROTOCOL_H */ -- cgit v1.2.3 From bceb5646a15dad49ceb29ec16b8acc10075d0627 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 31 Mar 2020 18:54:48 +0200 Subject: thermal: core: Make thermal_zone_set_trips private The function thermal_zone_set_trips() is used by the thermal core code in order to update the next trip points, there are no other users. Move the function definition in the thermal_core.h, remove the EXPORT_SYMBOL_GPL and document the function. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Reviewed-by: Amit Kucheria Link: https://lore.kernel.org/r/20200331165449.30355-1-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 3 +++ drivers/thermal/thermal_helpers.c | 13 ++++++++++++- include/linux/thermal.h | 3 --- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index a9bf00e91d64..37cd4e2bead2 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -69,6 +69,9 @@ void thermal_zone_device_unbind_exception(struct thermal_zone_device *, int thermal_zone_device_set_policy(struct thermal_zone_device *, char *); int thermal_build_list_of_policies(char *buf); +/* Helpers */ +void thermal_zone_set_trips(struct thermal_zone_device *tz); + /* sysfs I/F */ int thermal_zone_create_device_groups(struct thermal_zone_device *, int); void thermal_zone_destroy_device_groups(struct thermal_zone_device *); diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index 2ba756af76b7..59eaf2d0fdb3 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -113,6 +113,18 @@ exit: } EXPORT_SYMBOL_GPL(thermal_zone_get_temp); +/** + * thermal_zone_set_trips - Computes the next trip points for the driver + * @tz: a pointer to a thermal zone device structure + * + * The function computes the next temperature boundaries by browsing + * the trip points. The result is the closer low and high trip points + * to the current temperature. These values are passed to the backend + * driver to let it set its own notification mechanism (usually an + * interrupt). + * + * It does not return a value + */ void thermal_zone_set_trips(struct thermal_zone_device *tz) { int low = -INT_MAX; @@ -161,7 +173,6 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz) exit: mutex_unlock(&tz->lock); } -EXPORT_SYMBOL_GPL(thermal_zone_set_trips); void thermal_cdev_update(struct thermal_cooling_device *cdev) { diff --git a/include/linux/thermal.h b/include/linux/thermal.h index c91b1e344d56..448841ab0dca 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -439,7 +439,6 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *, enum thermal_notify_event); -void thermal_zone_set_trips(struct thermal_zone_device *); struct thermal_cooling_device *thermal_cooling_device_register(const char *, void *, const struct thermal_cooling_device_ops *); @@ -497,8 +496,6 @@ static inline int thermal_zone_unbind_cooling_device( static inline void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { } -static inline void thermal_zone_set_trips(struct thermal_zone_device *tz) -{ } static inline struct thermal_cooling_device * thermal_cooling_device_register(char *type, void *devdata, const struct thermal_cooling_device_ops *ops) -- cgit v1.2.3 From 8097db407a08f33236b6fa6ba8b62d669321c720 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:39 +0200 Subject: thermal: Move default governor config option to the internal header The default governor set at compilation time is a thermal internal business, no need to export to the global thermal header. Move the config options to the internal header. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-1-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 11 +++++++++++ include/linux/thermal.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 37cd4e2bead2..828305508556 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -12,6 +12,17 @@ #include #include +/* Default Thermal Governor */ +#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) +#define DEFAULT_THERMAL_GOVERNOR "step_wise" +#elif defined(CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE) +#define DEFAULT_THERMAL_GOVERNOR "fair_share" +#elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE) +#define DEFAULT_THERMAL_GOVERNOR "user_space" +#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR) +#define DEFAULT_THERMAL_GOVERNOR "power_allocator" +#endif + /* Initial state of a cooling device during binding */ #define THERMAL_NO_TARGET -1UL diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 448841ab0dca..71cff87dcb46 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -32,17 +32,6 @@ /* use value, which < 0K, to indicate an invalid/uninitialized temperature */ #define THERMAL_TEMP_INVALID -274000 -/* Default Thermal Governor */ -#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) -#define DEFAULT_THERMAL_GOVERNOR "step_wise" -#elif defined(CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE) -#define DEFAULT_THERMAL_GOVERNOR "fair_share" -#elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE) -#define DEFAULT_THERMAL_GOVERNOR "user_space" -#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR) -#define DEFAULT_THERMAL_GOVERNOR "power_allocator" -#endif - struct thermal_zone_device; struct thermal_cooling_device; struct thermal_instance; -- cgit v1.2.3 From c68df440b07f88210c5839d4507b5cbfa35e3df9 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:40 +0200 Subject: thermal: Move struct thermal_attr to the private header The structure belongs to the thermal core internals but it is exported in the include/linux/thermal.h For better self-encapsulation and less impact for the compilation if a change is made on it. Move the structure in the thermal core internal header file. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-2-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 5 +++++ include/linux/thermal.h | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 828305508556..5d08ad60d9df 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -41,6 +41,11 @@ extern struct thermal_governor *__governor_thermal_table_end[]; __governor < __governor_thermal_table_end; \ __governor++) +struct thermal_attr { + struct device_attribute attr; + char name[THERMAL_NAME_LENGTH]; +}; + /* * This structure is used to describe the behavior of * a certain cooling device on a certain trip point diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 71cff87dcb46..5aa80fb2fb61 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -35,6 +35,7 @@ struct thermal_zone_device; struct thermal_cooling_device; struct thermal_instance; +struct thermal_attr; enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, @@ -119,11 +120,6 @@ struct thermal_cooling_device { struct list_head node; }; -struct thermal_attr { - struct device_attribute attr; - char name[THERMAL_NAME_LENGTH]; -}; - /** * struct thermal_zone_device - structure for a thermal zone * @id: unique id number for each thermal zone -- cgit v1.2.3 From 33a88af10944edc7fd390000cd6bc9bbde918bc3 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:41 +0200 Subject: thermal: Move internal IPA functions The exported IPA functions are used by the IPA. It is pointless to declare the functions in the thermal.h file. For better self-encapsulation and less impact for the compilation if a change is made on it. Move the code in the thermal core internal header file. As the users depends on THERMAL then it is pointless to have the stub, remove them. Take also the opportunity to fix checkpatch warnings/errors when moving the code around. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-3-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 13 +++++++++++++ include/linux/thermal.h | 24 ------------------------ 2 files changed, 13 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 5d08ad60d9df..f99551ce9838 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -46,6 +46,19 @@ struct thermal_attr { char name[THERMAL_NAME_LENGTH]; }; +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ + return cdev->ops->get_requested_power && cdev->ops->state2power && + cdev->ops->power2state; +} + +int power_actor_get_max_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *max_power); +int power_actor_get_min_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *min_power); +int power_actor_set_power(struct thermal_cooling_device *cdev, + struct thermal_instance *ti, u32 power); + /* * This structure is used to describe the behavior of * a certain cooling device on a certain trip point diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 5aa80fb2fb61..e0279f7b43f4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -399,18 +399,6 @@ void devm_thermal_zone_of_sensor_unregister(struct device *dev, #endif #if IS_ENABLED(CONFIG_THERMAL) -static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) -{ - return cdev->ops->get_requested_power && cdev->ops->state2power && - cdev->ops->power2state; -} - -int power_actor_get_max_power(struct thermal_cooling_device *, - struct thermal_zone_device *tz, u32 *max_power); -int power_actor_get_min_power(struct thermal_cooling_device *, - struct thermal_zone_device *tz, u32 *min_power); -int power_actor_set_power(struct thermal_cooling_device *, - struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, struct thermal_zone_params *, int, int); @@ -447,18 +435,6 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, void thermal_cdev_update(struct thermal_cooling_device *); void thermal_notify_framework(struct thermal_zone_device *, int); #else -static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) -{ return false; } -static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev, - struct thermal_zone_device *tz, u32 *max_power) -{ return 0; } -static inline int power_actor_get_min_power(struct thermal_cooling_device *cdev, - struct thermal_zone_device *tz, - u32 *min_power) -{ return -ENODEV; } -static inline int power_actor_set_power(struct thermal_cooling_device *cdev, - struct thermal_instance *tz, u32 power) -{ return 0; } static inline struct thermal_zone_device *thermal_zone_device_register( const char *type, int trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, -- cgit v1.2.3 From 2e7700dc336dde93cdc9394d10ccd79593fff214 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:42 +0200 Subject: thermal: Move trip point structure definition to private header The struct thermal_trip is only used by the thermal internals, it is pointless to export the definition in the global header. Move the structure to the thermal_core.h internal header. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-4-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 13 +++++++++++++ include/linux/thermal.h | 15 --------------- 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index f99551ce9838..d37de708c28a 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -58,6 +58,19 @@ int power_actor_get_min_power(struct thermal_cooling_device *cdev, struct thermal_zone_device *tz, u32 *min_power); int power_actor_set_power(struct thermal_cooling_device *cdev, struct thermal_instance *ti, u32 power); +/** + * struct thermal_trip - representation of a point in temperature domain + * @np: pointer to struct device_node that this trip point was created from + * @temperature: temperature value in miliCelsius + * @hysteresis: relative hysteresis in miliCelsius + * @type: trip point type + */ +struct thermal_trip { + struct device_node *np; + int temperature; + int hysteresis; + enum thermal_trip_type type; +}; /* * This structure is used to describe the behavior of diff --git a/include/linux/thermal.h b/include/linux/thermal.h index e0279f7b43f4..7adbfe092281 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -332,21 +332,6 @@ struct thermal_zone_of_device_ops { int (*set_trip_temp)(void *, int, int); }; -/** - * struct thermal_trip - representation of a point in temperature domain - * @np: pointer to struct device_node that this trip point was created from - * @temperature: temperature value in miliCelsius - * @hysteresis: relative hysteresis in miliCelsius - * @type: trip point type - */ - -struct thermal_trip { - struct device_node *np; - int temperature; - int hysteresis; - enum thermal_trip_type type; -}; - /* Function declarations */ #ifdef CONFIG_THERMAL_OF int thermal_zone_of_get_sensor_id(struct device_node *tz_np, -- cgit v1.2.3 From f0129c231772a85a726b233756cdc58ea42a9d84 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:43 +0200 Subject: thermal: Move get_tz_trend to the internal header The function is not used any place other than the thermal directory. It does not make sense to export its definition in the global header as there is no use of it. Move the definition to the internal header and allow better self-encapsulation. Take the opportunity to add the parameter names to make checkpatch happy and remove the pointless stubs. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-5-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 2 ++ include/linux/thermal.h | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index d37de708c28a..5fb2bd9c7034 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -72,6 +72,8 @@ struct thermal_trip { enum thermal_trip_type type; }; +int get_tz_trend(struct thermal_zone_device *tz, int trip); + /* * This structure is used to describe the behavior of * a certain cooling device on a certain trip point diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 7adbfe092281..8006ba5de855 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -414,7 +414,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); int thermal_zone_get_slope(struct thermal_zone_device *tz); int thermal_zone_get_offset(struct thermal_zone_device *tz); -int get_tz_trend(struct thermal_zone_device *, int); struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, struct thermal_cooling_device *, int); void thermal_cdev_update(struct thermal_cooling_device *); @@ -473,8 +472,7 @@ static inline int thermal_zone_get_slope( static inline int thermal_zone_get_offset( struct thermal_zone_device *tz) { return -ENODEV; } -static inline int get_tz_trend(struct thermal_zone_device *tz, int trip) -{ return -ENODEV; } + static inline struct thermal_instance * get_thermal_instance(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev, int trip) -- cgit v1.2.3 From 06f1041f5023c00a54f63c269b997c61d1b3b739 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:44 +0200 Subject: thermal: Move get_thermal_instance to the internal header The function is not used any place other than the thermal directory. It does not make sense to export its definition in the global header as there is no use of it. Move the definition to the internal header and allow better self-encapsulation. Take the opportunity to add the parameter names to make checkpatch happy and remove the pointless stubs. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-6-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.h | 5 +++++ include/linux/thermal.h | 6 ------ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 5fb2bd9c7034..c95689586e19 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -74,6 +74,11 @@ struct thermal_trip { int get_tz_trend(struct thermal_zone_device *tz, int trip); +struct thermal_instance * +get_thermal_instance(struct thermal_zone_device *tz, + struct thermal_cooling_device *cdev, + int trip); + /* * This structure is used to describe the behavior of * a certain cooling device on a certain trip point diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 8006ba5de855..47e745c5dfca 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -414,8 +414,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); int thermal_zone_get_slope(struct thermal_zone_device *tz); int thermal_zone_get_offset(struct thermal_zone_device *tz); -struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, - struct thermal_cooling_device *, int); void thermal_cdev_update(struct thermal_cooling_device *); void thermal_notify_framework(struct thermal_zone_device *, int); #else @@ -473,10 +471,6 @@ static inline int thermal_zone_get_offset( struct thermal_zone_device *tz) { return -ENODEV; } -static inline struct thermal_instance * -get_thermal_instance(struct thermal_zone_device *tz, - struct thermal_cooling_device *cdev, int trip) -{ return ERR_PTR(-ENODEV); } static inline void thermal_cdev_update(struct thermal_cooling_device *cdev) { } static inline void thermal_notify_framework(struct thermal_zone_device *tz, -- cgit v1.2.3 From 60518260cab21e749704baa5246ff13f7559fa91 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:45 +0200 Subject: thermal: Change IS_ENABLED to IFDEF in the header file The thermal framework can not be compiled as a module. The IS_ENABLED macro is useless here and can be replaced by an ifdef. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-7-daniel.lezcano@linaro.org --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 47e745c5dfca..12df9ff0182d 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -383,7 +383,7 @@ void devm_thermal_zone_of_sensor_unregister(struct device *dev, #endif -#if IS_ENABLED(CONFIG_THERMAL) +#ifdef CONFIG_THERMAL struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, struct thermal_zone_params *, int, int); -- cgit v1.2.3 From 708418500644c248d6f266e4df0bf43ce53bf746 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:46 +0200 Subject: thermal: Remove stubs for thermal_zone_[un]bind_cooling_device All callers of the functions depends on THERMAL, it is pointless to define stubs. Remove them. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-8-daniel.lezcano@linaro.org --- include/linux/thermal.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 12df9ff0182d..7b3dbfe15b59 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -426,16 +426,6 @@ static inline struct thermal_zone_device *thermal_zone_device_register( static inline void thermal_zone_device_unregister( struct thermal_zone_device *tz) { } -static inline int thermal_zone_bind_cooling_device( - struct thermal_zone_device *tz, int trip, - struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower, - unsigned int weight) -{ return -ENODEV; } -static inline int thermal_zone_unbind_cooling_device( - struct thermal_zone_device *tz, int trip, - struct thermal_cooling_device *cdev) -{ return -ENODEV; } static inline void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { } -- cgit v1.2.3 From 0145f67866b71e8c1da3c1d9412623db7ba8a0c8 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 2 Apr 2020 16:27:47 +0200 Subject: thermal: Remove thermal_zone_device_update() stub All users of the function depends on THERMAL, no stub is needed. Remove it. Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200402142747.8307-9-daniel.lezcano@linaro.org --- include/linux/thermal.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 7b3dbfe15b59..216185bb3014 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -426,9 +426,6 @@ static inline struct thermal_zone_device *thermal_zone_device_register( static inline void thermal_zone_device_unregister( struct thermal_zone_device *tz) { } -static inline void thermal_zone_device_update(struct thermal_zone_device *tz, - enum thermal_notify_event event) -{ } static inline struct thermal_cooling_device * thermal_cooling_device_register(char *type, void *devdata, const struct thermal_cooling_device_ops *ops) -- cgit v1.2.3 From 9554bfe403bdfc084823df8695a01f28c680af61 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:15 -0800 Subject: x86/mce: Convert the CEC to use the MCE notifier The CEC code has its claws in a couple of routines in mce/core.c. Convert it to just register itself on the normal MCE notifier chain. [ bp: Make cec_add_elem() and cec_init() static. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-3-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 19 ------------------- drivers/ras/cec.c | 30 ++++++++++++++++++++++++++++-- include/linux/ras.h | 5 ----- 3 files changed, 28 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 43b1519ad4e5..b033b3589630 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -544,21 +544,6 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -static bool cec_add_mce(struct mce *m) -{ - if (!m) - return false; - - /* We eat only correctable DRAM errors with usable addresses. */ - if (mce_is_memory_error(m) && - mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return true; - - return false; -} - static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -567,9 +552,6 @@ static int mce_early_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (cec_add_mce(m)) - return NOTIFY_STOP; - /* Emit the trace record: */ trace_mce_record(m); @@ -2612,7 +2594,6 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); - cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index c09cf55e2d20..6b42040bf956 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -309,7 +309,7 @@ static bool sanity_check(struct ce_array *ca) return ret; } -int cec_add_elem(u64 pfn) +static int cec_add_elem(u64 pfn) { struct ce_array *ca = &ce_arr; unsigned int to = 0; @@ -527,7 +527,30 @@ err: return 1; } -void __init cec_init(void) +static int cec_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (mce_is_memory_error(m) && + mce_is_correctable(m) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} + +static struct notifier_block cec_nb = { + .notifier_call = cec_notifier, + .priority = MCE_PRIO_CEC, +}; + +static void __init cec_init(void) { if (ce_arr.disabled) return; @@ -546,8 +569,11 @@ void __init cec_init(void) INIT_DELAYED_WORK(&cec_work, cec_work_fn); schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL); + mce_register_decode_chain(&cec_nb); + pr_info("Correctable Errors collector initialized.\n"); } +late_initcall(cec_init); int __init parse_cec_param(char *str) { diff --git a/include/linux/ras.h b/include/linux/ras.h index 7c3debb47c87..1f4048bf2674 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -17,12 +17,7 @@ static inline int ras_add_daemon_trace(void) { return 0; } #endif #ifdef CONFIG_RAS_CEC -void __init cec_init(void); int __init parse_cec_param(char *str); -int cec_add_elem(u64 pfn); -#else -static inline void __init cec_init(void) { } -static inline int cec_add_elem(u64 pfn) { return -ENODEV; } #endif #ifdef CONFIG_RAS -- cgit v1.2.3 From 7fc0b9b995f222646ece8d5bca528060c098ee88 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:20 -0800 Subject: EDAC: Drop the EDAC report status checks When acpi_extlog was added, we were worried that the same error would be reported more than once by different subsystems. But in the ensuing years I've seen complaints that people could not find an error log (because this mechanism suppressed the log they were looking for). Rip it all out. People are smart enough to notice the same address from different reporting mechanisms. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-8-tony.luck@intel.com --- drivers/acpi/acpi_extlog.c | 14 ----------- drivers/edac/edac_mc.c | 61 ---------------------------------------------- drivers/edac/pnd2_edac.c | 3 --- drivers/edac/sb_edac.c | 4 --- drivers/edac/skx_common.c | 3 --- include/linux/edac.h | 8 ------ 6 files changed, 93 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 9cc3c1f92db5..f138e12b7b82 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -42,8 +42,6 @@ struct extlog_l1_head { u8 rev1[12]; }; -static int old_edac_report_status; - static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295"; /* L1 table related physical address */ @@ -229,11 +227,6 @@ static int __init extlog_init(void) if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr()) return -ENODEV; - if (edac_get_report_status() == EDAC_REPORTING_FORCE) { - pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); - return -EPERM; - } - rc = -EINVAL; /* get L1 header to fetch necessary information */ l1_hdr_size = sizeof(struct extlog_l1_head); @@ -281,12 +274,6 @@ static int __init extlog_init(void) if (elog_buf == NULL) goto err_release_elog; - /* - * eMCA event report method has higher priority than EDAC method, - * unless EDAC event report method is mandatory. - */ - old_edac_report_status = edac_get_report_status(); - edac_set_report_status(EDAC_REPORTING_DISABLED); mce_register_decode_chain(&extlog_mce_dec); /* enable OS to be involved to take over management from BIOS */ ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN; @@ -308,7 +295,6 @@ err: static void __exit extlog_exit(void) { - edac_set_report_status(old_edac_report_status); mce_unregister_decode_chain(&extlog_mce_dec); ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; if (extlog_l1_addr) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 75ede27bdf6a..5813e931f2f0 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -43,8 +43,6 @@ int edac_op_state = EDAC_OPSTATE_INVAL; EXPORT_SYMBOL_GPL(edac_op_state); -static int edac_report = EDAC_REPORTING_ENABLED; - /* lock to memory controller's control array */ static DEFINE_MUTEX(mem_ctls_mutex); static LIST_HEAD(mc_devices); @@ -60,65 +58,6 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e) return container_of(e, struct mem_ctl_info, error_desc); } -int edac_get_report_status(void) -{ - return edac_report; -} -EXPORT_SYMBOL_GPL(edac_get_report_status); - -void edac_set_report_status(int new) -{ - if (new == EDAC_REPORTING_ENABLED || - new == EDAC_REPORTING_DISABLED || - new == EDAC_REPORTING_FORCE) - edac_report = new; -} -EXPORT_SYMBOL_GPL(edac_set_report_status); - -static int edac_report_set(const char *str, const struct kernel_param *kp) -{ - if (!str) - return -EINVAL; - - if (!strncmp(str, "on", 2)) - edac_report = EDAC_REPORTING_ENABLED; - else if (!strncmp(str, "off", 3)) - edac_report = EDAC_REPORTING_DISABLED; - else if (!strncmp(str, "force", 5)) - edac_report = EDAC_REPORTING_FORCE; - - return 0; -} - -static int edac_report_get(char *buffer, const struct kernel_param *kp) -{ - int ret = 0; - - switch (edac_report) { - case EDAC_REPORTING_ENABLED: - ret = sprintf(buffer, "on"); - break; - case EDAC_REPORTING_DISABLED: - ret = sprintf(buffer, "off"); - break; - case EDAC_REPORTING_FORCE: - ret = sprintf(buffer, "force"); - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -static const struct kernel_param_ops edac_report_ops = { - .set = edac_report_set, - .get = edac_report_get, -}; - -module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644); - unsigned int edac_dimm_info_location(struct dimm_info *dimm, char *buf, unsigned int len) { diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index 1929a5dc8f94..c1f2e6deb021 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1396,9 +1396,6 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo struct dram_addr daddr; char *type; - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - return NOTIFY_DONE; - mci = pnd2_mci; if (!mci || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index f790f7d08688..d414698ca324 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3134,8 +3134,6 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; char *type; - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - return NOTIFY_DONE; if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; @@ -3526,8 +3524,6 @@ static int __init sbridge_init(void) if (rc >= 0) { mce_register_decode_chain(&sbridge_mce_dec); - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n"); return 0; } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 6f08a12f6b11..423d33aef54f 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -574,9 +574,6 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; char *type; - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - return NOTIFY_DONE; - if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; diff --git a/include/linux/edac.h b/include/linux/edac.h index 0f20b986b0ab..6eb7d55d7c3d 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -31,14 +31,6 @@ struct device; extern int edac_op_state; struct bus_type *edac_get_sysfs_subsys(void); -int edac_get_report_status(void); -void edac_set_report_status(int new); - -enum { - EDAC_REPORTING_ENABLED, - EDAC_REPORTING_DISABLED, - EDAC_REPORTING_FORCE -}; static inline void opstate_init(void) { -- cgit v1.2.3 From 12479382877dcf6623af4676caa8d3c647469a1b Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 2 Apr 2020 22:36:44 +0200 Subject: regmap-irq: make it possible to add irq_chip do a specific device node Add a new function regmap_add_irq_chip_np() with its corresponding devm_regmap_add_irq_chip_np() variant. Sometimes one want to register the IRQ domain on a different device node that the one of the regmap node. For example when using a MFD where there are different interrupt controllers and particularly for the generic regmap gpio_chip/irq_chip driver. In this case it is not desireable to have the IRQ domain on the parent node. Signed-off-by: Michael Walle Link: https://lore.kernel.org/r/20200402203656.27047-5-michael@walle.cc Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 84 ++++++++++++++++++++++++++++++++-------- include/linux/regmap.h | 10 +++++ 2 files changed, 78 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index 3d64c9331a82..4340e1d268b6 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -541,8 +541,9 @@ static const struct irq_domain_ops regmap_domain_ops = { }; /** - * regmap_add_irq_chip() - Use standard regmap IRQ controller handling + * regmap_add_irq_chip_np() - Use standard regmap IRQ controller handling * + * @np: The device_node where the IRQ domain should be added to. * @map: The regmap for the device. * @irq: The IRQ the device uses to signal interrupts. * @irq_flags: The IRQF_ flags to use for the primary interrupt. @@ -556,9 +557,10 @@ static const struct irq_domain_ops regmap_domain_ops = { * register cache. The chip driver is responsible for restoring the * register values used by the IRQ controller over suspend and resume. */ -int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags, - int irq_base, const struct regmap_irq_chip *chip, - struct regmap_irq_chip_data **data) +int regmap_add_irq_chip_np(struct device_node *np, struct regmap *map, int irq, + int irq_flags, int irq_base, + const struct regmap_irq_chip *chip, + struct regmap_irq_chip_data **data) { struct regmap_irq_chip_data *d; int i; @@ -769,12 +771,10 @@ int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags, } if (irq_base) - d->domain = irq_domain_add_legacy(map->dev->of_node, - chip->num_irqs, irq_base, 0, - ®map_domain_ops, d); + d->domain = irq_domain_add_legacy(np, chip->num_irqs, irq_base, + 0, ®map_domain_ops, d); else - d->domain = irq_domain_add_linear(map->dev->of_node, - chip->num_irqs, + d->domain = irq_domain_add_linear(np, chip->num_irqs, ®map_domain_ops, d); if (!d->domain) { dev_err(map->dev, "Failed to create IRQ domain\n"); @@ -808,6 +808,30 @@ err_alloc: kfree(d); return ret; } +EXPORT_SYMBOL_GPL(regmap_add_irq_chip_np); + +/** + * regmap_add_irq_chip() - Use standard regmap IRQ controller handling + * + * @map: The regmap for the device. + * @irq: The IRQ the device uses to signal interrupts. + * @irq_flags: The IRQF_ flags to use for the primary interrupt. + * @irq_base: Allocate at specific IRQ number if irq_base > 0. + * @chip: Configuration for the interrupt controller. + * @data: Runtime data structure for the controller, allocated on success. + * + * Returns 0 on success or an errno on failure. + * + * This is the same as regmap_add_irq_chip_np, except that the device + * node of the regmap is used. + */ +int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags, + int irq_base, const struct regmap_irq_chip *chip, + struct regmap_irq_chip_data **data) +{ + return regmap_add_irq_chip_np(map->dev->of_node, map, irq, irq_flags, + irq_base, chip, data); +} EXPORT_SYMBOL_GPL(regmap_add_irq_chip); /** @@ -875,9 +899,10 @@ static int devm_regmap_irq_chip_match(struct device *dev, void *res, void *data) } /** - * devm_regmap_add_irq_chip() - Resource manager regmap_add_irq_chip() + * devm_regmap_add_irq_chip_np() - Resource manager regmap_add_irq_chip_np() * * @dev: The device pointer on which irq_chip belongs to. + * @np: The device_node where the IRQ domain should be added to. * @map: The regmap for the device. * @irq: The IRQ the device uses to signal interrupts * @irq_flags: The IRQF_ flags to use for the primary interrupt. @@ -890,10 +915,11 @@ static int devm_regmap_irq_chip_match(struct device *dev, void *res, void *data) * The ®map_irq_chip_data will be automatically released when the device is * unbound. */ -int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq, - int irq_flags, int irq_base, - const struct regmap_irq_chip *chip, - struct regmap_irq_chip_data **data) +int devm_regmap_add_irq_chip_np(struct device *dev, struct device_node *np, + struct regmap *map, int irq, int irq_flags, + int irq_base, + const struct regmap_irq_chip *chip, + struct regmap_irq_chip_data **data) { struct regmap_irq_chip_data **ptr, *d; int ret; @@ -903,8 +929,8 @@ int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq, if (!ptr) return -ENOMEM; - ret = regmap_add_irq_chip(map, irq, irq_flags, irq_base, - chip, &d); + ret = regmap_add_irq_chip_np(np, map, irq, irq_flags, irq_base, + chip, &d); if (ret < 0) { devres_free(ptr); return ret; @@ -915,6 +941,32 @@ int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq, *data = d; return 0; } +EXPORT_SYMBOL_GPL(devm_regmap_add_irq_chip_np); + +/** + * devm_regmap_add_irq_chip() - Resource manager regmap_add_irq_chip() + * + * @dev: The device pointer on which irq_chip belongs to. + * @map: The regmap for the device. + * @irq: The IRQ the device uses to signal interrupts + * @irq_flags: The IRQF_ flags to use for the primary interrupt. + * @irq_base: Allocate at specific IRQ number if irq_base > 0. + * @chip: Configuration for the interrupt controller. + * @data: Runtime data structure for the controller, allocated on success + * + * Returns 0 on success or an errno on failure. + * + * The ®map_irq_chip_data will be automatically released when the device is + * unbound. + */ +int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq, + int irq_flags, int irq_base, + const struct regmap_irq_chip *chip, + struct regmap_irq_chip_data **data) +{ + return devm_regmap_add_irq_chip_np(dev, map->dev->of_node, map, irq, + irq_flags, irq_base, chip, data); +} EXPORT_SYMBOL_GPL(devm_regmap_add_irq_chip); /** diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 40b07168fd8e..ae5034b2d7c3 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -21,6 +21,7 @@ struct module; struct clk; struct device; +struct device_node; struct i2c_client; struct i3c_device; struct irq_domain; @@ -1310,12 +1311,21 @@ struct regmap_irq_chip_data; int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags, int irq_base, const struct regmap_irq_chip *chip, struct regmap_irq_chip_data **data); +int regmap_add_irq_chip_np(struct device_node *np, struct regmap *map, int irq, + int irq_flags, int irq_base, + const struct regmap_irq_chip *chip, + struct regmap_irq_chip_data **data); void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *data); int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq, int irq_flags, int irq_base, const struct regmap_irq_chip *chip, struct regmap_irq_chip_data **data); +int devm_regmap_add_irq_chip_np(struct device *dev, struct device_node *np, + struct regmap *map, int irq, int irq_flags, + int irq_base, + const struct regmap_irq_chip *chip, + struct regmap_irq_chip_data **data); void devm_regmap_del_irq_chip(struct device *dev, int irq, struct regmap_irq_chip_data *data); -- cgit v1.2.3 From bd3ddb495762575ab14e7bd2e4017dc1f9a80b2f Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Thu, 2 Apr 2020 10:41:11 +0200 Subject: regmap: add reg_sequence helpers Add helper to make it easier to define a reg_sequence array. Signed-off-by: Marco Felsch Link: https://lore.kernel.org/r/20200402084111.30123-1-m.felsch@pengutronix.de Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 40b07168fd8e..0b5582a78df8 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -71,6 +71,13 @@ struct reg_sequence { unsigned int delay_us; }; +#define REG_SEQ(_reg, _def, _delay_us) { \ + .reg = _reg, \ + .def = _def, \ + .delay_us = _delay_us, \ + } +#define REG_SEQ0(_reg, _def) REG_SEQ(_reg, _def, 0) + #define regmap_update_bits(map, reg, mask, val) \ regmap_update_bits_base(map, reg, mask, val, NULL, false, false) #define regmap_update_bits_async(map, reg, mask, val)\ -- cgit v1.2.3 From 4c5b566c2193e2af82c891daa5303c8899e61044 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 31 Mar 2020 02:15:44 +0800 Subject: crash_dump: Remove no longer used saved_max_pfn saved_max_pfn was originally introduced in commit 92aa63a5a1bf ("[PATCH] kdump: Retrieve saved max pfn") It used to make sure that the user does not try to read the physical memory beyond saved_max_pfn. But since commit 921d58c0e699 ("vmcore: remove saved_max_pfn check") it's no longer used for the check. This variable doesn't have any users anymore so just remove it. [ bp: Drop the Calgary IOMMU reference from the commit message. ] Signed-off-by: Kairui Song Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20200330181544.1595733-1-kasong@redhat.com --- arch/x86/kernel/e820.c | 8 -------- include/linux/crash_dump.h | 2 -- kernel/crash_dump.c | 6 ------ 3 files changed, 16 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..4d13c57f370a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p) return -EINVAL; if (!strncmp(p, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* - * If we are doing a crash dump, we still need to know - * the real memory size before the original memory map is - * reset. - */ - saved_max_pfn = e820__end_of_ram_pfn(); -#endif e820_table->nr_entries = 0; userdef = 1; return 0; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 4664fc1871de..bc156285d097 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -97,8 +97,6 @@ extern void unregister_oldmem_pfn_is_ram(void); static inline bool is_kdump_kernel(void) { return 0; } #endif /* CONFIG_CRASH_DUMP */ -extern unsigned long saved_max_pfn; - /* Device Dump information to be filled by drivers */ struct vmcoredd_data { char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -5,12 +5,6 @@ #include #include -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - /* * stores the physical address of elf header of crash image * -- cgit v1.2.3 From 5429ef62bcf360aae06740cbe065be01e5cfb6fc Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 22 Jan 2020 19:38:21 +0000 Subject: compiler/gcc: Raise minimum GCC version for kernel builds to 4.8 It is very rare to see versions of GCC prior to 4.8 being used to build the mainline kernel. These old compilers are also know to have codegen issues which can lead to silent miscompilation: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 Raise the minimum GCC version for kernel build to 4.8 and remove some tautological Kconfig dependencies as a consequence. Cc: Masahiro Yamada Acked-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Signed-off-by: Will Deacon --- Documentation/process/changes.rst | 2 +- arch/arm/crypto/Kconfig | 12 ++++++------ crypto/Kconfig | 1 - include/linux/compiler-gcc.h | 5 ++--- init/Kconfig | 1 - scripts/gcc-plugins/Kconfig | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 91c5ff8e161e..5cfb54c2aaa6 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -29,7 +29,7 @@ you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== Program Minimal version Command to check the version ====================== =============== ======================================== -GNU C 4.6 gcc --version +GNU C 4.8 gcc --version GNU make 3.81 make --version binutils 2.23 ld -v flex 2.5.35 flex --version diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 2674de6ada1f..c9bf2df85cb9 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -30,7 +30,7 @@ config CRYPTO_SHA1_ARM_NEON config CRYPTO_SHA1_ARM_CE tristate "SHA1 digest algorithm (ARM v8 Crypto Extensions)" - depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on KERNEL_MODE_NEON select CRYPTO_SHA1_ARM select CRYPTO_HASH help @@ -39,7 +39,7 @@ config CRYPTO_SHA1_ARM_CE config CRYPTO_SHA2_ARM_CE tristate "SHA-224/256 digest algorithm (ARM v8 Crypto Extensions)" - depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on KERNEL_MODE_NEON select CRYPTO_SHA256_ARM select CRYPTO_HASH help @@ -96,7 +96,7 @@ config CRYPTO_AES_ARM_BS config CRYPTO_AES_ARM_CE tristate "Accelerated AES using ARMv8 Crypto Extensions" - depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_LIB_AES select CRYPTO_SIMD @@ -106,7 +106,7 @@ config CRYPTO_AES_ARM_CE config CRYPTO_GHASH_ARM_CE tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions" - depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_CRYPTD select CRYPTO_GF128MUL @@ -118,13 +118,13 @@ config CRYPTO_GHASH_ARM_CE config CRYPTO_CRCT10DIF_ARM_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" - depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on KERNEL_MODE_NEON depends on CRC_T10DIF select CRYPTO_HASH config CRYPTO_CRC32_ARM_CE tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions" - depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800) + depends on KERNEL_MODE_NEON depends on CRC32 select CRYPTO_HASH diff --git a/crypto/Kconfig b/crypto/Kconfig index c24a47406f8f..34a8c5bfd062 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -316,7 +316,6 @@ config CRYPTO_AEGIS128 config CRYPTO_AEGIS128_SIMD bool "Support SIMD acceleration for AEGIS-128" depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON) - depends on !ARM || CC_IS_CLANG || GCC_VERSION >= 40800 default y config CRYPTO_AEGIS128_AESNI_SSE2 diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d7ee4c6bad48..e2f725273261 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -10,7 +10,8 @@ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) -#if GCC_VERSION < 40600 +/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ +#if GCC_VERSION < 40800 # error Sorry, your compiler is too old - please upgrade it. #endif @@ -126,9 +127,7 @@ #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ -#if GCC_VERSION >= 40800 #define __HAVE_BUILTIN_BSWAP16__ -#endif #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */ #if GCC_VERSION >= 70000 diff --git a/init/Kconfig b/init/Kconfig index 9e22ee8fbd75..035d38a4f9ad 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1285,7 +1285,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION bool "Dead code and data elimination (EXPERIMENTAL)" depends on HAVE_LD_DEAD_CODE_DATA_ELIMINATION depends on EXPERT - depends on !(FUNCTION_TRACER && CC_IS_GCC && GCC_VERSION < 40800) depends on $(cc-option,-ffunction-sections -fdata-sections) depends on $(ld-option,--gc-sections) help diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index 013ba3a57669..ce0b99fb5847 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -8,7 +8,7 @@ config HAVE_GCC_PLUGINS menuconfig GCC_PLUGINS bool "GCC plugins" depends on HAVE_GCC_PLUGINS - depends on CC_IS_GCC && GCC_VERSION >= 40800 + depends on CC_IS_GCC depends on $(success,$(srctree)/scripts/gcc-plugin.sh $(CC)) default y help -- cgit v1.2.3 From a5460b5e5fb82656807840d40d3deaecad094044 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 16 Dec 2019 16:51:45 +0000 Subject: READ_ONCE: Simplify implementations of {READ,WRITE}_ONCE() The implementations of {READ,WRITE}_ONCE() suffer from a significant amount of indirection and complexity due to a historic GCC bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 which was originally worked around by 230fa253df63 ("kernel: Provide READ_ONCE and ASSIGN_ONCE"). Since GCC 4.8 is fairly vintage at this point and we emit a warning if we detect it during the build, return {READ,WRITE}_ONCE() to their former glory with an implementation that is easier to understand and, crucially, more amenable to optimisation. A side effect of this simplification is that WRITE_ONCE() no longer returns a value, but nobody seems to be relying on that and the new behaviour is aligned with smp_store_release(). Suggested-by: Linus Torvalds Cc: Peter Zijlstra Cc: Michael Ellerman Cc: Arnd Bergmann Cc: Christian Borntraeger Signed-off-by: Will Deacon --- include/linux/compiler.h | 118 ++++++++++++++++------------------------------- 1 file changed, 39 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 034b0a644efc..338111a448d0 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -177,60 +177,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) #endif -#include - -#define __READ_ONCE_SIZE \ -({ \ - switch (size) { \ - case 1: *(__u8 *)res = *(volatile __u8 *)p; break; \ - case 2: *(__u16 *)res = *(volatile __u16 *)p; break; \ - case 4: *(__u32 *)res = *(volatile __u32 *)p; break; \ - case 8: *(__u64 *)res = *(volatile __u64 *)p; break; \ - default: \ - barrier(); \ - __builtin_memcpy((void *)res, (const void *)p, size); \ - barrier(); \ - } \ -}) - -static __always_inline -void __read_once_size(const volatile void *p, void *res, int size) -{ - __READ_ONCE_SIZE; -} - -#ifdef CONFIG_KASAN -/* - * We can't declare function 'inline' because __no_sanitize_address confilcts - * with inlining. Attempt to inline it may cause a build failure. - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 - * '__maybe_unused' allows us to avoid defined-but-not-used warnings. - */ -# define __no_kasan_or_inline __no_sanitize_address notrace __maybe_unused -#else -# define __no_kasan_or_inline __always_inline -#endif - -static __no_kasan_or_inline -void __read_once_size_nocheck(const volatile void *p, void *res, int size) -{ - __READ_ONCE_SIZE; -} - -static __always_inline void __write_once_size(volatile void *p, void *res, int size) -{ - switch (size) { - case 1: *(volatile __u8 *)p = *(__u8 *)res; break; - case 2: *(volatile __u16 *)p = *(__u16 *)res; break; - case 4: *(volatile __u32 *)p = *(__u32 *)res; break; - case 8: *(volatile __u64 *)p = *(__u64 *)res; break; - default: - barrier(); - __builtin_memcpy((void *)p, (const void *)res, size); - barrier(); - } -} - /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of @@ -240,11 +186,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * statements. * * These two macros will also work on aggregate data types like structs or - * unions. If the size of the accessed data type exceeds the word size of - * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will - * fall back to memcpy(). There's at least two memcpy()s: one for the - * __builtin_memcpy() and then one for the macro doing the copy of variable - * - '__u' allocated on the stack. + * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, @@ -256,23 +198,49 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #include #include -#define __READ_ONCE(x, check) \ +#define __READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#define READ_ONCE(x) \ ({ \ - union { typeof(x) __val; char __c[1]; } __u; \ - if (check) \ - __read_once_size(&(x), __u.__c, sizeof(x)); \ - else \ - __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ - smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ - __u.__val; \ + typeof(x) __x = __READ_ONCE(x); \ + smp_read_barrier_depends(); \ + __x; \ }) -#define READ_ONCE(x) __READ_ONCE(x, 1) + +#define WRITE_ONCE(x, val) \ +do { \ + *(volatile typeof(x) *)&(x) = (val); \ +} while (0) + +#ifdef CONFIG_KASAN +/* + * We can't declare function 'inline' because __no_sanitize_address conflicts + * with inlining. Attempt to inline it may cause a build failure. + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 + * '__maybe_unused' allows us to avoid defined-but-not-used warnings. + */ +# define __no_kasan_or_inline __no_sanitize_address notrace __maybe_unused +#else +# define __no_kasan_or_inline __always_inline +#endif + +static __no_kasan_or_inline +unsigned long __read_once_word_nocheck(const void *addr) +{ + return __READ_ONCE(*(unsigned long *)addr); +} /* - * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need - * to hide memory access from KASAN. + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a + * word from memory atomically but without telling KASAN. This is usually + * used by unwinding code when walking the stack of a running process. */ -#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0) +#define READ_ONCE_NOCHECK(x) \ +({ \ + unsigned long __x = __read_once_word_nocheck(&(x)); \ + smp_read_barrier_depends(); \ + __x; \ +}) static __no_kasan_or_inline unsigned long read_word_at_a_time(const void *addr) @@ -281,14 +249,6 @@ unsigned long read_word_at_a_time(const void *addr) return *(unsigned long *)addr; } -#define WRITE_ONCE(x, val) \ -({ \ - union { typeof(x) __val; char __c[1]; } __u = \ - { .__val = (__force typeof(x)) (val) }; \ - __write_once_size(&(x), __u.__c, sizeof(x)); \ - __u.__val; \ -}) - #endif /* __KERNEL__ */ /* -- cgit v1.2.3 From 9e343b467c70379e66b8b771d96f03ae23eba351 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Dec 2019 14:47:02 +0000 Subject: READ_ONCE: Enforce atomicity for {READ,WRITE}_ONCE() memory accesses {READ,WRITE}_ONCE() cannot guarantee atomicity for arbitrary data sizes. This can be surprising to callers that might incorrectly be expecting atomicity for accesses to aggregate structures, although there are other callers where tearing is actually permissable (e.g. if they are using something akin to sequence locking to protect the access). Linus sayeth: | We could also look at being stricter for the normal READ/WRITE_ONCE(), | and require that they are | | (a) regular integer types | | (b) fit in an atomic word | | We actually did (b) for a while, until we noticed that we do it on | loff_t's etc and relaxed the rules. But maybe we could have a | "non-atomic" version of READ/WRITE_ONCE() that is used for the | questionable cases? The slight snag is that we also have to support 64-bit accesses on 32-bit architectures, as these appear to be widespread and tend to work out ok if either the architecture supports atomic 64-bit accesses (x86, armv7) or if the variable being accesses represents a virtual address and therefore only requires 32-bit atomicity in practice. Take a step in that direction by introducing a variant of 'compiletime_assert_atomic_type()' and use it to check the pointer argument to {READ,WRITE}_ONCE(). Expose __{READ,WRITE}_ONCE() variants which are allowed to tear and convert the one broken caller over to the new macros. Suggested-by: Linus Torvalds Cc: Peter Zijlstra Cc: Michael Ellerman Cc: Arnd Bergmann Signed-off-by: Will Deacon --- drivers/xen/time.c | 2 +- include/linux/compiler.h | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/xen/time.c b/drivers/xen/time.c index 0968859c29d0..108edbcbc040 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -64,7 +64,7 @@ static void xen_get_runstate_snapshot_cpu_delta( do { state_time = get64(&state->state_entry_time); rmb(); /* Hypervisor might update data. */ - *res = READ_ONCE(*state); + *res = __READ_ONCE(*state); rmb(); /* Hypervisor might update data. */ } while (get64(&state->state_entry_time) != state_time || (state_time & XEN_RUNSTATE_UPDATE)); diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 338111a448d0..50bb2461648f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -198,20 +198,37 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #include #include -#define __READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +/* + * Use __READ_ONCE() instead of READ_ONCE() if you do not require any + * atomicity or dependency ordering guarantees. Note that this may result + * in tears! + */ +#define __READ_ONCE(x) (*(const volatile typeof(x) *)&(x)) -#define READ_ONCE(x) \ +#define __READ_ONCE_SCALAR(x) \ ({ \ typeof(x) __x = __READ_ONCE(x); \ smp_read_barrier_depends(); \ __x; \ }) -#define WRITE_ONCE(x, val) \ +#define READ_ONCE(x) \ +({ \ + compiletime_assert_rwonce_type(x); \ + __READ_ONCE_SCALAR(x); \ +}) + +#define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) +#define WRITE_ONCE(x, val) \ +do { \ + compiletime_assert_rwonce_type(x); \ + __WRITE_ONCE(x, val); \ +} while (0) + #ifdef CONFIG_KASAN /* * We can't declare function 'inline' because __no_sanitize_address conflicts @@ -313,6 +330,16 @@ static inline void *offset_to_ptr(const int *off) compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") +/* + * Yes, this permits 64-bit accesses on 32-bit architectures. These will + * actually be atomic in many cases (namely x86), but for others we rely on + * the access being split into 2x32-bit accesses for a 32-bit quantity (e.g. + * a virtual address) and a strong prevailing wind. + */ +#define compiletime_assert_rwonce_type(t) \ + compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ + "Unsupported access size for {READ,WRITE}_ONCE().") + /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) -- cgit v1.2.3 From dee081bf8f824cabeb7c7495367d5dad0a444eb1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 19 Dec 2019 14:22:31 +0000 Subject: READ_ONCE: Drop pointer qualifiers when reading from scalar types Passing a volatile-qualified pointer to READ_ONCE() is an absolute trainwreck for code generation: the use of 'typeof()' to define a temporary variable inside the macro means that the final evaluation in macro scope ends up forcing a read back from the stack. When stack protector is enabled (the default for arm64, at least), this causes the compiler to vomit up all sorts of junk. Unfortunately, dropping pointer qualifiers inside the macro poses quite a challenge, especially since the pointed-to type is permitted to be an aggregate, and this is relied upon by mm/ code accessing things like 'pmd_t'. Based on numerous hacks and discussions on the mailing list, this is the best I've managed to come up with. Introduce '__unqual_scalar_typeof()' which takes an expression and, if the expression is an optionally qualified 8, 16, 32 or 64-bit scalar type, evaluates to the unqualified type. Other input types, including aggregates, remain unchanged. Hopefully READ_ONCE() on volatile aggregate pointers isn't something we do on a fast-path. Cc: Peter Zijlstra Cc: Arnd Bergmann Suggested-by: Linus Torvalds Reported-by: Michael Ellerman Signed-off-by: Will Deacon --- include/linux/compiler.h | 6 +++--- include/linux/compiler_types.h | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 50bb2461648f..c363d8debc43 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -203,13 +203,13 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * atomicity or dependency ordering guarantees. Note that this may result * in tears! */ -#define __READ_ONCE(x) (*(const volatile typeof(x) *)&(x)) +#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #define __READ_ONCE_SCALAR(x) \ ({ \ - typeof(x) __x = __READ_ONCE(x); \ + __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ smp_read_barrier_depends(); \ - __x; \ + (typeof(x))__x; \ }) #define READ_ONCE(x) \ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e970f97a7fcb..233066c92f6f 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -210,6 +210,27 @@ struct ftrace_likely_data { /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +/* + * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged. + * + * We build this out of a couple of helper macros in a vain attempt to + * help you keep your lunch down while reading it. + */ +#define __pick_scalar_type(x, type, otherwise) \ + __builtin_choose_expr(__same_type(x, type), (type)0, otherwise) + +#define __pick_integer_type(x, type, otherwise) \ + __pick_scalar_type(x, unsigned type, \ + __pick_scalar_type(x, signed type, otherwise)) + +#define __unqual_scalar_typeof(x) typeof( \ + __pick_integer_type(x, char, \ + __pick_integer_type(x, short, \ + __pick_integer_type(x, int, \ + __pick_integer_type(x, long, \ + __pick_integer_type(x, long long, x)))))) + /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ -- cgit v1.2.3 From 6aba6ed879b3471903c8ada28ba968a268df6143 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Apr 2020 20:38:41 +0300 Subject: pinctrl: mcp23s08: Get rid of legacy platform data Platform data is a legacy interface to supply device properties to the driver. In this case we even don't have in-kernel users for it. Just remove it for good. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200407173849.43628-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08.c | 42 +++++++++++--------------------------- include/linux/spi/mcp23s08.h | 18 ---------------- 2 files changed, 12 insertions(+), 48 deletions(-) delete mode 100644 include/linux/spi/mcp23s08.h (limited to 'include/linux') diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c index 3a235487e38d..2c8b8c45b70e 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08.c +++ b/drivers/pinctrl/pinctrl-mcp23s08.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -914,16 +913,9 @@ MODULE_DEVICE_TABLE(of, mcp23s08_i2c_of_match); static int mcp230xx_probe(struct i2c_client *client, const struct i2c_device_id *id) { - struct mcp23s08_platform_data *pdata, local_pdata; struct mcp23s08 *mcp; int status; - pdata = dev_get_platdata(&client->dev); - if (!pdata) { - pdata = &local_pdata; - pdata->base = -1; - } - mcp = devm_kzalloc(&client->dev, sizeof(*mcp), GFP_KERNEL); if (!mcp) return -ENOMEM; @@ -937,7 +929,7 @@ static int mcp230xx_probe(struct i2c_client *client, mcp->irq_chip.irq_bus_sync_unlock = mcp23s08_irq_bus_unlock; status = mcp23s08_probe_one(mcp, &client->dev, client, client->addr, - id->driver_data, pdata->base, 0); + id->driver_data, -1, 0); if (status) return status; @@ -986,13 +978,13 @@ static void mcp23s08_i2c_exit(void) { } static int mcp23s08_probe(struct spi_device *spi) { - struct mcp23s08_platform_data *pdata, local_pdata; unsigned addr; int chips = 0; struct mcp23s08_driver_data *data; int status, type; unsigned ngpio = 0; const struct of_device_id *match; + u32 spi_present_mask; match = of_match_device(of_match_ptr(mcp23s08_spi_of_match), &spi->dev); if (match) @@ -1000,32 +992,24 @@ static int mcp23s08_probe(struct spi_device *spi) else type = spi_get_device_id(spi)->driver_data; - pdata = dev_get_platdata(&spi->dev); - if (!pdata) { - pdata = &local_pdata; - pdata->base = -1; - + status = device_property_read_u32(&spi->dev, + "microchip,spi-present-mask", &spi_present_mask); + if (status) { status = device_property_read_u32(&spi->dev, - "microchip,spi-present-mask", &pdata->spi_present_mask); + "mcp,spi-present-mask", &spi_present_mask); if (status) { - status = device_property_read_u32(&spi->dev, - "mcp,spi-present-mask", - &pdata->spi_present_mask); - - if (status) { - dev_err(&spi->dev, "missing spi-present-mask"); - return -ENODEV; - } + dev_err(&spi->dev, "missing spi-present-mask"); + return -ENODEV; } } - if (!pdata->spi_present_mask || pdata->spi_present_mask > 0xff) { + if (!spi_present_mask || spi_present_mask > 0xff) { dev_err(&spi->dev, "invalid spi-present-mask"); return -ENODEV; } for (addr = 0; addr < MCP_MAX_DEV_PER_CS; addr++) { - if (pdata->spi_present_mask & BIT(addr)) + if (spi_present_mask & BIT(addr)) chips++; } @@ -1040,7 +1024,7 @@ static int mcp23s08_probe(struct spi_device *spi) spi_set_drvdata(spi, data); for (addr = 0; addr < MCP_MAX_DEV_PER_CS; addr++) { - if (!(pdata->spi_present_mask & BIT(addr))) + if (!(spi_present_mask & BIT(addr))) continue; chips--; data->mcp[addr] = &data->chip[chips]; @@ -1054,12 +1038,10 @@ static int mcp23s08_probe(struct spi_device *spi) mcp23s08_irq_bus_unlock; status = mcp23s08_probe_one(data->mcp[addr], &spi->dev, spi, 0x40 | (addr << 1), type, - pdata->base, addr); + -1, addr); if (status < 0) return status; - if (pdata->base != -1) - pdata->base += data->mcp[addr]->chip.ngpio; ngpio += data->mcp[addr]->chip.ngpio; } data->ngpio = ngpio; diff --git a/include/linux/spi/mcp23s08.h b/include/linux/spi/mcp23s08.h deleted file mode 100644 index 738a45b435f2..000000000000 --- a/include/linux/spi/mcp23s08.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -struct mcp23s08_platform_data { - /* For mcp23s08, up to 4 slaves (numbered 0..3) can share one SPI - * chipselect, each providing 1 gpio_chip instance with 8 gpios. - * For mpc23s17, up to 8 slaves (numbered 0..7) can share one SPI - * chipselect, each providing 1 gpio_chip (port A + port B) with - * 16 gpios. - */ - u32 spi_present_mask; - - /* "base" is the number of the first GPIO or -1 for dynamic - * assignment. If there are gaps in chip addressing the GPIO - * numbers are sequential .. so for example if only slaves 0 - * and 3 are present, their GPIOs range from base to base+15 - * (or base+31 for s17 variant). - */ - unsigned base; -}; -- cgit v1.2.3 From 980737282232b752bb14dab96d77665c15889c36 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:45:31 +0300 Subject: capabilities: Introduce CAP_PERFMON to kernel and user space Introduce the CAP_PERFMON capability designed to secure system performance monitoring and observability operations so that CAP_PERFMON can assist CAP_SYS_ADMIN capability in its governing role for performance monitoring and observability subsystems. CAP_PERFMON hardens system security and integrity during performance monitoring and observability operations by decreasing attack surface that is available to a CAP_SYS_ADMIN privileged process [2]. Providing the access to system performance monitoring and observability operations under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes the operation more secure. Thus, CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e: 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) CAP_PERFMON meets the demand to secure system performance monitoring and observability operations for adoption in security sensitive, restricted, multiuser production environments (e.g. HPC clusters, cloud and virtual compute environments), where root or CAP_SYS_ADMIN credentials are not available to mass users of a system, and securely unblocks applicability and scalability of system performance monitoring and observability operations beyond root and CAP_SYS_ADMIN use cases. CAP_PERFMON takes over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balances amount of CAP_SYS_ADMIN credentials following the recommendations in the capabilities man page [1] for CAP_SYS_ADMIN: "Note: this capability is overloaded; see Notes to kernel developers, below." For backward compatibility reasons access to system performance monitoring and observability subsystems of the kernel remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN capability usage for secure system performance monitoring and observability operations is discouraged with respect to the designed CAP_PERFMON capability. Although the software running under CAP_PERFMON can not ensure avoidance of related hardware issues, the software can still mitigate these issues following the official hardware issues mitigation procedure [2]. The bugs in the software itself can be fixed following the standard kernel development process [3] to maintain and harden security of system performance monitoring and observability operations. [1] http://man7.org/linux/man-pages/man7/capabilities.7.html [2] https://www.kernel.org/doc/html/latest/process/embargoed-hardware-issues.html [3] https://www.kernel.org/doc/html/latest/admin-guide/security-bugs.html Signed-off-by: Alexey Budankov Acked-by: James Morris Acked-by: Serge E. Hallyn Acked-by: Song Liu Acked-by: Stephen Smalley Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/5590d543-82c6-490a-6544-08e6a5517db0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/capability.h | 4 ++++ include/uapi/linux/capability.h | 8 +++++++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..027d7e4a853b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 272dc69fa080..e58c9636741b 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -367,8 +367,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ + +#define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 986f3ac14282..d233ab3f1533 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "perfmon" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_PERFMON #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3 From 18aa18566218d4a46d940049b835314d2b071cc2 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:46:24 +0300 Subject: perf/core: Open access to the core for CAP_PERFMON privileged process Open access to monitoring of kernel code, CPUs, tracepoints and namespaces data for a CAP_PERFMON privileged process. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes operation more secure. CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) For backward compatibility reasons the access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_PERFMON capability. Signed-off-by: Alexey Budankov Reviewed-by: James Morris Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: linux-man@vger.kernel.org Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Serge Hallyn Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/471acaef-bb8a-5ce2-923f-90606b78eef9@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 6 +++--- kernel/events/core.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..87e21681759c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1305,7 +1305,7 @@ static inline int perf_is_paranoid(void) static inline int perf_allow_kernel(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); @@ -1313,7 +1313,7 @@ static inline int perf_allow_kernel(struct perf_event_attr *attr) static inline int perf_allow_cpu(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); @@ -1321,7 +1321,7 @@ static inline int perf_allow_cpu(struct perf_event_attr *attr) static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..74025b7b83a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11504,7 +11504,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } -- cgit v1.2.3 From ab7c1e163b525316a870a494dd4ea196e7a6c455 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 15 Apr 2020 18:45:00 +0200 Subject: firmware: Drop unused pages field from struct firmware The struct firmware contains a page table pointer that was used only internally in the past. Since the actual page tables are referred from struct fw_priv and should be never from struct firmware, we can drop this unused field gracefully. Signed-off-by: Takashi Iwai Acked-by: Luis Chamberlain Link: https://lore.kernel.org/r/20200415164500.28749-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/base/firmware_loader/main.c | 3 --- include/linux/firmware.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 76f79913916d..5296aaca35cf 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -548,9 +548,6 @@ static void firmware_free_data(const struct firmware *fw) static void fw_set_page_data(struct fw_priv *fw_priv, struct firmware *fw) { fw->priv = fw_priv; -#ifdef CONFIG_FW_LOADER_USER_HELPER - fw->pages = fw_priv->pages; -#endif fw->size = fw_priv->size; fw->data = fw_priv->data; diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 4bbd0afd91b7..cb3e2c06ed8a 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -12,7 +12,6 @@ struct firmware { size_t size; const u8 *data; - struct page **pages; /* firmware loader private fields */ void *priv; -- cgit v1.2.3 From 699fed4a2d8e32d77d64928e570d7ffd93a62fdf Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Thu, 16 Apr 2020 01:43:10 -0700 Subject: scsi: qed: Send BW update notifications to the protocol drivers Management firmware (MFW) sends a notification whenever there is a change in the bandwidth values. Add driver support for sending this notification to the upper layer drivers (e.g., qedf). Link: https://lore.kernel.org/r/20200416084314.18851-6-skashyap@marvell.com Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Saurav Kashyap Signed-off-by: Martin K. Petersen --- drivers/net/ethernet/qlogic/qed/qed.h | 1 + drivers/net/ethernet/qlogic/qed/qed_main.c | 9 +++++++++ include/linux/qed/qed_if.h | 1 + 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index fa41bf08a589..d00663967afe 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -1016,6 +1016,7 @@ int qed_device_num_ports(struct qed_dev *cdev); int qed_fill_dev_info(struct qed_dev *cdev, struct qed_dev_info *dev_info); void qed_link_update(struct qed_hwfn *hwfn, struct qed_ptt *ptt); +void qed_bw_update(struct qed_hwfn *hwfn, struct qed_ptt *ptt); u32 qed_unzip_data(struct qed_hwfn *p_hwfn, u32 input_len, u8 *input_buf, u32 max_size, u8 *unzip_buf); diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 96356e897c80..645d1dbf04ce 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1940,6 +1940,15 @@ void qed_link_update(struct qed_hwfn *hwfn, struct qed_ptt *ptt) op->link_update(cookie, &if_link); } +void qed_bw_update(struct qed_hwfn *hwfn, struct qed_ptt *ptt) +{ + void *cookie = hwfn->cdev->ops_cookie; + struct qed_common_cb_ops *op = hwfn->cdev->protocol_ops.common; + + if (IS_LEAD_HWFN(hwfn) && cookie && op && op->bw_update) + op->bw_update(cookie); +} + static int qed_drain(struct qed_dev *cdev) { struct qed_hwfn *hwfn; diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 8f29e0d8a7b3..c4956374a444 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -817,6 +817,7 @@ struct qed_common_cb_ops { void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data); void (*get_protocol_tlv_data)(void *dev, void *data); + void (*bw_update)(void *dev); }; struct qed_selftest_ops { -- cgit v1.2.3 From 2ce0d7f9766f0e49bb54f149c77bae89464932fb Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 16 Apr 2020 19:24:02 +0100 Subject: x86/asm: Provide a Kconfig symbol for disabling old assembly annotations As x86 was converted to use the modern SYM_ annotations for assembly, ifdefs were added to remove the generic definitions of the old style annotations on x86. Rather than collect a list of architectures in the ifdefs as more architectures are converted over, provide a Kconfig symbol for this and update x86 to use it. Signed-off-by: Mark Brown Signed-off-by: Borislav Petkov Acked-by: Jiri Slaby Link: https://lkml.kernel.org/r/20200416182402.6206-1-broonie@kernel.org --- arch/x86/Kconfig | 1 + include/linux/linkage.h | 8 ++++---- lib/Kconfig | 3 +++ 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d6104ea8af0..e3d22edfd70a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -91,6 +91,7 @@ config X86 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_SYM_ANNOTATIONS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 9280209d1f62..d796ec20d114 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -105,7 +105,7 @@ /* === DEPRECATED annotations === */ -#ifndef CONFIG_X86 +#ifndef CONFIG_ARCH_USE_SYM_ANNOTATIONS #ifndef GLOBAL /* deprecated, use SYM_DATA*, SYM_ENTRY, or similar */ #define GLOBAL(name) \ @@ -118,10 +118,10 @@ #define ENTRY(name) \ SYM_FUNC_START(name) #endif -#endif /* CONFIG_X86 */ +#endif /* CONFIG_ARCH_USE_SYM_ANNOTATIONS */ #endif /* LINKER_SCRIPT */ -#ifndef CONFIG_X86 +#ifndef CONFIG_ARCH_USE_SYM_ANNOTATIONS #ifndef WEAK /* deprecated, use SYM_FUNC_START_WEAK* */ #define WEAK(name) \ @@ -143,7 +143,7 @@ #define ENDPROC(name) \ SYM_FUNC_END(name) #endif -#endif /* CONFIG_X86 */ +#endif /* CONFIG_ARCH_USE_SYM_ANNOTATIONS */ /* === generic annotations === */ diff --git a/lib/Kconfig b/lib/Kconfig index 5d53f9609c25..e831e1f01767 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -80,6 +80,9 @@ config ARCH_USE_CMPXCHG_LOCKREF config ARCH_HAS_FAST_MULTIPLIER bool +config ARCH_USE_SYM_ANNOTATIONS + bool + config INDIRECT_PIO bool "Access I/O in non-MMIO mode" depends on ARM64 -- cgit v1.2.3 From 123aff2a789c3975c2235653939ff00107d6156c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Apr 2020 11:38:02 -0700 Subject: net: phy: broadcom: Add support for BCM53125 internal PHYs BCM53125 has internal Gigabit PHYs which support interrupts as well as statistics, make it possible to configure both of those features with a PHY driver entry. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 14 ++++++++++++++ include/linux/brcmphy.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index ae4873f2f86e..97201d5cf007 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -781,6 +781,19 @@ static struct phy_driver broadcom_drivers[] = { .get_strings = bcm_phy_get_strings, .get_stats = bcm53xx_phy_get_stats, .probe = bcm53xx_phy_probe, +}, { + .phy_id = PHY_ID_BCM53125, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM53125", + .flags = PHY_IS_INTERNAL, + /* PHY_GBIT_FEATURES */ + .get_sset_count = bcm_phy_get_sset_count, + .get_strings = bcm_phy_get_strings, + .get_stats = bcm53xx_phy_get_stats, + .probe = bcm53xx_phy_probe, + .config_init = bcm54xx_config_init, + .ack_interrupt = bcm_phy_ack_intr, + .config_intr = bcm_phy_config_intr, }, { .phy_id = PHY_ID_BCM89610, .phy_id_mask = 0xfffffff0, @@ -810,6 +823,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = { { PHY_ID_BCMAC131, 0xfffffff0 }, { PHY_ID_BCM5241, 0xfffffff0 }, { PHY_ID_BCM5395, 0xfffffff0 }, + { PHY_ID_BCM53125, 0xfffffff0 }, { PHY_ID_BCM89610, 0xfffffff0 }, { } }; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6462c5447872..7e1d857c8468 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -15,6 +15,7 @@ #define PHY_ID_BCMAC131 0x0143bc70 #define PHY_ID_BCM5481 0x0143bca0 #define PHY_ID_BCM5395 0x0143bcf0 +#define PHY_ID_BCM53125 0x03625f20 #define PHY_ID_BCM54810 0x03625d00 #define PHY_ID_BCM5482 0x0143bcb0 #define PHY_ID_BCM5411 0x00206070 -- cgit v1.2.3 From bb7fc863729b45f0fbcdea991d0465d855ffd831 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 5 Apr 2020 20:57:00 +0300 Subject: net/mlx5: Provide simplified command interfaces Many mlx5_cmd_exec() callers are not interested in the output from that command or have standard in/out structures. Those callers simply allocate those structure on the stack and use sizeof() to provide in/out arguments. In this naive approach provide simplified versions of mlx5_cmd_exec(). Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6f8f79ef829b..1caddfa85c4d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -903,6 +903,19 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); + +#define mlx5_cmd_exec_inout(dev, ifc_cmd, in, out) \ + ({ \ + mlx5_cmd_exec(dev, in, MLX5_ST_SZ_BYTES(ifc_cmd##_in), out, \ + MLX5_ST_SZ_BYTES(ifc_cmd##_out)); \ + }) + +#define mlx5_cmd_exec_in(dev, ifc_cmd, in) \ + ({ \ + u32 _out[MLX5_ST_SZ_DW(ifc_cmd##_out)] = {}; \ + mlx5_cmd_exec_inout(dev, ifc_cmd, in, _out); \ + }) + int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); -- cgit v1.2.3 From 66247fbb280c2a699a8621708c52dae6acd2e4bc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 3 Apr 2020 11:28:28 +0300 Subject: net/mlx5: Remove Q counter low level helper APIs mlx5 core users are encouraged to use low level API (mlx5_cmd_exec) without the need of helper functions, do this for q counters, remove helper functions and call mlx5_cmd_exec directly from users. This will help reduce the total amount of code and reduction of the mlx5_core symbol table. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 55 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 39 +++++++++------ drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 35 +++++++++----- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 39 --------------- include/linux/mlx5/qp.h | 4 -- 5 files changed, 80 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6679756506e6..b02d027ebf3b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5439,15 +5439,21 @@ static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev) static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; int num_cnt_ports; int i; num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports; + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + for (i = 0; i < num_cnt_ports; i++) { - if (dev->port[i].cnts.set_id_valid) - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); + if (dev->port[i].cnts.set_id_valid) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + dev->port[i].cnts.set_id); + mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); + } kfree(dev->port[i].cnts.names); kfree(dev->port[i].cnts.offsets); } @@ -5638,27 +5644,23 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, struct rdma_hw_stats *stats, u16 set_id) { - int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); - void *out; + u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; __be32 val; int ret, i; - out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; - - ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen); + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); + MLX5_SET(query_q_counter_in, in, counter_set_id, set_id); + ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out); if (ret) - goto free; + return ret; for (i = 0; i < cnts->num_q_counters; i++) { - val = *(__be32 *)(out + cnts->offsets[i]); + val = *(__be32 *)((void *)out + cnts->offsets[i]); stats->value[i] = (u64)be32_to_cpu(val); } -free: - kvfree(out); - return ret; + return 0; } static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev, @@ -5765,6 +5767,20 @@ static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) counter->stats, counter->id); } +static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + + if (!counter->id) + return 0; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); + return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); +} + static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { @@ -5788,7 +5804,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return 0; fail_set_counter: - mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id); + mlx5_ib_counter_dealloc(counter); counter->id = 0; return err; @@ -5799,13 +5815,6 @@ static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) return mlx5_ib_qp_set_counter(qp, NULL); } -static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) -{ - struct mlx5_ib_dev *dev = to_mdev(counter->device); - - return mlx5_core_dealloc_q_counter(dev->mdev, counter->id); -} - static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd7f338425eb..30970b405040 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4997,29 +4997,40 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) void mlx5e_create_q_counters(struct mlx5e_priv *priv) { + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; struct mlx5_core_dev *mdev = priv->mdev; int err; - err = mlx5_core_alloc_q_counter(mdev, &priv->q_counter); - if (err) { - mlx5_core_warn(mdev, "alloc queue counter failed, %d\n", err); - priv->q_counter = 0; - } + MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); + err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out); + if (!err) + priv->q_counter = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); - err = mlx5_core_alloc_q_counter(mdev, &priv->drop_rq_q_counter); - if (err) { - mlx5_core_warn(mdev, "alloc drop RQ counter failed, %d\n", err); - priv->drop_rq_q_counter = 0; - } + err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out); + if (!err) + priv->drop_rq_q_counter = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); } void mlx5e_destroy_q_counters(struct mlx5e_priv *priv) { - if (priv->q_counter) - mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter); + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + if (priv->q_counter) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + priv->q_counter); + mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in); + } - if (priv->drop_rq_q_counter) - mlx5_core_dealloc_q_counter(priv->mdev, priv->drop_rq_q_counter); + if (priv->drop_rq_q_counter) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + priv->drop_rq_q_counter); + mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in); + } } static int mlx5e_nic_init(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 30b216d9284c..ff4002ebad90 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -411,18 +411,29 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qcnt) static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qcnt) { struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt; - u32 out[MLX5_ST_SZ_DW(query_q_counter_out)]; - - if (priv->q_counter && - !mlx5_core_query_q_counter(priv->mdev, priv->q_counter, 0, out, - sizeof(out))) - qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out, - out, out_of_buffer); - if (priv->drop_rq_q_counter && - !mlx5_core_query_q_counter(priv->mdev, priv->drop_rq_q_counter, 0, - out, sizeof(out))) - qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, out, - out_of_buffer); + u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + int ret; + + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); + + if (priv->q_counter) { + MLX5_SET(query_q_counter_in, in, counter_set_id, + priv->q_counter); + ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out); + if (!ret) + qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out, + out, out_of_buffer); + } + + if (priv->drop_rq_q_counter) { + MLX5_SET(query_q_counter_in, in, counter_set_id, + priv->drop_rq_q_counter); + ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out); + if (!ret) + qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, + out, out_of_buffer); + } } #define VNIC_ENV_OFF(c) MLX5_BYTE_OFF(query_vnic_env_out, c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index c3aea4cc2fff..e36790ad5256 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -680,45 +680,6 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, } EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); -int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id) -{ - u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0}; - int err; - - MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (!err) - *counter_id = MLX5_GET(alloc_q_counter_out, out, - counter_set_id); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter); - -int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id) -{ - u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)] = {0}; - - MLX5_SET(dealloc_q_counter_in, in, opcode, - MLX5_CMD_OP_DEALLOC_Q_COUNTER); - MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} -EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter); - -int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id, - int reset, void *out, int out_size) -{ - u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {0}; - - MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); - MLX5_SET(query_q_counter_in, in, clear, reset); - MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id); - return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); -} -EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter); - struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, int res_num, enum mlx5_res_type res_type) diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index ae63b1ae9004..4d25a3d24182 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -595,10 +595,6 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, struct mlx5_core_qp *sq); void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, struct mlx5_core_qp *sq); -int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id); -int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id); -int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id, - int reset, void *out, int out_size); struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, int res_num, -- cgit v1.2.3 From 333fbaa0255b8d471fc7ae767ef3a1766c732d6d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 4 Apr 2020 10:40:24 +0300 Subject: net/mlx5: Move QP logic to mlx5_ib The mlx5_core doesn't need any functionality coded in qp.c, so move that file to drivers/infiniband/ be under mlx5_ib responsibility. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/Makefile | 1 + drivers/infiniband/hw/mlx5/cq.c | 3 +- drivers/infiniband/hw/mlx5/devx.c | 10 +- drivers/infiniband/hw/mlx5/main.c | 10 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + drivers/infiniband/hw/mlx5/odp.c | 3 +- drivers/infiniband/hw/mlx5/qp.c | 47 +- drivers/infiniband/hw/mlx5/qp.h | 46 ++ drivers/infiniband/hw/mlx5/qpc.c | 605 +++++++++++++++++++ drivers/infiniband/hw/mlx5/srq_cmd.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 - drivers/net/ethernet/mellanox/mlx5/core/qp.c | 697 ---------------------- include/linux/mlx5/cmd.h | 51 -- include/linux/mlx5/driver.h | 2 - include/linux/mlx5/qp.h | 45 -- 17 files changed, 699 insertions(+), 836 deletions(-) create mode 100644 drivers/infiniband/hw/mlx5/qp.h create mode 100644 drivers/infiniband/hw/mlx5/qpc.c delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qp.c delete mode 100644 include/linux/mlx5/cmd.h (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 375b341be8a1..228be05fbaf8 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -13,6 +13,7 @@ mlx5_ib-y := ah.o \ mem.o \ mr.o \ qp.o \ + qpc.o \ restrack.o \ srq.o \ srq_cmd.o diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 146ba2966744..32c05730dfe9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -36,6 +36,7 @@ #include #include "mlx5_ib.h" #include "srq.h" +#include "qp.h" static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) { @@ -484,7 +485,7 @@ repoll: * because CQs will be locked while QPs are removed * from the table. */ - mqp = __mlx5_qp_lookup(dev->mdev, qpn); + mqp = radix_tree_lookup(&dev->qp_table.tree, qpn); *cur_qp = to_mibqp(mqp); } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 46e1ab771f10..35b98c2d64d5 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -14,6 +14,7 @@ #include #include #include "mlx5_ib.h" +#include "qp.h" #include #define UVERBS_MODULE_NAME mlx5_ib @@ -1356,7 +1357,7 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, } if (obj->flags & DEVX_OBJ_FLAGS_DCT) - ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); + ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else @@ -1450,9 +1451,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( if (opcode == MLX5_CMD_OP_CREATE_DCT) { obj->flags |= DEVX_OBJ_FLAGS_DCT; - err = mlx5_core_create_dct(dev->mdev, &obj->core_dct, - cmd_in, cmd_in_len, - cmd_out, cmd_out_len); + err = mlx5_core_create_dct(dev, &obj->core_dct, cmd_in, + cmd_in_len, cmd_out, cmd_out_len); } else if (opcode == MLX5_CMD_OP_CREATE_CQ) { obj->flags |= DEVX_OBJ_FLAGS_CQ; obj->core_cq.comp = devx_cq_comp; @@ -1499,7 +1499,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( obj_destroy: if (obj->flags & DEVX_OBJ_FLAGS_DCT) - mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); + mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 76ea756d846b..f10675213115 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -59,6 +59,7 @@ #include "ib_rep.h" #include "cmd.h" #include "srq.h" +#include "qp.h" #include #include #include @@ -4632,8 +4633,7 @@ static void delay_drop_handler(struct work_struct *work) atomic_inc(&delay_drop->events_cnt); mutex_lock(&delay_drop->lock); - err = mlx5_core_set_delay_drop(delay_drop->dev->mdev, - delay_drop->timeout); + err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout); if (err) { mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", delay_drop->timeout); @@ -7193,6 +7193,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_ROCE, mlx5_ib_stage_roce_init, mlx5_ib_stage_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), STAGE_CREATE(MLX5_IB_STAGE_SRQ, mlx5_init_srq_table, mlx5_cleanup_srq_table), @@ -7250,6 +7253,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_ROCE, mlx5_ib_stage_raw_eth_roce_init, mlx5_ib_stage_raw_eth_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), STAGE_CREATE(MLX5_IB_STAGE_SRQ, mlx5_init_srq_table, mlx5_cleanup_srq_table), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index cb2a021aa93c..aaabb8a98eed 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -869,6 +869,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_CAPS, MLX5_IB_STAGE_NON_DEFAULT_CB, MLX5_IB_STAGE_ROCE, + MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, MLX5_IB_STAGE_DEVICE_NOTIFIER, @@ -1064,6 +1065,7 @@ struct mlx5_ib_dev { struct mlx5_dm dm; u16 devx_whitelist_uid; struct mlx5_srq_table srq_table; + struct mlx5_qp_table qp_table; struct mlx5_async_ctx async_ctx; struct mlx5_devx_event_table devx_event_table; struct mlx5_var_table var_table; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3de7606d4a1a..16af1105cfcf 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -36,6 +36,7 @@ #include "mlx5_ib.h" #include "cmd.h" +#include "qp.h" #include @@ -1219,7 +1220,7 @@ static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: case MLX5_WQE_PF_TYPE_RESP: case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: - common = mlx5_core_res_hold(dev->mdev, wq_num, MLX5_RES_QP); + common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP); break; default: break; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1456db4b6295..3ecd1864b3c8 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -39,6 +39,7 @@ #include "mlx5_ib.h" #include "ib_rep.h" #include "cmd.h" +#include "qp.h" /* not supported currently */ static int wq_signature; @@ -1336,7 +1337,7 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); - err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); + err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp); kvfree(in); @@ -1356,7 +1357,7 @@ static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq) { destroy_flow_rule_vport_sq(sq); - mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + mlx5_core_destroy_sq_tracked(dev, &sq->base.mqp); ib_umem_release(sq->ubuffer.umem); } @@ -1426,7 +1427,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); memcpy(pas, qp_pas, rq_pas_size); - err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); + err = mlx5_core_create_rq_tracked(dev, in, inlen, &rq->base.mqp); kvfree(in); @@ -1436,7 +1437,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq) { - mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); + mlx5_core_destroy_rq_tracked(dev, &rq->base.mqp); } static bool tunnel_offload_supported(struct mlx5_core_dev *dev) @@ -2347,7 +2348,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, &resp); } else { - err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); + err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); } if (err) { @@ -2513,8 +2514,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && !(qp->flags & MLX5_IB_QP_UNDERLAY)) { - err = mlx5_core_qp_modify(dev->mdev, - MLX5_CMD_OP_2RST_QP, 0, + err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); } else { struct mlx5_modify_raw_qp_param raw_qp_param = { @@ -2555,7 +2555,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, qp->flags & MLX5_IB_QP_UNDERLAY) { destroy_raw_packet_qp(dev, qp); } else { - err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); + err = mlx5_core_destroy_qp(dev, &base->mqp); if (err) mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", base->mqp.qpn); @@ -2818,7 +2818,7 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) if (mqp->state == IB_QPS_RTR) { int err; - err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct); + err = mlx5_core_destroy_dct(dev, &mqp->dct.mdct); if (err) { mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); return err; @@ -3462,10 +3462,9 @@ static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, base = &mqp->trans_qp.base; context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff); context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24); - return mlx5_core_qp_modify(dev->mdev, - MLX5_CMD_OP_RTS2RTS_QP, - MLX5_QP_OPTPAR_COUNTER_SET_ID, - &context, &base->mqp); + return mlx5_core_qp_modify(dev, MLX5_CMD_OP_RTS2RTS_QP, + MLX5_QP_OPTPAR_COUNTER_SET_ID, &context, + &base->mqp); } static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, @@ -3752,8 +3751,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - err = mlx5_core_qp_modify(dev->mdev, op, optpar, context, - &base->mqp); + err = mlx5_core_qp_modify(dev, op, optpar, context, &base->mqp); } if (err) @@ -3927,7 +3925,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); - err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, + err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, sizeof(out)); if (err) @@ -3935,7 +3933,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, resp.dctn = qp->dct.mdct.mqp.qpn; err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) { - mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct); + mlx5_core_destroy_dct(dev, &qp->dct.mdct); return err; } } else { @@ -5697,8 +5695,7 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!outb) return -ENOMEM; - err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, - outlen); + err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen); if (err) goto out; @@ -5776,7 +5773,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, if (!out) return -ENOMEM; - err = mlx5_core_dct_query(dev->mdev, dct, out, outlen); + err = mlx5_core_dct_query(dev, dct, out, outlen); if (err) goto out; @@ -5962,7 +5959,7 @@ static int set_delay_drop(struct mlx5_ib_dev *dev) if (dev->delay_drop.activate) goto out; - err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout); + err = mlx5_core_set_delay_drop(dev, dev->delay_drop.timeout); if (err) goto out; @@ -6068,13 +6065,13 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, } rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); - err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); + err = mlx5_core_create_rq_tracked(dev, in, inlen, &rwq->core_qp); if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { err = set_delay_drop(dev); if (err) { mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n", err); - mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); } else { rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP; } @@ -6256,7 +6253,7 @@ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, return &rwq->ibwq; err_copy: - mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); err_user_rq: destroy_user_rq(dev, pd, rwq, udata); err: @@ -6269,7 +6266,7 @@ void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) struct mlx5_ib_dev *dev = to_mdev(wq->device); struct mlx5_ib_rwq *rwq = to_mrwq(wq); - mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); destroy_user_rq(dev, wq->pd, rwq, udata); kfree(rwq); } diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h new file mode 100644 index 000000000000..ad9d76e3e18a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_QP_H +#define _MLX5_IB_QP_H + +#include "mlx5_ib.h" + +int mlx5_init_qp_table(struct mlx5_ib_dev *dev); +void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); + +int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, + u32 *in, int inlen, u32 *out, int outlen); +int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen); +int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, + void *qpc, struct mlx5_core_qp *qp); +int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); +int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct); +int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *out, int outlen); +int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen); + +int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, u32 timeout_usec); + +void mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *rq); +int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq); +void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *sq); + +int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq); + +struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev, + int res_num, + enum mlx5_res_type res_type); +void mlx5_core_res_put(struct mlx5_core_rsc_common *res); + +int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn); +int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); +#endif /* _MLX5_IB_QP_H */ diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c new file mode 100644 index 000000000000..ea62735042f0 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "qp.h" + +static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct); + +static struct mlx5_core_rsc_common * +mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) +{ + struct mlx5_core_rsc_common *common; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + + common = radix_tree_lookup(&table->tree, rsn); + if (common) + refcount_inc(&common->refcount); + + spin_unlock_irqrestore(&table->lock, flags); + + return common; +} + +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) +{ + if (refcount_dec_and_test(&common->refcount)) + complete(&common->free); +} + +static u64 qp_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) | + BIT(MLX5_EVENT_TYPE_COMM_EST) | + BIT(MLX5_EVENT_TYPE_SQ_DRAINED) | + BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | + BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) | + BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | + BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR); + + return mask; +} + +static u64 rq_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); + + return mask; +} + +static u64 sq_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); +} + +static u64 dct_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); +} + +static bool is_event_type_allowed(int rsc_type, int event_type) +{ + switch (rsc_type) { + case MLX5_EVENT_QUEUE_TYPE_QP: + return BIT(event_type) & qp_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_RQ: + return BIT(event_type) & rq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_SQ: + return BIT(event_type) & sq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_DCT: + return BIT(event_type) & dct_allowed_event_types(); + default: + WARN(1, "Event arrived for unknown resource type"); + return false; + } +} + +static int rsc_event_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_core_rsc_common *common; + struct mlx5_qp_table *table; + struct mlx5_core_dct *dct; + u8 event_type = (u8)type; + struct mlx5_core_qp *qp; + struct mlx5_eqe *eqe; + u32 rsn; + + switch (event_type) { + case MLX5_EVENT_TYPE_DCT_DRAINED: + eqe = data; + rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); + break; + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + eqe = data; + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); + break; + default: + return NOTIFY_DONE; + } + + table = container_of(nb, struct mlx5_qp_table, nb); + common = mlx5_get_rsc(table, rsn); + if (!common) + return NOTIFY_OK; + + if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) + goto out; + + switch (common->res) { + case MLX5_RES_QP: + case MLX5_RES_RQ: + case MLX5_RES_SQ: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); + break; + case MLX5_RES_DCT: + dct = (struct mlx5_core_dct *)common; + if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) + complete(&dct->drained); + break; + default: + break; + } +out: + mlx5_core_put_rsc(common); + + return NOTIFY_OK; +} + +static int create_resource_common(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, int rsc_type) +{ + struct mlx5_qp_table *table = &dev->qp_table; + int err; + + qp->common.res = rsc_type; + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, + qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN), + qp); + spin_unlock_irq(&table->lock); + if (err) + return err; + + refcount_set(&qp->common.refcount, 1); + init_completion(&qp->common.free); + qp->pid = current->pid; + + return 0; +} + +static void destroy_resource_common(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + radix_tree_delete(&table->tree, + qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); + spin_unlock_irqrestore(&table->lock, flags); + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); +} + +static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct, bool need_cleanup) +{ + u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto destroy; + + return err; + } + wait_for_completion(&dct->drained); +destroy: + if (need_cleanup) + destroy_resource_common(dev, &dct->mqp); + MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); + MLX5_SET(destroy_dct_in, in, uid, qp->uid); + err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); + return err; +} + +int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *in, int inlen, u32 *out, int outlen) +{ + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + init_completion(&dct->drained); + MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); + + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen); + if (err) + return err; + + qp->qpn = MLX5_GET(create_dct_out, out, dctn); + qp->uid = MLX5_GET(create_dct_in, in, uid); + err = create_resource_common(dev, qp, MLX5_RES_DCT); + if (err) + goto err_cmd; + + return 0; +err_cmd: + _mlx5_core_destroy_dct(dev, dct, false); + return err; +} + +int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen) +{ + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + u32 din[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + int err; + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + if (err) + return err; + + qp->uid = MLX5_GET(create_qp_in, in, uid); + qp->qpn = MLX5_GET(create_qp_out, out, qpn); + + err = create_resource_common(dev, qp, MLX5_RES_QP); + if (err) + goto err_cmd; + + mlx5_debug_qp_add(dev->mdev, qp); + + return 0; + +err_cmd: + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, din, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, din, uid, qp->uid); + mlx5_cmd_exec_in(dev->mdev, destroy_qp, din); + return err; +} + +static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); + MLX5_SET(drain_dct_in, in, dctn, qp->qpn); + MLX5_SET(drain_dct_in, in, uid, qp->uid); + return mlx5_cmd_exec_in(dev->mdev, drain_dct, in); +} + +int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct) +{ + return _mlx5_core_destroy_dct(dev, dct, true); +} + +int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + mlx5_debug_qp_remove(dev->mdev, qp); + + destroy_resource_common(dev, qp); + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, in, uid, qp->uid); + mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); + return 0; +} + +int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, + u32 timeout_usec) +{ + u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {}; + + MLX5_SET(set_delay_drop_params_in, in, opcode, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); + MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, + timeout_usec / 100); + return mlx5_cmd_exec_in(dev->mdev, set_delay_drop_params, in); +} + +struct mbox_info { + u32 *in; + u32 *out; + int inlen; + int outlen; +}; + +static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen) +{ + mbox->inlen = inlen; + mbox->outlen = outlen; + mbox->in = kzalloc(mbox->inlen, GFP_KERNEL); + mbox->out = kzalloc(mbox->outlen, GFP_KERNEL); + if (!mbox->in || !mbox->out) { + kfree(mbox->in); + kfree(mbox->out); + return -ENOMEM; + } + + return 0; +} + +static void mbox_free(struct mbox_info *mbox) +{ + kfree(mbox->in); + kfree(mbox->out); +} + +static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, + u32 opt_param_mask, void *qpc, + struct mbox_info *mbox, u16 uid) +{ + mbox->out = NULL; + mbox->in = NULL; + +#define MBOX_ALLOC(mbox, typ) \ + mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out)) + +#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid) \ + do { \ + MLX5_SET(typ##_in, in, opcode, _opcode); \ + MLX5_SET(typ##_in, in, qpn, _qpn); \ + MLX5_SET(typ##_in, in, uid, _uid); \ + } while (0) + +#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid) \ + do { \ + MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid); \ + MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \ + memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, \ + MLX5_ST_SZ_BYTES(qpc)); \ + } while (0) + + switch (opcode) { + /* 2RST & 2ERR */ + case MLX5_CMD_OP_2RST_QP: + if (MBOX_ALLOC(mbox, qp_2rst)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid); + break; + case MLX5_CMD_OP_2ERR_QP: + if (MBOX_ALLOC(mbox, qp_2err)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid); + break; + + /* MODIFY with QPC */ + case MLX5_CMD_OP_RST2INIT_QP: + if (MBOX_ALLOC(mbox, rst2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + if (MBOX_ALLOC(mbox, init2rtr_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + if (MBOX_ALLOC(mbox, rtr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + if (MBOX_ALLOC(mbox, rts2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + if (MBOX_ALLOC(mbox, sqerr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_INIT2INIT_QP: + if (MBOX_ALLOC(mbox, init2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + default: + return -EINVAL; + } + return 0; +} + +int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, + void *qpc, struct mlx5_core_qp *qp) +{ + struct mbox_info mbox; + int err; + + err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, + opt_param_mask, qpc, &mbox, qp->uid); + if (err) + return err; + + err = mlx5_cmd_exec(dev->mdev, mbox.in, mbox.inlen, mbox.out, + mbox.outlen); + mbox_free(&mbox); + return err; +} + +int mlx5_init_qp_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_qp_table *table = &dev->qp_table; + + spin_lock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + mlx5_qp_debugfs_init(dev->mdev); + + table->nb.notifier_call = rsc_event_notifier; + mlx5_notifier_register(dev->mdev, &table->nb); + + return 0; +} + +void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_qp_table *table = &dev->qp_table; + + mlx5_notifier_unregister(dev->mdev, &table->nb); + mlx5_qp_debugfs_cleanup(dev->mdev); +} + +int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; + + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); + MLX5_SET(query_qp_in, in, qpn, qp->qpn); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, outlen); +} + +int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); + MLX5_SET(query_dct_in, in, dctn, qp->qpn); + + return mlx5_cmd_exec(dev->mdev, (void *)&in, sizeof(in), (void *)out, + outlen); +} + +int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + int err; + + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + err = mlx5_cmd_exec_inout(dev->mdev, alloc_xrcd, in, out); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); + return err; +} + +int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + return mlx5_cmd_exec_in(dev->mdev, dealloc_xrcd, in); +} + +static void destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; + + MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); + MLX5_SET(destroy_rq_in, in, rqn, rqn); + MLX5_SET(destroy_rq_in, in, uid, uid); + mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); +} + +int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq) +{ + int err; + u32 rqn; + + err = mlx5_core_create_rq(dev->mdev, in, inlen, &rqn); + if (err) + return err; + + rq->uid = MLX5_GET(create_rq_in, in, uid); + rq->qpn = rqn; + err = create_resource_common(dev, rq, MLX5_RES_RQ); + if (err) + goto err_destroy_rq; + + return 0; + +err_destroy_rq: + destroy_rq_tracked(dev, rq->qpn, rq->uid); + + return err; +} + +void mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *rq) +{ + destroy_resource_common(dev, rq); + destroy_rq_tracked(dev, rq->qpn, rq->uid); +} + +static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; + + MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); + MLX5_SET(destroy_sq_in, in, sqn, sqn); + MLX5_SET(destroy_sq_in, in, uid, uid); + mlx5_cmd_exec_in(dev->mdev, destroy_sq, in); +} + +int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq) +{ + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {}; + int err; + + MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + if (err) + return err; + + sq->qpn = MLX5_GET(create_sq_out, out, sqn); + sq->uid = MLX5_GET(create_sq_in, in, uid); + err = create_resource_common(dev, sq, MLX5_RES_SQ); + if (err) + goto err_destroy_sq; + + return 0; + +err_destroy_sq: + destroy_sq_tracked(dev, sq->qpn, sq->uid); + + return err; +} + +void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *sq) +{ + destroy_resource_common(dev, sq); + destroy_sq_tracked(dev, sq->qpn, sq->uid); +} + +struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev, + int res_num, + enum mlx5_res_type res_type) +{ + u32 rsn = res_num | (res_type << MLX5_USER_INDEX_LEN); + struct mlx5_qp_table *table = &dev->qp_table; + + return mlx5_get_rsc(table, rsn); +} + +void mlx5_core_res_put(struct mlx5_core_rsc_common *res) +{ + mlx5_core_put_rsc(res); +} diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index 88c0388f9fc6..c851570791af 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -7,6 +7,7 @@ #include #include "mlx5_ib.h" #include "srq.h" +#include "qp.h" static int get_pas_size(struct mlx5_srq_attr *in) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 6d32915000fc..d3c7dbd7f1d5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_MLX5_CORE) += mlx5_core.o # mlx5 core basic # mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ - health.o mcg.o cq.o alloc.o qp.o port.o mr.o pd.o \ + health.o mcg.o cq.o alloc.o port.o mr.o pd.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index d40c3d5bd496..65fef5a86644 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -101,15 +101,15 @@ void mlx5_unregister_debugfs(void) void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev) { - atomic_set(&dev->num_qps, 0); - dev->priv.qp_debugfs = debugfs_create_dir("QPs", dev->priv.dbg_root); } +EXPORT_SYMBOL(mlx5_qp_debugfs_init); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev) { debugfs_remove_recursive(dev->priv.qp_debugfs); } +EXPORT_SYMBOL(mlx5_qp_debugfs_cleanup); void mlx5_eq_debugfs_init(struct mlx5_core_dev *dev) { @@ -450,6 +450,7 @@ int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) return err; } +EXPORT_SYMBOL(mlx5_debug_qp_add); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) { @@ -459,6 +460,7 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) if (qp->dbg) rem_res_tree(qp->dbg); } +EXPORT_SYMBOL(mlx5_debug_qp_remove); int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7af4210c1b96..6e19fa4d1310 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -836,8 +836,6 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_cq_debugfs_init(dev); - mlx5_init_qp_table(dev); - mlx5_init_reserved_gids(dev); mlx5_init_clock(dev); @@ -896,7 +894,6 @@ err_rl_cleanup: err_tables_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); - mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); err_eq_cleanup: @@ -924,7 +921,6 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); mlx5_cleanup_reserved_gids(dev); - mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); mlx5_eq_table_cleanup(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c deleted file mode 100644 index d9df3a5dd532..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ /dev/null @@ -1,697 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "mlx5_core.h" -#include "lib/eq.h" - -static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct); - -static struct mlx5_core_rsc_common * -mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) -{ - struct mlx5_core_rsc_common *common; - unsigned long flags; - - spin_lock_irqsave(&table->lock, flags); - - common = radix_tree_lookup(&table->tree, rsn); - if (common) - refcount_inc(&common->refcount); - - spin_unlock_irqrestore(&table->lock, flags); - - return common; -} - -void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) -{ - if (refcount_dec_and_test(&common->refcount)) - complete(&common->free); -} - -static u64 qp_allowed_event_types(void) -{ - u64 mask; - - mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) | - BIT(MLX5_EVENT_TYPE_COMM_EST) | - BIT(MLX5_EVENT_TYPE_SQ_DRAINED) | - BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | - BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | - BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) | - BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | - BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR); - - return mask; -} - -static u64 rq_allowed_event_types(void) -{ - u64 mask; - - mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | - BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); - - return mask; -} - -static u64 sq_allowed_event_types(void) -{ - return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); -} - -static u64 dct_allowed_event_types(void) -{ - return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); -} - -static bool is_event_type_allowed(int rsc_type, int event_type) -{ - switch (rsc_type) { - case MLX5_EVENT_QUEUE_TYPE_QP: - return BIT(event_type) & qp_allowed_event_types(); - case MLX5_EVENT_QUEUE_TYPE_RQ: - return BIT(event_type) & rq_allowed_event_types(); - case MLX5_EVENT_QUEUE_TYPE_SQ: - return BIT(event_type) & sq_allowed_event_types(); - case MLX5_EVENT_QUEUE_TYPE_DCT: - return BIT(event_type) & dct_allowed_event_types(); - default: - WARN(1, "Event arrived for unknown resource type"); - return false; - } -} - -static int rsc_event_notifier(struct notifier_block *nb, - unsigned long type, void *data) -{ - struct mlx5_core_rsc_common *common; - struct mlx5_qp_table *table; - struct mlx5_core_dev *dev; - struct mlx5_core_dct *dct; - u8 event_type = (u8)type; - struct mlx5_core_qp *qp; - struct mlx5_priv *priv; - struct mlx5_eqe *eqe; - u32 rsn; - - switch (event_type) { - case MLX5_EVENT_TYPE_DCT_DRAINED: - eqe = data; - rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; - rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); - break; - case MLX5_EVENT_TYPE_PATH_MIG: - case MLX5_EVENT_TYPE_COMM_EST: - case MLX5_EVENT_TYPE_SQ_DRAINED: - case MLX5_EVENT_TYPE_SRQ_LAST_WQE: - case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: - case MLX5_EVENT_TYPE_PATH_MIG_FAILED: - case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: - case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: - eqe = data; - rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; - rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); - break; - default: - return NOTIFY_DONE; - } - - table = container_of(nb, struct mlx5_qp_table, nb); - priv = container_of(table, struct mlx5_priv, qp_table); - dev = container_of(priv, struct mlx5_core_dev, priv); - - mlx5_core_dbg(dev, "event (%d) arrived on resource 0x%x\n", eqe->type, rsn); - - common = mlx5_get_rsc(table, rsn); - if (!common) { - mlx5_core_dbg(dev, "Async event for unknown resource 0x%x\n", rsn); - return NOTIFY_OK; - } - - if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) { - mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n", - event_type, rsn); - goto out; - } - - switch (common->res) { - case MLX5_RES_QP: - case MLX5_RES_RQ: - case MLX5_RES_SQ: - qp = (struct mlx5_core_qp *)common; - qp->event(qp, event_type); - break; - case MLX5_RES_DCT: - dct = (struct mlx5_core_dct *)common; - if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) - complete(&dct->drained); - break; - default: - mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); - } -out: - mlx5_core_put_rsc(common); - - return NOTIFY_OK; -} - -static int create_resource_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - int rsc_type) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - int err; - - qp->common.res = rsc_type; - spin_lock_irq(&table->lock); - err = radix_tree_insert(&table->tree, - qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN), - qp); - spin_unlock_irq(&table->lock); - if (err) - return err; - - refcount_set(&qp->common.refcount, 1); - init_completion(&qp->common.free); - qp->pid = current->pid; - - return 0; -} - -static void destroy_resource_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - unsigned long flags; - - spin_lock_irqsave(&table->lock, flags); - radix_tree_delete(&table->tree, - qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); - spin_unlock_irqrestore(&table->lock, flags); - mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); - wait_for_completion(&qp->common.free); -} - -static int _mlx5_core_destroy_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct, bool need_cleanup) -{ - u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - int err; - - err = mlx5_core_drain_dct(dev, dct); - if (err) { - if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - goto destroy; - } else { - mlx5_core_warn( - dev, "failed drain DCT 0x%x with error 0x%x\n", - qp->qpn, err); - return err; - } - } - wait_for_completion(&dct->drained); -destroy: - if (need_cleanup) - destroy_resource_common(dev, &dct->mqp); - MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); - MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); - MLX5_SET(destroy_dct_in, in, uid, qp->uid); - err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)&out, sizeof(out)); - return err; -} - -int mlx5_core_create_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct, - u32 *in, int inlen, - u32 *out, int outlen) -{ - struct mlx5_core_qp *qp = &dct->mqp; - int err; - - init_completion(&dct->drained); - MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); - - err = mlx5_cmd_exec(dev, in, inlen, out, outlen); - if (err) { - mlx5_core_warn(dev, "create DCT failed, ret %d\n", err); - return err; - } - - qp->qpn = MLX5_GET(create_dct_out, out, dctn); - qp->uid = MLX5_GET(create_dct_in, in, uid); - err = create_resource_common(dev, qp, MLX5_RES_DCT); - if (err) - goto err_cmd; - - return 0; -err_cmd: - _mlx5_core_destroy_dct(dev, dct, false); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_create_dct); - -int mlx5_core_create_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - u32 *in, int inlen) -{ - u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {0}; - u32 dout[MLX5_ST_SZ_DW(destroy_qp_out)]; - u32 din[MLX5_ST_SZ_DW(destroy_qp_in)]; - int err; - - MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); - - err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - if (err) - return err; - - qp->uid = MLX5_GET(create_qp_in, in, uid); - qp->qpn = MLX5_GET(create_qp_out, out, qpn); - mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); - - err = create_resource_common(dev, qp, MLX5_RES_QP); - if (err) - goto err_cmd; - - err = mlx5_debug_qp_add(dev, qp); - if (err) - mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n", - qp->qpn); - - atomic_inc(&dev->num_qps); - - return 0; - -err_cmd: - memset(din, 0, sizeof(din)); - memset(dout, 0, sizeof(dout)); - MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, din, qpn, qp->qpn); - MLX5_SET(destroy_qp_in, din, uid, qp->uid); - mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_create_qp); - -static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct) -{ - u32 out[MLX5_ST_SZ_DW(drain_dct_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - - MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); - MLX5_SET(drain_dct_in, in, dctn, qp->qpn); - MLX5_SET(drain_dct_in, in, uid, qp->uid); - return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)&out, sizeof(out)); -} - -int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct) -{ - return _mlx5_core_destroy_dct(dev, dct, true); -} -EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct); - -int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp) -{ - u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {0}; - int err; - - mlx5_debug_qp_remove(dev, qp); - - destroy_resource_common(dev, qp); - - MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); - MLX5_SET(destroy_qp_in, in, uid, qp->uid); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (err) - return err; - - atomic_dec(&dev->num_qps); - return 0; -} -EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); - -int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, - u32 timeout_usec) -{ - u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {0}; - - MLX5_SET(set_delay_drop_params_in, in, opcode, - MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); - MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, - timeout_usec / 100); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} -EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop); - -struct mbox_info { - u32 *in; - u32 *out; - int inlen; - int outlen; -}; - -static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen) -{ - mbox->inlen = inlen; - mbox->outlen = outlen; - mbox->in = kzalloc(mbox->inlen, GFP_KERNEL); - mbox->out = kzalloc(mbox->outlen, GFP_KERNEL); - if (!mbox->in || !mbox->out) { - kfree(mbox->in); - kfree(mbox->out); - return -ENOMEM; - } - - return 0; -} - -static void mbox_free(struct mbox_info *mbox) -{ - kfree(mbox->in); - kfree(mbox->out); -} - -static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, - u32 opt_param_mask, void *qpc, - struct mbox_info *mbox, u16 uid) -{ - mbox->out = NULL; - mbox->in = NULL; - -#define MBOX_ALLOC(mbox, typ) \ - mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out)) - -#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid) \ - do { \ - MLX5_SET(typ##_in, in, opcode, _opcode); \ - MLX5_SET(typ##_in, in, qpn, _qpn); \ - MLX5_SET(typ##_in, in, uid, _uid); \ - } while (0) - -#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid) \ - do { \ - MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid); \ - MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \ - memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, \ - MLX5_ST_SZ_BYTES(qpc)); \ - } while (0) - - switch (opcode) { - /* 2RST & 2ERR */ - case MLX5_CMD_OP_2RST_QP: - if (MBOX_ALLOC(mbox, qp_2rst)) - return -ENOMEM; - MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid); - break; - case MLX5_CMD_OP_2ERR_QP: - if (MBOX_ALLOC(mbox, qp_2err)) - return -ENOMEM; - MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid); - break; - - /* MODIFY with QPC */ - case MLX5_CMD_OP_RST2INIT_QP: - if (MBOX_ALLOC(mbox, rst2init_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_INIT2RTR_QP: - if (MBOX_ALLOC(mbox, init2rtr_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_RTR2RTS_QP: - if (MBOX_ALLOC(mbox, rtr2rts_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_RTS2RTS_QP: - if (MBOX_ALLOC(mbox, rts2rts_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_SQERR2RTS_QP: - if (MBOX_ALLOC(mbox, sqerr2rts_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_INIT2INIT_QP: - if (MBOX_ALLOC(mbox, init2init_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - default: - mlx5_core_err(dev, "Unknown transition for modify QP: OP(0x%x) QPN(0x%x)\n", - opcode, qpn); - return -EINVAL; - } - return 0; -} - -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, - u32 opt_param_mask, void *qpc, - struct mlx5_core_qp *qp) -{ - struct mbox_info mbox; - int err; - - err = modify_qp_mbox_alloc(dev, opcode, qp->qpn, - opt_param_mask, qpc, &mbox, qp->uid); - if (err) - return err; - - err = mlx5_cmd_exec(dev, mbox.in, mbox.inlen, mbox.out, mbox.outlen); - mbox_free(&mbox); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_qp_modify); - -void mlx5_init_qp_table(struct mlx5_core_dev *dev) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - - memset(table, 0, sizeof(*table)); - spin_lock_init(&table->lock); - INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); - mlx5_qp_debugfs_init(dev); - - table->nb.notifier_call = rsc_event_notifier; - mlx5_notifier_register(dev, &table->nb); -} - -void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - - mlx5_notifier_unregister(dev, &table->nb); - mlx5_qp_debugfs_cleanup(dev); -} - -int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - u32 *out, int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {0}; - - MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); - MLX5_SET(query_qp_in, in, qpn, qp->qpn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); -} -EXPORT_SYMBOL_GPL(mlx5_core_qp_query); - -int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, - u32 *out, int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - - MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); - MLX5_SET(query_dct_in, in, dctn, qp->qpn); - - return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)out, outlen); -} -EXPORT_SYMBOL_GPL(mlx5_core_dct_query); - -int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) -{ - u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {0}; - int err; - - MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (!err) - *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc); - -int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) -{ - u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {0}; - - MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); - MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} -EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); - -static void destroy_rq_tracked(struct mlx5_core_dev *dev, u32 rqn, u16 uid) -{ - u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {}; - - MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); - MLX5_SET(destroy_rq_in, in, rqn, rqn); - MLX5_SET(destroy_rq_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} - -int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *rq) -{ - int err; - u32 rqn; - - err = mlx5_core_create_rq(dev, in, inlen, &rqn); - if (err) - return err; - - rq->uid = MLX5_GET(create_rq_in, in, uid); - rq->qpn = rqn; - err = create_resource_common(dev, rq, MLX5_RES_RQ); - if (err) - goto err_destroy_rq; - - return 0; - -err_destroy_rq: - destroy_rq_tracked(dev, rq->qpn, rq->uid); - - return err; -} -EXPORT_SYMBOL(mlx5_core_create_rq_tracked); - -void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *rq) -{ - destroy_resource_common(dev, rq); - destroy_rq_tracked(dev, rq->qpn, rq->uid); -} -EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked); - -static void destroy_sq_tracked(struct mlx5_core_dev *dev, u32 sqn, u16 uid) -{ - u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {}; - - MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); - MLX5_SET(destroy_sq_in, in, sqn, sqn); - MLX5_SET(destroy_sq_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} - -int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *sq) -{ - int err; - u32 sqn; - - err = mlx5_core_create_sq(dev, in, inlen, &sqn); - if (err) - return err; - - sq->uid = MLX5_GET(create_sq_in, in, uid); - sq->qpn = sqn; - err = create_resource_common(dev, sq, MLX5_RES_SQ); - if (err) - goto err_destroy_sq; - - return 0; - -err_destroy_sq: - destroy_sq_tracked(dev, sq->qpn, sq->uid); - - return err; -} -EXPORT_SYMBOL(mlx5_core_create_sq_tracked); - -void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *sq) -{ - destroy_resource_common(dev, sq); - destroy_sq_tracked(dev, sq->qpn, sq->uid); -} -EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); - -struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, - int res_num, - enum mlx5_res_type res_type) -{ - u32 rsn = res_num | (res_type << MLX5_USER_INDEX_LEN); - struct mlx5_qp_table *table = &dev->priv.qp_table; - - return mlx5_get_rsc(table, rsn); -} -EXPORT_SYMBOL_GPL(mlx5_core_res_hold); - -void mlx5_core_res_put(struct mlx5_core_rsc_common *res) -{ - mlx5_core_put_rsc(res); -} -EXPORT_SYMBOL_GPL(mlx5_core_res_put); diff --git a/include/linux/mlx5/cmd.h b/include/linux/mlx5/cmd.h deleted file mode 100644 index 68cd08f02c2f..000000000000 --- a/include/linux/mlx5/cmd.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MLX5_CMD_H -#define MLX5_CMD_H - -#include - -struct manage_pages_layout { - u64 ptr; - u32 reserved; - u16 num_entries; - u16 func_id; -}; - - -struct mlx5_cmd_alloc_uar_imm_out { - u32 rsvd[3]; - u32 uarn; -}; - -#endif /* MLX5_CMD_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1caddfa85c4d..b60e5ab7906b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -541,7 +541,6 @@ struct mlx5_priv { struct mlx5_core_health health; /* start: qp staff */ - struct mlx5_qp_table qp_table; struct dentry *qp_debugfs; struct dentry *eq_debugfs; struct dentry *cq_debugfs; @@ -687,7 +686,6 @@ struct mlx5_core_dev { unsigned long intf_state; struct mlx5_priv priv; struct mlx5_profile *profile; - atomic_t num_qps; u32 issi; struct mlx5e_resources mlx5e_res; struct mlx5_dm *dm; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 4d25a3d24182..ef127a156a62 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -553,53 +553,8 @@ struct mlx5_qp_context { u8 rsvd1[24]; }; -static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) -{ - return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); -} - -int mlx5_core_create_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *qp, - u32 *in, int inlen, - u32 *out, int outlen); -int mlx5_core_create_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - u32 *in, - int inlen); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, - u32 opt_param_mask, void *qpc, - struct mlx5_core_qp *qp); -int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp); -int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct); -int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - u32 *out, int outlen); -int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, - u32 *out, int outlen); - -int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, - u32 timeout_usec); - -int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); -int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); -void mlx5_init_qp_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); -int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *rq); -void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *rq); -int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *sq); -void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *sq); - -struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, - int res_num, - enum mlx5_res_type res_type); -void mlx5_core_res_put(struct mlx5_core_rsc_common *res); static inline const char *mlx5_qp_type_str(int type) { -- cgit v1.2.3 From 59e9e8e4fe83f68e599b87c06aaf239dcc64887b Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 14 Jan 2020 05:06:25 +0200 Subject: net/mlx5: Enable SW-defined RoCEv2 UDP source port When this is enabled, UDP source port for RoCEv2 packets are defined by software instead of firmware. Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 32 ++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 5 +++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a000cd820ace..0044aa5cc676 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -556,6 +556,31 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); } +static int handle_hca_cap_roce(struct mlx5_core_dev *dev, void *set_ctx) +{ + void *set_hca_cap; + int err; + + if (!MLX5_CAP_GEN(dev, roce)) + return 0; + + err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE); + if (err) + return err; + + if (MLX5_CAP_ROCE(dev, sw_r_roce_src_udp_port) || + !MLX5_CAP_ROCE_MAX(dev, sw_r_roce_src_udp_port)) + return 0; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ROCE], + MLX5_ST_SZ_BYTES(roce_cap)); + MLX5_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1); + + err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ROCE); + return err; +} + static int set_hca_cap(struct mlx5_core_dev *dev) { int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); @@ -586,6 +611,13 @@ static int set_hca_cap(struct mlx5_core_dev *dev) goto out; } + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_roce(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap_roce failed\n"); + goto out; + } + out: kfree(set_ctx); return err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 69b27c7dfc3e..6fa24918eade 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -74,6 +74,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0, MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, + MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, }; enum { @@ -903,7 +904,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { struct mlx5_ifc_roce_cap_bits { u8 roce_apm[0x1]; - u8 reserved_at_1[0x1f]; + u8 reserved_at_1[0x3]; + u8 sw_r_roce_src_udp_port[0x1]; + u8 reserved_at_5[0x1b]; u8 reserved_at_20[0x60]; -- cgit v1.2.3 From 501d3e5dd5bd33a7119f35ee6d365a5787e32a30 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Sat, 21 Mar 2020 11:08:01 +0200 Subject: iio: ad_sigma_delta: remove unused IIO channel macros Now that all channel SigmaDelta IIO channel macros have been localized, remove the generic ones. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 58 ---------------------------------- 1 file changed, 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 5a127c0ed200..a3a838dcf8e4 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -133,62 +133,4 @@ void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev); int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig); -#define __AD_SD_CHANNEL(_si, _channel1, _channel2, _address, _bits, \ - _storagebits, _shift, _extend_name, _type, _mask_all) \ - { \ - .type = (_type), \ - .differential = (_channel2 == -1 ? 0 : 1), \ - .indexed = 1, \ - .channel = (_channel1), \ - .channel2 = (_channel2), \ - .address = (_address), \ - .extend_name = (_extend_name), \ - .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \ - BIT(IIO_CHAN_INFO_OFFSET), \ - .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \ - .info_mask_shared_by_all = _mask_all, \ - .scan_index = (_si), \ - .scan_type = { \ - .sign = 'u', \ - .realbits = (_bits), \ - .storagebits = (_storagebits), \ - .shift = (_shift), \ - .endianness = IIO_BE, \ - }, \ - } - -#define AD_SD_DIFF_CHANNEL(_si, _channel1, _channel2, _address, _bits, \ - _storagebits, _shift) \ - __AD_SD_CHANNEL(_si, _channel1, _channel2, _address, _bits, \ - _storagebits, _shift, NULL, IIO_VOLTAGE, \ - BIT(IIO_CHAN_INFO_SAMP_FREQ)) - -#define AD_SD_SHORTED_CHANNEL(_si, _channel, _address, _bits, \ - _storagebits, _shift) \ - __AD_SD_CHANNEL(_si, _channel, _channel, _address, _bits, \ - _storagebits, _shift, "shorted", IIO_VOLTAGE, \ - BIT(IIO_CHAN_INFO_SAMP_FREQ)) - -#define AD_SD_CHANNEL(_si, _channel, _address, _bits, \ - _storagebits, _shift) \ - __AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \ - _storagebits, _shift, NULL, IIO_VOLTAGE, \ - BIT(IIO_CHAN_INFO_SAMP_FREQ)) - -#define AD_SD_CHANNEL_NO_SAMP_FREQ(_si, _channel, _address, _bits, \ - _storagebits, _shift) \ - __AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \ - _storagebits, _shift, NULL, IIO_VOLTAGE, 0) - -#define AD_SD_TEMP_CHANNEL(_si, _address, _bits, _storagebits, _shift) \ - __AD_SD_CHANNEL(_si, 0, -1, _address, _bits, \ - _storagebits, _shift, NULL, IIO_TEMP, \ - BIT(IIO_CHAN_INFO_SAMP_FREQ)) - -#define AD_SD_SUPPLY_CHANNEL(_si, _channel, _address, _bits, _storagebits, \ - _shift) \ - __AD_SD_CHANNEL(_si, _channel, -1, _address, _bits, \ - _storagebits, _shift, "supply", IIO_VOLTAGE, \ - BIT(IIO_CHAN_INFO_SAMP_FREQ)) - #endif -- cgit v1.2.3 From 9159c7c06cebfab4bebca0a930b924ad29437638 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Sat, 28 Mar 2020 18:34:21 +0200 Subject: iio: buffer: drop left-over 'stufftoread' field This seems like a left-over from a7348347ba8a4 ("staging:iio: Add polling of events on the ring access chrdev."). Then it was moved into the sca3000 driver around 9dd4694dafbd8 ("iio: staging: sca3000: hide stufftoread logic"), and that one seemed to be the only user of this. Then it eventually was no longer used after 152a6a884ae1 ("staging:iio:accel:sca3000 move to hybrid hard / soft buffer design.") Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index a4d2d8061ef6..1e7edf6bed96 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -115,9 +115,6 @@ struct iio_buffer { */ struct attribute_group scan_el_group; - /* @stufftoread: Flag to indicate new data. */ - bool stufftoread; - /* @attrs: Standard attributes of the buffer. */ const struct attribute **attrs; -- cgit v1.2.3 From 641dedd50c488739b56ef0f716988e646295dce4 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 24 Mar 2020 15:46:29 +0200 Subject: include: fpga: adi-axi-common.h: fixup whitespace tab -> space The initial version use a tab between '#define' & 'ADI_AXI_REG_VERSION'. This changes it to space. The change is purely cosmetic. Acked-by: Moritz Fischer Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- include/linux/fpga/adi-axi-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h index 7fc95d5c95bb..ebd4e07ae3d8 100644 --- a/include/linux/fpga/adi-axi-common.h +++ b/include/linux/fpga/adi-axi-common.h @@ -11,7 +11,7 @@ #ifndef ADI_AXI_COMMON_H_ #define ADI_AXI_COMMON_H_ -#define ADI_AXI_REG_VERSION 0x0000 +#define ADI_AXI_REG_VERSION 0x0000 #define ADI_AXI_PCORE_VER(major, minor, patch) \ (((major) << 16) | ((minor) << 8) | (patch)) -- cgit v1.2.3 From 20d5fa48d333670ffbc08d387a80710be91259a0 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 24 Mar 2020 15:46:30 +0200 Subject: include: fpga: adi-axi-common.h: add version helper macros The format for all ADI AXI IP cores is the same. i.e. 'major.minor.patch'. This patch adds the helper macros to be re-used in ADI AXI drivers. Acked-by: Moritz Fischer Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- include/linux/fpga/adi-axi-common.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h index ebd4e07ae3d8..141ac3f251e6 100644 --- a/include/linux/fpga/adi-axi-common.h +++ b/include/linux/fpga/adi-axi-common.h @@ -16,4 +16,8 @@ #define ADI_AXI_PCORE_VER(major, minor, patch) \ (((major) << 16) | ((minor) << 8) | (patch)) +#define ADI_AXI_PCORE_VER_MAJOR(version) (((version) >> 16) & 0xff) +#define ADI_AXI_PCORE_VER_MINOR(version) (((version) >> 8) & 0xff) +#define ADI_AXI_PCORE_VER_PATCH(version) ((version) & 0xff) + #endif /* ADI_AXI_COMMON_H_ */ -- cgit v1.2.3 From e0fcca9fbd99e959855aa1d66c125d696f969e68 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 24 Mar 2020 15:46:32 +0200 Subject: iio: buffer-dmaengine: add dev-managed calls for buffer alloc Currently, when using a 'iio_dmaengine_buffer_alloc()', an matching call to 'iio_dmaengine_buffer_free()' must be made. With this change, this can be avoided by using 'devm_iio_dmaengine_buffer_alloc()'. The buffer will get free'd via the device's devres handling. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/iio/buffer/industrialio-buffer-dmaengine.c | 39 ++++++++++++++++++++++ include/linux/iio/buffer-dmaengine.h | 3 ++ 2 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c index 94da3b1ca3a2..6dedf12b69a4 100644 --- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c +++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c @@ -229,6 +229,45 @@ void iio_dmaengine_buffer_free(struct iio_buffer *buffer) } EXPORT_SYMBOL_GPL(iio_dmaengine_buffer_free); +static void __devm_iio_dmaengine_buffer_free(struct device *dev, void *res) +{ + iio_dmaengine_buffer_free(*(struct iio_buffer **)res); +} + +/** + * devm_iio_dmaengine_buffer_alloc() - Resource-managed iio_dmaengine_buffer_alloc() + * @dev: Parent device for the buffer + * @channel: DMA channel name, typically "rx". + * + * This allocates a new IIO buffer which internally uses the DMAengine framework + * to perform its transfers. The parent device will be used to request the DMA + * channel. + * + * The buffer will be automatically de-allocated once the device gets destroyed. + */ +struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev, + const char *channel) +{ + struct iio_buffer **bufferp, *buffer; + + bufferp = devres_alloc(__devm_iio_dmaengine_buffer_free, + sizeof(*bufferp), GFP_KERNEL); + if (!bufferp) + return ERR_PTR(-ENOMEM); + + buffer = iio_dmaengine_buffer_alloc(dev, channel); + if (IS_ERR(buffer)) { + devres_free(bufferp); + return buffer; + } + + *bufferp = buffer; + devres_add(dev, bufferp); + + return buffer; +} +EXPORT_SYMBOL_GPL(devm_iio_dmaengine_buffer_alloc); + MODULE_AUTHOR("Lars-Peter Clausen "); MODULE_DESCRIPTION("DMA buffer for the IIO framework"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h index b3a57444a886..0e503db71289 100644 --- a/include/linux/iio/buffer-dmaengine.h +++ b/include/linux/iio/buffer-dmaengine.h @@ -14,4 +14,7 @@ struct iio_buffer *iio_dmaengine_buffer_alloc(struct device *dev, const char *channel); void iio_dmaengine_buffer_free(struct iio_buffer *buffer); +struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev, + const char *channel); + #endif -- cgit v1.2.3 From ef04070692a21633ec6a60f80c19b6af44b3cf47 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Tue, 24 Mar 2020 15:46:33 +0200 Subject: iio: adc: adi-axi-adc: add support for AXI ADC IP core This change adds support for the Analog Devices Generic AXI ADC IP core. The IP core is used for interfacing with analog-to-digital (ADC) converters that require either a high-speed serial interface (JESD204B/C) or a source synchronous parallel interface (LVDS/CMOS). Usually, some other interface type (i.e SPI) is used as a control interface for the actual ADC, while the IP core (controlled via this driver), will interface to the data-lines of the ADC and handle the streaming of data into memory via DMA. Because of this, the AXI ADC driver needs the other SPI-ADC driver to register with it. The SPI-ADC needs to be register via the SPI framework, while the AXI ADC registers as a platform driver. The two cannot be ordered in a hierarchy as both drivers have their own registers, and trying to organize this [in a hierarchy becomes] problematic when trying to map memory/registers. There are some modes where the AXI ADC can operate as standalone ADC, but those will be implemented at a later point in time. DocLink: https://wiki.analog.com/resources/fpga/docs/axi_adc_ip Signed-off-by: Michael Hennerich Signed-off-by: Lars-Peter Clausen Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/iio/adc/Kconfig | 20 ++ drivers/iio/adc/Makefile | 1 + drivers/iio/adc/adi-axi-adc.c | 482 ++++++++++++++++++++++++++++++++++++ include/linux/iio/adc/adi-axi-adc.h | 64 +++++ 4 files changed, 567 insertions(+) create mode 100644 drivers/iio/adc/adi-axi-adc.c create mode 100644 include/linux/iio/adc/adi-axi-adc.h (limited to 'include/linux') diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index f4da821c4022..445070abf376 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -246,6 +246,26 @@ config AD799X To compile this driver as a module, choose M here: the module will be called ad799x. +config ADI_AXI_ADC + tristate "Analog Devices Generic AXI ADC IP core driver" + select IIO_BUFFER + select IIO_BUFFER_HW_CONSUMER + select IIO_BUFFER_DMAENGINE + help + Say yes here to build support for Analog Devices Generic + AXI ADC IP core. The IP core is used for interfacing with + analog-to-digital (ADC) converters that require either a high-speed + serial interface (JESD204B/C) or a source synchronous parallel + interface (LVDS/CMOS). + Typically (for such devices) SPI will be used for configuration only, + while this IP core handles the streaming of data into memory via DMA. + + Link: https://wiki.analog.com/resources/fpga/docs/axi_adc_ip + If unsure, say N (but it's safe to say "Y"). + + To compile this driver as a module, choose M here: the + module will be called adi-axi-adc. + config ASPEED_ADC tristate "Aspeed ADC" depends on ARCH_ASPEED || COMPILE_TEST diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index 8462455b4228..7c6594d049f9 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_AD7793) += ad7793.o obj-$(CONFIG_AD7887) += ad7887.o obj-$(CONFIG_AD7949) += ad7949.o obj-$(CONFIG_AD799X) += ad799x.o +obj-$(CONFIG_ADI_AXI_ADC) += adi-axi-adc.o obj-$(CONFIG_ASPEED_ADC) += aspeed_adc.o obj-$(CONFIG_AT91_ADC) += at91_adc.o obj-$(CONFIG_AT91_SAMA5D2_ADC) += at91-sama5d2_adc.o diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c new file mode 100644 index 000000000000..c24c8da99eb4 --- /dev/null +++ b/drivers/iio/adc/adi-axi-adc.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Analog Devices Generic AXI ADC IP core + * Link: https://wiki.analog.com/resources/fpga/docs/axi_adc_ip + * + * Copyright 2012-2020 Analog Devices Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/** + * Register definitions: + * https://wiki.analog.com/resources/fpga/docs/axi_adc_ip#register_map + */ + +/* ADC controls */ + +#define ADI_AXI_REG_RSTN 0x0040 +#define ADI_AXI_REG_RSTN_CE_N BIT(2) +#define ADI_AXI_REG_RSTN_MMCM_RSTN BIT(1) +#define ADI_AXI_REG_RSTN_RSTN BIT(0) + +/* ADC Channel controls */ + +#define ADI_AXI_REG_CHAN_CTRL(c) (0x0400 + (c) * 0x40) +#define ADI_AXI_REG_CHAN_CTRL_LB_OWR BIT(11) +#define ADI_AXI_REG_CHAN_CTRL_PN_SEL_OWR BIT(10) +#define ADI_AXI_REG_CHAN_CTRL_IQCOR_EN BIT(9) +#define ADI_AXI_REG_CHAN_CTRL_DCFILT_EN BIT(8) +#define ADI_AXI_REG_CHAN_CTRL_FMT_SIGNEXT BIT(6) +#define ADI_AXI_REG_CHAN_CTRL_FMT_TYPE BIT(5) +#define ADI_AXI_REG_CHAN_CTRL_FMT_EN BIT(4) +#define ADI_AXI_REG_CHAN_CTRL_PN_TYPE_OWR BIT(1) +#define ADI_AXI_REG_CHAN_CTRL_ENABLE BIT(0) + +#define ADI_AXI_REG_CHAN_CTRL_DEFAULTS \ + (ADI_AXI_REG_CHAN_CTRL_FMT_SIGNEXT | \ + ADI_AXI_REG_CHAN_CTRL_FMT_EN | \ + ADI_AXI_REG_CHAN_CTRL_ENABLE) + +struct adi_axi_adc_core_info { + unsigned int version; +}; + +struct adi_axi_adc_state { + struct mutex lock; + + struct adi_axi_adc_client *client; + void __iomem *regs; +}; + +struct adi_axi_adc_client { + struct list_head entry; + struct adi_axi_adc_conv conv; + struct adi_axi_adc_state *state; + struct device *dev; + const struct adi_axi_adc_core_info *info; +}; + +static LIST_HEAD(registered_clients); +static DEFINE_MUTEX(registered_clients_lock); + +static struct adi_axi_adc_client *conv_to_client(struct adi_axi_adc_conv *conv) +{ + return container_of(conv, struct adi_axi_adc_client, conv); +} + +void *adi_axi_adc_conv_priv(struct adi_axi_adc_conv *conv) +{ + struct adi_axi_adc_client *cl = conv_to_client(conv); + + return (char *)cl + ALIGN(sizeof(struct adi_axi_adc_client), IIO_ALIGN); +} +EXPORT_SYMBOL_GPL(adi_axi_adc_conv_priv); + +static void adi_axi_adc_write(struct adi_axi_adc_state *st, + unsigned int reg, + unsigned int val) +{ + iowrite32(val, st->regs + reg); +} + +static unsigned int adi_axi_adc_read(struct adi_axi_adc_state *st, + unsigned int reg) +{ + return ioread32(st->regs + reg); +} + +static int adi_axi_adc_config_dma_buffer(struct device *dev, + struct iio_dev *indio_dev) +{ + struct iio_buffer *buffer; + const char *dma_name; + + if (!device_property_present(dev, "dmas")) + return 0; + + if (device_property_read_string(dev, "dma-names", &dma_name)) + dma_name = "rx"; + + buffer = devm_iio_dmaengine_buffer_alloc(indio_dev->dev.parent, + dma_name); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); + + indio_dev->modes |= INDIO_BUFFER_HARDWARE; + iio_device_attach_buffer(indio_dev, buffer); + + return 0; +} + +static int adi_axi_adc_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + struct adi_axi_adc_state *st = iio_priv(indio_dev); + struct adi_axi_adc_conv *conv = &st->client->conv; + + if (!conv->read_raw) + return -EOPNOTSUPP; + + return conv->read_raw(conv, chan, val, val2, mask); +} + +static int adi_axi_adc_write_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + int val, int val2, long mask) +{ + struct adi_axi_adc_state *st = iio_priv(indio_dev); + struct adi_axi_adc_conv *conv = &st->client->conv; + + if (!conv->write_raw) + return -EOPNOTSUPP; + + return conv->write_raw(conv, chan, val, val2, mask); +} + +static int adi_axi_adc_update_scan_mode(struct iio_dev *indio_dev, + const unsigned long *scan_mask) +{ + struct adi_axi_adc_state *st = iio_priv(indio_dev); + struct adi_axi_adc_conv *conv = &st->client->conv; + unsigned int i, ctrl; + + for (i = 0; i < conv->chip_info->num_channels; i++) { + ctrl = adi_axi_adc_read(st, ADI_AXI_REG_CHAN_CTRL(i)); + + if (test_bit(i, scan_mask)) + ctrl |= ADI_AXI_REG_CHAN_CTRL_ENABLE; + else + ctrl &= ~ADI_AXI_REG_CHAN_CTRL_ENABLE; + + adi_axi_adc_write(st, ADI_AXI_REG_CHAN_CTRL(i), ctrl); + } + + return 0; +} + +static struct adi_axi_adc_conv *adi_axi_adc_conv_register(struct device *dev, + size_t sizeof_priv) +{ + struct adi_axi_adc_client *cl; + size_t alloc_size; + + alloc_size = ALIGN(sizeof(struct adi_axi_adc_client), IIO_ALIGN); + if (sizeof_priv) + alloc_size += ALIGN(sizeof_priv, IIO_ALIGN); + + cl = kzalloc(alloc_size, GFP_KERNEL); + if (!cl) + return ERR_PTR(-ENOMEM); + + mutex_lock(®istered_clients_lock); + + cl->dev = get_device(dev); + + list_add_tail(&cl->entry, ®istered_clients); + + mutex_unlock(®istered_clients_lock); + + return &cl->conv; +} + +static void adi_axi_adc_conv_unregister(struct adi_axi_adc_conv *conv) +{ + struct adi_axi_adc_client *cl = conv_to_client(conv); + + mutex_lock(®istered_clients_lock); + + list_del(&cl->entry); + put_device(cl->dev); + + mutex_unlock(®istered_clients_lock); + + kfree(cl); +} + +static void devm_adi_axi_adc_conv_release(struct device *dev, void *res) +{ + adi_axi_adc_conv_unregister(*(struct adi_axi_adc_conv **)res); +} + +struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev, + size_t sizeof_priv) +{ + struct adi_axi_adc_conv **ptr, *conv; + + ptr = devres_alloc(devm_adi_axi_adc_conv_release, sizeof(*ptr), + GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + conv = adi_axi_adc_conv_register(dev, sizeof_priv); + if (IS_ERR(conv)) { + devres_free(ptr); + return ERR_CAST(conv); + } + + *ptr = conv; + devres_add(dev, ptr); + + return conv; +} +EXPORT_SYMBOL_GPL(devm_adi_axi_adc_conv_register); + +static ssize_t in_voltage_scale_available_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct iio_dev *indio_dev = dev_to_iio_dev(dev); + struct adi_axi_adc_state *st = iio_priv(indio_dev); + struct adi_axi_adc_conv *conv = &st->client->conv; + size_t len = 0; + int i; + + for (i = 0; i < conv->chip_info->num_scales; i++) { + const unsigned int *s = conv->chip_info->scale_table[i]; + + len += scnprintf(buf + len, PAGE_SIZE - len, + "%u.%06u ", s[0], s[1]); + } + buf[len - 1] = '\n'; + + return len; +} + +static IIO_DEVICE_ATTR_RO(in_voltage_scale_available, 0); + +enum { + ADI_AXI_ATTR_SCALE_AVAIL, +}; + +#define ADI_AXI_ATTR(_en_, _file_) \ + [ADI_AXI_ATTR_##_en_] = &iio_dev_attr_##_file_.dev_attr.attr + +static struct attribute *adi_axi_adc_attributes[] = { + ADI_AXI_ATTR(SCALE_AVAIL, in_voltage_scale_available), + NULL +}; + +static umode_t axi_adc_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct iio_dev *indio_dev = dev_to_iio_dev(dev); + struct adi_axi_adc_state *st = iio_priv(indio_dev); + struct adi_axi_adc_conv *conv = &st->client->conv; + + switch (n) { + case ADI_AXI_ATTR_SCALE_AVAIL: + if (!conv->chip_info->num_scales) + return 0; + return attr->mode; + default: + return attr->mode; + } +} + +static const struct attribute_group adi_axi_adc_attribute_group = { + .attrs = adi_axi_adc_attributes, + .is_visible = axi_adc_attr_is_visible, +}; + +static const struct iio_info adi_axi_adc_info = { + .read_raw = &adi_axi_adc_read_raw, + .write_raw = &adi_axi_adc_write_raw, + .attrs = &adi_axi_adc_attribute_group, + .update_scan_mode = &adi_axi_adc_update_scan_mode, +}; + +static const struct adi_axi_adc_core_info adi_axi_adc_10_0_a_info = { + .version = ADI_AXI_PCORE_VER(10, 0, 'a'), +}; + +static struct adi_axi_adc_client *adi_axi_adc_attach_client(struct device *dev) +{ + const struct adi_axi_adc_core_info *info; + struct adi_axi_adc_client *cl; + struct device_node *cln; + + info = of_device_get_match_data(dev); + if (!info) + return ERR_PTR(-ENODEV); + + cln = of_parse_phandle(dev->of_node, "adi,adc-dev", 0); + if (!cln) { + dev_err(dev, "No 'adi,adc-dev' node defined\n"); + return ERR_PTR(-ENODEV); + } + + mutex_lock(®istered_clients_lock); + + list_for_each_entry(cl, ®istered_clients, entry) { + if (!cl->dev) + continue; + + if (cl->dev->of_node != cln) + continue; + + if (!try_module_get(dev->driver->owner)) { + mutex_unlock(®istered_clients_lock); + return ERR_PTR(-ENODEV); + } + + get_device(dev); + cl->info = info; + mutex_unlock(®istered_clients_lock); + return cl; + } + + mutex_unlock(®istered_clients_lock); + + return ERR_PTR(-EPROBE_DEFER); +} + +static int adi_axi_adc_setup_channels(struct device *dev, + struct adi_axi_adc_state *st) +{ + struct adi_axi_adc_conv *conv = &st->client->conv; + int i, ret; + + if (conv->preenable_setup) { + ret = conv->preenable_setup(conv); + if (ret) + return ret; + } + + for (i = 0; i < conv->chip_info->num_channels; i++) { + adi_axi_adc_write(st, ADI_AXI_REG_CHAN_CTRL(i), + ADI_AXI_REG_CHAN_CTRL_DEFAULTS); + } + + return 0; +} + +static void axi_adc_reset(struct adi_axi_adc_state *st) +{ + adi_axi_adc_write(st, ADI_AXI_REG_RSTN, 0); + mdelay(10); + adi_axi_adc_write(st, ADI_AXI_REG_RSTN, ADI_AXI_REG_RSTN_MMCM_RSTN); + mdelay(10); + adi_axi_adc_write(st, ADI_AXI_REG_RSTN, + ADI_AXI_REG_RSTN_RSTN | ADI_AXI_REG_RSTN_MMCM_RSTN); +} + +static void adi_axi_adc_cleanup(void *data) +{ + struct adi_axi_adc_client *cl = data; + + put_device(cl->dev); + module_put(cl->dev->driver->owner); +} + +static int adi_axi_adc_probe(struct platform_device *pdev) +{ + struct adi_axi_adc_conv *conv; + struct iio_dev *indio_dev; + struct adi_axi_adc_client *cl; + struct adi_axi_adc_state *st; + unsigned int ver; + int ret; + + cl = adi_axi_adc_attach_client(&pdev->dev); + if (IS_ERR(cl)) + return PTR_ERR(cl); + + ret = devm_add_action_or_reset(&pdev->dev, adi_axi_adc_cleanup, cl); + if (ret) + return ret; + + indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*st)); + if (indio_dev == NULL) + return -ENOMEM; + + st = iio_priv(indio_dev); + st->client = cl; + cl->state = st; + mutex_init(&st->lock); + + st->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(st->regs)) + return PTR_ERR(st->regs); + + conv = &st->client->conv; + + axi_adc_reset(st); + + ver = adi_axi_adc_read(st, ADI_AXI_REG_VERSION); + + if (cl->info->version > ver) { + dev_err(&pdev->dev, + "IP core version is too old. Expected %d.%.2d.%c, Reported %d.%.2d.%c\n", + ADI_AXI_PCORE_VER_MAJOR(cl->info->version), + ADI_AXI_PCORE_VER_MINOR(cl->info->version), + ADI_AXI_PCORE_VER_PATCH(cl->info->version), + ADI_AXI_PCORE_VER_MAJOR(ver), + ADI_AXI_PCORE_VER_MINOR(ver), + ADI_AXI_PCORE_VER_PATCH(ver)); + return -ENODEV; + } + + indio_dev->info = &adi_axi_adc_info; + indio_dev->dev.parent = &pdev->dev; + indio_dev->name = "adi-axi-adc"; + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->num_channels = conv->chip_info->num_channels; + indio_dev->channels = conv->chip_info->channels; + + ret = adi_axi_adc_config_dma_buffer(&pdev->dev, indio_dev); + if (ret) + return ret; + + ret = adi_axi_adc_setup_channels(&pdev->dev, st); + if (ret) + return ret; + + ret = devm_iio_device_register(&pdev->dev, indio_dev); + if (ret) + return ret; + + dev_info(&pdev->dev, "AXI ADC IP core (%d.%.2d.%c) probed\n", + ADI_AXI_PCORE_VER_MAJOR(ver), + ADI_AXI_PCORE_VER_MINOR(ver), + ADI_AXI_PCORE_VER_PATCH(ver)); + + return 0; +} + +/* Match table for of_platform binding */ +static const struct of_device_id adi_axi_adc_of_match[] = { + { .compatible = "adi,axi-adc-10.0.a", .data = &adi_axi_adc_10_0_a_info }, + { /* end of list */ } +}; +MODULE_DEVICE_TABLE(of, adi_axi_adc_of_match); + +static struct platform_driver adi_axi_adc_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = adi_axi_adc_of_match, + }, + .probe = adi_axi_adc_probe, +}; +module_platform_driver(adi_axi_adc_driver); + +MODULE_AUTHOR("Michael Hennerich "); +MODULE_DESCRIPTION("Analog Devices Generic AXI ADC IP core driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/iio/adc/adi-axi-adc.h b/include/linux/iio/adc/adi-axi-adc.h new file mode 100644 index 000000000000..c5d48e1c2d36 --- /dev/null +++ b/include/linux/iio/adc/adi-axi-adc.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Analog Devices Generic AXI ADC IP core driver/library + * Link: https://wiki.analog.com/resources/fpga/docs/axi_adc_ip + * + * Copyright 2012-2020 Analog Devices Inc. + */ +#ifndef __ADI_AXI_ADC_H__ +#define __ADI_AXI_ADC_H__ + +struct device; +struct iio_chan_spec; + +/** + * struct adi_axi_adc_chip_info - Chip specific information + * @name Chip name + * @id Chip ID (usually product ID) + * @channels Channel specifications of type @struct axi_adc_chan_spec + * @num_channels Number of @channels + * @scale_table Supported scales by the chip; tuples of 2 ints + * @num_scales Number of scales in the table + * @max_rate Maximum sampling rate supported by the device + */ +struct adi_axi_adc_chip_info { + const char *name; + unsigned int id; + + const struct iio_chan_spec *channels; + unsigned int num_channels; + + const unsigned int (*scale_table)[2]; + int num_scales; + + unsigned long max_rate; +}; + +/** + * struct adi_axi_adc_conv - data of the ADC attached to the AXI ADC + * @chip_info chip info details for the client ADC + * @preenable_setup op to run in the client before enabling the AXI ADC + * @reg_access IIO debugfs_reg_access hook for the client ADC + * @read_raw IIO read_raw hook for the client ADC + * @write_raw IIO write_raw hook for the client ADC + */ +struct adi_axi_adc_conv { + const struct adi_axi_adc_chip_info *chip_info; + + int (*preenable_setup)(struct adi_axi_adc_conv *conv); + int (*reg_access)(struct adi_axi_adc_conv *conv, unsigned int reg, + unsigned int writeval, unsigned int *readval); + int (*read_raw)(struct adi_axi_adc_conv *conv, + struct iio_chan_spec const *chan, + int *val, int *val2, long mask); + int (*write_raw)(struct adi_axi_adc_conv *conv, + struct iio_chan_spec const *chan, + int val, int val2, long mask); +}; + +struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev, + size_t sizeof_priv); + +void *adi_axi_adc_conv_priv(struct adi_axi_adc_conv *conv); + +#endif -- cgit v1.2.3 From c0ae3591d900c0d1a54a86666c2102dd39b3d4f7 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 1 Apr 2020 18:57:06 +0200 Subject: iio: dma-buffer: Cleanup buffer.h/buffer_impl.h includes The IIO DMA buffer is a DMA buffer implementation. As such it should include buffer_impl.h rather than buffer.h. The include to buffer.h in buffer-dma.h should be buffer_impl.h so it has access to the struct iio_buffer definition. The code currently only works because all places that use buffer-dma.h include buffer_impl.h before it. The include to buffer.h in industrialio-buffer-dma.c can be removed since those file does not reference any of buffer consumer functions. Signed-off-by: Lars-Peter Clausen Tested-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/iio/buffer/industrialio-buffer-dma.c | 1 - include/linux/iio/buffer-dma.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c index a74bd9c0587c..d348af8b9705 100644 --- a/drivers/iio/buffer/industrialio-buffer-dma.c +++ b/drivers/iio/buffer/industrialio-buffer-dma.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 016d8a068353..ff15c61bf319 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include struct iio_dma_buffer_queue; struct iio_dma_buffer_ops; -- cgit v1.2.3 From 2e036804d773e00ea19c6f6b8b7a4b8232c77da1 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Fri, 10 Apr 2020 12:36:07 +0300 Subject: iio: buffer: remove 'scan_el_attrs' attribute group from buffer struct This field doesn't seem used. It seems that only 'buffer->attrs' was ever used to extend sysfs attributes for an IIO buffer. Moving forward, it may not make sense to keep it. This patch removes the field and it's initialization code. Since we want to rework IIO buffer, to be able to add more buffers per IIO device, we will merge [somehow] the 'buffer' & 'scan_elements' groups, and we will continue to add the attributes to the 'buffer' group. Removing it here, will also make the rework here a bit smaller, since this code will not be present. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-buffer.c | 8 -------- include/linux/iio/buffer_impl.h | 6 ------ 2 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index e6fa1a4e135d..221157136af6 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -1283,11 +1283,6 @@ int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev) indio_dev->groups[indio_dev->groupcounter++] = &buffer->buffer_group; - if (buffer->scan_el_attrs != NULL) { - attr = buffer->scan_el_attrs->attrs; - while (*attr++ != NULL) - attrcount_orig++; - } attrcount = attrcount_orig; INIT_LIST_HEAD(&buffer->scan_el_dev_attr_list); channels = indio_dev->channels; @@ -1325,9 +1320,6 @@ int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev) ret = -ENOMEM; goto error_free_scan_mask; } - if (buffer->scan_el_attrs) - memcpy(buffer->scan_el_group.attrs, buffer->scan_el_attrs, - sizeof(buffer->scan_el_group.attrs[0])*attrcount_orig); attrn = attrcount_orig; list_for_each_entry(p, &buffer->scan_el_dev_attr_list, l) diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index 1e7edf6bed96..a63dc07b7350 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -94,12 +94,6 @@ struct iio_buffer { unsigned int watermark; /* private: */ - /* - * @scan_el_attrs: Control of scan elements if that scan mode - * control method is used. - */ - struct attribute_group *scan_el_attrs; - /* @scan_timestamp: Does the scan mode include a timestamp. */ bool scan_timestamp; -- cgit v1.2.3 From 83af573e980aafa7edf56a449a6e70f6c8ac9792 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:20 +0200 Subject: iio: core: drop devm_iio_device_unregister() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 1 - drivers/iio/industrialio-core.c | 17 ----------------- include/linux/iio/iio.h | 4 ---- 3 files changed, 22 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 46c13780994c..0580c64ebdfd 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -286,7 +286,6 @@ IIO devm_iio_device_alloc() devm_iio_device_free() devm_iio_device_register() - devm_iio_device_unregister() devm_iio_kfifo_allocate() devm_iio_kfifo_free() devm_iio_triggered_buffer_setup() diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 56ff24d7a174..0f278da12bcb 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1830,23 +1830,6 @@ int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev, } EXPORT_SYMBOL_GPL(__devm_iio_device_register); -/** - * devm_iio_device_unregister - Resource-managed iio_device_unregister() - * @dev: Device this iio_dev belongs to - * @indio_dev: the iio_dev associated with the device - * - * Unregister iio_dev registered with devm_iio_device_register(). - */ -void devm_iio_device_unregister(struct device *dev, struct iio_dev *indio_dev) -{ - int rc; - - rc = devres_release(dev, devm_iio_device_unreg, - devm_iio_device_match, indio_dev); - WARN_ON(rc); -} -EXPORT_SYMBOL_GPL(devm_iio_device_unregister); - /** * iio_device_claim_direct_mode - Keep device in direct mode * @indio_dev: the iio_dev associated with the device diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index e975020abaa6..1b2630bc327e 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -593,9 +593,6 @@ void iio_device_unregister(struct iio_dev *indio_dev); * calls iio_device_register() internally. Refer to that function for more * information. * - * If an iio_dev registered with this function needs to be unregistered - * separately, devm_iio_device_unregister() must be used. - * * RETURNS: * 0 on success, negative error number on failure. */ @@ -603,7 +600,6 @@ void iio_device_unregister(struct iio_dev *indio_dev); __devm_iio_device_register((dev), (indio_dev), THIS_MODULE); int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev, struct module *this_mod); -void devm_iio_device_unregister(struct device *dev, struct iio_dev *indio_dev); int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp); int iio_device_claim_direct_mode(struct iio_dev *indio_dev); void iio_device_release_direct_mode(struct iio_dev *indio_dev); -- cgit v1.2.3 From 666e4de43d942fbe0d7f1d9dbc993f5b5c81760c Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:21 +0200 Subject: iio: core: drop devm_iio_triggered_buffer_cleanup() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 1 - drivers/iio/buffer/industrialio-triggered-buffer.c | 11 ----------- include/linux/iio/triggered_buffer.h | 2 -- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 0580c64ebdfd..1431d5e44abc 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -289,7 +289,6 @@ IIO devm_iio_kfifo_allocate() devm_iio_kfifo_free() devm_iio_triggered_buffer_setup() - devm_iio_triggered_buffer_cleanup() devm_iio_trigger_alloc() devm_iio_trigger_free() devm_iio_trigger_register() diff --git a/drivers/iio/buffer/industrialio-triggered-buffer.c b/drivers/iio/buffer/industrialio-triggered-buffer.c index cb322b2f09cd..e8046c1ecd6b 100644 --- a/drivers/iio/buffer/industrialio-triggered-buffer.c +++ b/drivers/iio/buffer/industrialio-triggered-buffer.c @@ -126,17 +126,6 @@ int devm_iio_triggered_buffer_setup(struct device *dev, } EXPORT_SYMBOL_GPL(devm_iio_triggered_buffer_setup); -void devm_iio_triggered_buffer_cleanup(struct device *dev, - struct iio_dev *indio_dev) -{ - int rc; - - rc = devres_release(dev, devm_iio_triggered_buffer_clean, - devm_iio_device_match, indio_dev); - WARN_ON(rc); -} -EXPORT_SYMBOL_GPL(devm_iio_triggered_buffer_cleanup); - MODULE_AUTHOR("Lars-Peter Clausen "); MODULE_DESCRIPTION("IIO helper functions for setting up triggered buffers"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iio/triggered_buffer.h b/include/linux/iio/triggered_buffer.h index 238ad30ce166..e99c91799359 100644 --- a/include/linux/iio/triggered_buffer.h +++ b/include/linux/iio/triggered_buffer.h @@ -18,7 +18,5 @@ int devm_iio_triggered_buffer_setup(struct device *dev, irqreturn_t (*h)(int irq, void *p), irqreturn_t (*thread)(int irq, void *p), const struct iio_buffer_setup_ops *ops); -void devm_iio_triggered_buffer_cleanup(struct device *dev, - struct iio_dev *indio_dev); #endif -- cgit v1.2.3 From 66be392a48f9d5256b4ba582ff06303fe97b5ca2 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:22 +0200 Subject: iio: core: drop devm_iio_device_free() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. This is the last user of 'devm_iio_device_match()', so it can be removed as well in this patch. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 1 - drivers/iio/industrialio-core.c | 31 ------------------------ include/linux/iio/iio.h | 2 -- 3 files changed, 34 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 1431d5e44abc..6ae6c67dfec0 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -284,7 +284,6 @@ I2C IIO devm_iio_device_alloc() - devm_iio_device_free() devm_iio_device_register() devm_iio_kfifo_allocate() devm_iio_kfifo_free() diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 0f278da12bcb..f4daf19f2a3b 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1546,17 +1546,6 @@ static void devm_iio_device_release(struct device *dev, void *res) iio_device_free(*(struct iio_dev **)res); } -int devm_iio_device_match(struct device *dev, void *res, void *data) -{ - struct iio_dev **r = res; - if (!r || !*r) { - WARN_ON(!r || !*r); - return 0; - } - return *r == data; -} -EXPORT_SYMBOL_GPL(devm_iio_device_match); - /** * devm_iio_device_alloc - Resource-managed iio_device_alloc() * @dev: Device to allocate iio_dev for @@ -1565,9 +1554,6 @@ EXPORT_SYMBOL_GPL(devm_iio_device_match); * Managed iio_device_alloc. iio_dev allocated with this function is * automatically freed on driver detach. * - * If an iio_dev allocated with this function needs to be freed separately, - * devm_iio_device_free() must be used. - * * RETURNS: * Pointer to allocated iio_dev on success, NULL on failure. */ @@ -1592,23 +1578,6 @@ struct iio_dev *devm_iio_device_alloc(struct device *dev, int sizeof_priv) } EXPORT_SYMBOL_GPL(devm_iio_device_alloc); -/** - * devm_iio_device_free - Resource-managed iio_device_free() - * @dev: Device this iio_dev belongs to - * @iio_dev: the iio_dev associated with the device - * - * Free iio_dev allocated with devm_iio_device_alloc(). - */ -void devm_iio_device_free(struct device *dev, struct iio_dev *iio_dev) -{ - int rc; - - rc = devres_release(dev, devm_iio_device_release, - devm_iio_device_match, iio_dev); - WARN_ON(rc); -} -EXPORT_SYMBOL_GPL(devm_iio_device_free); - /** * iio_chrdev_open() - chrdev file open for buffer access and ioctls * @inode: Inode structure for identifying the device in the file system diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 1b2630bc327e..4c1e320ef7ed 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -688,9 +688,7 @@ static inline struct iio_dev *iio_priv_to_dev(void *priv) } void iio_device_free(struct iio_dev *indio_dev); -int devm_iio_device_match(struct device *dev, void *res, void *data); struct iio_dev *devm_iio_device_alloc(struct device *dev, int sizeof_priv); -void devm_iio_device_free(struct device *dev, struct iio_dev *indio_dev); struct iio_trigger *devm_iio_trigger_alloc(struct device *dev, const char *fmt, ...); void devm_iio_trigger_free(struct device *dev, struct iio_trigger *iio_trig); -- cgit v1.2.3 From 83381c9803a00d534f32ab52ed91588601eba43f Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:23 +0200 Subject: iio: core: drop devm_iio_trigger_unregister() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 1 - Documentation/driver-api/iio/triggers.rst | 1 - drivers/iio/industrialio-trigger.c | 21 --------------------- include/linux/iio/trigger.h | 3 --- 4 files changed, 26 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 6ae6c67dfec0..f638a035e6d2 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -291,7 +291,6 @@ IIO devm_iio_trigger_alloc() devm_iio_trigger_free() devm_iio_trigger_register() - devm_iio_trigger_unregister() devm_iio_channel_get() devm_iio_channel_release() devm_iio_channel_get_all() diff --git a/Documentation/driver-api/iio/triggers.rst b/Documentation/driver-api/iio/triggers.rst index 5c2156de6284..160faa810d12 100644 --- a/Documentation/driver-api/iio/triggers.rst +++ b/Documentation/driver-api/iio/triggers.rst @@ -6,7 +6,6 @@ Triggers * :c:func:`devm_iio_trigger_alloc` — Resource-managed iio_trigger_alloc * :c:func:`devm_iio_trigger_free` — Resource-managed iio_trigger_free * :c:func:`devm_iio_trigger_register` — Resource-managed iio_trigger_register -* :c:func:`devm_iio_trigger_unregister` — Resource-managed iio_trigger_unregister * :c:func:`iio_trigger_validate_own_device` — Check if a trigger and IIO device belong to the same device diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c index 3908a9a90035..611f608a9da2 100644 --- a/drivers/iio/industrialio-trigger.c +++ b/drivers/iio/industrialio-trigger.c @@ -673,9 +673,6 @@ static void devm_iio_trigger_unreg(struct device *dev, void *res) * calls iio_trigger_register() internally. Refer to that function for more * information. * - * If an iio_trigger registered with this function needs to be unregistered - * separately, devm_iio_trigger_unregister() must be used. - * * RETURNS: * 0 on success, negative error number on failure. */ @@ -701,24 +698,6 @@ int __devm_iio_trigger_register(struct device *dev, } EXPORT_SYMBOL_GPL(__devm_iio_trigger_register); -/** - * devm_iio_trigger_unregister - Resource-managed iio_trigger_unregister() - * @dev: device this iio_trigger belongs to - * @trig_info: the trigger associated with the device - * - * Unregister trigger registered with devm_iio_trigger_register(). - */ -void devm_iio_trigger_unregister(struct device *dev, - struct iio_trigger *trig_info) -{ - int rc; - - rc = devres_release(dev, devm_iio_trigger_unreg, devm_iio_trigger_match, - trig_info); - WARN_ON(rc); -} -EXPORT_SYMBOL_GPL(devm_iio_trigger_unregister); - bool iio_trigger_using_own(struct iio_dev *indio_dev) { return indio_dev->trig->attached_own_device; diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h index 84995e2967ac..cad8325903f9 100644 --- a/include/linux/iio/trigger.h +++ b/include/linux/iio/trigger.h @@ -141,9 +141,6 @@ int __devm_iio_trigger_register(struct device *dev, **/ void iio_trigger_unregister(struct iio_trigger *trig_info); -void devm_iio_trigger_unregister(struct device *dev, - struct iio_trigger *trig_info); - /** * iio_trigger_set_immutable() - set an immutable trigger on destination * -- cgit v1.2.3 From 92b7ed7fe40dd26de62ff8f5aa1605909a14e3ef Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:24 +0200 Subject: iio: core: drop devm_iio_trigger_free() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 1 - Documentation/driver-api/iio/triggers.rst | 1 - drivers/iio/industrialio-trigger.c | 32 ------------------------ include/linux/iio/iio.h | 2 -- 4 files changed, 36 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index f638a035e6d2..6c401c74e480 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -289,7 +289,6 @@ IIO devm_iio_kfifo_free() devm_iio_triggered_buffer_setup() devm_iio_trigger_alloc() - devm_iio_trigger_free() devm_iio_trigger_register() devm_iio_channel_get() devm_iio_channel_release() diff --git a/Documentation/driver-api/iio/triggers.rst b/Documentation/driver-api/iio/triggers.rst index 160faa810d12..dfd7ba3eabde 100644 --- a/Documentation/driver-api/iio/triggers.rst +++ b/Documentation/driver-api/iio/triggers.rst @@ -4,7 +4,6 @@ Triggers * struct :c:type:`iio_trigger` — industrial I/O trigger device * :c:func:`devm_iio_trigger_alloc` — Resource-managed iio_trigger_alloc -* :c:func:`devm_iio_trigger_free` — Resource-managed iio_trigger_free * :c:func:`devm_iio_trigger_register` — Resource-managed iio_trigger_register iio_trigger_unregister * :c:func:`iio_trigger_validate_own_device` — Check if a trigger and IIO diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c index 611f608a9da2..53d1931f6be8 100644 --- a/drivers/iio/industrialio-trigger.c +++ b/drivers/iio/industrialio-trigger.c @@ -585,18 +585,6 @@ static void devm_iio_trigger_release(struct device *dev, void *res) iio_trigger_free(*(struct iio_trigger **)res); } -static int devm_iio_trigger_match(struct device *dev, void *res, void *data) -{ - struct iio_trigger **r = res; - - if (!r || !*r) { - WARN_ON(!r || !*r); - return 0; - } - - return *r == data; -} - /** * devm_iio_trigger_alloc - Resource-managed iio_trigger_alloc() * @dev: Device to allocate iio_trigger for @@ -608,9 +596,6 @@ static int devm_iio_trigger_match(struct device *dev, void *res, void *data) * Managed iio_trigger_alloc. iio_trigger allocated with this function is * automatically freed on driver detach. * - * If an iio_trigger allocated with this function needs to be freed separately, - * devm_iio_trigger_free() must be used. - * * RETURNS: * Pointer to allocated iio_trigger on success, NULL on failure. */ @@ -640,23 +625,6 @@ struct iio_trigger *devm_iio_trigger_alloc(struct device *dev, } EXPORT_SYMBOL_GPL(devm_iio_trigger_alloc); -/** - * devm_iio_trigger_free - Resource-managed iio_trigger_free() - * @dev: Device this iio_dev belongs to - * @iio_trig: the iio_trigger associated with the device - * - * Free iio_trigger allocated with devm_iio_trigger_alloc(). - */ -void devm_iio_trigger_free(struct device *dev, struct iio_trigger *iio_trig) -{ - int rc; - - rc = devres_release(dev, devm_iio_trigger_release, - devm_iio_trigger_match, iio_trig); - WARN_ON(rc); -} -EXPORT_SYMBOL_GPL(devm_iio_trigger_free); - static void devm_iio_trigger_unreg(struct device *dev, void *res) { iio_trigger_unregister(*(struct iio_trigger **)res); diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 4c1e320ef7ed..d63884a54939 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -691,8 +691,6 @@ void iio_device_free(struct iio_dev *indio_dev); struct iio_dev *devm_iio_device_alloc(struct device *dev, int sizeof_priv); struct iio_trigger *devm_iio_trigger_alloc(struct device *dev, const char *fmt, ...); -void devm_iio_trigger_free(struct device *dev, struct iio_trigger *iio_trig); - /** * iio_buffer_enabled() - helper function to test if the buffer is enabled * @indio_dev: IIO device structure for device -- cgit v1.2.3 From fc1f75a0347aecc8b967c8c154564f520d6f4412 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:25 +0200 Subject: iio: inkern: drop devm_iio_channel_release{_all} API calls It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 2 -- drivers/iio/inkern.c | 27 ------------------------ include/linux/iio/consumer.h | 18 ---------------- 3 files changed, 47 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 6c401c74e480..10ccebe9f7c1 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -291,9 +291,7 @@ IIO devm_iio_trigger_alloc() devm_iio_trigger_register() devm_iio_channel_get() - devm_iio_channel_release() devm_iio_channel_get_all() - devm_iio_channel_release_all() INPUT devm_input_allocate_device() diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c index 5a8351c9a426..ede99e0d5371 100644 --- a/drivers/iio/inkern.c +++ b/drivers/iio/inkern.c @@ -360,18 +360,6 @@ static void devm_iio_channel_free(struct device *dev, void *res) iio_channel_release(channel); } -static int devm_iio_channel_match(struct device *dev, void *res, void *data) -{ - struct iio_channel **r = res; - - if (!r || !*r) { - WARN_ON(!r || !*r); - return 0; - } - - return *r == data; -} - struct iio_channel *devm_iio_channel_get(struct device *dev, const char *channel_name) { @@ -394,13 +382,6 @@ struct iio_channel *devm_iio_channel_get(struct device *dev, } EXPORT_SYMBOL_GPL(devm_iio_channel_get); -void devm_iio_channel_release(struct device *dev, struct iio_channel *channel) -{ - WARN_ON(devres_release(dev, devm_iio_channel_free, - devm_iio_channel_match, channel)); -} -EXPORT_SYMBOL_GPL(devm_iio_channel_release); - struct iio_channel *iio_channel_get_all(struct device *dev) { const char *name; @@ -514,14 +495,6 @@ struct iio_channel *devm_iio_channel_get_all(struct device *dev) } EXPORT_SYMBOL_GPL(devm_iio_channel_get_all); -void devm_iio_channel_release_all(struct device *dev, - struct iio_channel *channels) -{ - WARN_ON(devres_release(dev, devm_iio_channel_free_all, - devm_iio_channel_match, channels)); -} -EXPORT_SYMBOL_GPL(devm_iio_channel_release_all); - static int iio_channel_read(struct iio_channel *chan, int *val, int *val2, enum iio_chan_info_enum info) { diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 2bde8c912d4d..c4118dcb8e05 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -63,15 +63,6 @@ void iio_channel_release(struct iio_channel *chan); */ struct iio_channel *devm_iio_channel_get(struct device *dev, const char *consumer_channel); -/** - * devm_iio_channel_release() - Resource managed version of - * iio_channel_release(). - * @dev: Pointer to consumer device for which resource - * is allocared. - * @chan: The channel to be released. - */ -void devm_iio_channel_release(struct device *dev, struct iio_channel *chan); - /** * iio_channel_get_all() - get all channels associated with a client * @dev: Pointer to consumer device. @@ -106,15 +97,6 @@ void iio_channel_release_all(struct iio_channel *chan); */ struct iio_channel *devm_iio_channel_get_all(struct device *dev); -/** - * devm_iio_channel_release_all() - Resource managed version of - * iio_channel_release_all(). - * @dev: Pointer to consumer device for which resource - * is allocared. - * @chan: Array channel to be released. - */ -void devm_iio_channel_release_all(struct device *dev, struct iio_channel *chan); - struct iio_cb_buffer; /** * iio_channel_get_all_cb() - register callback for triggered capture -- cgit v1.2.3 From 05c09e3cee0aa97a51c2eac1a2f453cac0ba5683 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:26 +0200 Subject: iio: buffer: drop devm_iio_hw_consumer_free() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- drivers/iio/buffer/industrialio-hw-consumer.c | 31 --------------------------- include/linux/iio/hw-consumer.h | 1 - 2 files changed, 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/buffer/industrialio-hw-consumer.c b/drivers/iio/buffer/industrialio-hw-consumer.c index 95165697d8ae..f2d27788f666 100644 --- a/drivers/iio/buffer/industrialio-hw-consumer.c +++ b/drivers/iio/buffer/industrialio-hw-consumer.c @@ -142,17 +142,6 @@ static void devm_iio_hw_consumer_release(struct device *dev, void *res) iio_hw_consumer_free(*(struct iio_hw_consumer **)res); } -static int devm_iio_hw_consumer_match(struct device *dev, void *res, void *data) -{ - struct iio_hw_consumer **r = res; - - if (!r || !*r) { - WARN_ON(!r || !*r); - return 0; - } - return *r == data; -} - /** * devm_iio_hw_consumer_alloc - Resource-managed iio_hw_consumer_alloc() * @dev: Pointer to consumer device. @@ -160,9 +149,6 @@ static int devm_iio_hw_consumer_match(struct device *dev, void *res, void *data) * Managed iio_hw_consumer_alloc. iio_hw_consumer allocated with this function * is automatically freed on driver detach. * - * If an iio_hw_consumer allocated with this function needs to be freed - * separately, devm_iio_hw_consumer_free() must be used. - * * returns pointer to allocated iio_hw_consumer on success, NULL on failure. */ struct iio_hw_consumer *devm_iio_hw_consumer_alloc(struct device *dev) @@ -186,23 +172,6 @@ struct iio_hw_consumer *devm_iio_hw_consumer_alloc(struct device *dev) } EXPORT_SYMBOL_GPL(devm_iio_hw_consumer_alloc); -/** - * devm_iio_hw_consumer_free - Resource-managed iio_hw_consumer_free() - * @dev: Pointer to consumer device. - * @hwc: iio_hw_consumer to free. - * - * Free iio_hw_consumer allocated with devm_iio_hw_consumer_alloc(). - */ -void devm_iio_hw_consumer_free(struct device *dev, struct iio_hw_consumer *hwc) -{ - int rc; - - rc = devres_release(dev, devm_iio_hw_consumer_release, - devm_iio_hw_consumer_match, hwc); - WARN_ON(rc); -} -EXPORT_SYMBOL_GPL(devm_iio_hw_consumer_free); - /** * iio_hw_consumer_enable() - Enable IIO hardware consumer * @hwc: iio_hw_consumer to enable. diff --git a/include/linux/iio/hw-consumer.h b/include/linux/iio/hw-consumer.h index 44d48bb1d39f..e8255c2e33bc 100644 --- a/include/linux/iio/hw-consumer.h +++ b/include/linux/iio/hw-consumer.h @@ -14,7 +14,6 @@ struct iio_hw_consumer; struct iio_hw_consumer *iio_hw_consumer_alloc(struct device *dev); void iio_hw_consumer_free(struct iio_hw_consumer *hwc); struct iio_hw_consumer *devm_iio_hw_consumer_alloc(struct device *dev); -void devm_iio_hw_consumer_free(struct device *dev, struct iio_hw_consumer *hwc); int iio_hw_consumer_enable(struct iio_hw_consumer *hwc); void iio_hw_consumer_disable(struct iio_hw_consumer *hwc); -- cgit v1.2.3 From 608d98a2c4a0a72c15fa16ef1542df67afe41fee Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 27 Feb 2020 15:52:27 +0200 Subject: iio: buffer: drop devm_iio_kfifo_free() API call It's unused so far, so it can be removed. Also makes sense to remove it to discourage weird uses of this call during review. Signed-off-by: Alexandru Ardelean Signed-off-by: Jonathan Cameron --- Documentation/driver-api/driver-model/devres.rst | 1 - drivers/iio/buffer/kfifo_buf.c | 22 ---------------------- include/linux/iio/kfifo_buf.h | 1 - 3 files changed, 24 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 10ccebe9f7c1..91b0b8e5556c 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -286,7 +286,6 @@ IIO devm_iio_device_alloc() devm_iio_device_register() devm_iio_kfifo_allocate() - devm_iio_kfifo_free() devm_iio_triggered_buffer_setup() devm_iio_trigger_alloc() devm_iio_trigger_register() diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c index 3150f8ab984b..1359abed3b31 100644 --- a/drivers/iio/buffer/kfifo_buf.c +++ b/drivers/iio/buffer/kfifo_buf.c @@ -179,16 +179,6 @@ static void devm_iio_kfifo_release(struct device *dev, void *res) iio_kfifo_free(*(struct iio_buffer **)res); } -static int devm_iio_kfifo_match(struct device *dev, void *res, void *data) -{ - struct iio_buffer **r = res; - - if (WARN_ON(!r || !*r)) - return 0; - - return *r == data; -} - /** * devm_iio_fifo_allocate - Resource-managed iio_kfifo_allocate() * @dev: Device to allocate kfifo buffer for @@ -216,16 +206,4 @@ struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev) } EXPORT_SYMBOL(devm_iio_kfifo_allocate); -/** - * devm_iio_fifo_free - Resource-managed iio_kfifo_free() - * @dev: Device the buffer belongs to - * @r: The buffer associated with the device - */ -void devm_iio_kfifo_free(struct device *dev, struct iio_buffer *r) -{ - WARN_ON(devres_release(dev, devm_iio_kfifo_release, - devm_iio_kfifo_match, r)); -} -EXPORT_SYMBOL(devm_iio_kfifo_free); - MODULE_LICENSE("GPL"); diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h index 764659e01b68..1fc1efa7799d 100644 --- a/include/linux/iio/kfifo_buf.h +++ b/include/linux/iio/kfifo_buf.h @@ -9,6 +9,5 @@ struct iio_buffer *iio_kfifo_allocate(void); void iio_kfifo_free(struct iio_buffer *r); struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev); -void devm_iio_kfifo_free(struct device *dev, struct iio_buffer *r); #endif -- cgit v1.2.3 From 418fd78771220f5e522d02676758ed824c349fd5 Mon Sep 17 00:00:00 2001 From: Clement Leger Date: Fri, 10 Apr 2020 12:24:32 +0200 Subject: remoteproc: add rproc_coredump_set_elf_info This function allows drivers to correctly setup the coredump output elf information. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Clement Leger Link: https://lore.kernel.org/r/20200410102433.2672-2-cleger@kalray.eu Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 32 ++++++++++++++++++++++++++++-- drivers/remoteproc/remoteproc_elf_loader.c | 3 --- include/linux/remoteproc.h | 2 ++ 3 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 9899467fa1cf..d9e6949e4ac1 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1565,6 +1565,28 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc, } EXPORT_SYMBOL(rproc_coredump_add_custom_segment); +/** + * rproc_coredump_set_elf_info() - set coredump elf information + * @rproc: handle of a remote processor + * @class: elf class for coredump elf file + * @machine: elf machine for coredump elf file + * + * Set elf information which will be used for coredump elf file. + * + * Return: 0 on success, negative errno on error. + */ +int rproc_coredump_set_elf_info(struct rproc *rproc, u8 class, u16 machine) +{ + if (class != ELFCLASS64 && class != ELFCLASS32) + return -EINVAL; + + rproc->elf_class = class; + rproc->elf_machine = machine; + + return 0; +} +EXPORT_SYMBOL(rproc_coredump_set_elf_info); + /** * rproc_coredump() - perform coredump * @rproc: rproc handle @@ -1587,6 +1609,11 @@ static void rproc_coredump(struct rproc *rproc) if (list_empty(&rproc->dump_segments)) return; + if (class == ELFCLASSNONE) { + dev_err(&rproc->dev, "Elf class is not set\n"); + return; + } + data_size = elf_size_of_hdr(class); list_for_each_entry(segment, &rproc->dump_segments, node) { data_size += elf_size_of_phdr(class) + segment->size; @@ -1605,7 +1632,7 @@ static void rproc_coredump(struct rproc *rproc) elf_hdr_init_ident(ehdr, class); elf_hdr_set_e_type(class, ehdr, ET_CORE); - elf_hdr_set_e_machine(class, ehdr, EM_NONE); + elf_hdr_set_e_machine(class, ehdr, rproc->elf_machine); elf_hdr_set_e_version(class, ehdr, EV_CURRENT); elf_hdr_set_e_entry(class, ehdr, rproc->bootaddr); elf_hdr_set_e_phoff(class, ehdr, elf_size_of_hdr(class)); @@ -2047,7 +2074,8 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->name = name; rproc->priv = &rproc[1]; rproc->auto_boot = true; - rproc->elf_class = ELFCLASS32; + rproc->elf_class = ELFCLASSNONE; + rproc->elf_machine = EM_NONE; device_initialize(&rproc->dev); rproc->dev.parent = dev; diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c index 16e2c496fd45..4869fb7d8fe4 100644 --- a/drivers/remoteproc/remoteproc_elf_loader.c +++ b/drivers/remoteproc/remoteproc_elf_loader.c @@ -248,9 +248,6 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) memset(ptr + filesz, 0, memsz - filesz); } - if (ret == 0) - rproc->elf_class = class; - return ret; } EXPORT_SYMBOL(rproc_elf_load_segments); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 9c07d7958c53..0547676479d3 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -518,6 +518,7 @@ struct rproc { struct list_head dump_segments; int nb_vdev; u8 elf_class; + u16 elf_machine; }; /** @@ -622,6 +623,7 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc, struct rproc_dump_segment *segment, void *dest), void *priv); +int rproc_coredump_set_elf_info(struct rproc *rproc, u8 class, u16 machine); static inline struct rproc_vdev *vdev_to_rvdev(struct virtio_device *vdev) { -- cgit v1.2.3 From bb15aded5144d36b2a050e1dad415876c0b94234 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 7 Apr 2020 23:56:43 +0300 Subject: mtd: spi-nor: move #define SPINOR_OP_WRDI The write disable (WRDI) opcode is not really specific to the SST flashes (anymore?) -- move the #define to the main opcode group, just before WREN. Signed-off-by: Sergei Shtylyov Signed-off-by: Tudor Ambarus --- include/linux/mtd/spi-nor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 1e2af0ec1f03..1cc8ed5d59ed 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -20,6 +20,7 @@ */ /* Flash opcodes. */ +#define SPINOR_OP_WRDI 0x04 /* Write disable */ #define SPINOR_OP_WREN 0x06 /* Write enable */ #define SPINOR_OP_RDSR 0x05 /* Read status register */ #define SPINOR_OP_WRSR 0x01 /* Write status register 1 byte */ @@ -80,7 +81,6 @@ /* Used for SST flashes only. */ #define SPINOR_OP_BP 0x02 /* Byte program */ -#define SPINOR_OP_WRDI 0x04 /* Write disable */ #define SPINOR_OP_AAI_WP 0xad /* Auto address increment word program */ /* Used for S3AN flashes only */ -- cgit v1.2.3 From c84dc6e68a1d2464e050d9694be4e4ff49e32bfd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2020 17:04:55 -0700 Subject: dma-pool: add additional coherent pools to map to gfp mask The single atomic pool is allocated from the lowest zone possible since it is guaranteed to be applicable for any DMA allocation. Devices may allocate through the DMA API but not have a strict reliance on GFP_DMA memory. Since the atomic pool will be used for all non-blockable allocations, returning all memory from ZONE_DMA may unnecessarily deplete the zone. Provision for multiple atomic pools that will map to the optimal gfp mask of the device. When allocating non-blockable memory, determine the optimal gfp mask of the device and use the appropriate atomic pool. The coherent DMA mask will remain the same between allocation and free and, thus, memory will be freed to the same atomic pool it was allocated from. __dma_atomic_pool_init() will be changed to return struct gen_pool * later once dynamic expansion is added. Signed-off-by: David Rientjes Signed-off-by: Christoph Hellwig --- drivers/iommu/dma-iommu.c | 5 +- include/linux/dma-direct.h | 2 + include/linux/dma-mapping.h | 6 +-- kernel/dma/direct.c | 12 ++--- kernel/dma/pool.c | 120 ++++++++++++++++++++++++++++---------------- 5 files changed, 91 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ba128d1cdaee..4959f5df21bd 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -952,7 +952,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) /* Non-coherent atomic allocation? Easy */ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, alloc_size)) + dma_free_from_pool(dev, cpu_addr, alloc_size)) return; if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) { @@ -1035,7 +1035,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !gfpflags_allow_blocking(gfp) && !coherent) - cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + cpu_addr = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, + gfp); else cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs); if (!cpu_addr) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 24b8684aa21d..136f984df0d9 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -67,6 +67,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size, } u64 dma_direct_get_required_mask(struct device *dev); +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 330ad58fbf4d..b43116a6405d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -630,9 +630,9 @@ void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller); void dma_common_free_remap(void *cpu_addr, size_t size); -bool dma_in_atomic_pool(void *start, size_t size); -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags); -bool dma_free_from_pool(void *start, size_t size); +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags); +bool dma_free_from_pool(struct device *dev, void *start, size_t size); int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f4bbdaf965e..a834ee22f8ff 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -89,8 +89,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -128,7 +128,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs) && !gfpflags_allow_blocking(gfp)) { - ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; goto done; @@ -212,7 +212,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, } if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; if (force_dma_unencrypted(dev)) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 3df5d9d39922..db4f89ac5f5f 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -10,7 +10,9 @@ #include #include -static struct gen_pool *atomic_pool __ro_after_init; +static struct gen_pool *atomic_pool_dma __ro_after_init; +static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static struct gen_pool *atomic_pool_kernel __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -22,89 +24,119 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -static gfp_t dma_atomic_pool_gfp(void) +static int __init __dma_atomic_pool_init(struct gen_pool **pool, + size_t pool_size, gfp_t gfp) { - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + const unsigned int order = get_order(pool_size); + const unsigned long nr_pages = pool_size >> PAGE_SHIFT; struct page *page; void *addr; int ret; if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); + page = dma_alloc_from_contiguous(NULL, nr_pages, order, false); else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); + page = alloc_pages(gfp, order); if (!page) goto out; - arch_dma_prep_coherent(page, atomic_pool_size); + arch_dma_prep_coherent(page, pool_size); - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) + *pool = gen_pool_create(PAGE_SHIFT, -1); + if (!*pool) goto free_page; - addr = dma_common_contiguous_remap(page, atomic_pool_size, + addr = dma_common_contiguous_remap(page, pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) goto destroy_genpool; - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); + ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page), + pool_size, -1); if (ret) goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL); - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + pool_size >> 10, &gfp); return 0; remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); + dma_common_free_remap(addr, pool_size); destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; + gen_pool_destroy(*pool); + *pool = NULL; free_page: if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); + __free_pages(page, order); out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); return -ENOMEM; } + +static int __init dma_atomic_pool_init(void) +{ + int ret = 0; + int err; + + ret = __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, + GFP_KERNEL); + if (IS_ENABLED(CONFIG_ZONE_DMA)) { + err = __dma_atomic_pool_init(&atomic_pool_dma, + atomic_pool_size, GFP_DMA); + if (!ret && err) + ret = err; + } + if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + err = __dma_atomic_pool_init(&atomic_pool_dma32, + atomic_pool_size, GFP_DMA32); + if (!ret && err) + ret = err; + } + return ret; +} postcore_initcall(dma_atomic_pool_init); -bool dma_in_atomic_pool(void *start, size_t size) +static inline struct gen_pool *dev_to_pool(struct device *dev) { - if (unlikely(!atomic_pool)) - return false; + u64 phys_mask; + gfp_t gfp; + + gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + return atomic_pool_dma; + if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) + return atomic_pool_dma32; + return atomic_pool_kernel; +} - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); +static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +{ + struct gen_pool *pool = dev_to_pool(dev); + + if (unlikely(!pool)) + return false; + return gen_pool_has_addr(pool, (unsigned long)start, size); } -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags) { + struct gen_pool *pool = dev_to_pool(dev); unsigned long val; void *ptr = NULL; - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); + if (!pool) { + WARN(1, "%pGg atomic pool not initialised!\n", &flags); return NULL; } - val = gen_pool_alloc(atomic_pool, size); + val = gen_pool_alloc(pool, size); if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + phys_addr_t phys = gen_pool_virt_to_phys(pool, val); *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; @@ -114,10 +146,12 @@ void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) return ptr; } -bool dma_free_from_pool(void *start, size_t size) +bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - if (!dma_in_atomic_pool(start, size)) + struct gen_pool *pool = dev_to_pool(dev); + + if (!dma_in_atomic_pool(dev, start, size)) return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); + gen_pool_free(pool, (unsigned long)start, size); return true; } -- cgit v1.2.3 From e9d7144597b10ff13ff2264c059f7d4a7fbc89ac Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:23:10 +0200 Subject: x86/cpu: Add a steppings field to struct x86_cpu_id Intel uses the same family/model for several CPUs. Sometimes the stepping must be checked to tell them apart. On x86 there can be at most 16 steppings. Add a steppings bitmask to x86_cpu_id and a X86_MATCH_VENDOR_FAMILY_MODEL_STEPPING_FEATURE macro and support for matching against family/model/stepping. [ bp: Massage. ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf --- arch/x86/include/asm/cpu_device_id.h | 27 ++++++++++++++++++++++++--- arch/x86/kernel/cpu/match.c | 7 ++++++- include/linux/mod_devicetable.h | 2 ++ 3 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index cf3d621c6892..10426cd56dca 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -20,12 +20,14 @@ #define X86_CENTAUR_FAM6_C7_D 0xd #define X86_CENTAUR_FAM6_NANO 0xf +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Base macro for CPU matching + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@_vendor * @_family: The family number or X86_FAMILY_ANY * @_model: The model number, model constant or X86_MODEL_ANY + * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY * @_data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer @@ -37,15 +39,34 @@ * into another macro at the usage site for good reasons, then please * start this local macro with X86_MATCH to allow easy grepping. */ -#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(_vendor, _family, _model, \ - _feature, _data) { \ +#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ .vendor = X86_VENDOR_##_vendor, \ .family = _family, \ .model = _model, \ + .steppings = _steppings, \ .feature = _feature, \ .driver_data = (unsigned long) _data \ } +/** + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ + X86_STEPPING_ANY, feature, data) + /** * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index d3482eb43ff3..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..0754b8d71262 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -663,6 +663,7 @@ struct x86_cpu_id { __u16 vendor; __u16 family; __u16 model; + __u16 steppings; __u16 feature; /* bit index */ kernel_ulong_t driver_data; }; @@ -671,6 +672,7 @@ struct x86_cpu_id { #define X86_VENDOR_ANY 0xffff #define X86_FAMILY_ANY 0 #define X86_MODEL_ANY 0 +#define X86_STEPPING_ANY 0 #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ /* -- cgit v1.2.3 From 02094d54870590a667f822e4032bbd1ba6c48d00 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 8 Apr 2020 19:09:02 +0300 Subject: software node: Allow register and unregister software node groups Sometimes it's more convenient to register a set of individual software nodes grouped together. Add couple of functions for that. Signed-off-by: Andy Shevchenko Acked-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Tested-by: Hans de Goede Reviewed-by: Heikki Krogerus --- drivers/base/swnode.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/property.h | 3 +++ 2 files changed, 51 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index de8d3543e8fe..2079937ddb51 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -726,6 +726,54 @@ void software_node_unregister_nodes(const struct software_node *nodes) } EXPORT_SYMBOL_GPL(software_node_unregister_nodes); +/** + * software_node_register_node_group - Register a group of software nodes + * @node_group: NULL terminated array of software node pointers to be registered + * + * Register multiple software nodes at once. + */ +int software_node_register_node_group(const struct software_node **node_group) +{ + unsigned int i; + int ret; + + if (!node_group) + return 0; + + for (i = 0; node_group[i]; i++) { + ret = software_node_register(node_group[i]); + if (ret) { + software_node_unregister_node_group(node_group); + return ret; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(software_node_register_node_group); + +/** + * software_node_unregister_node_group - Unregister a group of software nodes + * @node_group: NULL terminated array of software node pointers to be unregistered + * + * Unregister multiple software nodes at once. + */ +void software_node_unregister_node_group(const struct software_node **node_group) +{ + struct swnode *swnode; + unsigned int i; + + if (!node_group) + return; + + for (i = 0; node_group[i]; i++) { + swnode = software_node_to_swnode(node_group[i]); + if (swnode) + fwnode_remove_software_node(&swnode->fwnode); + } +} +EXPORT_SYMBOL_GPL(software_node_unregister_node_group); + /** * software_node_register - Register static software node * @node: The software node to be registered diff --git a/include/linux/property.h b/include/linux/property.h index d86de017c689..c7b5f3db36aa 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -440,6 +440,9 @@ software_node_find_by_name(const struct software_node *parent, int software_node_register_nodes(const struct software_node *nodes); void software_node_unregister_nodes(const struct software_node *nodes); +int software_node_register_node_group(const struct software_node **node_group); +void software_node_unregister_node_group(const struct software_node **node_group); + int software_node_register(const struct software_node *node); int software_node_notify(struct device *dev, unsigned long action); -- cgit v1.2.3 From 2a6ba3f794e892c37d67b8ebb19487ce105eabc2 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Fri, 27 Mar 2020 10:59:47 +0530 Subject: tee: enable support to register kernel memory Enable support to register kernel memory reference with TEE. This change will allow TEE bus drivers to register memory references. Signed-off-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_shm.c | 28 +++++++++++++++++++++++++--- include/linux/tee_drv.h | 1 + 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index bd679b72bd05..c259271d4d5f 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "tee_private.h" static void tee_shm_release(struct tee_shm *shm) @@ -185,14 +186,15 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, size_t length, u32 flags) { struct tee_device *teedev = ctx->teedev; - const u32 req_flags = TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED; + const u32 req_user_flags = TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED; + const u32 req_kernel_flags = TEE_SHM_DMA_BUF | TEE_SHM_KERNEL_MAPPED; struct tee_shm *shm; void *ret; int rc; int num_pages; unsigned long start; - if (flags != req_flags) + if (flags != req_user_flags && flags != req_kernel_flags) return ERR_PTR(-ENOTSUPP); if (!tee_device_get(teedev)) @@ -226,7 +228,27 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } - rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages); + if (flags & TEE_SHM_USER_MAPPED) { + rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, + shm->pages); + } else { + struct kvec *kiov; + int i; + + kiov = kcalloc(num_pages, sizeof(*kiov), GFP_KERNEL); + if (!kiov) { + ret = ERR_PTR(-ENOMEM); + goto err; + } + + for (i = 0; i < num_pages; i++) { + kiov[i].iov_base = (void *)(start + i * PAGE_SIZE); + kiov[i].iov_len = PAGE_SIZE; + } + + rc = get_kernel_pages(kiov, num_pages, 0, shm->pages); + kfree(kiov); + } if (rc > 0) shm->num_pages = rc; if (rc != num_pages) { diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 1412e9cc79ce..e96154d92b1e 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -26,6 +26,7 @@ #define TEE_SHM_REGISTER BIT(3) /* Memory registered in secure world */ #define TEE_SHM_USER_MAPPED BIT(4) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(5) /* Memory allocated from pool */ +#define TEE_SHM_KERNEL_MAPPED BIT(6) /* Memory mapped in kernel space */ struct device; struct tee_device; -- cgit v1.2.3 From e44ab4e14d6f4c448ae555132090c1a116b19e5c Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Mon, 20 Apr 2020 21:46:46 +0800 Subject: regmap: Simplify implementation of the regmap_read_poll_timeout() macro Simplify the implementation of the macro regmap_read_poll_timeout() by using the macro read_poll_timeout(). Signed-off-by: Dejin Zheng Link: https://lore.kernel.org/r/20200420134647.9121-2-zhengdejin5@gmail.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 91420af1c30d..b50e930a9c18 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -17,6 +17,7 @@ #include #include #include +#include struct module; struct clk; @@ -130,26 +131,10 @@ struct reg_sequence { */ #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ - u64 __timeout_us = (timeout_us); \ - unsigned long __sleep_us = (sleep_us); \ - ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \ - int __ret; \ - might_sleep_if(__sleep_us); \ - for (;;) { \ - __ret = regmap_read((map), (addr), &(val)); \ - if (__ret) \ - break; \ - if (cond) \ - break; \ - if ((__timeout_us) && \ - ktime_compare(ktime_get(), __timeout) > 0) { \ - __ret = regmap_read((map), (addr), &(val)); \ - break; \ - } \ - if (__sleep_us) \ - usleep_range((__sleep_us >> 2) + 1, __sleep_us); \ - } \ - __ret ?: ((cond) ? 0 : -ETIMEDOUT); \ + int __ret, __tmp; \ + __tmp = read_poll_timeout(regmap_read, __ret, __ret || (cond), \ + sleep_us, timeout_us, false, (map), (addr), &(val)); \ + __ret ?: __tmp; \ }) /** -- cgit v1.2.3 From 148c01d176237115d9c2805f6d29c0b6a72fbd10 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Mon, 20 Apr 2020 21:46:47 +0800 Subject: regmap: Simplify implementation of the regmap_field_read_poll_timeout() macro Simplify the implementation of the macro regmap_field_read_poll_timeout() by using the macro read_poll_timeout(). Signed-off-by: Dejin Zheng Link: https://lore.kernel.org/r/20200420134647.9121-3-zhengdejin5@gmail.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index b50e930a9c18..4c4fbe4d41a6 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -202,25 +202,10 @@ struct reg_sequence { */ #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \ ({ \ - u64 __timeout_us = (timeout_us); \ - unsigned long __sleep_us = (sleep_us); \ - ktime_t timeout = ktime_add_us(ktime_get(), __timeout_us); \ - int pollret; \ - might_sleep_if(__sleep_us); \ - for (;;) { \ - pollret = regmap_field_read((field), &(val)); \ - if (pollret) \ - break; \ - if (cond) \ - break; \ - if (__timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ - pollret = regmap_field_read((field), &(val)); \ - break; \ - } \ - if (__sleep_us) \ - usleep_range((__sleep_us >> 2) + 1, __sleep_us); \ - } \ - pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ + int __ret, __tmp; \ + __tmp = read_poll_timeout(regmap_field_read, __ret, __ret || (cond), \ + sleep_us, timeout_us, false, (field), &(val)); \ + __ret ?: __tmp; \ }) #ifdef CONFIG_REGMAP -- cgit v1.2.3 From b9151e7bca82a17ff7aa442a168e0f924a07443c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 20 Apr 2020 09:24:52 -0700 Subject: blk-mq: Add blk_mq_delay_run_hw_queues() API call We have: * blk_mq_run_hw_queue() * blk_mq_delay_run_hw_queue() * blk_mq_run_hw_queues() ...but not blk_mq_delay_run_hw_queues(), presumably because nobody needed it before now. Since we need it for a later patch in this series, add it. Signed-off-by: Douglas Anderson Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ include/linux/blk-mq.h | 1 + 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c4bedf500c5..cf95e8e0881a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1545,6 +1545,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queues); +/** + * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. + * @q: Pointer to the request queue to run. + * @msecs: Microseconds of delay to wait before running the queues. + */ +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_hctx_stopped(hctx)) + continue; + + blk_mq_delay_run_hw_queue(hctx, msecs); + } +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); + /** * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped * @q: request queue. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b45148ba3291..51fbf6f76593 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -508,6 +508,7 @@ void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); -- cgit v1.2.3 From d46430bf5a2298f55e20f59a90ebe3545d273b2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:28:57 +0200 Subject: block: remove the disk argument from blk_drop_partitions The gendisk can be trivially deducted from the block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/partitions/core.c | 10 +++++----- fs/block_dev.c | 2 +- include/linux/genhd.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/partitions/core.c b/block/partitions/core.c index 7d062599aa33..deccc3fbcd37 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -603,23 +603,23 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_drop_partitions(struct block_device *bdev) { struct disk_part_iter piter; struct hd_struct *part; int res; - if (!disk_part_scan_enabled(disk)) + if (!disk_part_scan_enabled(bdev->bd_disk)) return 0; if (bdev->bd_part_count || bdev->bd_openers > 1) return -EBUSY; - res = invalidate_partition(disk, 0); + res = invalidate_partition(bdev->bd_disk, 0); if (res) return res; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(disk, part); + delete_partition(bdev->bd_disk, part); disk_part_iter_exit(&piter); return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 52b6f646cdbd..9c8de54fa0c9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1517,7 +1517,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); rescan: - ret = blk_drop_partitions(disk, bdev); + ret = blk_drop_partitions(bdev); if (ret) return ret; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9b3fffdf4011..058d895544c7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -339,7 +339,7 @@ extern dev_t blk_lookup_devt(const char *name, int partno); int bdev_disk_changed(struct block_device *bdev, bool invalidate); int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); -int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev); +int blk_drop_partitions(struct block_device *bdev); extern void printk_all_partitions(void); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -- cgit v1.2.3 From 02d33b6771fcc63c98cb48cad0cd8b8fb033837a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:29:01 +0200 Subject: block: mark invalidate_partition static invalidate_partition is only used in genhd.c, so mark it static. Also drop the return value given that is is always ignored. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/genhd.c | 27 +++++++++++++-------------- include/linux/fs.h | 1 - 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 1cc50ad5b191..980a4609d4a5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -878,6 +878,19 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); +static void invalidate_partition(struct gendisk *disk, int partno) +{ + struct block_device *bdev; + + bdev = bdget_disk(disk, partno); + if (!bdev) + return; + + fsync_bdev(bdev); + __invalidate_device(bdev, true); + bdput(bdev); +} + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -1806,20 +1819,6 @@ int bdev_read_only(struct block_device *bdev) EXPORT_SYMBOL(bdev_read_only); -int invalidate_partition(struct gendisk *disk, int partno) -{ - int res = 0; - struct block_device *bdev = bdget_disk(disk, partno); - if (bdev) { - fsync_bdev(bdev); - res = __invalidate_device(bdev, true); - bdput(bdev); - } - return res; -} - -EXPORT_SYMBOL(invalidate_partition); - /* * Disk events - monitor disk events like media change and eject request. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..2b4e9f86b151 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2723,7 +2723,6 @@ extern bool is_bad_inode(struct inode *); extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *, bool); -extern int invalidate_partition(struct gendisk *, int); #endif unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -- cgit v1.2.3 From 9bc5c397d8384b50c8202f4400bf2f87fe8291d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:29:02 +0200 Subject: block: fold bdev_unhash_inode into invalidate_partition invalidate_partition and bdev_unhash_inode are always paired, and invalidate_partition already does an icache lookup for the block device inode. Piggy back on that to remove the inode from the hash. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++-- fs/block_dev.c | 15 --------------- include/linux/fs.h | 1 - 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 980a4609d4a5..c05d509877fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -888,6 +888,12 @@ static void invalidate_partition(struct gendisk *disk, int partno) fsync_bdev(bdev); __invalidate_device(bdev, true); + + /* + * Unhash the bdev inode for this device so that it gets evicted as soon + * as last inode reference is dropped. + */ + remove_inode_hash(bdev->bd_inode); bdput(bdev); } @@ -909,13 +915,11 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(disk, part->partno); - bdev_unhash_inode(part_devt(part)); delete_partition(disk, part); } disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); diff --git a/fs/block_dev.c b/fs/block_dev.c index 9c8de54fa0c9..998820174d3e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -883,21 +883,6 @@ static int bdev_set(struct inode *inode, void *data) static LIST_HEAD(all_bdevs); -/* - * If there is a bdev inode for this device, unhash it so that it gets evicted - * as soon as last inode reference is dropped. - */ -void bdev_unhash_inode(dev_t dev) -{ - struct inode *inode; - - inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev); - if (inode) { - remove_inode_hash(inode); - iput(inode); - } -} - struct block_device *bdget(dev_t dev) { struct block_device *bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2b4e9f86b151..1a95e5158811 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2581,7 +2581,6 @@ extern struct kmem_cache *names_cachep; #ifdef CONFIG_BLOCK extern int register_blkdev(unsigned int, const char *); extern void unregister_blkdev(unsigned int, const char *); -extern void bdev_unhash_inode(dev_t dev); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); -- cgit v1.2.3 From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:50 +0200 Subject: net: Add IF_OPER_TESTING RFC 2863 defines the operational state testing. Add support for this state, both as a IF_LINK_MODE_ and __LINK_STATE_. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if.h | 1 + net/core/dev.c | 5 +++++ net/core/link_watch.c | 12 ++++++++++-- net/core/rtnetlink.c | 9 ++++++++- 5 files changed, 65 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..0750b54b3765 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -288,6 +288,7 @@ enum netdev_state_t { __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, + __LINK_STATE_TESTING, }; @@ -3907,6 +3908,46 @@ static inline bool netif_dormant(const struct net_device *dev) } +/** + * netif_testing_on - mark device as under test. + * @dev: network device + * + * Mark device as under test (as per RFC2863). + * + * The testing state indicates that some test(s) must be performed on + * the interface. After completion, of the test, the interface state + * will change to up, dormant, or down, as appropriate. + */ +static inline void netif_testing_on(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing_off - set device as not under test. + * @dev: network device + * + * Device is not in testing state. + */ +static inline void netif_testing_off(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing - test if device is under test + * @dev: network device + * + * Check if device is under test + */ +static inline bool netif_testing(const struct net_device *dev) +{ + return test_bit(__LINK_STATE_TESTING, &dev->state); +} + + /** * netif_oper_up - test if device is operational * @dev: network device diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index be714cd8c826..797ba2c1562a 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -178,6 +178,7 @@ enum { enum { IF_LINK_MODE_DEFAULT, IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ + IF_LINK_MODE_TESTING, /* limit upward transition to testing */ }; /* diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..fb61522b1ce1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9136,6 +9136,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..d6f4f4a9e8ba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) -- cgit v1.2.3 From 1dbd51d0a71a561056579e2d4f406e5ce5343af0 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 10 Dec 2019 13:20:55 +0200 Subject: net/mlx5: Refactor mlx5_accel_esp_create_hw_context parameter list Currently the FPGA IPsec is the only hw implementation of the IPsec acceleration api, and so the mlx5_accel_esp_create_hw_context was wrongly made to suit this HW api, among other in its parameter list and some of its parameter endianness. This implementation might not be suitable for different HW. Refactor by group and pass all function arguments of mlx5_accel_esp_create_hw_context in common mlx5_accel_esp_xfrm_attrs struct field of mlx5_accel_esp_xfrm struct and correct the endianness according to the HW being called. Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c | 20 +++++++++++++------ .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h | 10 ++-------- .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 23 +++++++++------------- include/linux/mlx5/accel.h | 12 +++++++++++ 4 files changed, 37 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c index eddc34e4a762..a92cd88d369c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c @@ -57,13 +57,21 @@ int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, } void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm, - const __be32 saddr[4], - const __be32 daddr[4], - const __be32 spi, bool is_ipv6) + struct mlx5_accel_esp_xfrm *xfrm) { - return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, daddr, - spi, is_ipv6); + __be32 saddr[4] = {}, daddr[4] = {}; + + if (!xfrm->attrs.is_ipv6) { + saddr[3] = xfrm->attrs.saddr.a4; + daddr[3] = xfrm->attrs.daddr.a4; + } else { + memcpy(saddr, xfrm->attrs.saddr.a6, sizeof(saddr)); + memcpy(daddr, xfrm->attrs.daddr.a6, sizeof(daddr)); + } + + return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, + daddr, xfrm->attrs.spi, + xfrm->attrs.is_ipv6); } void mlx5_accel_esp_free_hw_context(void *context) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h index 530e428d46ab..f9b8e2a041c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h @@ -47,10 +47,7 @@ int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, unsigned int count); void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm, - const __be32 saddr[4], - const __be32 daddr[4], - const __be32 spi, bool is_ipv6); + struct mlx5_accel_esp_xfrm *xfrm); void mlx5_accel_esp_free_hw_context(void *context); int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev); @@ -63,10 +60,7 @@ void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev); static inline void * mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm, - const __be32 saddr[4], - const __be32 daddr[4], - const __be32 spi, bool is_ipv6) + struct mlx5_accel_esp_xfrm *xfrm) { return NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 29626c6c9c25..9e6c2216c93e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -199,6 +199,14 @@ mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, attrs->flags |= (x->props.mode == XFRM_MODE_TRANSPORT) ? MLX5_ACCEL_ESP_FLAGS_TRANSPORT : MLX5_ACCEL_ESP_FLAGS_TUNNEL; + + /* spi */ + attrs->spi = x->id.spi; + + /* source , destination ips */ + memcpy(&attrs->saddr, x->props.saddr.a6, sizeof(attrs->saddr)); + memcpy(&attrs->daddr, x->id.daddr.a6, sizeof(attrs->daddr)); + attrs->is_ipv6 = (x->props.family != AF_INET); } static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x) @@ -284,8 +292,6 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) struct net_device *netdev = x->xso.dev; struct mlx5_accel_esp_xfrm_attrs attrs; struct mlx5e_priv *priv; - __be32 saddr[4] = {0}, daddr[4] = {0}, spi; - bool is_ipv6 = false; int err; priv = netdev_priv(netdev); @@ -331,20 +337,9 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) } /* create hw context */ - if (x->props.family == AF_INET) { - saddr[3] = x->props.saddr.a4; - daddr[3] = x->id.daddr.a4; - } else { - memcpy(saddr, x->props.saddr.a6, sizeof(saddr)); - memcpy(daddr, x->id.daddr.a6, sizeof(daddr)); - is_ipv6 = true; - } - spi = x->id.spi; sa_entry->hw_context = mlx5_accel_esp_create_hw_context(priv->mdev, - sa_entry->xfrm, - saddr, daddr, spi, - is_ipv6); + sa_entry->xfrm); if (IS_ERR(sa_entry->hw_context)) { err = PTR_ERR(sa_entry->hw_context); goto err_xfrm; diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 5613e677a5f9..b919d143a9a6 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -92,6 +92,18 @@ struct mlx5_accel_esp_xfrm_attrs { union { struct aes_gcm_keymat aes_gcm; } keymat; + + union { + __be32 a4; + __be32 a6[4]; + } saddr; + + union { + __be32 a4; + __be32 a6[4]; + } daddr; + + u8 is_ipv6; }; struct mlx5_accel_esp_xfrm { -- cgit v1.2.3 From 72ef5e52b3f74c0be47b20f5c434b7ecc830cf40 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:35 +0200 Subject: docs: fix broken references to text files Several references got broken due to txt to ReST conversion. Several of them can be automatically fixed with: scripts/documentation-file-ref-check --fix Reviewed-by: Mathieu Poirier # hwtracing/coresight/Kconfig Reviewed-by: Paul E. McKenney # memory-barrier.txt Acked-by: Alex Shi # translations/zh_CN Acked-by: Federico Vaga # translations/it_IT Acked-by: Marc Zyngier # kvm/arm64 Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6f919ddb83a33b5f2a63b6b5f0575737bb2b36aa.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/memory-barriers.txt | 2 +- Documentation/process/submit-checklist.rst | 2 +- .../translations/it_IT/process/submit-checklist.rst | 2 +- Documentation/translations/ko_KR/memory-barriers.txt | 2 +- Documentation/translations/zh_CN/filesystems/sysfs.txt | 2 +- .../translations/zh_CN/process/submit-checklist.rst | 2 +- Documentation/virt/kvm/arm/pvtime.rst | 2 +- Documentation/virt/kvm/devices/vcpu.rst | 2 +- Documentation/virt/kvm/hypercalls.rst | 4 ++-- arch/powerpc/include/uapi/asm/kvm_para.h | 2 +- drivers/gpu/drm/Kconfig | 2 +- drivers/gpu/drm/drm_ioctl.c | 2 +- drivers/hwtracing/coresight/Kconfig | 2 +- fs/fat/Kconfig | 8 ++++---- fs/fuse/Kconfig | 2 +- fs/fuse/dev.c | 2 +- fs/overlayfs/Kconfig | 6 +++--- include/linux/mm.h | 4 ++-- include/uapi/linux/ethtool_netlink.h | 2 +- include/uapi/rdma/rdma_user_ioctl_cmds.h | 2 +- mm/gup.c | 12 ++++++------ virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 4 ++-- 23 files changed, 36 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e1c355e84edd..eaabc3134294 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -620,7 +620,7 @@ because the CPUs that the Linux kernel supports don't do writes until they are certain (1) that the write will actually happen, (2) of the location of the write, and (3) of the value to be written. But please carefully read the "CONTROL DEPENDENCIES" section and the -Documentation/RCU/rcu_dereference.txt file: The compiler can and does +Documentation/RCU/rcu_dereference.rst file: The compiler can and does break dependencies in a great many highly creative ways. CPU 1 CPU 2 diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index 8e56337d422d..3f8e9d5d95c2 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches. and why. 26) If any ioctl's are added by the patch, then also update - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 27) If your modified source code depends on or uses any of the kernel APIs or features that are related to the following ``Kconfig`` symbols, diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 995ee69fab11..3e575502690f 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -117,7 +117,7 @@ sottomissione delle patch, in particolare sorgenti che ne spieghi la logica: cosa fanno e perché. 25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o funzionalità del kernel che è associata a uno dei seguenti simboli diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 2e831ece6e26..e50fe6541335 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -641,7 +641,7 @@ P 는 짝수 번호 캐시 라인에 저장되어 있고, 변수 B 는 홀수 리눅스 커널이 지원하는 CPU 들은 (1) 쓰기가 정말로 일어날지, (2) 쓰기가 어디에 이루어질지, 그리고 (3) 쓰여질 값을 확실히 알기 전까지는 쓰기를 수행하지 않기 때문입니다. 하지만 "컨트롤 의존성" 섹션과 -Documentation/RCU/rcu_dereference.txt 파일을 주의 깊게 읽어 주시기 바랍니다: +Documentation/RCU/rcu_dereference.rst 파일을 주의 깊게 읽어 주시기 바랍니다: 컴파일러는 매우 창의적인 많은 방법으로 종속성을 깰 수 있습니다. CPU 1 CPU 2 diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index ee1f37da5b23..a15c3ebdfa82 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -281,7 +281,7 @@ drivers/ 包含了每个已为特定总线上的设备而挂载的驱动程序 假定驱动没有跨越多个总线类型)。 fs/ 包含了一个为文件系统设立的目录。现在每个想要导出属性的文件系统必须 -在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.txt)。 +在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.rst)。 dev/ 包含两个子目录: char/ 和 block/。在这两个子目录中,有以 : 格式命名的符号链接。这些符号链接指向 sysfs 目录 diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst index 8738c55e42a2..50386e0e42e7 100644 --- a/Documentation/translations/zh_CN/process/submit-checklist.rst +++ b/Documentation/translations/zh_CN/process/submit-checklist.rst @@ -97,7 +97,7 @@ Linux内核补丁提交清单 24) 所有内存屏障例如 ``barrier()``, ``rmb()``, ``wmb()`` 都需要源代码中的注 释来解释它们正在执行的操作及其原因的逻辑。 -25) 如果补丁添加了任何ioctl,那么也要更新 ``Documentation/ioctl/ioctl-number.rst`` +25) 如果补丁添加了任何ioctl,那么也要更新 ``Documentation/userspace-api/ioctl/ioctl-number.rst`` 26) 如果修改后的源代码依赖或使用与以下 ``Kconfig`` 符号相关的任何内核API或 功能,则在禁用相关 ``Kconfig`` 符号和/或 ``=m`` (如果该选项可用)的情况 diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst index 2357dd2d8655..687b60d76ca9 100644 --- a/Documentation/virt/kvm/arm/pvtime.rst +++ b/Documentation/virt/kvm/arm/pvtime.rst @@ -76,5 +76,5 @@ It is advisable that one or more 64k pages are set aside for the purpose of these structures and not used for other purposes, this enables the guest to map the region using 64k pages and avoids conflicting attributes with other memory. -For the user space interface see Documentation/virt/kvm/devices/vcpu.txt +For the user space interface see Documentation/virt/kvm/devices/vcpu.rst section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL". diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 9963e680770a..ca374d3fe085 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -110,5 +110,5 @@ Returns: Specifies the base address of the stolen time structure for this VCPU. The base address must be 64 byte aligned and exist within a valid guest memory -region. See Documentation/virt/kvm/arm/pvtime.txt for more information +region. See Documentation/virt/kvm/arm/pvtime.rst for more information including the layout of the stolen time structure. diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst index dbaf207e560d..ed4fddd364ea 100644 --- a/Documentation/virt/kvm/hypercalls.rst +++ b/Documentation/virt/kvm/hypercalls.rst @@ -22,7 +22,7 @@ S390: number in R1. For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virt/kvm/s390-diag.txt. + refer to Documentation/virt/kvm/s390-diag.rst. PowerPC: It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. @@ -30,7 +30,7 @@ PowerPC: KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' property inside the device tree's /hypervisor node. - For more information refer to Documentation/virt/kvm/ppc-pv.txt + For more information refer to Documentation/virt/kvm/ppc-pv.rst MIPS: KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index be48c2215fa2..a809b1b44ddf 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virt/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.rst */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 43594978958e..fb92be7e8aa7 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -161,7 +161,7 @@ config DRM_LOAD_EDID_FIRMWARE monitor are unable to provide appropriate EDID data. Since this feature is provided as a workaround for broken hardware, the default case is N. Details and instructions how to build your own - EDID data are given in Documentation/driver-api/edid.rst. + EDID data are given in Documentation/admin-guide/edid.rst. config DRM_DP_CEC bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support" diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 9e41972c4bbc..c2b8d2a953ae 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -741,7 +741,7 @@ static const struct drm_ioctl_desc drm_ioctls[] = { * }; * * Please make sure that you follow all the best practices from - * ``Documentation/ioctl/botching-up-ioctls.rst``. Note that drm_ioctl() + * ``Documentation/process/botching-up-ioctls.rst``. Note that drm_ioctl() * automatically zero-extends structures, hence make sure you can add more stuff * at the end, i.e. don't put a variable sized array there. * diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig index 83e841be1081..02dbb5ca3bcf 100644 --- a/drivers/hwtracing/coresight/Kconfig +++ b/drivers/hwtracing/coresight/Kconfig @@ -107,7 +107,7 @@ config CORESIGHT_CPU_DEBUG can quickly get to know program counter (PC), secure state, exception level, etc. Before use debugging functionality, platform needs to ensure the clock domain and power domain are enabled - properly, please refer Documentation/trace/coresight-cpu-debug.rst + properly, please refer Documentation/trace/coresight/coresight-cpu-debug.rst for detailed description and the example for usage. config CORESIGHT_CTI diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 718163d0c621..ca31993dcb47 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -69,7 +69,7 @@ config VFAT_FS The VFAT support enlarges your kernel by about 10 KB and it only works if you said Y to the "DOS FAT fs support" above. Please read - the file for details. If + the file for details. If unsure, say Y. To compile this as a module, choose M here: the module will be called @@ -82,7 +82,7 @@ config FAT_DEFAULT_CODEPAGE help This option should be set to the codepage of your FAT filesystems. It can be overridden with the "codepage" mount option. - See for more information. + See for more information. config FAT_DEFAULT_IOCHARSET string "Default iocharset for FAT" @@ -96,7 +96,7 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here - select the next option instead if you would like to use UTF-8 encoded file names by default. - See for more information. + See for more information. Enable any character sets you need in File Systems/Native Language Support. @@ -114,4 +114,4 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. - See for more information. + See for more information. diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..774b2618018a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -12,7 +12,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See for more information. + See for more information. See for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..c7a65cf2bcca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2081,7 +2081,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 714c14c47ca5..dd188c7996b3 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -9,7 +9,7 @@ config OVERLAY_FS 'lower' filesystem is either hidden or, in the case of directories, merged with the 'upper' object. - For more information see Documentation/filesystems/overlayfs.txt + For more information see Documentation/filesystems/overlayfs.rst config OVERLAY_FS_REDIRECT_DIR bool "Overlayfs: turn on redirect directory feature by default" @@ -38,7 +38,7 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW If backward compatibility is not an issue, then it is safe and recommended to say N here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say Y. @@ -103,7 +103,7 @@ config OVERLAY_FS_XINO_AUTO If compatibility with applications that expect 32bit inodes is not an issue, then it is safe and recommended to say Y here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say N. diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..1f2850465f59 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1219,7 +1219,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS * scheme). * - * For more information, please see Documentation/vm/pin_user_pages.rst. + * For more information, please see Documentation/core-api/pin_user_pages.rst. * * @page: pointer to page to be queried. * @Return: True, if it is likely that the page has been "dma-pinned". @@ -2834,7 +2834,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * - * Please see Documentation/vm/pin_user_pages.rst for more information. + * Please see Documentation/core-api/pin_user_pages.rst for more information. */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..1711e57f7848 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -2,7 +2,7 @@ /* * include/uapi/linux/ethtool_netlink.h - netlink interface for ethtool * - * See Documentation/networking/ethtool-netlink.txt in kernel source tree for + * See Documentation/networking/ethtool-netlink.rst in kernel source tree for * doucumentation of the interface. */ diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 7b1ec806f8f9..38ab7accb7be 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -36,7 +36,7 @@ #include #include -/* Documentation/ioctl/ioctl-number.rst */ +/* Documentation/userspace-api/ioctl/ioctl-number.rst */ #define RDMA_IOCTL_MAGIC 0x1b #define RDMA_VERBS_IOCTL \ _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) diff --git a/mm/gup.c b/mm/gup.c index 6076df8e04a4..81e4d0b377fd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2843,9 +2843,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. + * see Documentation/core-api/pin_user_pages.rst for further details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, @@ -2883,9 +2883,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -2919,9 +2919,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index e72dcc454247..859464fd413f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -301,7 +301,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 769e4802645e..64fcd7511110 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 -- cgit v1.2.3 From 0c1bc6b84525b96aa9fb8f6fbe8c5cb26a5c0ead Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:37 +0200 Subject: docs: filesystems: fix renamed references Some filesystem references got broken by a previous patch series I submitted. Address those. Signed-off-by: Mauro Carvalho Chehab Acked-by: David Sterba # fs/affs/Kconfig Link: https://lore.kernel.org/r/57318c53008dbda7f6f4a5a9e5787f4d37e8565a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/ABI/stable/sysfs-devices-node | 2 +- Documentation/ABI/testing/procfs-smaps_rollup | 2 +- Documentation/admin-guide/cpu-load.rst | 2 +- Documentation/admin-guide/nfs/nfsroot.rst | 2 +- Documentation/driver-api/driver-model/device.rst | 4 ++-- Documentation/driver-api/driver-model/overview.rst | 2 +- Documentation/filesystems/dax.txt | 2 +- Documentation/filesystems/dnotify.txt | 2 +- Documentation/filesystems/ramfs-rootfs-initramfs.rst | 2 +- Documentation/filesystems/sysfs.rst | 2 +- Documentation/powerpc/firmware-assisted-dump.rst | 2 +- Documentation/process/adding-syscalls.rst | 2 +- Documentation/translations/it_IT/process/adding-syscalls.rst | 2 +- Documentation/translations/zh_CN/filesystems/sysfs.txt | 6 +++--- drivers/base/core.c | 2 +- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 2 +- fs/Kconfig | 2 +- fs/Kconfig.binfmt | 2 +- fs/adfs/Kconfig | 2 +- fs/affs/Kconfig | 2 +- fs/afs/Kconfig | 6 +++--- fs/bfs/Kconfig | 2 +- fs/cramfs/Kconfig | 2 +- fs/ecryptfs/Kconfig | 2 +- fs/hfs/Kconfig | 2 +- fs/hpfs/Kconfig | 2 +- fs/isofs/Kconfig | 2 +- fs/namespace.c | 2 +- fs/notify/inotify/Kconfig | 2 +- fs/ntfs/Kconfig | 2 +- fs/ocfs2/Kconfig | 2 +- fs/proc/Kconfig | 4 ++-- fs/romfs/Kconfig | 2 +- fs/sysfs/dir.c | 2 +- fs/sysfs/file.c | 2 +- fs/sysfs/mount.c | 2 +- fs/sysfs/symlink.c | 2 +- fs/sysv/Kconfig | 2 +- fs/udf/Kconfig | 2 +- include/linux/kobject.h | 2 +- include/linux/kobject_ns.h | 2 +- include/linux/relay.h | 2 +- include/linux/sysfs.h | 2 +- kernel/relay.c | 2 +- lib/kobject.c | 4 ++-- 45 files changed, 52 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index df8413cf1468..484fc04bcc25 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -54,7 +54,7 @@ Date: October 2002 Contact: Linux Memory Management list Description: Provides information about the node's distribution and memory - utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt + utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.rst What: /sys/devices/system/node/nodeX/numastat Date: October 2002 diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup index 274df44d8b1b..046978193368 100644 --- a/Documentation/ABI/testing/procfs-smaps_rollup +++ b/Documentation/ABI/testing/procfs-smaps_rollup @@ -11,7 +11,7 @@ Description: Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem are not present in /proc/pid/smaps. These fields represent the sum of the Pss field of each type (anon, file, shmem). - For more details, see Documentation/filesystems/proc.txt + For more details, see Documentation/filesystems/proc.rst and the procfs man page. Typical output looks like this: diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst index 2d01ce43d2a2..ebdecf864080 100644 --- a/Documentation/admin-guide/cpu-load.rst +++ b/Documentation/admin-guide/cpu-load.rst @@ -105,7 +105,7 @@ References ---------- - http://lkml.org/lkml/2007/2/12/6 -- Documentation/filesystems/proc.txt (1.8) +- Documentation/filesystems/proc.rst (1.8) Thanks diff --git a/Documentation/admin-guide/nfs/nfsroot.rst b/Documentation/admin-guide/nfs/nfsroot.rst index 82a4fda057f9..c6772075c80c 100644 --- a/Documentation/admin-guide/nfs/nfsroot.rst +++ b/Documentation/admin-guide/nfs/nfsroot.rst @@ -18,7 +18,7 @@ Mounting the root filesystem via NFS (nfsroot) In order to use a diskless system, such as an X-terminal or printer server for example, it is necessary for the root filesystem to be present on a non-disk device. This may be an initramfs (see -Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see +Documentation/filesystems/ramfs-rootfs-initramfs.rst), a ramdisk (see Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The following text describes on how to use NFS for the root filesystem. For the rest of this text 'client' means the diskless system, and 'server' means the NFS diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst index 2b868d49d349..b9b022371e85 100644 --- a/Documentation/driver-api/driver-model/device.rst +++ b/Documentation/driver-api/driver-model/device.rst @@ -50,10 +50,10 @@ Attributes Attributes of devices can be exported by a device driver through sysfs. -Please see Documentation/filesystems/sysfs.txt for more information +Please see Documentation/filesystems/sysfs.rst for more information on how sysfs works. -As explained in Documentation/kobject.txt, device attributes must be +As explained in Documentation/core-api/kobject.rst, device attributes must be created before the KOBJ_ADD uevent is generated. The only way to realize that is by defining an attribute group. diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst index d4d1e9b40e0c..e98d0ab4a9b6 100644 --- a/Documentation/driver-api/driver-model/overview.rst +++ b/Documentation/driver-api/driver-model/overview.rst @@ -121,4 +121,4 @@ device-specific data or tunable interfaces. More information about the sysfs directory layout can be found in the other documents in this directory and in the file -Documentation/filesystems/sysfs.txt. +Documentation/filesystems/sysfs.rst. diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 679729442fd2..735f3859b19f 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -74,7 +74,7 @@ are zeroed out and converted to written extents before being returned to avoid exposure of uninitialized data through mmap. These filesystems may be used for inspiration: -- ext2: see Documentation/filesystems/ext2.txt +- ext2: see Documentation/filesystems/ext2.rst - ext4: see Documentation/filesystems/ext4/ - xfs: see Documentation/admin-guide/xfs.rst diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt index 15156883d321..08d575ece45d 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.txt @@ -67,4 +67,4 @@ See tools/testing/selftests/filesystems/dnotify_test.c for an example. NOTE ---- Beginning with Linux 2.6.13, dnotify has been replaced by inotify. -See Documentation/filesystems/inotify.txt for more information on it. +See Documentation/filesystems/inotify.rst for more information on it. diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.rst b/Documentation/filesystems/ramfs-rootfs-initramfs.rst index 6c576e241d86..3fddacc6bf14 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst @@ -71,7 +71,7 @@ be allowed write access to a ramfs mount. A ramfs derivative called tmpfs was created to add size limits, and the ability to write the data to swap space. Normal users can be allowed write access to -tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information. +tmpfs mounts. See Documentation/filesystems/tmpfs.rst for more information. What is rootfs? --------------- diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst index 290891c3fecb..ab0f7795792b 100644 --- a/Documentation/filesystems/sysfs.rst +++ b/Documentation/filesystems/sysfs.rst @@ -20,7 +20,7 @@ a means to export kernel data structures, their attributes, and the linkages between them to userspace. sysfs is tied inherently to the kobject infrastructure. Please read -Documentation/kobject.txt for more information concerning the kobject +Documentation/core-api/kobject.rst for more information concerning the kobject interface. diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index b3f3ee135dbe..20ea8cdee0aa 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -344,7 +344,7 @@ Here is the list of files under powerpc debugfs: NOTE: - Please refer to Documentation/filesystems/debugfs.txt on + Please refer to Documentation/filesystems/debugfs.rst on how to mount the debugfs filesystem. diff --git a/Documentation/process/adding-syscalls.rst b/Documentation/process/adding-syscalls.rst index 1c3a840d06b9..a6b4a3a5bf3f 100644 --- a/Documentation/process/adding-syscalls.rst +++ b/Documentation/process/adding-syscalls.rst @@ -33,7 +33,7 @@ interface. to a somewhat opaque API. - If you're just exposing runtime system information, a new node in sysfs - (see ``Documentation/filesystems/sysfs.txt``) or the ``/proc`` filesystem may + (see ``Documentation/filesystems/sysfs.rst``) or the ``/proc`` filesystem may be more appropriate. However, access to these mechanisms requires that the relevant filesystem is mounted, which might not always be the case (e.g. in a namespaced/sandboxed/chrooted environment). Avoid adding any API to diff --git a/Documentation/translations/it_IT/process/adding-syscalls.rst b/Documentation/translations/it_IT/process/adding-syscalls.rst index c3a3439595a6..bff0a82bf127 100644 --- a/Documentation/translations/it_IT/process/adding-syscalls.rst +++ b/Documentation/translations/it_IT/process/adding-syscalls.rst @@ -39,7 +39,7 @@ vostra interfaccia. un qualche modo opaca. - Se dovete esporre solo delle informazioni sul sistema, un nuovo nodo in - sysfs (vedere ``Documentation/filesystems/sysfs.txt``) o + sysfs (vedere ``Documentation/filesystems/sysfs.rst``) o in procfs potrebbe essere sufficiente. Tuttavia, l'accesso a questi meccanismi richiede che il filesystem sia montato, il che potrebbe non essere sempre vero (per esempio, in ambienti come namespace/sandbox/chroot). diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index a15c3ebdfa82..fcf620049d11 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/filesystems/sysfs.txt +Chinese translated version of Documentation/filesystems/sysfs.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ Maintainer: Patrick Mochel Mike Murphy Chinese maintainer: Fu Wei --------------------------------------------------------------------- -Documentation/filesystems/sysfs.txt 的中文翻译 +Documentation/filesystems/sysfs.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 @@ -40,7 +40,7 @@ sysfs 是一个最初基于 ramfs 且位于内存的文件系统。它提供导 数据结构及其属性,以及它们之间的关联到用户空间的方法。 sysfs 始终与 kobject 的底层结构紧密相关。请阅读 -Documentation/kobject.txt 文档以获得更多关于 kobject 接口的 +Documentation/core-api/kobject.rst 文档以获得更多关于 kobject 接口的 信息。 diff --git a/drivers/base/core.c b/drivers/base/core.c index 139cdf7e7327..89fcc0a09f94 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1374,7 +1374,7 @@ static void device_release(struct kobject *kobj) else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); else - WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n", + WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", dev_name(dev)); kfree(p); } diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index 211f5de99a44..9aba2910d83a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -170,7 +170,7 @@ struct dpu_global_state * * Main debugfs documentation is located at, * - * Documentation/filesystems/debugfs.txt + * Documentation/filesystems/debugfs.rst * * @dpu_debugfs_setup_regset32: Initialize data for dpu_debugfs_create_regset32 * @dpu_debugfs_create_regset32: Create 32-bit register dump file diff --git a/fs/Kconfig b/fs/Kconfig index f08fbbfafd9a..d1ad3935fb85 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -166,7 +166,7 @@ config TMPFS space. If you unmount a tmpfs instance, everything stored therein is lost. - See for details. + See for details. config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..3fbbd54f50fd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -72,7 +72,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS The core dump behavior can be controlled per process using the /proc/PID/coredump_filter pseudo-file; this setting is - inherited. See Documentation/filesystems/proc.txt for details. + inherited. See Documentation/filesystems/proc.rst for details. This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index df4650dccf68..44738fed6625 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -12,7 +12,7 @@ config ADFS_FS The ADFS partition should be the first partition (i.e., /dev/[hs]d?1) on each of your drives. Please read the file - for further details. + for further details. To compile this code as a module, choose M here: the module will be called adfs. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 84c46b9025c5..eb9d0ab850cb 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -9,7 +9,7 @@ config AFFS_FS FFS partition on your hard drive. Amiga floppies however cannot be read with this driver due to an incompatibility of the floppy controller used in an Amiga and the standard floppy controller in - PCs and workstations. Read + PCs and workstations. Read and . With this driver you can also mount disk files used by Bernd diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 3fb1f559e317..1ad211d72b3b 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -8,7 +8,7 @@ config AFS_FS If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See for more information. + See for more information. If unsure, say N. @@ -18,7 +18,7 @@ config AFS_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See for more information. + See for more information. If unsure, say N. @@ -37,6 +37,6 @@ config AFS_DEBUG_CURSOR the dmesg log if the server rotation algorithm fails to successfully contact a server. - See for more information. + See for more information. If unsure, say N. diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3e1247f07913..3a757805b585 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -11,7 +11,7 @@ config BFS_FS on your /stand slice from within Linux. You then also need to say Y to "UnixWare slices support", below. More information about the BFS file system is contained in the file - . + . If you don't know what this is about, say N. diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index c8bebb70a971..d98cef0dbb6b 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -9,7 +9,7 @@ config CRAMFS limited to 256MB file systems (with 16MB files), and doesn't support 16/32 bits uid/gid, hard links and timestamps. - See and + See and for further information. To compile this as a module, choose M here: the module will be called diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 522c35d5292b..1bdeaa6d5790 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -7,7 +7,7 @@ config ECRYPT_FS select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See - to learn more about + to learn more about eCryptfs. Userspace components are required and can be obtained from . diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 44f6e89bcb75..129926b5142d 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -6,7 +6,7 @@ config HFS_FS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. - Please read to learn about + Please read to learn about the available mount options. To compile this file system support as a module, choose M here: the diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 56aa0336254a..2b36dc6f0a10 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -9,7 +9,7 @@ config HPFS_FS write files to an OS/2 HPFS partition on your hard drive. OS/2 floppies however are in regular MSDOS format, so you don't need this option in order to be able to read them. Read - . + . To compile this file system support as a module, choose M here: the module will be called hpfs. If unsure, say N. diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 5e7419599f50..08ffd37b9bb8 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -8,7 +8,7 @@ config ISO9660_FS long Unix filenames and symbolic links are also supported by this driver. If you have a CD-ROM drive and want to do more with it than just listen to audio CDs and watch its LEDs, say Y (and read - and the CD-ROM-HOWTO, + and the CD-ROM-HOWTO, available from ), thereby enlarging your kernel by about 27 KB; otherwise say N. diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..5f036dc711b6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3595,7 +3595,7 @@ EXPORT_SYMBOL(path_is_under); * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 6736e47d94d8..7715fadd5fff 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -12,6 +12,6 @@ config INOTIFY_USER new features including multiple file events, one-shot support, and unmount notification. - For more information, see + For more information, see If unsure, say Y. diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index de9fb5cff226..1667a7e590d8 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -18,7 +18,7 @@ config NTFS_FS the Linux 2.4 kernel series is separately available as a patch from the project web site. - For more information see + For more information see and . To compile this file system support as a module, choose M here: the diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 46bba20da6b5..1177c33df895 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -21,7 +21,7 @@ config OCFS2_FS OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ For more information on OCFS2, see the file - . + . config OCFS2_FS_O2CB tristate "O2CB Kernelspace Clustering" diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 27ef84d99f59..971a42f6357d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -23,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - and on the proc(5) manpage + and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -95,7 +95,7 @@ config PROC_CHILDREN default n help Provides a fast way to retrieve first level children pids of a task. See - for more information. + for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index ad4c45788896..9737b8e68878 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -6,7 +6,7 @@ config ROMFS_FS This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for other read-only media as well. Read - for details. + for details. To compile this file system support as a module, choose M here: the module will be called romfs. Note that the file system of your diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aa85f2874a9f..59dffd5ca517 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 26bbf960e2a2..f275fcda62fb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db81cfbab9d6..e747c135c1d1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c4deecc80f67..5603530a1a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index d4edf7d9ae10..b4e23e03fbeb 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -28,7 +28,7 @@ config SYSV_FS tar" or preferably "info tar"). Note also that this option has nothing whatsoever to do with the option "System V IPC". Read about the System V file system in - . + . Saying Y here will enlarge your kernel by about 27 KB. To compile this as a module, choose M here: the module will be called diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 6848de581ce1..26e1a49f3ba7 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -9,7 +9,7 @@ config UDF_FS compatible with standard unix file systems, it is also suitable for removable USB disks. Say Y if you intend to mount DVD discs or CDRW's written in packet mode, or if you want to use UDF for removable USB - disks. Please read . + disks. Please read . To compile this file system support as a module, choose M here: the module will be called udf. diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e2ca0a292e21..fc8d83e91379 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -7,7 +7,7 @@ * Copyright (c) 2006-2008 Greg Kroah-Hartman * Copyright (c) 2006-2008 Novell Inc. * - * Please read Documentation/kobject.txt before using the kobject + * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 069aa2ebef90..2b5b64256cf4 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -8,7 +8,7 @@ * * Split from kobject.h by David Howells (dhowells@redhat.com) * - * Please read Documentation/kobject.txt before using the kobject + * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ diff --git a/include/linux/relay.h b/include/linux/relay.h index c759f96e39c1..e13a333e7c37 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -141,7 +141,7 @@ struct rchan_callbacks * cause relay_open() to create a single global buffer rather * than the default set of per-cpu buffers. * - * See Documentation/filesystems/relay.txt for more info. + * See Documentation/filesystems/relay.rst for more info. */ struct dentry *(*create_buf_file)(const char *filename, struct dentry *parent, diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 80bb865b3a33..86067dbe7745 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -7,7 +7,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..d0c9c287680a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) diff --git a/lib/kobject.c b/lib/kobject.c index 83198cb37d8d..65fa7bf70c57 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2007 Greg Kroah-Hartman * Copyright (c) 2006-2007 Novell Inc. * - * Please see the file Documentation/kobject.txt for critical information + * Please see the file Documentation/core-api/kobject.rst for critical information * about using the kobject interface. */ @@ -670,7 +670,7 @@ static void kobject_cleanup(struct kobject *kobj) kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) - pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n", + pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", kobject_name(kobj), kobj); /* send "remove" if the caller did not do it but sent "add" */ -- cgit v1.2.3 From ad89c8852fde7ab0c4007f5b753517cf508d12be Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:44 +0200 Subject: docs: spi: spi.h: fix a doc building warning We need to add a blank line to avoid this warning: ./include/linux/spi/spi.h:401: WARNING: Unexpected indentation. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/1c701b3ac903dc0bc304dca958fbdee53bd38dc3.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/spi/spi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 38286de779e3..aac57b5b7c21 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -394,6 +394,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * for example doing DMA mapping. Called from threaded * context. * @transfer_one: transfer a single spi_transfer. + * * - return 0 if the transfer is finished, * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must -- cgit v1.2.3 From 4951d27b099bd24f87a093f67d91618742bd3a65 Mon Sep 17 00:00:00 2001 From: Bumsik Kim Date: Fri, 3 Apr 2020 12:15:07 +0900 Subject: watchdog: clarify that stop() is optional The commit d0684c8a9354 ("watchdog: Make stop function optional") made stop function not mandatory, but the comments and the doc weren't reflected. Fix it to clarify. Signed-off-by: Bumsik Kim Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20200403031507.63487-1-k.bumsik@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/watchdog/convert_drivers_to_kernel_api.rst | 2 +- Documentation/watchdog/watchdog-kernel-api.rst | 2 +- include/linux/watchdog.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/watchdog/convert_drivers_to_kernel_api.rst b/Documentation/watchdog/convert_drivers_to_kernel_api.rst index dd934cc08e40..51b999b5551a 100644 --- a/Documentation/watchdog/convert_drivers_to_kernel_api.rst +++ b/Documentation/watchdog/convert_drivers_to_kernel_api.rst @@ -115,7 +115,7 @@ Add the watchdog operations --------------------------- All possible callbacks are defined in 'struct watchdog_ops'. You can find it -explained in 'watchdog-kernel-api.txt' in this directory. start(), stop() and +explained in 'watchdog-kernel-api.txt' in this directory. start() and owner must be set, the rest are optional. You will easily find corresponding functions in the old driver. Note that you will now get a pointer to the watchdog_device as a parameter to these functions, so you probably have to diff --git a/Documentation/watchdog/watchdog-kernel-api.rst b/Documentation/watchdog/watchdog-kernel-api.rst index 864edbe932c1..068a55ee0d4a 100644 --- a/Documentation/watchdog/watchdog-kernel-api.rst +++ b/Documentation/watchdog/watchdog-kernel-api.rst @@ -123,8 +123,8 @@ The list of watchdog operations is defined as:: struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); - int (*stop)(struct watchdog_device *); /* optional operations */ + int (*stop)(struct watchdog_device *); int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 417d9f37077a..1464ce6ffa31 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -37,15 +37,15 @@ struct watchdog_governor; * * The watchdog_ops structure contains a list of low-level operations * that control a watchdog device. It also contains the module that owns - * these operations. The start and stop function are mandatory, all other + * these operations. The start function is mandatory, all other * functions are optional. */ struct watchdog_ops { struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); - int (*stop)(struct watchdog_device *); /* optional operations */ + int (*stop)(struct watchdog_device *); int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); -- cgit v1.2.3 From 90c165f0de3adad4719e65ab0c31d59edf5bd481 Mon Sep 17 00:00:00 2001 From: Ricardo Cañuelo Date: Fri, 3 Apr 2020 11:36:17 +0200 Subject: docs: pr_*() kerneldocs and basic printk docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kerneldocs comments to the pr_*() macros in printk.h. Add a new rst node in the core-api manual describing the basic usage of printk and the related macro aliases. Signed-off-by: Ricardo Cañuelo Link: https://lore.kernel.org/r/20200403093617.18003-1-ricardo.canuelo@collabora.com Signed-off-by: Jonathan Corbet --- Documentation/core-api/index.rst | 1 + Documentation/core-api/printk-basics.rst | 115 ++++++++++++++++++++++++++++++ Documentation/core-api/printk-formats.rst | 2 + include/linux/printk.h | 112 +++++++++++++++++++++++++---- 4 files changed, 218 insertions(+), 12 deletions(-) create mode 100644 Documentation/core-api/printk-basics.rst (limited to 'include/linux') diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 0897ad12c119..49e3da910d9e 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -18,6 +18,7 @@ it. kernel-api workqueue + printk-basics printk-formats symbol-namespaces diff --git a/Documentation/core-api/printk-basics.rst b/Documentation/core-api/printk-basics.rst new file mode 100644 index 000000000000..563a9ce5fe1d --- /dev/null +++ b/Documentation/core-api/printk-basics.rst @@ -0,0 +1,115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Message logging with printk +=========================== + +printk() is one of the most widely known functions in the Linux kernel. It's the +standard tool we have for printing messages and usually the most basic way of +tracing and debugging. If you're familiar with printf(3) you can tell printk() +is based on it, although it has some functional differences: + + - printk() messages can specify a log level. + + - the format string, while largely compatible with C99, doesn't follow the + exact same specification. It has some extensions and a few limitations + (no ``%n`` or floating point conversion specifiers). See :ref:`How to get + printk format specifiers right `. + +All printk() messages are printed to the kernel log buffer, which is a ring +buffer exported to userspace through /dev/kmsg. The usual way to read it is +using ``dmesg``. + +printk() is typically used like this:: + + printk(KERN_INFO "Message: %s\n", arg); + +where ``KERN_INFO`` is the log level (note that it's concatenated to the format +string, the log level is not a separate argument). The available log levels are: + ++----------------+--------+-----------------------------------------------+ +| Name | String | Alias function | ++================+========+===============================================+ +| KERN_EMERG | "0" | pr_emerg() | ++----------------+--------+-----------------------------------------------+ +| KERN_ALERT | "1" | pr_alert() | ++----------------+--------+-----------------------------------------------+ +| KERN_CRIT | "2" | pr_crit() | ++----------------+--------+-----------------------------------------------+ +| KERN_ERR | "3" | pr_err() | ++----------------+--------+-----------------------------------------------+ +| KERN_WARNING | "4" | pr_warn() | ++----------------+--------+-----------------------------------------------+ +| KERN_NOTICE | "5" | pr_notice() | ++----------------+--------+-----------------------------------------------+ +| KERN_INFO | "6" | pr_info() | ++----------------+--------+-----------------------------------------------+ +| KERN_DEBUG | "7" | pr_debug() and pr_devel() if DEBUG is defined | ++----------------+--------+-----------------------------------------------+ +| KERN_DEFAULT | "" | | ++----------------+--------+-----------------------------------------------+ +| KERN_CONT | "c" | pr_cont() | ++----------------+--------+-----------------------------------------------+ + + +The log level specifies the importance of a message. The kernel decides whether +to show the message immediately (printing it to the current console) depending +on its log level and the current *console_loglevel* (a kernel variable). If the +message priority is higher (lower log level value) than the *console_loglevel* +the message will be printed to the console. + +If the log level is omitted, the message is printed with ``KERN_DEFAULT`` +level. + +You can check the current *console_loglevel* with:: + + $ cat /proc/sys/kernel/printk + 4 4 1 7 + +The result shows the *current*, *default*, *minimum* and *boot-time-default* log +levels. + +To change the current console_loglevel simply write the the desired level to +``/proc/sys/kernel/printk``. For example, to print all messages to the console:: + + # echo 8 > /proc/sys/kernel/printk + +Another way, using ``dmesg``:: + + # dmesg -n 5 + +sets the console_loglevel to print KERN_WARNING (4) or more severe messages to +console. See ``dmesg(1)`` for more information. + +As an alternative to printk() you can use the ``pr_*()`` aliases for +logging. This family of macros embed the log level in the macro names. For +example:: + + pr_info("Info message no. %d\n", msg_num); + +prints a ``KERN_INFO`` message. + +Besides being more concise than the equivalent printk() calls, they can use a +common definition for the format string through the pr_fmt() macro. For +instance, defining this at the top of a source file (before any ``#include`` +directive):: + + #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +would prefix every pr_*() message in that file with the module and function name +that originated the message. + +For debugging purposes there are also two conditionally-compiled macros: +pr_debug() and pr_devel(), which are compiled-out unless ``DEBUG`` (or +also ``CONFIG_DYNAMIC_DEBUG`` in the case of pr_debug()) is defined. + + +Function reference +================== + +.. kernel-doc:: kernel/printk/printk.c + :functions: printk + +.. kernel-doc:: include/linux/printk.h + :functions: pr_emerg pr_alert pr_crit pr_err pr_warn pr_notice pr_info + pr_fmt pr_debug pr_devel pr_cont diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 8ebe46b1af39..1e3838652348 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -2,6 +2,8 @@ How to get printk format specifiers right ========================================= +.. _printk-specifiers: + :Author: Randy Dunlap :Author: Andrew Murray diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..ccea5d4a509e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -279,39 +279,116 @@ static inline void printk_safe_flush_on_panic(void) extern int kptr_restrict; +/** + * pr_fmt - used by the pr_*() macros to generate the printk format string + * @fmt: format string passed from a pr_*() macro + * + * This macro can be used to generate a unified format string for pr_*() + * macros. A common use is to prefix all pr_*() messages in a file with a common + * string. For example, defining this at the top of a source file: + * + * #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + * + * would prefix all pr_info, pr_emerg... messages in the file with the module + * name. + */ #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif -/* - * These can be used to print at the various log levels. - * All of these will print unconditionally, although note that pr_debug() - * and other debug macros are compiled out unless either DEBUG is defined - * or CONFIG_DYNAMIC_DEBUG is set. +/** + * pr_emerg - Print an emergency-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to + * generate the format string. */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_alert - Print an alert-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_alert(fmt, ...) \ printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_crit - Print a critical-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_crit(fmt, ...) \ printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_err - Print an error-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_warn - Print a warning-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt() + * to generate the format string. + */ #define pr_warn(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_notice - Print a notice-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_notice(fmt, ...) \ printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_info - Print an info-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -/* - * Like KERN_CONT, pr_cont() should only be used when continuing - * a line with no newline ('\n') enclosed. Otherwise it defaults - * back to KERN_DEFAULT. + +/** + * pr_cont - Continues a previous log message in the same line. + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_CONT loglevel. It should only be + * used when continuing a log message with no newline ('\n') enclosed. Otherwise + * it defaults back to KERN_DEFAULT loglevel. */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) -/* pr_devel() should produce zero code unless DEBUG is defined */ +/** + * pr_devel - Print a debug-level message conditionally + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is + * defined. Otherwise it does nothing. + * + * It uses pr_fmt() to generate the format string. + */ #ifdef DEBUG #define pr_devel(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) @@ -325,8 +402,19 @@ extern int kptr_restrict; #if defined(CONFIG_DYNAMIC_DEBUG) #include -/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ -#define pr_debug(fmt, ...) \ +/** + * pr_debug - Print a debug-level message conditionally + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is + * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with + * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing. + * + * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses + * pr_fmt() internally). + */ +#define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug(fmt, ...) \ -- cgit v1.2.3 From 1487deda19c82d30d1867277e89bc2d515b9d2d4 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 20 Apr 2020 17:15:58 -0600 Subject: remoteproc: Use kstrdup_const() rather than kstrdup() For cases where @firmware is declared "const char *", use function kstrdup_const() to avoid needlessly creating another copy on the heap. Suggested-by: Bjorn Andersson Signed-off-by: Mathieu Poirier Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20200420231601.16781-2-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 8 ++++---- include/linux/remoteproc.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index d9e6949e4ac1..db8a15fc1e4a 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1996,7 +1996,7 @@ static void rproc_type_release(struct device *dev) if (rproc->index >= 0) ida_simple_remove(&rproc_dev_index, rproc->index); - kfree(rproc->firmware); + kfree_const(rproc->firmware); kfree(rproc->ops); kfree(rproc); } @@ -2009,7 +2009,7 @@ static const struct device_type rproc_type = { static int rproc_alloc_firmware(struct rproc *rproc, const char *name, const char *firmware) { - char *p; + const char *p; if (!firmware) /* @@ -2018,7 +2018,7 @@ static int rproc_alloc_firmware(struct rproc *rproc, */ p = kasprintf(GFP_KERNEL, "rproc-%s-fw", name); else - p = kstrdup(firmware, GFP_KERNEL); + p = kstrdup_const(firmware, GFP_KERNEL); if (!p) return -ENOMEM; @@ -2122,7 +2122,7 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, return rproc; free_firmware: - kfree(rproc->firmware); + kfree_const(rproc->firmware); free_rproc: kfree(rproc); return NULL; diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 0547676479d3..800b4f09dc98 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -489,7 +489,7 @@ struct rproc { struct list_head node; struct iommu_domain *domain; const char *name; - char *firmware; + const char *firmware; void *priv; struct rproc_ops *ops; struct device dev; -- cgit v1.2.3 From 305ac5a766b1d0dd8a4052c8c92e5464888eaa10 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 17 Apr 2020 19:00:37 +0200 Subject: remoteproc: Add device-managed variants of rproc_alloc/rproc_add Add API functions devm_rproc_alloc() and devm_rproc_add(), which behave like rproc_alloc() and rproc_add() respectively, but register their respective cleanup function to be called on driver detach. Reviewed-by: Bjorn Andersson Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20200417170040.174319-2-paul@crapouillou.net Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 67 ++++++++++++++++++++++++++++++++++++ include/linux/remoteproc.h | 5 +++ 2 files changed, 72 insertions(+) (limited to 'include/linux') diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index a352362d8e48..448262470fc7 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1976,6 +1976,33 @@ int rproc_add(struct rproc *rproc) } EXPORT_SYMBOL(rproc_add); +static void devm_rproc_remove(void *rproc) +{ + rproc_del(rproc); +} + +/** + * devm_rproc_add() - resource managed rproc_add() + * @dev: the underlying device + * @rproc: the remote processor handle to register + * + * This function performs like rproc_add() but the registered rproc device will + * automatically be removed on driver detach. + * + * Returns: 0 on success, negative errno on failure + */ +int devm_rproc_add(struct device *dev, struct rproc *rproc) +{ + int err; + + err = rproc_add(rproc); + if (err) + return err; + + return devm_add_action_or_reset(dev, devm_rproc_remove, rproc); +} +EXPORT_SYMBOL(devm_rproc_add); + /** * rproc_type_release() - release a remote processor instance * @dev: the rproc's device @@ -2215,6 +2242,46 @@ int rproc_del(struct rproc *rproc) } EXPORT_SYMBOL(rproc_del); +static void devm_rproc_free(struct device *dev, void *res) +{ + rproc_free(*(struct rproc **)res); +} + +/** + * devm_rproc_alloc() - resource managed rproc_alloc() + * @dev: the underlying device + * @name: name of this remote processor + * @ops: platform-specific handlers (mainly start/stop) + * @firmware: name of firmware file to load, can be NULL + * @len: length of private data needed by the rproc driver (in bytes) + * + * This function performs like rproc_alloc() but the acquired rproc device will + * automatically be released on driver detach. + * + * Returns: new rproc instance, or NULL on failure + */ +struct rproc *devm_rproc_alloc(struct device *dev, const char *name, + const struct rproc_ops *ops, + const char *firmware, int len) +{ + struct rproc **ptr, *rproc; + + ptr = devres_alloc(devm_rproc_free, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + rproc = rproc_alloc(dev, name, ops, firmware, len); + if (rproc) { + *ptr = rproc; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return rproc; +} +EXPORT_SYMBOL(devm_rproc_alloc); + /** * rproc_add_subdev() - add a subdevice to a remoteproc * @rproc: rproc handle to add the subdevice to diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 800b4f09dc98..ac4082f12e8b 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -600,6 +600,11 @@ int rproc_add(struct rproc *rproc); int rproc_del(struct rproc *rproc); void rproc_free(struct rproc *rproc); +struct rproc *devm_rproc_alloc(struct device *dev, const char *name, + const struct rproc_ops *ops, + const char *firmware, int len); +int devm_rproc_add(struct device *dev, struct rproc *rproc); + void rproc_add_carveout(struct rproc *rproc, struct rproc_mem_entry *mem); struct rproc_mem_entry * -- cgit v1.2.3 From 812756a82ea51e3c7ff7ba5e6fa3f34345234bc7 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Tue, 14 Apr 2020 17:56:25 +0200 Subject: kvm_host: unify VM_STAT and VCPU_STAT definitions in a single place The macros VM_STAT and VCPU_STAT are redundantly implemented in multiple files, each used by a different architecure to initialize the debugfs entries for statistics. Since they all have the same purpose, they can be unified in a single common definition in include/linux/kvm_host.h Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20200414155625.20559-1-eesposit@redhat.com> Acked-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 23 +++--- arch/mips/kvm/mips.c | 61 +++++++------- arch/powerpc/kvm/book3s.c | 61 +++++++------- arch/powerpc/kvm/booke.c | 41 +++++----- arch/s390/kvm/kvm-s390.c | 203 +++++++++++++++++++++++----------------------- arch/x86/kvm/x86.c | 80 +++++++++--------- include/linux/kvm_host.h | 5 ++ 7 files changed, 231 insertions(+), 243 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 23ebe51410f0..8417b200bec9 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -29,20 +29,17 @@ #include "trace.h" -#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM } -#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU } - struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT(halt_successful_poll), - VCPU_STAT(halt_attempted_poll), - VCPU_STAT(halt_poll_invalid), - VCPU_STAT(halt_wakeup), - VCPU_STAT(hvc_exit_stat), - VCPU_STAT(wfe_exit_stat), - VCPU_STAT(wfi_exit_stat), - VCPU_STAT(mmio_exit_user), - VCPU_STAT(mmio_exit_kernel), - VCPU_STAT(exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hvc_exit_stat", hvc_exit_stat), + VCPU_STAT("wfe_exit_stat", wfe_exit_stat), + VCPU_STAT("wfi_exit_stat", wfi_exit_stat), + VCPU_STAT("mmio_exit_user", mmio_exit_user), + VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), + VCPU_STAT("exits", exits), { NULL } }; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 8f05dd0a0f4e..fdf1c14d9205 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -39,40 +39,39 @@ #define VECTORSPACING 0x100 /* for EI/VI mode */ #endif -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x) struct kvm_stats_debugfs_item debugfs_entries[] = { - { "wait", VCPU_STAT(wait_exits), KVM_STAT_VCPU }, - { "cache", VCPU_STAT(cache_exits), KVM_STAT_VCPU }, - { "signal", VCPU_STAT(signal_exits), KVM_STAT_VCPU }, - { "interrupt", VCPU_STAT(int_exits), KVM_STAT_VCPU }, - { "cop_unusable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU }, - { "tlbmod", VCPU_STAT(tlbmod_exits), KVM_STAT_VCPU }, - { "tlbmiss_ld", VCPU_STAT(tlbmiss_ld_exits), KVM_STAT_VCPU }, - { "tlbmiss_st", VCPU_STAT(tlbmiss_st_exits), KVM_STAT_VCPU }, - { "addrerr_st", VCPU_STAT(addrerr_st_exits), KVM_STAT_VCPU }, - { "addrerr_ld", VCPU_STAT(addrerr_ld_exits), KVM_STAT_VCPU }, - { "syscall", VCPU_STAT(syscall_exits), KVM_STAT_VCPU }, - { "resvd_inst", VCPU_STAT(resvd_inst_exits), KVM_STAT_VCPU }, - { "break_inst", VCPU_STAT(break_inst_exits), KVM_STAT_VCPU }, - { "trap_inst", VCPU_STAT(trap_inst_exits), KVM_STAT_VCPU }, - { "msa_fpe", VCPU_STAT(msa_fpe_exits), KVM_STAT_VCPU }, - { "fpe", VCPU_STAT(fpe_exits), KVM_STAT_VCPU }, - { "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU }, - { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU }, + VCPU_STAT("wait", wait_exits), + VCPU_STAT("cache", cache_exits), + VCPU_STAT("signal", signal_exits), + VCPU_STAT("interrupt", int_exits), + VCPU_STAT("cop_unusable", cop_unusable_exits), + VCPU_STAT("tlbmod", tlbmod_exits), + VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits), + VCPU_STAT("tlbmiss_st", tlbmiss_st_exits), + VCPU_STAT("addrerr_st", addrerr_st_exits), + VCPU_STAT("addrerr_ld", addrerr_ld_exits), + VCPU_STAT("syscall", syscall_exits), + VCPU_STAT("resvd_inst", resvd_inst_exits), + VCPU_STAT("break_inst", break_inst_exits), + VCPU_STAT("trap_inst", trap_inst_exits), + VCPU_STAT("msa_fpe", msa_fpe_exits), + VCPU_STAT("fpe", fpe_exits), + VCPU_STAT("msa_disabled", msa_disabled_exits), + VCPU_STAT("flush_dcache", flush_dcache_exits), #ifdef CONFIG_KVM_MIPS_VZ - { "vz_gpsi", VCPU_STAT(vz_gpsi_exits), KVM_STAT_VCPU }, - { "vz_gsfc", VCPU_STAT(vz_gsfc_exits), KVM_STAT_VCPU }, - { "vz_hc", VCPU_STAT(vz_hc_exits), KVM_STAT_VCPU }, - { "vz_grr", VCPU_STAT(vz_grr_exits), KVM_STAT_VCPU }, - { "vz_gva", VCPU_STAT(vz_gva_exits), KVM_STAT_VCPU }, - { "vz_ghfc", VCPU_STAT(vz_ghfc_exits), KVM_STAT_VCPU }, - { "vz_gpa", VCPU_STAT(vz_gpa_exits), KVM_STAT_VCPU }, - { "vz_resvd", VCPU_STAT(vz_resvd_exits), KVM_STAT_VCPU }, + VCPU_STAT("vz_gpsi", vz_gpsi_exits), + VCPU_STAT("vz_gsfc", vz_gsfc_exits), + VCPU_STAT("vz_hc", vz_hc_exits), + VCPU_STAT("vz_grr", vz_grr_exits), + VCPU_STAT("vz_gva", vz_gva_exits), + VCPU_STAT("vz_ghfc", vz_ghfc_exits), + VCPU_STAT("vz_gpa", vz_gpa_exits), + VCPU_STAT("vz_resvd", vz_resvd_exits), #endif - { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU }, - { "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU }, + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), {NULL} }; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 5690a1f9b976..37508a356f28 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -36,41 +36,38 @@ #include "book3s.h" #include "trace.h" -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ - /* #define EXIT_DEBUG */ struct kvm_stats_debugfs_item debugfs_entries[] = { - { "exits", VCPU_STAT(sum_exits) }, - { "mmio", VCPU_STAT(mmio_exits) }, - { "sig", VCPU_STAT(signal_exits) }, - { "sysc", VCPU_STAT(syscall_exits) }, - { "inst_emu", VCPU_STAT(emulated_inst_exits) }, - { "dec", VCPU_STAT(dec_exits) }, - { "ext_intr", VCPU_STAT(ext_intr_exits) }, - { "queue_intr", VCPU_STAT(queue_intr) }, - { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) }, - { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) }, - { "halt_wait_ns", VCPU_STAT(halt_wait_ns) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, - { "halt_successful_wait", VCPU_STAT(halt_successful_wait) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "pf_storage", VCPU_STAT(pf_storage) }, - { "sp_storage", VCPU_STAT(sp_storage) }, - { "pf_instruc", VCPU_STAT(pf_instruc) }, - { "sp_instruc", VCPU_STAT(sp_instruc) }, - { "ld", VCPU_STAT(ld) }, - { "ld_slow", VCPU_STAT(ld_slow) }, - { "st", VCPU_STAT(st) }, - { "st_slow", VCPU_STAT(st_slow) }, - { "pthru_all", VCPU_STAT(pthru_all) }, - { "pthru_host", VCPU_STAT(pthru_host) }, - { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, - { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) }, - { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) }, + VCPU_STAT("exits", sum_exits), + VCPU_STAT("mmio", mmio_exits), + VCPU_STAT("sig", signal_exits), + VCPU_STAT("sysc", syscall_exits), + VCPU_STAT("inst_emu", emulated_inst_exits), + VCPU_STAT("dec", dec_exits), + VCPU_STAT("ext_intr", ext_intr_exits), + VCPU_STAT("queue_intr", queue_intr), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("halt_wait_ns", halt_wait_ns), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_successful_wait", halt_successful_wait), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("pf_storage", pf_storage), + VCPU_STAT("sp_storage", sp_storage), + VCPU_STAT("pf_instruc", pf_instruc), + VCPU_STAT("sp_instruc", sp_instruc), + VCPU_STAT("ld", ld), + VCPU_STAT("ld_slow", ld_slow), + VCPU_STAT("st", st), + VCPU_STAT("st_slow", st_slow), + VCPU_STAT("pthru_all", pthru_all), + VCPU_STAT("pthru_host", pthru_host), + VCPU_STAT("pthru_bad_aff", pthru_bad_aff), + VM_STAT("largepages_2M", num_2M_pages, .mode = 0444), + VM_STAT("largepages_1G", num_1G_pages, .mode = 0444), { NULL } }; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 6c18ea88fd25..c2984cb6dfa7 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -35,29 +35,26 @@ unsigned long kvmppc_booke_handlers; -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU - struct kvm_stats_debugfs_item debugfs_entries[] = { - { "mmio", VCPU_STAT(mmio_exits) }, - { "sig", VCPU_STAT(signal_exits) }, - { "itlb_r", VCPU_STAT(itlb_real_miss_exits) }, - { "itlb_v", VCPU_STAT(itlb_virt_miss_exits) }, - { "dtlb_r", VCPU_STAT(dtlb_real_miss_exits) }, - { "dtlb_v", VCPU_STAT(dtlb_virt_miss_exits) }, - { "sysc", VCPU_STAT(syscall_exits) }, - { "isi", VCPU_STAT(isi_exits) }, - { "dsi", VCPU_STAT(dsi_exits) }, - { "inst_emu", VCPU_STAT(emulated_inst_exits) }, - { "dec", VCPU_STAT(dec_exits) }, - { "ext_intr", VCPU_STAT(ext_intr_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "doorbell", VCPU_STAT(dbell_exits) }, - { "guest doorbell", VCPU_STAT(gdbell_exits) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + VCPU_STAT("mmio", mmio_exits), + VCPU_STAT("sig", signal_exits), + VCPU_STAT("itlb_r", itlb_real_miss_exits), + VCPU_STAT("itlb_v", itlb_virt_miss_exits), + VCPU_STAT("dtlb_r", dtlb_real_miss_exits), + VCPU_STAT("dtlb_v", dtlb_virt_miss_exits), + VCPU_STAT("sysc", syscall_exits), + VCPU_STAT("isi", isi_exits), + VCPU_STAT("dsi", dsi_exits), + VCPU_STAT("inst_emu", emulated_inst_exits), + VCPU_STAT("dec", dec_exits), + VCPU_STAT("ext_intr", ext_intr_exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("doorbell", dbell_exits), + VCPU_STAT("guest doorbell", gdbell_exits), + VM_STAT("remote_tlb_flush", remote_tlb_flush), { NULL } }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5dcf9ff12828..5307929a6c89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -57,110 +57,107 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM - struct kvm_stats_debugfs_item debugfs_entries[] = { - { "userspace_handled", VCPU_STAT(exit_userspace) }, - { "exit_null", VCPU_STAT(exit_null) }, - { "exit_validity", VCPU_STAT(exit_validity) }, - { "exit_stop_request", VCPU_STAT(exit_stop_request) }, - { "exit_external_request", VCPU_STAT(exit_external_request) }, - { "exit_io_request", VCPU_STAT(exit_io_request) }, - { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) }, - { "exit_instruction", VCPU_STAT(exit_instruction) }, - { "exit_pei", VCPU_STAT(exit_pei) }, - { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, - { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, - { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, - { "instruction_lctl", VCPU_STAT(instruction_lctl) }, - { "instruction_stctl", VCPU_STAT(instruction_stctl) }, - { "instruction_stctg", VCPU_STAT(instruction_stctg) }, - { "deliver_ckc", VCPU_STAT(deliver_ckc) }, - { "deliver_cputm", VCPU_STAT(deliver_cputm) }, - { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, - { "deliver_external_call", VCPU_STAT(deliver_external_call) }, - { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, - { "deliver_virtio", VCPU_STAT(deliver_virtio) }, - { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, - { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) }, - { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, - { "deliver_program", VCPU_STAT(deliver_program) }, - { "deliver_io", VCPU_STAT(deliver_io) }, - { "deliver_machine_check", VCPU_STAT(deliver_machine_check) }, - { "exit_wait_state", VCPU_STAT(exit_wait_state) }, - { "inject_ckc", VCPU_STAT(inject_ckc) }, - { "inject_cputm", VCPU_STAT(inject_cputm) }, - { "inject_external_call", VCPU_STAT(inject_external_call) }, - { "inject_float_mchk", VM_STAT(inject_float_mchk) }, - { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) }, - { "inject_io", VM_STAT(inject_io) }, - { "inject_mchk", VCPU_STAT(inject_mchk) }, - { "inject_pfault_done", VM_STAT(inject_pfault_done) }, - { "inject_program", VCPU_STAT(inject_program) }, - { "inject_restart", VCPU_STAT(inject_restart) }, - { "inject_service_signal", VM_STAT(inject_service_signal) }, - { "inject_set_prefix", VCPU_STAT(inject_set_prefix) }, - { "inject_stop_signal", VCPU_STAT(inject_stop_signal) }, - { "inject_pfault_init", VCPU_STAT(inject_pfault_init) }, - { "inject_virtio", VM_STAT(inject_virtio) }, - { "instruction_epsw", VCPU_STAT(instruction_epsw) }, - { "instruction_gs", VCPU_STAT(instruction_gs) }, - { "instruction_io_other", VCPU_STAT(instruction_io_other) }, - { "instruction_lpsw", VCPU_STAT(instruction_lpsw) }, - { "instruction_lpswe", VCPU_STAT(instruction_lpswe) }, - { "instruction_pfmf", VCPU_STAT(instruction_pfmf) }, - { "instruction_ptff", VCPU_STAT(instruction_ptff) }, - { "instruction_stidp", VCPU_STAT(instruction_stidp) }, - { "instruction_sck", VCPU_STAT(instruction_sck) }, - { "instruction_sckpf", VCPU_STAT(instruction_sckpf) }, - { "instruction_spx", VCPU_STAT(instruction_spx) }, - { "instruction_stpx", VCPU_STAT(instruction_stpx) }, - { "instruction_stap", VCPU_STAT(instruction_stap) }, - { "instruction_iske", VCPU_STAT(instruction_iske) }, - { "instruction_ri", VCPU_STAT(instruction_ri) }, - { "instruction_rrbe", VCPU_STAT(instruction_rrbe) }, - { "instruction_sske", VCPU_STAT(instruction_sske) }, - { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) }, - { "instruction_essa", VCPU_STAT(instruction_essa) }, - { "instruction_stsi", VCPU_STAT(instruction_stsi) }, - { "instruction_stfl", VCPU_STAT(instruction_stfl) }, - { "instruction_tb", VCPU_STAT(instruction_tb) }, - { "instruction_tpi", VCPU_STAT(instruction_tpi) }, - { "instruction_tprot", VCPU_STAT(instruction_tprot) }, - { "instruction_tsch", VCPU_STAT(instruction_tsch) }, - { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, - { "instruction_sie", VCPU_STAT(instruction_sie) }, - { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, - { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, - { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, - { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, - { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) }, - { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) }, - { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, - { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) }, - { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) }, - { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) }, - { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, - { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) }, - { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) }, - { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) }, - { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) }, - { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) }, - { "instruction_diag_10", VCPU_STAT(diagnose_10) }, - { "instruction_diag_44", VCPU_STAT(diagnose_44) }, - { "instruction_diag_9c", VCPU_STAT(diagnose_9c) }, - { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) }, - { "instruction_diag_258", VCPU_STAT(diagnose_258) }, - { "instruction_diag_308", VCPU_STAT(diagnose_308) }, - { "instruction_diag_500", VCPU_STAT(diagnose_500) }, - { "instruction_diag_other", VCPU_STAT(diagnose_other) }, + VCPU_STAT("userspace_handled", exit_userspace), + VCPU_STAT("exit_null", exit_null), + VCPU_STAT("exit_validity", exit_validity), + VCPU_STAT("exit_stop_request", exit_stop_request), + VCPU_STAT("exit_external_request", exit_external_request), + VCPU_STAT("exit_io_request", exit_io_request), + VCPU_STAT("exit_external_interrupt", exit_external_interrupt), + VCPU_STAT("exit_instruction", exit_instruction), + VCPU_STAT("exit_pei", exit_pei), + VCPU_STAT("exit_program_interruption", exit_program_interruption), + VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program), + VCPU_STAT("exit_operation_exception", exit_operation_exception), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("instruction_lctlg", instruction_lctlg), + VCPU_STAT("instruction_lctl", instruction_lctl), + VCPU_STAT("instruction_stctl", instruction_stctl), + VCPU_STAT("instruction_stctg", instruction_stctg), + VCPU_STAT("deliver_ckc", deliver_ckc), + VCPU_STAT("deliver_cputm", deliver_cputm), + VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal), + VCPU_STAT("deliver_external_call", deliver_external_call), + VCPU_STAT("deliver_service_signal", deliver_service_signal), + VCPU_STAT("deliver_virtio", deliver_virtio), + VCPU_STAT("deliver_stop_signal", deliver_stop_signal), + VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal), + VCPU_STAT("deliver_restart_signal", deliver_restart_signal), + VCPU_STAT("deliver_program", deliver_program), + VCPU_STAT("deliver_io", deliver_io), + VCPU_STAT("deliver_machine_check", deliver_machine_check), + VCPU_STAT("exit_wait_state", exit_wait_state), + VCPU_STAT("inject_ckc", inject_ckc), + VCPU_STAT("inject_cputm", inject_cputm), + VCPU_STAT("inject_external_call", inject_external_call), + VM_STAT("inject_float_mchk", inject_float_mchk), + VCPU_STAT("inject_emergency_signal", inject_emergency_signal), + VM_STAT("inject_io", inject_io), + VCPU_STAT("inject_mchk", inject_mchk), + VM_STAT("inject_pfault_done", inject_pfault_done), + VCPU_STAT("inject_program", inject_program), + VCPU_STAT("inject_restart", inject_restart), + VM_STAT("inject_service_signal", inject_service_signal), + VCPU_STAT("inject_set_prefix", inject_set_prefix), + VCPU_STAT("inject_stop_signal", inject_stop_signal), + VCPU_STAT("inject_pfault_init", inject_pfault_init), + VM_STAT("inject_virtio", inject_virtio), + VCPU_STAT("instruction_epsw", instruction_epsw), + VCPU_STAT("instruction_gs", instruction_gs), + VCPU_STAT("instruction_io_other", instruction_io_other), + VCPU_STAT("instruction_lpsw", instruction_lpsw), + VCPU_STAT("instruction_lpswe", instruction_lpswe), + VCPU_STAT("instruction_pfmf", instruction_pfmf), + VCPU_STAT("instruction_ptff", instruction_ptff), + VCPU_STAT("instruction_stidp", instruction_stidp), + VCPU_STAT("instruction_sck", instruction_sck), + VCPU_STAT("instruction_sckpf", instruction_sckpf), + VCPU_STAT("instruction_spx", instruction_spx), + VCPU_STAT("instruction_stpx", instruction_stpx), + VCPU_STAT("instruction_stap", instruction_stap), + VCPU_STAT("instruction_iske", instruction_iske), + VCPU_STAT("instruction_ri", instruction_ri), + VCPU_STAT("instruction_rrbe", instruction_rrbe), + VCPU_STAT("instruction_sske", instruction_sske), + VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock), + VCPU_STAT("instruction_essa", instruction_essa), + VCPU_STAT("instruction_stsi", instruction_stsi), + VCPU_STAT("instruction_stfl", instruction_stfl), + VCPU_STAT("instruction_tb", instruction_tb), + VCPU_STAT("instruction_tpi", instruction_tpi), + VCPU_STAT("instruction_tprot", instruction_tprot), + VCPU_STAT("instruction_tsch", instruction_tsch), + VCPU_STAT("instruction_sthyi", instruction_sthyi), + VCPU_STAT("instruction_sie", instruction_sie), + VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense), + VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running), + VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call), + VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency), + VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency), + VCPU_STAT("instruction_sigp_start", instruction_sigp_start), + VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop), + VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status), + VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status), + VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status), + VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch), + VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix), + VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart), + VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset), + VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset), + VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown), + VCPU_STAT("instruction_diag_10", diagnose_10), + VCPU_STAT("instruction_diag_44", diagnose_44), + VCPU_STAT("instruction_diag_9c", diagnose_9c), + VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored), + VCPU_STAT("instruction_diag_258", diagnose_258), + VCPU_STAT("instruction_diag_308", diagnose_308), + VCPU_STAT("instruction_diag_500", diagnose_500), + VCPU_STAT("instruction_diag_other", diagnose_other), { NULL } }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de77bc9bd0d7..8104dc9ff5b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -97,9 +97,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ - #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -194,45 +191,44 @@ u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "req_event", VCPU_STAT(req_event) }, - { "l1d_flush", VCPU_STAT(l1d_flush) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages, .mode = 0444) }, - { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, - { "max_mmu_page_hash_collisions", - VM_STAT(max_mmu_page_hash_collisions) }, + VCPU_STAT("pf_fixed", pf_fixed), + VCPU_STAT("pf_guest", pf_guest), + VCPU_STAT("tlb_flush", tlb_flush), + VCPU_STAT("invlpg", invlpg), + VCPU_STAT("exits", exits), + VCPU_STAT("io_exits", io_exits), + VCPU_STAT("mmio_exits", mmio_exits), + VCPU_STAT("signal_exits", signal_exits), + VCPU_STAT("irq_window", irq_window_exits), + VCPU_STAT("nmi_window", nmi_window_exits), + VCPU_STAT("halt_exits", halt_exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hypercalls", hypercalls), + VCPU_STAT("request_irq", request_irq_exits), + VCPU_STAT("irq_exits", irq_exits), + VCPU_STAT("host_state_reload", host_state_reload), + VCPU_STAT("fpu_reload", fpu_reload), + VCPU_STAT("insn_emulation", insn_emulation), + VCPU_STAT("insn_emulation_fail", insn_emulation_fail), + VCPU_STAT("irq_injections", irq_injections), + VCPU_STAT("nmi_injections", nmi_injections), + VCPU_STAT("req_event", req_event), + VCPU_STAT("l1d_flush", l1d_flush), + VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), + VM_STAT("mmu_pte_write", mmu_pte_write), + VM_STAT("mmu_pte_updated", mmu_pte_updated), + VM_STAT("mmu_pde_zapped", mmu_pde_zapped), + VM_STAT("mmu_flooded", mmu_flooded), + VM_STAT("mmu_recycled", mmu_recycled), + VM_STAT("mmu_cache_miss", mmu_cache_miss), + VM_STAT("mmu_unsync", mmu_unsync), + VM_STAT("remote_tlb_flush", remote_tlb_flush), + VM_STAT("largepages", lpages, .mode = 0444), + VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), + VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), { NULL } }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01276e3d01b9..658215f6102c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1130,6 +1130,11 @@ struct kvm_stats_debugfs_item { #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) +#define VM_STAT(n, x, ...) \ + { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ } +#define VCPU_STAT(n, x, ...) \ + { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ } + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; -- cgit v1.2.3 From c36b71503a2268206ebeda6697094ffb4e7e94c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 16 Apr 2020 09:48:07 -0400 Subject: KVM: x86/mmu: Avoid an extra memslot lookup in try_async_pf() for L2 Create a new function kvm_is_visible_memslot() and use it from kvm_is_visible_gfn(); use the new function in try_async_pf() too, to avoid an extra memslot lookup. Opportunistically squish a multi-line comment into a single-line comment. Note, the end result, KVM_PFN_NOSLOT, is unchanged. Cc: Jim Mattson Cc: Rick Edgecombe Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 +++------ include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 6 +----- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b67046fa0ac4..e618472c572b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4082,19 +4082,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable) { - struct kvm_memory_slot *slot; + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; - /* - * Don't expose private memslots to L2. - */ - if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { *pfn = KVM_PFN_NOSLOT; *writable = false; return false; } - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 658215f6102c..7d4f1eb70274 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1357,6 +1357,12 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) +{ + return (memslot && memslot->id < KVM_USER_MEM_SLOTS && + !(memslot->flags & KVM_MEMSLOT_INVALID)); +} + struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index da8fd45e0e3e..8aa577db131e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1607,11 +1607,7 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || - memslot->flags & KVM_MEMSLOT_INVALID) - return false; - - return true; + return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -- cgit v1.2.3 From 1b94f6f81007b4afaea3480ec018bc9236148961 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 16 Apr 2020 13:10:57 +0800 Subject: KVM: Remove redundant argument to kvm_arch_vcpu_ioctl_run In earlier versions of kvm, 'kvm_run' was an independent structure and was not included in the vcpu structure. At present, 'kvm_run' is already included in the vcpu structure, so the parameter 'kvm_run' is redundant. This patch simplifies the function definition, removes the extra 'kvm_run' parameter, and extracts it from the 'kvm_vcpu' structure if necessary. Signed-off-by: Tianjia Zhang Message-Id: <20200416051057.26526-1-tianjia.zhang@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 ++- arch/powerpc/kvm/powerpc.c | 3 ++- arch/s390/kvm/kvm-s390.c | 3 ++- arch/x86/kvm/x86.c | 11 ++++++----- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 6 +++--- virt/kvm/kvm_main.c | 2 +- 7 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index fdf1c14d9205..9f50ceef9978 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -438,8 +438,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r = -EINTR; vcpu_load(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e15166b0a16d..7e24691e138a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1764,8 +1764,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r; vcpu_load(vcpu); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5307929a6c89..75471b646fd7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4333,8 +4333,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) store_regs_fmt2(vcpu, kvm_run); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int rc; if (kvm_run->immediate_exit) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d9bcd5faac5..59958ce2b681 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8710,8 +8710,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); @@ -8729,18 +8730,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@ -8770,7 +8771,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d4f1eb70274..5285a5568208 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -866,7 +866,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); int kvm_arch_init(void *opaque); void kvm_arch_exit(void); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 48d0ec44ad77..f5390ac2165b 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -639,7 +639,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer - * @run: The kvm_run structure pointer used for userspace state exchange * * This function is called through the VCPU_RUN ioctl called from user space. It * will execute VM code in a loop until the time slice for the process is used @@ -647,8 +646,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) * return with return value 0 and with the kvm_run structure filled in with the * required data for the requested emulation. */ -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int ret; if (unlikely(!kvm_vcpu_initialized(vcpu))) @@ -659,7 +659,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; if (run->exit_reason == KVM_EXIT_MMIO) { - ret = kvm_handle_mmio_return(vcpu, vcpu->run); + ret = kvm_handle_mmio_return(vcpu, run); if (ret) return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8aa577db131e..e2f60e313c87 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3129,7 +3129,7 @@ static long kvm_vcpu_ioctl(struct file *filp, synchronize_rcu(); put_pid(oldpid); } - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + r = kvm_arch_vcpu_ioctl_run(vcpu); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } -- cgit v1.2.3 From 14bbe3e33710be52f21d61253a94c5f44a696d02 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 1 Apr 2020 10:33:43 -0700 Subject: docs: Add rbtree documentation to the core-api This file is close enough to being in rst format that I didn't feel the need to alter it in any way. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Michel Lespinasse Link: https://lore.kernel.org/r/20200401173343.17472-1-willy@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/core-api/index.rst | 1 + Documentation/core-api/rbtree.rst | 429 +++++++++++++++++++++++++++++++++ Documentation/rbtree.txt | 429 --------------------------------- include/linux/rbtree.h | 2 +- include/linux/rbtree_augmented.h | 2 +- lib/Kconfig | 2 +- tools/include/linux/rbtree.h | 2 +- tools/include/linux/rbtree_augmented.h | 2 +- 8 files changed, 435 insertions(+), 434 deletions(-) create mode 100644 Documentation/core-api/rbtree.rst delete mode 100644 Documentation/rbtree.txt (limited to 'include/linux') diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 49e3da910d9e..b29c4a07beda 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -35,6 +35,7 @@ Library functionality that is used throughout the kernel. xarray idr circular-buffers + rbtree generic-radix-tree packing timekeeping diff --git a/Documentation/core-api/rbtree.rst b/Documentation/core-api/rbtree.rst new file mode 100644 index 000000000000..523d54b60087 --- /dev/null +++ b/Documentation/core-api/rbtree.rst @@ -0,0 +1,429 @@ +================================= +Red-black Trees (rbtree) in Linux +================================= + + +:Date: January 18, 2007 +:Author: Rob Landley + +What are red-black trees, and what are they for? +------------------------------------------------ + +Red-black trees are a type of self-balancing binary search tree, used for +storing sortable key/value data pairs. This differs from radix trees (which +are used to efficiently store sparse arrays and thus use long integer indexes +to insert/access/delete nodes) and hash tables (which are not kept sorted to +be easily traversed in order, and must be tuned for a specific size and +hash function where rbtrees scale gracefully storing arbitrary keys). + +Red-black trees are similar to AVL trees, but provide faster real-time bounded +worst case performance for insertion and deletion (at most two rotations and +three rotations, respectively, to balance the tree), with slightly slower +(but still O(log n)) lookup time. + +To quote Linux Weekly News: + + There are a number of red-black trees in use in the kernel. + The deadline and CFQ I/O schedulers employ rbtrees to + track requests; the packet CD/DVD driver does the same. + The high-resolution timer code uses an rbtree to organize outstanding + timer requests. The ext3 filesystem tracks directory entries in a + red-black tree. Virtual memory areas (VMAs) are tracked with red-black + trees, as are epoll file descriptors, cryptographic keys, and network + packets in the "hierarchical token bucket" scheduler. + +This document covers use of the Linux rbtree implementation. For more +information on the nature and implementation of Red Black Trees, see: + + Linux Weekly News article on red-black trees + http://lwn.net/Articles/184495/ + + Wikipedia entry on red-black trees + http://en.wikipedia.org/wiki/Red-black_tree + +Linux implementation of red-black trees +--------------------------------------- + +Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it, +"#include ". + +The Linux rbtree implementation is optimized for speed, and thus has one +less layer of indirection (and better cache locality) than more traditional +tree implementations. Instead of using pointers to separate rb_node and data +structures, each instance of struct rb_node is embedded in the data structure +it organizes. And instead of using a comparison callback function pointer, +users are expected to write their own tree search and insert functions +which call the provided rbtree functions. Locking is also left up to the +user of the rbtree code. + +Creating a new rbtree +--------------------- + +Data nodes in an rbtree tree are structures containing a struct rb_node member:: + + struct mytype { + struct rb_node node; + char *keystring; + }; + +When dealing with a pointer to the embedded struct rb_node, the containing data +structure may be accessed with the standard container_of() macro. In addition, +individual members may be accessed directly via rb_entry(node, type, member). + +At the root of each rbtree is an rb_root structure, which is initialized to be +empty via: + + struct rb_root mytree = RB_ROOT; + +Searching for a value in an rbtree +---------------------------------- + +Writing a search function for your tree is fairly straightforward: start at the +root, compare each value, and follow the left or right branch as necessary. + +Example:: + + struct mytype *my_search(struct rb_root *root, char *string) + { + struct rb_node *node = root->rb_node; + + while (node) { + struct mytype *data = container_of(node, struct mytype, node); + int result; + + result = strcmp(string, data->keystring); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + return NULL; + } + +Inserting data into an rbtree +----------------------------- + +Inserting data in the tree involves first searching for the place to insert the +new node, then inserting the node and rebalancing ("recoloring") the tree. + +The search for insertion differs from the previous search by finding the +location of the pointer on which to graft the new node. The new node also +needs a link to its parent node for rebalancing purposes. + +Example:: + + int my_insert(struct rb_root *root, struct mytype *data) + { + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct mytype *this = container_of(*new, struct mytype, node); + int result = strcmp(data->keystring, this->keystring); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + return FALSE; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + + return TRUE; + } + +Removing or replacing existing data in an rbtree +------------------------------------------------ + +To remove an existing node from a tree, call:: + + void rb_erase(struct rb_node *victim, struct rb_root *tree); + +Example:: + + struct mytype *data = mysearch(&mytree, "walrus"); + + if (data) { + rb_erase(&data->node, &mytree); + myfree(data); + } + +To replace an existing node in a tree with a new one with the same key, call:: + + void rb_replace_node(struct rb_node *old, struct rb_node *new, + struct rb_root *tree); + +Replacing a node this way does not re-sort the tree: If the new node doesn't +have the same key as the old node, the rbtree will probably become corrupted. + +Iterating through the elements stored in an rbtree (in sort order) +------------------------------------------------------------------ + +Four functions are provided for iterating through an rbtree's contents in +sorted order. These work on arbitrary trees, and should not need to be +modified or wrapped (except for locking purposes):: + + struct rb_node *rb_first(struct rb_root *tree); + struct rb_node *rb_last(struct rb_root *tree); + struct rb_node *rb_next(struct rb_node *node); + struct rb_node *rb_prev(struct rb_node *node); + +To start iterating, call rb_first() or rb_last() with a pointer to the root +of the tree, which will return a pointer to the node structure contained in +the first or last element in the tree. To continue, fetch the next or previous +node by calling rb_next() or rb_prev() on the current node. This will return +NULL when there are no more nodes left. + +The iterator functions return a pointer to the embedded struct rb_node, from +which the containing data structure may be accessed with the container_of() +macro, and individual members may be accessed directly via +rb_entry(node, type, member). + +Example:: + + struct rb_node *node; + for (node = rb_first(&mytree); node; node = rb_next(node)) + printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); + +Cached rbtrees +-------------- + +Computing the leftmost (smallest) node is quite a common task for binary +search trees, such as for traversals or users relying on a the particular +order for their own logic. To this end, users can use 'struct rb_root_cached' +to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding +potentially expensive tree iterations. This is done at negligible runtime +overhead for maintanence; albeit larger memory footprint. + +Similar to the rb_root structure, cached rbtrees are initialized to be +empty via:: + + struct rb_root_cached mytree = RB_ROOT_CACHED; + +Cached rbtree is simply a regular rb_root with an extra pointer to cache the +leftmost node. This allows rb_root_cached to exist wherever rb_root does, +which permits augmented trees to be supported as well as only a few extra +interfaces:: + + struct rb_node *rb_first_cached(struct rb_root_cached *tree); + void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); + void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); + +Both insert and erase calls have their respective counterpart of augmented +trees:: + + void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, + bool, struct rb_augment_callbacks *); + void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, + struct rb_augment_callbacks *); + + +Support for Augmented rbtrees +----------------------------- + +Augmented rbtree is an rbtree with "some" additional data stored in +each node, where the additional data for node N must be a function of +the contents of all nodes in the subtree rooted at N. This data can +be used to augment some new functionality to rbtree. Augmented rbtree +is an optional feature built on top of basic rbtree infrastructure. +An rbtree user who wants this feature will have to call the augmentation +functions with the user provided augmentation callback when inserting +and erasing nodes. + +C files implementing augmented rbtree manipulation must include + instead of . Note that +linux/rbtree_augmented.h exposes some rbtree implementations details +you are not expected to rely on; please stick to the documented APIs +there and do not include from header files +either so as to minimize chances of your users accidentally relying on +such implementation details. + +On insertion, the user must update the augmented information on the path +leading to the inserted node, then call rb_link_node() as usual and +rb_augment_inserted() instead of the usual rb_insert_color() call. +If rb_augment_inserted() rebalances the rbtree, it will callback into +a user provided function to update the augmented information on the +affected subtrees. + +When erasing a node, the user must call rb_erase_augmented() instead of +rb_erase(). rb_erase_augmented() calls back into user provided functions +to updated the augmented information on affected subtrees. + +In both cases, the callbacks are provided through struct rb_augment_callbacks. +3 callbacks must be defined: + +- A propagation callback, which updates the augmented value for a given + node and its ancestors, up to a given stop point (or NULL to update + all the way to the root). + +- A copy callback, which copies the augmented value for a given subtree + to a newly assigned subtree root. + +- A tree rotation callback, which copies the augmented value for a given + subtree to a newly assigned subtree root AND recomputes the augmented + information for the former subtree root. + +The compiled code for rb_erase_augmented() may inline the propagation and +copy callbacks, which results in a large function, so each augmented rbtree +user should have a single rb_erase_augmented() call site in order to limit +compiled code size. + + +Sample usage +^^^^^^^^^^^^ + +Interval tree is an example of augmented rb tree. Reference - +"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. +More details about interval trees: + +Classical rbtree has a single key and it cannot be directly used to store +interval ranges like [lo:hi] and do a quick lookup for any overlap with a new +lo:hi or to find whether there is an exact match for a new lo:hi. + +However, rbtree can be augmented to store such interval ranges in a structured +way making it possible to do efficient lookup and exact match. + +This "extra information" stored in each node is the maximum hi +(max_hi) value among all the nodes that are its descendants. This +information can be maintained at each node just be looking at the node +and its immediate children. And this will be used in O(log n) lookup +for lowest match (lowest start address among all possible matches) +with something like:: + + struct interval_tree_node * + interval_tree_first_match(struct rb_root *root, + unsigned long start, unsigned long last) + { + struct interval_tree_node *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, struct interval_tree_node, rb); + + while (true) { + if (node->rb.rb_left) { + struct interval_tree_node *left = + rb_entry(node->rb.rb_left, + struct interval_tree_node, rb); + if (left->__subtree_last >= start) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } + } + if (node->start <= last) { /* Cond1 */ + if (node->last >= start) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->rb.rb_right) { + node = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb); + if (node->__subtree_last >= start) + continue; + } + } + return NULL; /* No match */ + } + } + +Insertion/removal are defined using the following augmented callbacks:: + + static inline unsigned long + compute_subtree_last(struct interval_tree_node *node) + { + unsigned long max = node->last, subtree_last; + if (node->rb.rb_left) { + subtree_last = rb_entry(node->rb.rb_left, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + if (node->rb.rb_right) { + subtree_last = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + return max; + } + + static void augment_propagate(struct rb_node *rb, struct rb_node *stop) + { + while (rb != stop) { + struct interval_tree_node *node = + rb_entry(rb, struct interval_tree_node, rb); + unsigned long subtree_last = compute_subtree_last(node); + if (node->__subtree_last == subtree_last) + break; + node->__subtree_last = subtree_last; + rb = rb_parent(&node->rb); + } + } + + static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) + { + struct interval_tree_node *old = + rb_entry(rb_old, struct interval_tree_node, rb); + struct interval_tree_node *new = + rb_entry(rb_new, struct interval_tree_node, rb); + + new->__subtree_last = old->__subtree_last; + } + + static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) + { + struct interval_tree_node *old = + rb_entry(rb_old, struct interval_tree_node, rb); + struct interval_tree_node *new = + rb_entry(rb_new, struct interval_tree_node, rb); + + new->__subtree_last = old->__subtree_last; + old->__subtree_last = compute_subtree_last(old); + } + + static const struct rb_augment_callbacks augment_callbacks = { + augment_propagate, augment_copy, augment_rotate + }; + + void interval_tree_insert(struct interval_tree_node *node, + struct rb_root *root) + { + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + unsigned long start = node->start, last = node->last; + struct interval_tree_node *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct interval_tree_node, rb); + if (parent->__subtree_last < last) + parent->__subtree_last = last; + if (start < parent->start) + link = &parent->rb.rb_left; + else + link = &parent->rb.rb_right; + } + + node->__subtree_last = last; + rb_link_node(&node->rb, rb_parent, link); + rb_insert_augmented(&node->rb, root, &augment_callbacks); + } + + void interval_tree_remove(struct interval_tree_node *node, + struct rb_root *root) + { + rb_erase_augmented(&node->rb, root, &augment_callbacks); + } diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt deleted file mode 100644 index 523d54b60087..000000000000 --- a/Documentation/rbtree.txt +++ /dev/null @@ -1,429 +0,0 @@ -================================= -Red-black Trees (rbtree) in Linux -================================= - - -:Date: January 18, 2007 -:Author: Rob Landley - -What are red-black trees, and what are they for? ------------------------------------------------- - -Red-black trees are a type of self-balancing binary search tree, used for -storing sortable key/value data pairs. This differs from radix trees (which -are used to efficiently store sparse arrays and thus use long integer indexes -to insert/access/delete nodes) and hash tables (which are not kept sorted to -be easily traversed in order, and must be tuned for a specific size and -hash function where rbtrees scale gracefully storing arbitrary keys). - -Red-black trees are similar to AVL trees, but provide faster real-time bounded -worst case performance for insertion and deletion (at most two rotations and -three rotations, respectively, to balance the tree), with slightly slower -(but still O(log n)) lookup time. - -To quote Linux Weekly News: - - There are a number of red-black trees in use in the kernel. - The deadline and CFQ I/O schedulers employ rbtrees to - track requests; the packet CD/DVD driver does the same. - The high-resolution timer code uses an rbtree to organize outstanding - timer requests. The ext3 filesystem tracks directory entries in a - red-black tree. Virtual memory areas (VMAs) are tracked with red-black - trees, as are epoll file descriptors, cryptographic keys, and network - packets in the "hierarchical token bucket" scheduler. - -This document covers use of the Linux rbtree implementation. For more -information on the nature and implementation of Red Black Trees, see: - - Linux Weekly News article on red-black trees - http://lwn.net/Articles/184495/ - - Wikipedia entry on red-black trees - http://en.wikipedia.org/wiki/Red-black_tree - -Linux implementation of red-black trees ---------------------------------------- - -Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it, -"#include ". - -The Linux rbtree implementation is optimized for speed, and thus has one -less layer of indirection (and better cache locality) than more traditional -tree implementations. Instead of using pointers to separate rb_node and data -structures, each instance of struct rb_node is embedded in the data structure -it organizes. And instead of using a comparison callback function pointer, -users are expected to write their own tree search and insert functions -which call the provided rbtree functions. Locking is also left up to the -user of the rbtree code. - -Creating a new rbtree ---------------------- - -Data nodes in an rbtree tree are structures containing a struct rb_node member:: - - struct mytype { - struct rb_node node; - char *keystring; - }; - -When dealing with a pointer to the embedded struct rb_node, the containing data -structure may be accessed with the standard container_of() macro. In addition, -individual members may be accessed directly via rb_entry(node, type, member). - -At the root of each rbtree is an rb_root structure, which is initialized to be -empty via: - - struct rb_root mytree = RB_ROOT; - -Searching for a value in an rbtree ----------------------------------- - -Writing a search function for your tree is fairly straightforward: start at the -root, compare each value, and follow the left or right branch as necessary. - -Example:: - - struct mytype *my_search(struct rb_root *root, char *string) - { - struct rb_node *node = root->rb_node; - - while (node) { - struct mytype *data = container_of(node, struct mytype, node); - int result; - - result = strcmp(string, data->keystring); - - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return data; - } - return NULL; - } - -Inserting data into an rbtree ------------------------------ - -Inserting data in the tree involves first searching for the place to insert the -new node, then inserting the node and rebalancing ("recoloring") the tree. - -The search for insertion differs from the previous search by finding the -location of the pointer on which to graft the new node. The new node also -needs a link to its parent node for rebalancing purposes. - -Example:: - - int my_insert(struct rb_root *root, struct mytype *data) - { - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* Figure out where to put new node */ - while (*new) { - struct mytype *this = container_of(*new, struct mytype, node); - int result = strcmp(data->keystring, this->keystring); - - parent = *new; - if (result < 0) - new = &((*new)->rb_left); - else if (result > 0) - new = &((*new)->rb_right); - else - return FALSE; - } - - /* Add new node and rebalance tree. */ - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); - - return TRUE; - } - -Removing or replacing existing data in an rbtree ------------------------------------------------- - -To remove an existing node from a tree, call:: - - void rb_erase(struct rb_node *victim, struct rb_root *tree); - -Example:: - - struct mytype *data = mysearch(&mytree, "walrus"); - - if (data) { - rb_erase(&data->node, &mytree); - myfree(data); - } - -To replace an existing node in a tree with a new one with the same key, call:: - - void rb_replace_node(struct rb_node *old, struct rb_node *new, - struct rb_root *tree); - -Replacing a node this way does not re-sort the tree: If the new node doesn't -have the same key as the old node, the rbtree will probably become corrupted. - -Iterating through the elements stored in an rbtree (in sort order) ------------------------------------------------------------------- - -Four functions are provided for iterating through an rbtree's contents in -sorted order. These work on arbitrary trees, and should not need to be -modified or wrapped (except for locking purposes):: - - struct rb_node *rb_first(struct rb_root *tree); - struct rb_node *rb_last(struct rb_root *tree); - struct rb_node *rb_next(struct rb_node *node); - struct rb_node *rb_prev(struct rb_node *node); - -To start iterating, call rb_first() or rb_last() with a pointer to the root -of the tree, which will return a pointer to the node structure contained in -the first or last element in the tree. To continue, fetch the next or previous -node by calling rb_next() or rb_prev() on the current node. This will return -NULL when there are no more nodes left. - -The iterator functions return a pointer to the embedded struct rb_node, from -which the containing data structure may be accessed with the container_of() -macro, and individual members may be accessed directly via -rb_entry(node, type, member). - -Example:: - - struct rb_node *node; - for (node = rb_first(&mytree); node; node = rb_next(node)) - printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); - -Cached rbtrees --------------- - -Computing the leftmost (smallest) node is quite a common task for binary -search trees, such as for traversals or users relying on a the particular -order for their own logic. To this end, users can use 'struct rb_root_cached' -to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding -potentially expensive tree iterations. This is done at negligible runtime -overhead for maintanence; albeit larger memory footprint. - -Similar to the rb_root structure, cached rbtrees are initialized to be -empty via:: - - struct rb_root_cached mytree = RB_ROOT_CACHED; - -Cached rbtree is simply a regular rb_root with an extra pointer to cache the -leftmost node. This allows rb_root_cached to exist wherever rb_root does, -which permits augmented trees to be supported as well as only a few extra -interfaces:: - - struct rb_node *rb_first_cached(struct rb_root_cached *tree); - void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); - void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); - -Both insert and erase calls have their respective counterpart of augmented -trees:: - - void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, - bool, struct rb_augment_callbacks *); - void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, - struct rb_augment_callbacks *); - - -Support for Augmented rbtrees ------------------------------ - -Augmented rbtree is an rbtree with "some" additional data stored in -each node, where the additional data for node N must be a function of -the contents of all nodes in the subtree rooted at N. This data can -be used to augment some new functionality to rbtree. Augmented rbtree -is an optional feature built on top of basic rbtree infrastructure. -An rbtree user who wants this feature will have to call the augmentation -functions with the user provided augmentation callback when inserting -and erasing nodes. - -C files implementing augmented rbtree manipulation must include - instead of . Note that -linux/rbtree_augmented.h exposes some rbtree implementations details -you are not expected to rely on; please stick to the documented APIs -there and do not include from header files -either so as to minimize chances of your users accidentally relying on -such implementation details. - -On insertion, the user must update the augmented information on the path -leading to the inserted node, then call rb_link_node() as usual and -rb_augment_inserted() instead of the usual rb_insert_color() call. -If rb_augment_inserted() rebalances the rbtree, it will callback into -a user provided function to update the augmented information on the -affected subtrees. - -When erasing a node, the user must call rb_erase_augmented() instead of -rb_erase(). rb_erase_augmented() calls back into user provided functions -to updated the augmented information on affected subtrees. - -In both cases, the callbacks are provided through struct rb_augment_callbacks. -3 callbacks must be defined: - -- A propagation callback, which updates the augmented value for a given - node and its ancestors, up to a given stop point (or NULL to update - all the way to the root). - -- A copy callback, which copies the augmented value for a given subtree - to a newly assigned subtree root. - -- A tree rotation callback, which copies the augmented value for a given - subtree to a newly assigned subtree root AND recomputes the augmented - information for the former subtree root. - -The compiled code for rb_erase_augmented() may inline the propagation and -copy callbacks, which results in a large function, so each augmented rbtree -user should have a single rb_erase_augmented() call site in order to limit -compiled code size. - - -Sample usage -^^^^^^^^^^^^ - -Interval tree is an example of augmented rb tree. Reference - -"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. -More details about interval trees: - -Classical rbtree has a single key and it cannot be directly used to store -interval ranges like [lo:hi] and do a quick lookup for any overlap with a new -lo:hi or to find whether there is an exact match for a new lo:hi. - -However, rbtree can be augmented to store such interval ranges in a structured -way making it possible to do efficient lookup and exact match. - -This "extra information" stored in each node is the maximum hi -(max_hi) value among all the nodes that are its descendants. This -information can be maintained at each node just be looking at the node -and its immediate children. And this will be used in O(log n) lookup -for lowest match (lowest start address among all possible matches) -with something like:: - - struct interval_tree_node * - interval_tree_first_match(struct rb_root *root, - unsigned long start, unsigned long last) - { - struct interval_tree_node *node; - - if (!root->rb_node) - return NULL; - node = rb_entry(root->rb_node, struct interval_tree_node, rb); - - while (true) { - if (node->rb.rb_left) { - struct interval_tree_node *left = - rb_entry(node->rb.rb_left, - struct interval_tree_node, rb); - if (left->__subtree_last >= start) { - /* - * Some nodes in left subtree satisfy Cond2. - * Iterate to find the leftmost such node N. - * If it also satisfies Cond1, that's the match - * we are looking for. Otherwise, there is no - * matching interval as nodes to the right of N - * can't satisfy Cond1 either. - */ - node = left; - continue; - } - } - if (node->start <= last) { /* Cond1 */ - if (node->last >= start) /* Cond2 */ - return node; /* node is leftmost match */ - if (node->rb.rb_right) { - node = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb); - if (node->__subtree_last >= start) - continue; - } - } - return NULL; /* No match */ - } - } - -Insertion/removal are defined using the following augmented callbacks:: - - static inline unsigned long - compute_subtree_last(struct interval_tree_node *node) - { - unsigned long max = node->last, subtree_last; - if (node->rb.rb_left) { - subtree_last = rb_entry(node->rb.rb_left, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - if (node->rb.rb_right) { - subtree_last = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - return max; - } - - static void augment_propagate(struct rb_node *rb, struct rb_node *stop) - { - while (rb != stop) { - struct interval_tree_node *node = - rb_entry(rb, struct interval_tree_node, rb); - unsigned long subtree_last = compute_subtree_last(node); - if (node->__subtree_last == subtree_last) - break; - node->__subtree_last = subtree_last; - rb = rb_parent(&node->rb); - } - } - - static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) - { - struct interval_tree_node *old = - rb_entry(rb_old, struct interval_tree_node, rb); - struct interval_tree_node *new = - rb_entry(rb_new, struct interval_tree_node, rb); - - new->__subtree_last = old->__subtree_last; - } - - static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) - { - struct interval_tree_node *old = - rb_entry(rb_old, struct interval_tree_node, rb); - struct interval_tree_node *new = - rb_entry(rb_new, struct interval_tree_node, rb); - - new->__subtree_last = old->__subtree_last; - old->__subtree_last = compute_subtree_last(old); - } - - static const struct rb_augment_callbacks augment_callbacks = { - augment_propagate, augment_copy, augment_rotate - }; - - void interval_tree_insert(struct interval_tree_node *node, - struct rb_root *root) - { - struct rb_node **link = &root->rb_node, *rb_parent = NULL; - unsigned long start = node->start, last = node->last; - struct interval_tree_node *parent; - - while (*link) { - rb_parent = *link; - parent = rb_entry(rb_parent, struct interval_tree_node, rb); - if (parent->__subtree_last < last) - parent->__subtree_last = last; - if (start < parent->start) - link = &parent->rb.rb_left; - else - link = &parent->rb.rb_right; - } - - node->__subtree_last = last; - rb_link_node(&node->rb, rb_parent, link); - rb_insert_augmented(&node->rb, root, &augment_callbacks); - } - - void interval_tree_remove(struct interval_tree_node *node, - struct rb_root *root) - { - rb_erase_augmented(&node->rb, root, &augment_callbacks); - } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 1fd61a9af45c..d7db17996322 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -11,7 +11,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - See Documentation/rbtree.txt for documentation and samples. + See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 724b0d036b57..d1c53e9d8c75 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -21,7 +21,7 @@ * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * - * See Documentation/rbtree.txt for documentation and samples. + * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { diff --git a/lib/Kconfig b/lib/Kconfig index 5d53f9609c25..8005e36c3459 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -427,7 +427,7 @@ config INTERVAL_TREE See: - Documentation/rbtree.txt + Documentation/core-api/rbtree.rst for more information. diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index e03b1ea23e0e..30dd21f976c3 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -11,7 +11,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - See Documentation/rbtree.txt for documentation and samples. + See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef __TOOLS_LINUX_PERF_RBTREE_H diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 381aa948610d..570bb9794421 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -23,7 +23,7 @@ * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * - * See Documentation/rbtree.txt for documentation and samples. + * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { -- cgit v1.2.3 From 51161bfc66a68d21f13d15a689b3ea7980457790 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 18:55:06 +0300 Subject: kernel/module: Hide vermagic header file from general use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VERMAGIC* definitions are not supposed to be used by the drivers, see this [1] bug report, so introduce special define to guard inclusion of this header file and define it in kernel/modules.h and in internal script that generates *.mod.c files. In-tree module build: ➜ kernel git:(vermagic) ✗ make clean ➜ kernel git:(vermagic) ✗ make M=drivers/infiniband/hw/mlx5 ➜ kernel git:(vermagic) ✗ modinfo drivers/infiniband/hw/mlx5/mlx5_ib.ko filename: /images/leonro/src/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions Out-of-tree module build: ➜ mlx5 make -C /images/leonro/src/kernel clean M=/tmp/mlx5 ➜ mlx5 make -C /images/leonro/src/kernel M=/tmp/mlx5 ➜ mlx5 modinfo /tmp/mlx5/mlx5_ib.ko filename: /tmp/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions [1] https://lore.kernel.org/lkml/20200411155623.GA22175@zn.tnic Reported-by: Borislav Petkov Acked-by: Borislav Petkov Acked-by: Jessica Yu Co-developed-by: Masahiro Yamada Signed-off-by: Masahiro Yamada Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- include/linux/vermagic.h | 5 +++++ kernel/module.c | 3 +++ scripts/mod/modpost.c | 1 + 3 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 9aced11e9000..7768d20ada39 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef INCLUDE_VERMAGIC +#error "This header can be included from kernel/module.c or *.mod.c only" +#endif + #include /* Simply sanity version stamp for modules. */ diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..8833e848b73c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,6 +4,9 @@ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. */ + +#define INCLUDE_VERMAGIC + #include #include #include diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5c3c50c5ec52..7f7d4ee7b652 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2251,6 +2251,7 @@ static void add_header(struct buffer *b, struct module *mod) * Include build-salt.h after module.h in order to * inherit the definitions. */ + buf_printf(b, "#define INCLUDE_VERMAGIC\n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); -- cgit v1.2.3 From fa10fed30f2550313a8284365b3e2398526eb42c Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Sun, 19 Apr 2020 16:10:52 +0200 Subject: proc: allow to mount many instances of proc in one pid namespace This patch allows to have multiple procfs instances inside the same pid namespace. The aim here is lightweight sandboxes, and to allow that we have to modernize procfs internals. 1) The main aim of this work is to have on embedded systems one supervisor for apps. Right now we have some lightweight sandbox support, however if we create pid namespacess we have to manages all the processes inside too, where our goal is to be able to run a bunch of apps each one inside its own mount namespace without being able to notice each other. We only want to use mount namespaces, and we want procfs to behave more like a real mount point. 2) Linux Security Modules have multiple ptrace paths inside some subsystems, however inside procfs, the implementation does not guarantee that the ptrace() check which triggers the security_ptrace_check() hook will always run. We have the 'hidepid' mount option that can be used to force the ptrace_may_access() check inside has_pid_permissions() to run. The problem is that 'hidepid' is per pid namespace and not attached to the mount point, any remount or modification of 'hidepid' will propagate to all other procfs mounts. This also does not allow to support Yama LSM easily in desktop and user sessions. Yama ptrace scope which restricts ptrace and some other syscalls to be allowed only on inferiors, can be updated to have a per-task context, where the context will be inherited during fork(), clone() and preserved across execve(). If we support multiple private procfs instances, then we may force the ptrace_may_access() on /proc// to always run inside that new procfs instances. This will allow to specifiy on user sessions if we should populate procfs with pids that the user can ptrace or not. By using Yama ptrace scope, some restricted users will only be able to see inferiors inside /proc, they won't even be able to see their other processes. Some software like Chromium, Firefox's crash handler, Wine and others are already using Yama to restrict which processes can be ptracable. With this change this will give the possibility to restrict /proc// but more importantly this will give desktop users a generic and usuable way to specifiy which users should see all processes and which users can not. Side notes: * This covers the lack of seccomp where it is not able to parse arguments, it is easy to install a seccomp filter on direct syscalls that operate on pids, however /proc// is a Linux ABI using filesystem syscalls. With this change LSMs should be able to analyze open/read/write/close... In the new patch set version I removed the 'newinstance' option as suggested by Eric W. Biederman. Selftest has been added to verify new behavior. Signed-off-by: Alexey Gladkov Reviewed-by: Alexey Dobriyan Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 31 ++++++++------ fs/proc/inode.c | 11 +++-- fs/proc/root.c | 49 +++++++++++----------- fs/proc/self.c | 6 +-- fs/proc/thread_self.c | 6 +-- include/linux/pid_namespace.h | 12 ------ include/linux/proc_fs.h | 22 +++++++++- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + .../testing/selftests/proc/proc-multiple-procfs.c | 48 +++++++++++++++++++++ 10 files changed, 124 insertions(+), 63 deletions(-) create mode 100644 tools/testing/selftests/proc/proc-multiple-procfs.c (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6042b646ab27..93b5d05c142c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -697,13 +697,13 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ -static bool has_pid_permissions(struct pid_namespace *pid, +static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, int hide_pid_min) { - if (pid->hide_pid < hide_pid_min) + if (fs_info->hide_pid < hide_pid_min) return true; - if (in_group_p(pid->pid_gid)) + if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -711,18 +711,18 @@ static bool has_pid_permissions(struct pid_namespace *pid, static int proc_pid_permission(struct inode *inode, int mask) { - struct pid_namespace *pid = proc_pid_ns(inode); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; - has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS); + has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { - if (pid->hide_pid == HIDEPID_INVISIBLE) { + if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process @@ -1897,7 +1897,7 @@ int pid_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - struct pid_namespace *pid = proc_pid_ns(inode); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(inode, stat); @@ -1907,7 +1907,7 @@ int pid_getattr(const struct path *path, struct kstat *stat, rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { + if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, @@ -3301,6 +3301,7 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; + struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); @@ -3308,7 +3309,8 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) if (tgid == ~0U) goto out; - ns = dentry->d_sb->s_fs_info; + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) @@ -3372,6 +3374,7 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; + struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); struct pid_namespace *ns = proc_pid_ns(file_inode(file)); loff_t pos = ctx->pos; @@ -3379,13 +3382,13 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(ns->proc_self); + struct inode *inode = d_inode(fs_info->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(ns->proc_thread_self); + struct inode *inode = d_inode(fs_info->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; @@ -3399,7 +3402,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) unsigned int len; cond_resched(); - if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) + if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%u", iter.tgid); @@ -3599,6 +3602,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; + struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); @@ -3609,7 +3613,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (tid == ~0U) goto out; - ns = dentry->d_sb->s_fs_info; + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fb4cace9ea41..9c756531282a 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -167,13 +167,12 @@ void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock static int proc_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = root->d_sb; - struct pid_namespace *pid = sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); - if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) - seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); - if (pid->hide_pid != HIDEPID_OFF) - seq_printf(seq, ",hidepid=%u", pid->hide_pid); + if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); + if (fs_info->hide_pid != HIDEPID_OFF) + seq_printf(seq, ",hidepid=%u", fs_info->hide_pid); return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index cdbe9293ea55..208989274923 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -77,26 +77,31 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct super_block *s, +static void proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, - struct pid_namespace *pid_ns, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; if (ctx->mask & (1 << Opt_gid)) - pid_ns->pid_gid = make_kgid(user_ns, ctx->gid); + fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) - pid_ns->hide_pid = ctx->hidepid; + fs_info->hide_pid = ctx->hidepid; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) { - struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info); + struct proc_fs_context *ctx = fc->fs_private; struct inode *root_inode; + struct proc_fs_info *fs_info; int ret; - proc_apply_options(s, fc, pid_ns, current_user_ns()); + fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; @@ -106,6 +111,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; + s->s_fs_info = fs_info; /* * procfs isn't actually a stacking filesystem; however, there is @@ -113,7 +119,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) * top of it */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - + /* procfs dentries and inodes don't require IO to create */ s->s_shrink.seeks = 0; @@ -140,19 +146,17 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; - struct pid_namespace *pid = sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); - proc_apply_options(sb, fc, pid, current_user_ns()); + proc_apply_options(fs_info, fc, current_user_ns()); return 0; } static int proc_get_tree(struct fs_context *fc) { - struct proc_fs_context *ctx = fc->fs_private; - - return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns); + return get_tree_nodev(fc, proc_fill_super); } static void proc_fs_context_free(struct fs_context *fc) @@ -188,22 +192,17 @@ static int proc_init_fs_context(struct fs_context *fc) static void proc_kill_sb(struct super_block *sb) { - struct pid_namespace *ns; + struct proc_fs_info *fs_info = proc_sb_info(sb); - ns = (struct pid_namespace *)sb->s_fs_info; - if (ns->proc_self) - dput(ns->proc_self); - if (ns->proc_thread_self) - dput(ns->proc_thread_self); - kill_anon_super(sb); + if (fs_info->proc_self) + dput(fs_info->proc_self); - /* Make the pid namespace safe for the next mount of proc */ - ns->proc_self = NULL; - ns->proc_thread_self = NULL; - ns->pid_gid = GLOBAL_ROOT_GID; - ns->hide_pid = 0; + if (fs_info->proc_thread_self) + dput(fs_info->proc_thread_self); - put_pid_ns(ns); + kill_anon_super(sb); + put_pid_ns(fs_info->pid_ns); + kfree(fs_info); } static struct file_system_type proc_fs_type = { diff --git a/fs/proc/self.c b/fs/proc/self.c index 57c0a1047250..309301ac0136 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -36,10 +36,10 @@ static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = proc_pid_ns(root_inode); + struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; - + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { @@ -62,7 +62,7 @@ int proc_setup_self(struct super_block *s) if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); else - ns->proc_self = self; + fs_info->proc_self = self; return ret; } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index f61ae53533f5..2493cbbdfa6f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = proc_pid_ns(root_inode); + struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; @@ -60,9 +60,9 @@ int proc_setup_thread_self(struct super_block *s) inode_unlock(root_inode); if (ret) - pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); + pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); else - ns->proc_thread_self = thread_self; + fs_info->proc_thread_self = thread_self; return ret; } diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 4956e362e55e..5a5cb45ac57e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -17,12 +17,6 @@ struct fs_pin; -enum { /* definitions for pid_namespace's hide_pid field */ - HIDEPID_OFF = 0, - HIDEPID_NO_ACCESS = 1, - HIDEPID_INVISIBLE = 2, -}; - struct pid_namespace { struct kref kref; struct idr idr; @@ -32,17 +26,11 @@ struct pid_namespace { struct kmem_cache *pid_cachep; unsigned int level; struct pid_namespace *parent; -#ifdef CONFIG_PROC_FS - struct dentry *proc_self; - struct dentry *proc_thread_self; -#endif #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; - kgid_t pid_gid; - int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; } __randomize_layout; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..1b98a41fdd8a 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -42,6 +42,26 @@ struct proc_ops { unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); } __randomize_layout; +/* definitions for hide_pid field */ +enum { + HIDEPID_OFF = 0, + HIDEPID_NO_ACCESS = 1, + HIDEPID_INVISIBLE = 2, +}; + +struct proc_fs_info { + struct pid_namespace *pid_ns; + struct dentry *proc_self; /* For /proc/self */ + struct dentry *proc_thread_self; /* For /proc/thread-self */ + kgid_t pid_gid; + int hide_pid; +}; + +static inline struct proc_fs_info *proc_sb_info(struct super_block *sb) +{ + return sb->s_fs_info; +} + #ifdef CONFIG_PROC_FS typedef int (*proc_write_t)(struct file *, char *, size_t); @@ -176,7 +196,7 @@ int open_related_ns(struct ns_common *ns, /* get the associated pid namespace for a file in procfs */ static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) { - return inode->i_sb->s_fs_info; + return proc_sb_info(inode->i_sb)->pid_ns; } #endif /* _LINUX_PROC_FS_H */ diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 4bca5a9327a4..126901c5d6f4 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -3,6 +3,7 @@ /fd-002-posix-eq /fd-003-kthread /proc-loadavg-001 +/proc-multiple-procfs /proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index a8ed0f684829..bf22457256be 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -19,5 +19,6 @@ TEST_GEN_PROGS += self TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self +TEST_GEN_PROGS += proc-multiple-procfs include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-multiple-procfs.c b/tools/testing/selftests/proc/proc-multiple-procfs.c new file mode 100644 index 000000000000..ab912ad95dab --- /dev/null +++ b/tools/testing/selftests/proc/proc-multiple-procfs.c @@ -0,0 +1,48 @@ +/* + * Copyright © 2020 Alexey Gladkov + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include + +int main(void) +{ + struct stat proc_st1, proc_st2; + char procbuff[] = "/tmp/proc.XXXXXX/meminfo"; + char procdir1[] = "/tmp/proc.XXXXXX"; + char procdir2[] = "/tmp/proc.XXXXXX"; + + assert(mkdtemp(procdir1) != NULL); + assert(mkdtemp(procdir2) != NULL); + + assert(!mount("proc", procdir1, "proc", 0, "hidepid=1")); + assert(!mount("proc", procdir2, "proc", 0, "hidepid=2")); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir1); + assert(!stat(procbuff, &proc_st1)); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir2); + assert(!stat(procbuff, &proc_st2)); + + umount(procdir1); + umount(procdir2); + + assert(proc_st1.st_dev != proc_st2.st_dev); + + return 0; +} -- cgit v1.2.3 From 24a71ce5c47f6b1b3cdacf544cb24220f5c3b7ef Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Sun, 19 Apr 2020 16:10:53 +0200 Subject: proc: instantiate only pids that we can ptrace on 'hidepid=4' mount option If "hidepid=4" mount option is set then do not instantiate pids that we can not ptrace. "hidepid=4" means that procfs should only contain pids that the caller can ptrace. Signed-off-by: Djalal Harouni Signed-off-by: Alexey Gladkov Reviewed-by: Alexey Dobriyan Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 15 +++++++++++++++ fs/proc/root.c | 13 ++++++++++--- include/linux/proc_fs.h | 1 + 3 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 93b5d05c142c..a52a91e90c25 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -701,6 +701,14 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, int hide_pid_min) { + /* + * If 'hidpid' mount option is set force a ptrace check, + * we indicate that we are using a filesystem syscall + * by passing PTRACE_MODE_READ_FSCREDS + */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + if (fs_info->hide_pid < hide_pid_min) return true; if (in_group_p(fs_info->pid_gid)) @@ -3319,7 +3327,14 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) if (!task) goto out; + /* Limit procfs to only ptraceable tasks */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { + if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) + goto out_put_task; + } + result = proc_pid_instantiate(dentry, task, NULL); +out_put_task: put_task_struct(task); out: return result; diff --git a/fs/proc/root.c b/fs/proc/root.c index 208989274923..8f23b951d685 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -47,6 +47,14 @@ static const struct fs_parameter_spec proc_fs_parameters[] = { {} }; +static inline int valid_hidepid(unsigned int value) +{ + return (value == HIDEPID_OFF || + value == HIDEPID_NO_ACCESS || + value == HIDEPID_INVISIBLE || + value == HIDEPID_NOT_PTRACEABLE); +} + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; @@ -63,10 +71,9 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: + if (!valid_hidepid(result.uint_32)) + return invalf(fc, "proc: unknown value of hidepid.\n"); ctx->hidepid = result.uint_32; - if (ctx->hidepid < HIDEPID_OFF || - ctx->hidepid > HIDEPID_INVISIBLE) - return invalfc(fc, "hidepid value must be between 0 and 2.\n"); break; default: diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 1b98a41fdd8a..5bdc117ae947 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -47,6 +47,7 @@ enum { HIDEPID_OFF = 0, HIDEPID_NO_ACCESS = 1, HIDEPID_INVISIBLE = 2, + HIDEPID_NOT_PTRACEABLE = 4, /* Limit pids to only ptraceable pids */ }; struct proc_fs_info { -- cgit v1.2.3 From 6814ef2d992af09451bbeda4770daa204461329e Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Sun, 19 Apr 2020 16:10:54 +0200 Subject: proc: add option to mount only a pids subset This allows to hide all files and directories in the procfs that are not related to tasks. Signed-off-by: Alexey Gladkov Reviewed-by: Alexey Dobriyan Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- fs/proc/generic.c | 9 +++++++++ fs/proc/inode.c | 6 ++++++ fs/proc/root.c | 33 +++++++++++++++++++++++++++++++++ include/linux/proc_fs.h | 7 +++++++ 4 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4ed6dabdf6ff..2f9fa179194d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -269,6 +269,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return ERR_PTR(-ENOENT); + return proc_lookup_de(dir, dentry, PDE(dir)); } @@ -325,6 +330,10 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx, int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return 1; return proc_readdir_de(file, ctx, PDE(inode)); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9c756531282a..0d5e68fa842f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -173,6 +173,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); if (fs_info->hide_pid != HIDEPID_OFF) seq_printf(seq, ",hidepid=%u", fs_info->hide_pid); + if (fs_info->pidonly != PROC_PIDONLY_OFF) + seq_printf(seq, ",subset=pid"); return 0; } @@ -463,6 +465,7 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, static int proc_reg_open(struct inode *inode, struct file *file) { + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; @@ -476,6 +479,9 @@ static int proc_reg_open(struct inode *inode, struct file *file) return rv; } + if (fs_info->pidonly == PROC_PIDONLY_ON) + return -ENOENT; + /* * Ensure that * 1) PDE's ->release hook will be called no matter what diff --git a/fs/proc/root.c b/fs/proc/root.c index 8f23b951d685..baff006a918f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -34,16 +34,19 @@ struct proc_fs_context { unsigned int mask; int hidepid; int gid; + int pidonly; }; enum proc_param { Opt_gid, Opt_hidepid, + Opt_subset, }; static const struct fs_parameter_spec proc_fs_parameters[] = { fsparam_u32("gid", Opt_gid), fsparam_u32("hidepid", Opt_hidepid), + fsparam_string("subset", Opt_subset), {} }; @@ -55,6 +58,29 @@ static inline int valid_hidepid(unsigned int value) value == HIDEPID_NOT_PTRACEABLE); } +static int proc_parse_subset_param(struct fs_context *fc, char *value) +{ + struct proc_fs_context *ctx = fc->fs_private; + + while (value) { + char *ptr = strchr(value, ','); + + if (ptr != NULL) + *ptr++ = '\0'; + + if (*value != '\0') { + if (!strcmp(value, "pid")) { + ctx->pidonly = PROC_PIDONLY_ON; + } else { + return invalf(fc, "proc: unsupported subset option - %s\n", value); + } + } + value = ptr; + } + + return 0; +} + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; @@ -76,6 +102,11 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->hidepid = result.uint_32; break; + case Opt_subset: + if (proc_parse_subset_param(fc, param->string) < 0) + return -EINVAL; + break; + default: return -EINVAL; } @@ -94,6 +125,8 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; + if (ctx->mask & (1 << Opt_subset)) + fs_info->pidonly = ctx->pidonly; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 5bdc117ae947..8bc31ba5cd9c 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -50,12 +50,19 @@ enum { HIDEPID_NOT_PTRACEABLE = 4, /* Limit pids to only ptraceable pids */ }; +/* definitions for proc mount option pidonly */ +enum { + PROC_PIDONLY_OFF = 0, + PROC_PIDONLY_ON = 1, +}; + struct proc_fs_info { struct pid_namespace *pid_ns; struct dentry *proc_self; /* For /proc/self */ struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; int hide_pid; + int pidonly; }; static inline struct proc_fs_info *proc_sb_info(struct super_block *sb) -- cgit v1.2.3 From e61bb8b36a287dddc71bdf30be775e7abcaa595c Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Sun, 19 Apr 2020 16:10:57 +0200 Subject: proc: use named enums for better readability Signed-off-by: Alexey Gladkov Reviewed-by: Alexey Dobriyan Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 2 +- fs/proc/inode.c | 2 +- fs/proc/root.c | 4 ++-- include/linux/proc_fs.h | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index a52a91e90c25..2868bff1a142 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -699,7 +699,7 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) */ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, - int hide_pid_min) + enum proc_hidepid hide_pid_min) { /* * If 'hidpid' mount option is set force a ptrace check, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index cbacac2e892b..f40c2532c057 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -166,7 +166,7 @@ void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock deactivate_super(old_sb); } -static inline const char *hidepid2str(int v) +static inline const char *hidepid2str(enum proc_hidepid v) { switch (v) { case HIDEPID_OFF: return "off"; diff --git a/fs/proc/root.c b/fs/proc/root.c index 288093261b7f..ffebed1999e5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -32,9 +32,9 @@ struct proc_fs_context { struct pid_namespace *pid_ns; unsigned int mask; - int hidepid; + enum proc_hidepid hidepid; int gid; - int pidonly; + enum proc_pidonly pidonly; }; enum proc_param { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 8bc31ba5cd9c..2cb424e6f36a 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -43,7 +43,7 @@ struct proc_ops { } __randomize_layout; /* definitions for hide_pid field */ -enum { +enum proc_hidepid { HIDEPID_OFF = 0, HIDEPID_NO_ACCESS = 1, HIDEPID_INVISIBLE = 2, @@ -51,7 +51,7 @@ enum { }; /* definitions for proc mount option pidonly */ -enum { +enum proc_pidonly { PROC_PIDONLY_OFF = 0, PROC_PIDONLY_ON = 1, }; @@ -61,8 +61,8 @@ struct proc_fs_info { struct dentry *proc_self; /* For /proc/self */ struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; - int hide_pid; - int pidonly; + enum proc_hidepid hide_pid; + enum proc_pidonly pidonly; }; static inline struct proc_fs_info *proc_sb_info(struct super_block *sb) -- cgit v1.2.3 From e64a0e16928415648d53d721b3d6fc3635eddf92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:21 +0200 Subject: block: remove RQF_COPY_USER The RQF_COPY_USER is set for bio where the passthrough request mapping helpers decided that bounce buffering is required. It is then used to pad scatterlist for drivers that required it. But given that non-passthrough requests are per definition aligned, and directly mapped pass-through request must be aligned it is not actually required at all. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 9 +-------- block/blk-merge.c | 3 +-- block/blk-mq-debugfs.c | 1 - include/linux/blkdev.h | 2 -- 4 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/blk-map.c b/block/blk-map.c index b72c361911a4..b6fa343fea9f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -654,8 +654,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, bio = rq->bio; } while (iov_iter_count(&i)); - if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: @@ -731,7 +729,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - int do_copy = 0; struct bio *bio, *orig_bio; int ret; @@ -740,8 +737,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); - if (do_copy) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -752,9 +748,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - if (do_copy) - rq->rq_flags |= RQF_COPY_USER; - orig_bio = bio; ret = blk_rq_append_bio(rq, &bio); if (unlikely(ret)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 1534ed736363..99c9759f3a8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -532,8 +532,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - if (unlikely(rq->rq_flags & RQF_COPY_USER) && - (blk_rq_bytes(rq) & q->dma_pad_mask)) { + if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b3f2ba483992..96b7a35c898a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -292,7 +292,6 @@ static const char *const rqf_name[] = { RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), RQF_NAME(PREEMPT), - RQF_NAME(COPY_USER), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(ELVPRIV), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32868fbedc9e..76da162b6ae9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -82,8 +82,6 @@ typedef __u32 __bitwise req_flags_t; /* set for "ide_preempt" requests and also for requests for which the SCSI "quiesce" state must be ignored. */ #define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) -/* contains copies of user pages */ -#define RQF_COPY_USER ((__force req_flags_t)(1 << 9)) /* vaguely specified driver internal error. Ignored by the block layer */ #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ -- cgit v1.2.3 From 89de1504d53b59b12bfff227328ee3e63dd3a112 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:22 +0200 Subject: block: provide a blk_rq_map_sg variant that returns the last element To be able to move some of the special purpose hacks in blk_rq_map_sg into the callers we need a variant that returns the last mapped S/G list element to the caller. Add that variant as __blk_rq_map_sg and make blk_rq_map_sg a trivial inline wrapper around it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 25 ++++++++++++------------- include/linux/blkdev.h | 10 +++++++++- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 99c9759f3a8a..ee618cdb141e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -519,24 +519,23 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, * map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg) { - struct scatterlist *sg = NULL; int nsegs = 0; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg); + nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) - nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg); + nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; - sg->length += pad_len; + (*last_sg)->length += pad_len; rq->extra_len += pad_len; } @@ -544,9 +543,9 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (op_is_write(req_op(rq))) memset(q->dma_drain_buffer, 0, q->dma_drain_size); - sg_unmark_end(sg); - sg = sg_next(sg); - sg_set_page(sg, virt_to_page(q->dma_drain_buffer), + sg_unmark_end(*last_sg); + *last_sg = sg_next(*last_sg); + sg_set_page(*last_sg, virt_to_page(q->dma_drain_buffer), q->dma_drain_size, ((unsigned long)q->dma_drain_buffer) & (PAGE_SIZE - 1)); @@ -554,8 +553,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, rq->extra_len += q->dma_drain_size; } - if (sg) - sg_mark_end(sg); + if (*last_sg) + sg_mark_end(*last_sg); /* * Something must have been wrong if the figured number of @@ -565,7 +564,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, return nsegs; } -EXPORT_SYMBOL(blk_rq_map_sg); +EXPORT_SYMBOL(__blk_rq_map_sg); static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76da162b6ae9..496dc9491026 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1136,7 +1136,15 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } -extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *last_sg = NULL; + + return __blk_rq_map_sg(q, rq, sglist, &last_sg); +} extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3 From cc97923a5bccc776851c242b61015faf288d5c22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:24 +0200 Subject: block: move dma drain handling to scsi Don't burden the common block code with with specifics of the libata DMA draining mechanism. Instead move most of the code to the scsi midlayer. That also means the nr_phys_segments adjustments in the blk-mq fast path can go away entirely, given that SCSI never looks at nr_phys_segments after mapping the request to a scatterlist. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 14 -------------- block/blk-mq.c | 11 ----------- block/blk-settings.c | 37 ------------------------------------- drivers/ata/libata-scsi.c | 28 ++++++++++------------------ drivers/scsi/scsi_lib.c | 39 ++++++++++++++++++++++++++++++++++----- include/linux/blkdev.h | 7 ------- include/linux/libata.h | 2 ++ include/scsi/scsi_device.h | 3 +++ include/scsi/scsi_host.h | 7 +++++++ 9 files changed, 56 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index ee618cdb141e..25f5a5e00ee6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -539,20 +539,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, rq->extra_len += pad_len; } - if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (op_is_write(req_op(rq))) - memset(q->dma_drain_buffer, 0, q->dma_drain_size); - - sg_unmark_end(*last_sg); - *last_sg = sg_next(*last_sg); - sg_set_page(*last_sg, virt_to_page(q->dma_drain_buffer), - q->dma_drain_size, - ((unsigned long)q->dma_drain_buffer) & - (PAGE_SIZE - 1)); - nsegs++; - rq->extra_len += q->dma_drain_size; - } - if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq.c b/block/blk-mq.c index cf95e8e0881a..2c105cb2a75b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -667,15 +667,6 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } - #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); @@ -695,8 +686,6 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; } } diff --git a/block/blk-settings.c b/block/blk-settings.c index 14397b4c4b53..2ab1967b9716 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -651,43 +651,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) } EXPORT_SYMBOL(blk_queue_update_dma_pad); -/** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * @q: the request queue for the device - * @dma_drain_needed: fn which returns non-zero if drain is necessary - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for appending - * the drain buffer. If you call blk_queue_max_segments() after calling - * this routine, you must set the limit to one fewer than your device - * can support otherwise there won't be room for the drain buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size) -{ - if (queue_max_segments(q) < 2) - return -EINVAL; - /* make room for appending the drain */ - blk_queue_max_segments(q, queue_max_segments(q) - 1); - q->dma_drain_needed = dma_drain_needed; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - /** * blk_queue_segment_boundary - set boundary rules for segment merging * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 36e588d88b95..feb13b8f93d7 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1017,16 +1017,11 @@ void ata_scsi_sdev_config(struct scsi_device *sdev) * RETURNS: * 1 if ; otherwise, 0. */ -static int atapi_drain_needed(struct request *rq) +bool ata_scsi_dma_need_drain(struct request *rq) { - if (likely(!blk_rq_is_passthrough(rq))) - return 0; - - if (!blk_rq_bytes(rq) || op_is_write(req_op(rq))) - return 0; - return atapi_cmd_type(scsi_req(rq)->cmd[0]) == ATAPI_MISC; } +EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain); int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) { @@ -1039,21 +1034,21 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) blk_queue_max_hw_sectors(q, dev->max_sectors); if (dev->class == ATA_DEV_ATAPI) { - void *buf; - sdev->sector_size = ATA_SECT_SIZE; /* set DMA padding */ blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); - /* configure draining */ - buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); - if (!buf) { + /* make room for appending the drain */ + blk_queue_max_segments(q, queue_max_segments(q) - 1); + + sdev->dma_drain_len = ATAPI_MAX_DRAIN; + sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, + q->bounce_gfp | GFP_KERNEL); + if (!sdev->dma_drain_buf) { ata_dev_err(dev, "drain buffer allocation failed\n"); return -ENOMEM; } - - blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN); } else { sdev->sector_size = ata_id_logical_sector_size(dev->id); sdev->manage_start_stop = 1; @@ -1135,7 +1130,6 @@ EXPORT_SYMBOL_GPL(ata_scsi_slave_config); void ata_scsi_slave_destroy(struct scsi_device *sdev) { struct ata_port *ap = ata_shost_to_port(sdev->host); - struct request_queue *q = sdev->request_queue; unsigned long flags; struct ata_device *dev; @@ -1152,9 +1146,7 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev) } spin_unlock_irqrestore(ap->lock, flags); - kfree(q->dma_drain_buffer); - q->dma_drain_buffer = NULL; - q->dma_drain_size = 0; + kfree(sdev->dma_drain_buf); } EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 4e42acbb3f32..88cac92fc153 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -978,6 +978,14 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_io_completion_action(cmd, result); } +static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, + struct request *rq) +{ + return sdev->dma_drain_len && blk_rq_is_passthrough(rq) && + !op_is_write(req_op(rq)) && + sdev->host->hostt->dma_need_drain(rq); +} + /* * Function: scsi_init_io() * @@ -991,26 +999,47 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) */ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) { + struct scsi_device *sdev = cmd->device; struct request *rq = cmd->request; + unsigned short nr_segs = blk_rq_nr_phys_segments(rq); + struct scatterlist *last_sg = NULL; blk_status_t ret; + bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq); int count; - if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) + if (WARN_ON_ONCE(!nr_segs)) return BLK_STS_IOERR; + /* + * Make sure there is space for the drain. The driver must adjust + * max_hw_segments to be prepared for this. + */ + if (need_drain) + nr_segs++; + /* * If sg table allocation fails, requeue request later. */ - if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, - blk_rq_nr_phys_segments(rq), cmd->sdb.table.sgl, - SCSI_INLINE_SG_CNT))) + if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs, + cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT))) return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl); + count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + + if (need_drain) { + sg_unmark_end(last_sg); + last_sg = sg_next(last_sg); + sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); + sg_mark_end(last_sg); + + rq->extra_len += sdev->dma_drain_len; + count++; + } + BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 496dc9491026..8e4726bce498 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -288,7 +288,6 @@ struct blk_queue_ctx; typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); struct bio_vec; -typedef int (dma_drain_needed_fn)(struct request *); enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ @@ -397,7 +396,6 @@ struct request_queue { struct rq_qos *rq_qos; make_request_fn *make_request_fn; - dma_drain_needed_fn *dma_drain_needed; const struct blk_mq_ops *mq_ops; @@ -467,8 +465,6 @@ struct request_queue { */ unsigned long nr_requests; /* Max # of requests */ - unsigned int dma_drain_size; - void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; @@ -1097,9 +1093,6 @@ extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -extern int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); extern void blk_queue_dma_alignment(struct request_queue *, int); diff --git a/include/linux/libata.h b/include/linux/libata.h index cffa4714bfa8..af832852e620 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1092,6 +1092,7 @@ extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd, #define ATA_SCSI_COMPAT_IOCTL /* empty */ #endif extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd); +bool ata_scsi_dma_need_drain(struct request *rq); extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev, unsigned int cmd, void __user *arg); extern bool ata_link_online(struct ata_link *link); @@ -1387,6 +1388,7 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .ioctl = ata_scsi_ioctl, \ ATA_SCSI_COMPAT_IOCTL \ .queuecommand = ata_scsi_queuecmd, \ + .dma_need_drain = ata_scsi_dma_need_drain, \ .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index c3cba2aaf934..bc5909033d13 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -229,6 +229,9 @@ struct scsi_device { struct scsi_device_handler *handler; void *handler_data; + size_t dma_drain_len; + void *dma_drain_buf; + unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 822e8cda8d9b..46ef8cccc982 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -270,6 +270,13 @@ struct scsi_host_template { */ int (* map_queues)(struct Scsi_Host *shost); + /* + * Check if scatterlists need to be padded for DMA draining. + * + * Status: OPTIONAL + */ + bool (* dma_need_drain)(struct request *rq); + /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by -- cgit v1.2.3 From bdf8710d69f82ee6fd41b0166300c3306898b3c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:25 +0200 Subject: block: move dma_pad handling from blk_rq_map_sg into the callers There are only two callers of blk_rq_map_sg/__blk_rq_map_sg that set the dma_pad value in the queue. Move the handling into those callers instead of burdening the common code, and move the ->extra_len field from struct request to struct scsi_cmnd. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-merge.c | 8 -------- block/blk-mq.c | 1 - drivers/ata/libata-scsi.c | 2 +- drivers/ide/ide-io.c | 7 +++++-- drivers/scsi/scsi_lib.c | 10 +++++++++- include/linux/blkdev.h | 2 -- include/scsi/scsi_cmnd.h | 1 + 8 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 7e4a1da0715e..311596d5dbc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1638,7 +1638,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; - rq->extra_len = rq_src->extra_len; return 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index 25f5a5e00ee6..c49eb3bdd0be 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -531,14 +531,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); - if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { - unsigned int pad_len = - (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; - - (*last_sg)->length += pad_len; - rq->extra_len += pad_len; - } - if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c105cb2a75b..71d0894ce1c5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -318,7 +318,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->nr_integrity_segments = 0; #endif /* tag was already set */ - rq->extra_len = 0; WRITE_ONCE(rq->deadline, 0); rq->timeout = 0; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index feb13b8f93d7..435781a16875 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -649,7 +649,7 @@ static void ata_qc_set_pc_nbytes(struct ata_queued_cmd *qc) { struct scsi_cmnd *scmd = qc->scsicmd; - qc->extrabytes = scmd->request->extra_len; + qc->extrabytes = scmd->extra_len; qc->nbytes = scsi_bufflen(scmd) + qc->extrabytes; } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index b137f27a34d5..c31f1d2b3b07 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -233,10 +233,13 @@ static ide_startstop_t do_special(ide_drive_t *drive) void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; - struct scatterlist *sg = hwif->sg_table; + struct scatterlist *sg = hwif->sg_table, *last_sg = NULL; struct request *rq = cmd->rq; - cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); + cmd->sg_nents = __blk_rq_map_sg(drive->queue, rq, sg, &last_sg); + if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & rq->q->dma_pad_mask)) + last_sg->length += + (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; } EXPORT_SYMBOL_GPL(ide_map_sg); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 88cac92fc153..0a73230a8f16 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1030,13 +1030,21 @@ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { + unsigned int pad_len = + (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; + + last_sg->length += pad_len; + cmd->extra_len += pad_len; + } + if (need_drain) { sg_unmark_end(last_sg); last_sg = sg_next(last_sg); sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); sg_mark_end(last_sg); - rq->extra_len += sdev->dma_drain_len; + cmd->extra_len += sdev->dma_drain_len; count++; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8e4726bce498..f00bd4042295 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -224,8 +224,6 @@ struct request { unsigned short write_hint; unsigned short ioprio; - unsigned int extra_len; /* length of alignment and padding */ - enum mq_rq_state state; refcount_t ref; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 80ac89e47b47..f93c0b800790 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -142,6 +142,7 @@ struct scsi_cmnd { unsigned long state; /* Command completion state */ unsigned char tag; /* SCSI-II queued command tag */ + unsigned int extra_len; /* length of alignment and padding */ }; /* -- cgit v1.2.3 From 66648766ef38d8d6c48ded2f59cf98420aec2cdb Mon Sep 17 00:00:00 2001 From: Jimmy Assarsson Date: Thu, 2 Apr 2020 19:25:07 +0200 Subject: mm: Remove MPX leftovers Remove MPX leftovers in generic code. Fixes: 45fc24e89b7c ("x86/mpx: remove MPX from arch/x86") Signed-off-by: Jimmy Assarsson Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20200402172507.2786-1-jimmyassarsson@gmail.com --- fs/proc/task_mmu.c | 3 --- include/linux/mm.h | 7 ------- 2 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8d382d4ec067..e12ad2e87942 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -622,9 +622,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_DENYWRITE)] = "dw", -#ifdef CONFIG_X86_INTEL_MPX - [ilog2(VM_MPX)] = "mp", -#endif [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..e1882eec1752 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -329,13 +329,6 @@ extern unsigned int kobjsize(const void *objp); # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif -#if defined(CONFIG_X86_INTEL_MPX) -/* MPX specific bounds table or bounds directory */ -# define VM_MPX VM_HIGH_ARCH_4 -#else -# define VM_MPX VM_NONE -#endif - #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif -- cgit v1.2.3 From 0a32f1ff2a2e41404deaba5fb32f8a0d640c0974 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Apr 2020 20:21:11 +0200 Subject: net: phy: broadcom: add helper to write/read RDB registers RDB (Register Data Base) registers are used on newer Broadcom PHYs. Add helper to read, write and modify these registers. Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm-phy-lib.c | 80 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/phy/bcm-phy-lib.h | 9 +++++ include/linux/brcmphy.h | 3 ++ 3 files changed, 92 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c index e77b274a09fd..d5f9a2701989 100644 --- a/drivers/net/phy/bcm-phy-lib.c +++ b/drivers/net/phy/bcm-phy-lib.c @@ -155,6 +155,86 @@ int bcm_phy_write_shadow(struct phy_device *phydev, u16 shadow, } EXPORT_SYMBOL_GPL(bcm_phy_write_shadow); +int __bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb) +{ + int val; + + val = __phy_write(phydev, MII_BCM54XX_RDB_ADDR, rdb); + if (val < 0) + return val; + + return __phy_read(phydev, MII_BCM54XX_RDB_DATA); +} +EXPORT_SYMBOL_GPL(__bcm_phy_read_rdb); + +int bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb) +{ + int ret; + + phy_lock_mdio_bus(phydev); + ret = __bcm_phy_read_rdb(phydev, rdb); + phy_unlock_mdio_bus(phydev); + + return ret; +} +EXPORT_SYMBOL_GPL(bcm_phy_read_rdb); + +int __bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val) +{ + int ret; + + ret = __phy_write(phydev, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + return ret; + + return __phy_write(phydev, MII_BCM54XX_RDB_DATA, val); +} +EXPORT_SYMBOL_GPL(__bcm_phy_write_rdb); + +int bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val) +{ + int ret; + + phy_lock_mdio_bus(phydev); + ret = __bcm_phy_write_rdb(phydev, rdb, val); + phy_unlock_mdio_bus(phydev); + + return ret; +} +EXPORT_SYMBOL_GPL(bcm_phy_write_rdb); + +int __bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, u16 set) +{ + int new, ret; + + ret = __phy_write(phydev, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + return ret; + + ret = __phy_read(phydev, MII_BCM54XX_RDB_DATA); + if (ret < 0) + return ret; + + new = (ret & ~mask) | set; + if (new == ret) + return 0; + + return __phy_write(phydev, MII_BCM54XX_RDB_DATA, new); +} +EXPORT_SYMBOL_GPL(__bcm_phy_modify_rdb); + +int bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, u16 set) +{ + int ret; + + phy_lock_mdio_bus(phydev); + ret = __bcm_phy_modify_rdb(phydev, rdb, mask, set); + phy_unlock_mdio_bus(phydev); + + return ret; +} +EXPORT_SYMBOL_GPL(bcm_phy_modify_rdb); + int bcm_phy_enable_apd(struct phy_device *phydev, bool dll_pwr_down) { int val; diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h index 129df819be8c..4d3de91cda6c 100644 --- a/drivers/net/phy/bcm-phy-lib.h +++ b/drivers/net/phy/bcm-phy-lib.h @@ -48,6 +48,15 @@ int bcm_phy_write_shadow(struct phy_device *phydev, u16 shadow, u16 val); int bcm_phy_read_shadow(struct phy_device *phydev, u16 shadow); +int __bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val); +int bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val); +int __bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb); +int bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb); +int __bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, + u16 set); +int bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, + u16 set); + int bcm_phy_ack_intr(struct phy_device *phydev); int bcm_phy_config_intr(struct phy_device *phydev); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 7e1d857c8468..897b69309964 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -115,6 +115,9 @@ #define MII_BCM54XX_SHD_VAL(x) ((x & 0x1f) << 10) #define MII_BCM54XX_SHD_DATA(x) ((x & 0x3ff) << 0) +#define MII_BCM54XX_RDB_ADDR 0x1e +#define MII_BCM54XX_RDB_DATA 0x1f + /* * AUXILIARY CONTROL SHADOW ACCESS REGISTERS. (PHY REG 0x18) */ -- cgit v1.2.3 From 6937602ed3f9ebd46ed6a6b5e609c0ae4ed99008 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Apr 2020 20:21:12 +0200 Subject: net: phy: add Broadcom BCM54140 support The Broadcom BCM54140 is a Quad SGMII/QSGMII Copper/Fiber Gigabit Ethernet transceiver. This also adds support for tunables to set and get downshift and energy detect auto power-down. The PHY has four ports and each port has its own PHY address. There are per-port registers as well as global registers. Unfortunately, the global registers can only be accessed by reading and writing from/to the PHY address of the first port. Further, there is no way to find out what port you actually are by just reading the per-port registers. We therefore, have to scan the bus on the PHY probe to determine the port and thus what address we need to access the global registers. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 10 + drivers/net/phy/Makefile | 1 + drivers/net/phy/bcm54140.c | 481 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/brcmphy.h | 1 + 4 files changed, 493 insertions(+) create mode 100644 drivers/net/phy/bcm54140.c (limited to 'include/linux') diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 3fa33d27eeba..cb7936b577de 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -346,6 +346,16 @@ config BROADCOM_PHY Currently supports the BCM5411, BCM5421, BCM5461, BCM54616S, BCM5464, BCM5481, BCM54810 and BCM5482 PHYs. +config BCM54140_PHY + tristate "Broadcom BCM54140 PHY" + depends on PHYLIB + select BCM_NET_PHYLIB + help + Support the Broadcom BCM54140 Quad SGMII/QSGMII PHY. + + This driver also supports the hardware monitoring of this PHY and + exposes voltage and temperature sensors. + config BCM84881_PHY tristate "Broadcom BCM84881 PHY" depends on PHYLIB diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 2f5c7093a65b..cd345b75d127 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_BCM87XX_PHY) += bcm87xx.o obj-$(CONFIG_BCM_CYGNUS_PHY) += bcm-cygnus.o obj-$(CONFIG_BCM_NET_PHYLIB) += bcm-phy-lib.o obj-$(CONFIG_BROADCOM_PHY) += broadcom.o +obj-$(CONFIG_BCM54140_PHY) += bcm54140.o obj-$(CONFIG_BCM84881_PHY) += bcm84881.o obj-$(CONFIG_CICADA_PHY) += cicada.o obj-$(CONFIG_CORTINA_PHY) += cortina.o diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c new file mode 100644 index 000000000000..0eeb60de67f8 --- /dev/null +++ b/drivers/net/phy/bcm54140.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* Broadcom BCM54140 Quad SGMII/QSGMII Copper/Fiber Gigabit PHY + * + * Copyright (c) 2020 Michael Walle + */ + +#include +#include +#include +#include + +#include "bcm-phy-lib.h" + +/* RDB per-port registers + */ +#define BCM54140_RDB_ISR 0x00a /* interrupt status */ +#define BCM54140_RDB_IMR 0x00b /* interrupt mask */ +#define BCM54140_RDB_INT_LINK BIT(1) /* link status changed */ +#define BCM54140_RDB_INT_SPEED BIT(2) /* link speed change */ +#define BCM54140_RDB_INT_DUPLEX BIT(3) /* duplex mode changed */ +#define BCM54140_RDB_SPARE1 0x012 /* spare control 1 */ +#define BCM54140_RDB_SPARE1_LSLM BIT(2) /* link speed LED mode */ +#define BCM54140_RDB_SPARE2 0x014 /* spare control 2 */ +#define BCM54140_RDB_SPARE2_WS_RTRY_DIS BIT(8) /* wirespeed retry disable */ +#define BCM54140_RDB_SPARE2_WS_RTRY_LIMIT GENMASK(4, 2) /* retry limit */ +#define BCM54140_RDB_SPARE3 0x015 /* spare control 3 */ +#define BCM54140_RDB_SPARE3_BIT0 BIT(0) +#define BCM54140_RDB_LED_CTRL 0x019 /* LED control */ +#define BCM54140_RDB_LED_CTRL_ACTLINK0 BIT(4) +#define BCM54140_RDB_LED_CTRL_ACTLINK1 BIT(8) +#define BCM54140_RDB_C_APWR 0x01a /* auto power down control */ +#define BCM54140_RDB_C_APWR_SINGLE_PULSE BIT(8) /* single pulse */ +#define BCM54140_RDB_C_APWR_APD_MODE_DIS 0 /* ADP disable */ +#define BCM54140_RDB_C_APWR_APD_MODE_EN 1 /* ADP enable */ +#define BCM54140_RDB_C_APWR_APD_MODE_DIS2 2 /* ADP disable */ +#define BCM54140_RDB_C_APWR_APD_MODE_EN_ANEG 3 /* ADP enable w/ aneg */ +#define BCM54140_RDB_C_APWR_APD_MODE_MASK GENMASK(6, 5) +#define BCM54140_RDB_C_APWR_SLP_TIM_MASK BIT(4)/* sleep timer */ +#define BCM54140_RDB_C_APWR_SLP_TIM_2_7 0 /* 2.7s */ +#define BCM54140_RDB_C_APWR_SLP_TIM_5_4 1 /* 5.4s */ +#define BCM54140_RDB_C_PWR 0x02a /* copper power control */ +#define BCM54140_RDB_C_PWR_ISOLATE BIT(5) /* super isolate mode */ +#define BCM54140_RDB_C_MISC_CTRL 0x02f /* misc copper control */ +#define BCM54140_RDB_C_MISC_CTRL_WS_EN BIT(4) /* wirespeed enable */ + +/* RDB global registers + */ +#define BCM54140_RDB_TOP_IMR 0x82d /* interrupt mask */ +#define BCM54140_RDB_TOP_IMR_PORT0 BIT(4) +#define BCM54140_RDB_TOP_IMR_PORT1 BIT(5) +#define BCM54140_RDB_TOP_IMR_PORT2 BIT(6) +#define BCM54140_RDB_TOP_IMR_PORT3 BIT(7) + +#define BCM54140_DEFAULT_DOWNSHIFT 5 +#define BCM54140_MAX_DOWNSHIFT 9 + +struct bcm54140_priv { + int port; + int base_addr; +}; + +static int bcm54140_base_read_rdb(struct phy_device *phydev, u16 rdb) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int ret; + + mutex_lock(&bus->mdio_lock); + ret = __mdiobus_write(bus, priv->base_addr, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + goto out; + + ret = __mdiobus_read(bus, priv->base_addr, MII_BCM54XX_RDB_DATA); + +out: + mutex_unlock(&bus->mdio_lock); + return ret; +} + +static int bcm54140_base_write_rdb(struct phy_device *phydev, + u16 rdb, u16 val) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int ret; + + mutex_lock(&bus->mdio_lock); + ret = __mdiobus_write(bus, priv->base_addr, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + goto out; + + ret = __mdiobus_write(bus, priv->base_addr, MII_BCM54XX_RDB_DATA, val); + +out: + mutex_unlock(&bus->mdio_lock); + return ret; +} + +/* Under some circumstances a core PLL may not lock, this will then prevent + * a successful link establishment. Restart the PLL after the voltages are + * stable to workaround this issue. + */ +static int bcm54140_b0_workaround(struct phy_device *phydev) +{ + int spare3; + int ret; + + spare3 = bcm_phy_read_rdb(phydev, BCM54140_RDB_SPARE3); + if (spare3 < 0) + return spare3; + + spare3 &= ~BCM54140_RDB_SPARE3_BIT0; + + ret = bcm_phy_write_rdb(phydev, BCM54140_RDB_SPARE3, spare3); + if (ret) + return ret; + + ret = phy_modify(phydev, MII_BMCR, 0, BMCR_PDOWN); + if (ret) + return ret; + + ret = phy_modify(phydev, MII_BMCR, BMCR_PDOWN, 0); + if (ret) + return ret; + + spare3 |= BCM54140_RDB_SPARE3_BIT0; + + return bcm_phy_write_rdb(phydev, BCM54140_RDB_SPARE3, spare3); +} + +/* The BCM54140 is a quad PHY where only the first port has access to the + * global register. Thus we need to find out its PHY address. + * + */ +static int bcm54140_get_base_addr_and_port(struct phy_device *phydev) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int addr, min_addr, max_addr; + int step = 1; + u32 phy_id; + int tmp; + + min_addr = phydev->mdio.addr; + max_addr = phydev->mdio.addr; + addr = phydev->mdio.addr; + + /* We scan forward and backwards and look for PHYs which have the + * same phy_id like we do. Step 1 will scan forward, step 2 + * backwards. Once we are finished, we have a min_addr and + * max_addr which resembles the range of PHY addresses of the same + * type of PHY. There is one caveat; there may be many PHYs of + * the same type, but we know that each PHY takes exactly 4 + * consecutive addresses. Therefore we can deduce our offset + * to the base address of this quad PHY. + */ + + while (1) { + if (step == 3) { + break; + } else if (step == 1) { + max_addr = addr; + addr++; + } else { + min_addr = addr; + addr--; + } + + if (addr < 0 || addr >= PHY_MAX_ADDR) { + addr = phydev->mdio.addr; + step++; + continue; + } + + /* read the PHY id */ + tmp = mdiobus_read(bus, addr, MII_PHYSID1); + if (tmp < 0) + return tmp; + phy_id = tmp << 16; + tmp = mdiobus_read(bus, addr, MII_PHYSID2); + if (tmp < 0) + return tmp; + phy_id |= tmp; + + /* see if it is still the same PHY */ + if ((phy_id & phydev->drv->phy_id_mask) != + (phydev->drv->phy_id & phydev->drv->phy_id_mask)) { + addr = phydev->mdio.addr; + step++; + } + } + + /* The range we get should be a multiple of four. Please note that both + * the min_addr and max_addr are inclusive. So we have to add one if we + * subtract them. + */ + if ((max_addr - min_addr + 1) % 4) { + dev_err(&phydev->mdio.dev, + "Detected Quad PHY IDs %d..%d doesn't make sense.\n", + min_addr, max_addr); + return -EINVAL; + } + + priv->port = (phydev->mdio.addr - min_addr) % 4; + priv->base_addr = phydev->mdio.addr - priv->port; + + return 0; +} + +static int bcm54140_probe(struct phy_device *phydev) +{ + struct bcm54140_priv *priv; + int ret; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + ret = bcm54140_get_base_addr_and_port(phydev); + if (ret) + return ret; + + phydev_dbg(phydev, "probed (port %d, base PHY address %d)\n", + priv->port, priv->base_addr); + + return 0; +} + +static int bcm54140_config_init(struct phy_device *phydev) +{ + u16 reg = 0xffff; + int ret; + + /* Apply hardware errata */ + ret = bcm54140_b0_workaround(phydev); + if (ret) + return ret; + + /* Unmask events we are interested in. */ + reg &= ~(BCM54140_RDB_INT_DUPLEX | + BCM54140_RDB_INT_SPEED | + BCM54140_RDB_INT_LINK); + ret = bcm_phy_write_rdb(phydev, BCM54140_RDB_IMR, reg); + if (ret) + return ret; + + /* LED1=LINKSPD[1], LED2=LINKSPD[2], LED3=LINK/ACTIVITY */ + ret = bcm_phy_modify_rdb(phydev, BCM54140_RDB_SPARE1, + 0, BCM54140_RDB_SPARE1_LSLM); + if (ret) + return ret; + + ret = bcm_phy_modify_rdb(phydev, BCM54140_RDB_LED_CTRL, + 0, BCM54140_RDB_LED_CTRL_ACTLINK0); + if (ret) + return ret; + + /* disable super isolate mode */ + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_PWR, + BCM54140_RDB_C_PWR_ISOLATE, 0); +} + +int bcm54140_did_interrupt(struct phy_device *phydev) +{ + int ret; + + ret = bcm_phy_read_rdb(phydev, BCM54140_RDB_ISR); + + return (ret < 0) ? 0 : ret; +} + +int bcm54140_ack_intr(struct phy_device *phydev) +{ + int reg; + + /* clear pending interrupts */ + reg = bcm_phy_read_rdb(phydev, BCM54140_RDB_ISR); + if (reg < 0) + return reg; + + return 0; +} + +int bcm54140_config_intr(struct phy_device *phydev) +{ + struct bcm54140_priv *priv = phydev->priv; + static const u16 port_to_imr_bit[] = { + BCM54140_RDB_TOP_IMR_PORT0, BCM54140_RDB_TOP_IMR_PORT1, + BCM54140_RDB_TOP_IMR_PORT2, BCM54140_RDB_TOP_IMR_PORT3, + }; + int reg; + + if (priv->port >= ARRAY_SIZE(port_to_imr_bit)) + return -EINVAL; + + reg = bcm54140_base_read_rdb(phydev, BCM54140_RDB_TOP_IMR); + if (reg < 0) + return reg; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) + reg &= ~port_to_imr_bit[priv->port]; + else + reg |= port_to_imr_bit[priv->port]; + + return bcm54140_base_write_rdb(phydev, BCM54140_RDB_TOP_IMR, reg); +} + +static int bcm54140_get_downshift(struct phy_device *phydev, u8 *data) +{ + int val; + + val = bcm_phy_read_rdb(phydev, BCM54140_RDB_C_MISC_CTRL); + if (val < 0) + return val; + + if (!(val & BCM54140_RDB_C_MISC_CTRL_WS_EN)) { + *data = DOWNSHIFT_DEV_DISABLE; + return 0; + } + + val = bcm_phy_read_rdb(phydev, BCM54140_RDB_SPARE2); + if (val < 0) + return val; + + if (val & BCM54140_RDB_SPARE2_WS_RTRY_DIS) + *data = 1; + else + *data = FIELD_GET(BCM54140_RDB_SPARE2_WS_RTRY_LIMIT, val) + 2; + + return 0; +} + +static int bcm54140_set_downshift(struct phy_device *phydev, u8 cnt) +{ + u16 mask, set; + int ret; + + if (cnt > BCM54140_MAX_DOWNSHIFT && cnt != DOWNSHIFT_DEV_DEFAULT_COUNT) + return -EINVAL; + + if (!cnt) + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_MISC_CTRL, + BCM54140_RDB_C_MISC_CTRL_WS_EN, 0); + + if (cnt == DOWNSHIFT_DEV_DEFAULT_COUNT) + cnt = BCM54140_DEFAULT_DOWNSHIFT; + + if (cnt == 1) { + mask = 0; + set = BCM54140_RDB_SPARE2_WS_RTRY_DIS; + } else { + mask = BCM54140_RDB_SPARE2_WS_RTRY_DIS; + mask |= BCM54140_RDB_SPARE2_WS_RTRY_LIMIT; + set = FIELD_PREP(BCM54140_RDB_SPARE2_WS_RTRY_LIMIT, cnt - 2); + } + ret = bcm_phy_modify_rdb(phydev, BCM54140_RDB_SPARE2, + mask, set); + if (ret) + return ret; + + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_MISC_CTRL, + 0, BCM54140_RDB_C_MISC_CTRL_WS_EN); +} + +static int bcm54140_get_edpd(struct phy_device *phydev, u16 *tx_interval) +{ + int val; + + val = bcm_phy_read_rdb(phydev, BCM54140_RDB_C_APWR); + if (val < 0) + return val; + + switch (FIELD_GET(BCM54140_RDB_C_APWR_APD_MODE_MASK, val)) { + case BCM54140_RDB_C_APWR_APD_MODE_DIS: + case BCM54140_RDB_C_APWR_APD_MODE_DIS2: + *tx_interval = ETHTOOL_PHY_EDPD_DISABLE; + break; + case BCM54140_RDB_C_APWR_APD_MODE_EN: + case BCM54140_RDB_C_APWR_APD_MODE_EN_ANEG: + switch (FIELD_GET(BCM54140_RDB_C_APWR_SLP_TIM_MASK, val)) { + case BCM54140_RDB_C_APWR_SLP_TIM_2_7: + *tx_interval = 2700; + break; + case BCM54140_RDB_C_APWR_SLP_TIM_5_4: + *tx_interval = 5400; + break; + } + } + + return 0; +} + +static int bcm54140_set_edpd(struct phy_device *phydev, u16 tx_interval) +{ + u16 mask, set; + + mask = BCM54140_RDB_C_APWR_APD_MODE_MASK; + if (tx_interval == ETHTOOL_PHY_EDPD_DISABLE) + set = FIELD_PREP(BCM54140_RDB_C_APWR_APD_MODE_MASK, + BCM54140_RDB_C_APWR_APD_MODE_DIS); + else + set = FIELD_PREP(BCM54140_RDB_C_APWR_APD_MODE_MASK, + BCM54140_RDB_C_APWR_APD_MODE_EN_ANEG); + + /* enable single pulse mode */ + set |= BCM54140_RDB_C_APWR_SINGLE_PULSE; + + /* set sleep timer */ + mask |= BCM54140_RDB_C_APWR_SLP_TIM_MASK; + switch (tx_interval) { + case ETHTOOL_PHY_EDPD_DFLT_TX_MSECS: + case ETHTOOL_PHY_EDPD_DISABLE: + case 2700: + set |= BCM54140_RDB_C_APWR_SLP_TIM_2_7; + break; + case 5400: + set |= BCM54140_RDB_C_APWR_SLP_TIM_5_4; + break; + default: + return -EINVAL; + } + + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_APWR, mask, set); +} + +static int bcm54140_get_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, void *data) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + return bcm54140_get_downshift(phydev, data); + case ETHTOOL_PHY_EDPD: + return bcm54140_get_edpd(phydev, data); + default: + return -EOPNOTSUPP; + } +} + +static int bcm54140_set_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, const void *data) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + return bcm54140_set_downshift(phydev, *(const u8 *)data); + case ETHTOOL_PHY_EDPD: + return bcm54140_set_edpd(phydev, *(const u16 *)data); + default: + return -EOPNOTSUPP; + } +} + +static struct phy_driver bcm54140_drivers[] = { + { + .phy_id = PHY_ID_BCM54140, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM54140", + .features = PHY_GBIT_FEATURES, + .config_init = bcm54140_config_init, + .did_interrupt = bcm54140_did_interrupt, + .ack_interrupt = bcm54140_ack_intr, + .config_intr = bcm54140_config_intr, + .probe = bcm54140_probe, + .suspend = genphy_suspend, + .resume = genphy_resume, + .get_tunable = bcm54140_get_tunable, + .set_tunable = bcm54140_set_tunable, + }, +}; +module_phy_driver(bcm54140_drivers); + +static struct mdio_device_id __maybe_unused bcm54140_tbl[] = { + { PHY_ID_BCM54140, 0xfffffff0 }, + { } +}; + +MODULE_AUTHOR("Michael Walle"); +MODULE_DESCRIPTION("Broadcom BCM54140 PHY driver"); +MODULE_DEVICE_TABLE(mdio, bcm54140_tbl); +MODULE_LICENSE("GPL"); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 897b69309964..8be150e69c7c 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -25,6 +25,7 @@ #define PHY_ID_BCM5461 0x002060c0 #define PHY_ID_BCM54612E 0x03625e60 #define PHY_ID_BCM54616S 0x03625d10 +#define PHY_ID_BCM54140 0xae025019 #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM89610 0x03625cd0 -- cgit v1.2.3 From 38f961e744840db9044af68f4773ae5feae60a89 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 20 Apr 2020 23:29:05 +0200 Subject: net: phy: add device-managed devm_mdiobus_register If there's no special ordering requirement for mdiobus_unregister(), then driver code can be simplified by using a device-managed version of mdiobus_register(). Prerequisite is that bus allocation has been done device-managed too. Else mdiobus_free() may be called whilst bus is still registered, resulting in a BUG_ON(). Therefore let devm_mdiobus_register() return -EPERM if bus was allocated non-managed. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 8 +++++++- include/linux/phy.h | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 346e88435d29..26b00af94573 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -170,7 +170,12 @@ EXPORT_SYMBOL(mdiobus_alloc_size); static void _devm_mdiobus_free(struct device *dev, void *res) { - mdiobus_free(*(struct mii_bus **)res); + struct mii_bus *bus = *(struct mii_bus **)res; + + if (bus->is_managed_registered && bus->state == MDIOBUS_REGISTERED) + mdiobus_unregister(bus); + + mdiobus_free(bus); } static int devm_mdiobus_match(struct device *dev, void *res, void *data) @@ -210,6 +215,7 @@ struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv) if (bus) { *ptr = bus; devres_add(dev, ptr); + bus->is_managed = 1; } else { devres_free(ptr); } diff --git a/include/linux/phy.h b/include/linux/phy.h index 2432ca463ddc..3941a6bcba10 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -241,6 +241,9 @@ struct mii_bus { int (*reset)(struct mii_bus *bus); struct mdio_bus_stats stats[PHY_MAX_ADDR]; + unsigned int is_managed:1; /* is device-managed */ + unsigned int is_managed_registered:1; + /* * A lock to ensure that only one thing can read/write * the MDIO bus at a time @@ -286,6 +289,20 @@ static inline struct mii_bus *mdiobus_alloc(void) int __mdiobus_register(struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) +static inline int devm_mdiobus_register(struct mii_bus *bus) +{ + int ret; + + if (!bus->is_managed) + return -EPERM; + + ret = mdiobus_register(bus); + if (!ret) + bus->is_managed_registered = 1; + + return ret; +} + void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); -- cgit v1.2.3 From 5972157c2dde11698d7bcfc55621107d97121c87 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Apr 2020 11:24:55 +0200 Subject: net: mdio: of: export part of of_mdiobus_register_phy() This function will be needed in tja11xx driver for secondary PHY support. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 73 ++++++++++++++++++++++++++++--------------------- include/linux/of_mdio.h | 11 +++++++- 2 files changed, 52 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 9f982c0627a0..a04afe79529c 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -60,39 +60,15 @@ static struct mii_timestamper *of_find_mii_timestamper(struct device_node *node) return register_mii_timestamper(arg.np, arg.args[0]); } -static int of_mdiobus_register_phy(struct mii_bus *mdio, - struct device_node *child, u32 addr) +int of_mdiobus_phy_device_register(struct mii_bus *mdio, struct phy_device *phy, + struct device_node *child, u32 addr) { - struct mii_timestamper *mii_ts; - struct phy_device *phy; - bool is_c45; int rc; - u32 phy_id; - - mii_ts = of_find_mii_timestamper(child); - if (IS_ERR(mii_ts)) - return PTR_ERR(mii_ts); - - is_c45 = of_device_is_compatible(child, - "ethernet-phy-ieee802.3-c45"); - - if (!is_c45 && !of_get_phy_id(child, &phy_id)) - phy = phy_device_create(mdio, addr, phy_id, 0, NULL); - else - phy = get_phy_device(mdio, addr, is_c45); - if (IS_ERR(phy)) { - if (mii_ts) - unregister_mii_timestamper(mii_ts); - return PTR_ERR(phy); - } rc = of_irq_get(child, 0); - if (rc == -EPROBE_DEFER) { - if (mii_ts) - unregister_mii_timestamper(mii_ts); - phy_device_free(phy); + if (rc == -EPROBE_DEFER) return rc; - } + if (rc > 0) { phy->irq = rc; mdio->irq[addr] = rc; @@ -117,11 +93,48 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio, /* All data is now stored in the phy struct; * register it */ rc = phy_device_register(phy); + if (rc) { + of_node_put(child); + return rc; + } + + dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n", + child, addr); + return 0; +} +EXPORT_SYMBOL(of_mdiobus_phy_device_register); + +static int of_mdiobus_register_phy(struct mii_bus *mdio, + struct device_node *child, u32 addr) +{ + struct mii_timestamper *mii_ts; + struct phy_device *phy; + bool is_c45; + int rc; + u32 phy_id; + + mii_ts = of_find_mii_timestamper(child); + if (IS_ERR(mii_ts)) + return PTR_ERR(mii_ts); + + is_c45 = of_device_is_compatible(child, + "ethernet-phy-ieee802.3-c45"); + + if (!is_c45 && !of_get_phy_id(child, &phy_id)) + phy = phy_device_create(mdio, addr, phy_id, 0, NULL); + else + phy = get_phy_device(mdio, addr, is_c45); + if (IS_ERR(phy)) { + if (mii_ts) + unregister_mii_timestamper(mii_ts); + return PTR_ERR(phy); + } + + rc = of_mdiobus_phy_device_register(mdio, phy, child, addr); if (rc) { if (mii_ts) unregister_mii_timestamper(mii_ts); phy_device_free(phy); - of_node_put(child); return rc; } @@ -132,8 +145,6 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio, if (mii_ts) phy->mii_ts = mii_ts; - dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n", - child, addr); return 0; } diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 491a2b7e77c1..0f61a4ac6bcf 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -30,7 +30,9 @@ extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); extern int of_phy_register_fixed_link(struct device_node *np); extern void of_phy_deregister_fixed_link(struct device_node *np); extern bool of_phy_is_fixed_link(struct device_node *np); - +extern int of_mdiobus_phy_device_register(struct mii_bus *mdio, + struct phy_device *phy, + struct device_node *child, u32 addr); static inline int of_mdio_parse_addr(struct device *dev, const struct device_node *np) @@ -118,6 +120,13 @@ static inline bool of_phy_is_fixed_link(struct device_node *np) { return false; } + +static inline int of_mdiobus_phy_device_register(struct mii_bus *mdio, + struct phy_device *phy, + struct device_node *child, u32 addr) +{ + return -ENOSYS; +} #endif -- cgit v1.2.3 From 33467ac3c8dc80575e1b158e78d0c30ab33c1b08 Mon Sep 17 00:00:00 2001 From: Loic Pallardy Date: Thu, 16 Apr 2020 19:20:35 -0500 Subject: remoteproc: Add prepare and unprepare ops On some SoC architecture, it is needed to enable HW like clock, bus, regulator, memory region... before loading co-processor firmware. This patch introduces prepare and unprepare ops to execute platform specific function before firmware loading and after stop execution. Signed-off-by: Loic Pallardy Signed-off-by: Suman Anna Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200417002036.24359-2-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 15 ++++++++++++++- drivers/remoteproc/remoteproc_internal.h | 16 ++++++++++++++++ include/linux/remoteproc.h | 4 ++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 206363723071..4bd0f45a00c0 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1394,12 +1394,19 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw) return ret; } + /* Prepare rproc for firmware loading if needed */ + ret = rproc_prepare_device(rproc); + if (ret) { + dev_err(dev, "can't prepare rproc %s: %d\n", rproc->name, ret); + goto disable_iommu; + } + rproc->bootaddr = rproc_get_boot_addr(rproc, fw); /* Load resource table, core dump segment list etc from the firmware */ ret = rproc_parse_fw(rproc, fw); if (ret) - goto disable_iommu; + goto unprepare_rproc; /* reset max_notifyid */ rproc->max_notifyid = -1; @@ -1433,6 +1440,9 @@ clean_up_resources: kfree(rproc->cached_table); rproc->cached_table = NULL; rproc->table_ptr = NULL; +unprepare_rproc: + /* release HW resources if needed */ + rproc_unprepare_device(rproc); disable_iommu: rproc_disable_iommu(rproc); return ret; @@ -1865,6 +1875,9 @@ void rproc_shutdown(struct rproc *rproc) /* clean up all acquired resources */ rproc_resource_cleanup(rproc); + /* release HW resources if needed */ + rproc_unprepare_device(rproc); + rproc_disable_iommu(rproc); /* Free the copy of the resource table */ diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index 31994715fd43..4ba7cb59d3e8 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -63,6 +63,22 @@ struct resource_table *rproc_elf_find_loaded_rsc_table(struct rproc *rproc, struct rproc_mem_entry * rproc_find_carveout_by_name(struct rproc *rproc, const char *name, ...); +static inline int rproc_prepare_device(struct rproc *rproc) +{ + if (rproc->ops->prepare) + return rproc->ops->prepare(rproc); + + return 0; +} + +static inline int rproc_unprepare_device(struct rproc *rproc) +{ + if (rproc->ops->unprepare) + return rproc->ops->unprepare(rproc); + + return 0; +} + static inline int rproc_fw_sanity_check(struct rproc *rproc, const struct firmware *fw) { diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index ac4082f12e8b..0468be4d02f4 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -355,6 +355,8 @@ enum rsc_handling_status { /** * struct rproc_ops - platform-specific device handlers + * @prepare: prepare device for code loading + * @unprepare: unprepare device after stop * @start: power on the device and boot it * @stop: power off the device * @kick: kick a virtqueue (virtqueue id given as a parameter) @@ -373,6 +375,8 @@ enum rsc_handling_status { * panic at least the returned number of milliseconds */ struct rproc_ops { + int (*prepare)(struct rproc *rproc); + int (*unprepare)(struct rproc *rproc); int (*start)(struct rproc *rproc); int (*stop)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); -- cgit v1.2.3 From 5cc415001bca8fe0e3f0ee6d58a953a314dd9751 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:41:52 +0200 Subject: Drivers: hv: avoid passing opaque pointer to vmbus_onmessage() vmbus_onmessage() doesn't need the header of the message, it only uses it to get to the payload, we can pass the pointer to the payload directly. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104154.45010-4-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 7 +------ drivers/hv/vmbus_drv.c | 3 ++- include/linux/hyperv.h | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 501c43c5851d..da93b5f1b143 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1363,13 +1363,8 @@ channel_message_table[CHANNELMSG_COUNT] = { * * This is invoked in the vmbus worker thread context. */ -void vmbus_onmessage(void *context) +void vmbus_onmessage(struct vmbus_channel_message_header *hdr) { - struct hv_message *msg = context; - struct vmbus_channel_message_header *hdr; - - hdr = (struct vmbus_channel_message_header *)msg->u.payload; - trace_vmbus_on_message(hdr); /* diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index e356ea4752c0..8f08e8273dd8 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1035,7 +1035,8 @@ static void vmbus_onmessage_work(struct work_struct *work) ctx = container_of(work, struct onmessage_work_context, work); - vmbus_onmessage(&ctx->msg); + vmbus_onmessage((struct vmbus_channel_message_header *) + &ctx->msg.payload); kfree(ctx); } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 692c89ccf5df..cbd24f4e68d1 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1017,7 +1017,7 @@ static inline void clear_low_latency_mode(struct vmbus_channel *c) c->low_latency = false; } -void vmbus_onmessage(void *context); +void vmbus_onmessage(struct vmbus_channel_message_header *hdr); int vmbus_request_offers(void); -- cgit v1.2.3 From 8b6a877c060ed6b86878fe66c7c6493a6054cf23 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:06 +0200 Subject: Drivers: hv: vmbus: Replace the per-CPU channel lists with a global array of channels When Hyper-V sends an interrupt to the guest, the guest has to figure out which channel the interrupt is associated with. Hyper-V sets a bit in a memory page that is shared with the guest, indicating a particular "relid" that the interrupt is associated with. The current Linux code then uses a set of per-CPU linked lists to map a given "relid" to a pointer to a channel structure. This design introduces a synchronization problem if the CPU that Hyper-V will interrupt for a certain channel is changed. If the interrupt comes on the "old CPU" and the channel was already moved to the per-CPU list of the "new CPU", then the relid -> channel mapping will fail and the interrupt is dropped. Similarly, if the interrupt comes on the new CPU but the channel was not moved to the per-CPU list of the new CPU, then the mapping will fail and the interrupt is dropped. Relids are integers ranging from 0 to 2047. The mapping from relids to channel structures can be done by setting up an array with 2048 entries, each entry being a pointer to a channel structure (hence total size ~16K bytes, which is not a problem). The array is global, so there are no per-CPU linked lists to update. The array can be searched and updated by loading from/storing to the array at the specified index. With no per-CPU data structures, the above mentioned synchronization problem is avoided and the relid2channel() function gets simpler. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-4-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 179 +++++++++++++++++++++++++++++----------------- drivers/hv/connection.c | 38 +++------- drivers/hv/hv.c | 2 - drivers/hv/hyperv_vmbus.h | 14 ++-- drivers/hv/vmbus_drv.c | 48 ++++++++----- include/linux/hyperv.h | 5 -- 6 files changed, 160 insertions(+), 126 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 0ced32ef2715..bbd0eee4362f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -319,7 +319,6 @@ static struct vmbus_channel *alloc_channel(void) init_completion(&channel->rescind_event); INIT_LIST_HEAD(&channel->sc_list); - INIT_LIST_HEAD(&channel->percpu_list); tasklet_init(&channel->callback_event, vmbus_on_event, (unsigned long)channel); @@ -340,23 +339,49 @@ static void free_channel(struct vmbus_channel *channel) kobject_put(&channel->kobj); } -static void percpu_channel_enq(void *arg) +void vmbus_channel_map_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - - list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + /* + * The mapping of the channel's relid is visible from the CPUs that + * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will + * execute: + * + * (a) In the "normal (i.e., not resuming from hibernation)" path, + * the full barrier in smp_store_mb() guarantees that the store + * is propagated to all CPUs before the add_channel_work work + * is queued. In turn, add_channel_work is queued before the + * channel's ring buffer is allocated/initialized and the + * OPENCHANNEL message for the channel is sent in vmbus_open(). + * Hyper-V won't start sending the interrupts for the channel + * before the OPENCHANNEL message is acked. The memory barrier + * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures + * that vmbus_chan_sched() must find the channel's relid in + * recv_int_page before retrieving the channel pointer from the + * array of channels. + * + * (b) In the "resuming from hibernation" path, the smp_store_mb() + * guarantees that the store is propagated to all CPUs before + * the VMBus connection is marked as ready for the resume event + * (cf. check_ready_for_resume_event()). The interrupt handler + * of the VMBus driver and vmbus_chan_sched() can not run before + * vmbus_bus_resume() has completed execution (cf. resume_noirq). + */ + smp_store_mb( + vmbus_connection.channels[channel->offermsg.child_relid], + channel); } -static void percpu_channel_deq(void *arg) +void vmbus_channel_unmap_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - - list_del_rcu(&channel->percpu_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + WRITE_ONCE( + vmbus_connection.channels[channel->offermsg.child_relid], + NULL); } - static void vmbus_release_relid(u32 relid) { struct vmbus_channel_relid_released msg; @@ -376,17 +401,25 @@ void hv_process_channel_removal(struct vmbus_channel *channel) struct vmbus_channel *primary_channel; unsigned long flags; - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + lockdep_assert_held(&vmbus_connection.channel_mutex); BUG_ON(!channel->rescind); - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, - percpu_channel_deq, channel, true); - } else { - percpu_channel_deq(channel); - put_cpu(); - } + /* + * hv_process_channel_removal() could find INVALID_RELID only for + * hv_sock channels. See the inline comments in vmbus_onoffer(). + */ + WARN_ON(channel->offermsg.child_relid == INVALID_RELID && + !is_hvsock_channel(channel)); + + /* + * Upon suspend, an in-use hv_sock channel is removed from the array of + * channels and the relid is invalidated. After hibernation, when the + * user-space appplication destroys the channel, it's unnecessary and + * unsafe to remove the channel from the array of channels. See also + * the inline comments before the call of vmbus_release_relid() below. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_channel_unmap_relid(channel); if (channel->primary_channel == NULL) { list_del(&channel->listentry); @@ -447,16 +480,6 @@ static void vmbus_add_channel_work(struct work_struct *work) init_vp_index(newchannel, dev_type); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_enq, - newchannel, true); - } else { - percpu_channel_enq(newchannel); - put_cpu(); - } - /* * This state is used to indicate a successful open * so that when we do close the channel normally, we @@ -523,17 +546,10 @@ err_deq_chan: spin_unlock_irqrestore(&primary_channel->lock, flags); } - mutex_unlock(&vmbus_connection.channel_mutex); + /* vmbus_process_offer() has mapped the channel. */ + vmbus_channel_unmap_relid(newchannel); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_deq, - newchannel, true); - } else { - percpu_channel_deq(newchannel); - put_cpu(); - } + mutex_unlock(&vmbus_connection.channel_mutex); vmbus_release_relid(newchannel->offermsg.child_relid); @@ -599,6 +615,8 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) spin_unlock_irqrestore(&channel->lock, flags); } + vmbus_channel_map_relid(newchannel); + mutex_unlock(&vmbus_connection.channel_mutex); /* @@ -940,8 +958,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { - atomic_dec(&vmbus_connection.offer_in_progress); - /* * We're resuming from hibernation: all the sub-channel and * hv_sock channels we had before the hibernation should have @@ -949,36 +965,65 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) * primary channel that we had before the hibernation. */ + /* + * { Initially: channel relid = INVALID_RELID, + * channels[valid_relid] = NULL } + * + * CPU1 CPU2 + * + * [vmbus_onoffer()] [vmbus_device_release()] + * + * LOCK channel_mutex LOCK channel_mutex + * STORE channel relid = valid_relid LOAD r1 = channel relid + * MAP_RELID channel if (r1 != INVALID_RELID) + * UNLOCK channel_mutex UNMAP_RELID channel + * UNLOCK channel_mutex + * + * Forbids: r1 == valid_relid && + * channels[valid_relid] == channel + * + * Note. r1 can be INVALID_RELID only for an hv_sock channel. + * None of the hv_sock channels which were present before the + * suspend are re-offered upon the resume. See the WARN_ON() + * in hv_process_channel_removal(). + */ + mutex_lock(&vmbus_connection.channel_mutex); + + atomic_dec(&vmbus_connection.offer_in_progress); + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); /* Fix up the relid. */ oldchannel->offermsg.child_relid = offer->child_relid; offer_sz = sizeof(*offer); - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { - check_ready_for_resume_event(); - return; + if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) { + /* + * This is not an error, since the host can also change + * the other field(s) of the offer, e.g. on WS RS5 + * (Build 17763), the offer->connection_id of the + * Mellanox VF vmbus device can change when the host + * reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); } - /* - * This is not an error, since the host can also change the - * other field(s) of the offer, e.g. on WS RS5 (Build 17763), - * the offer->connection_id of the Mellanox VF vmbus device - * can change when the host reoffers the device upon resume. - */ - pr_debug("vmbus offer changed: relid=%d\n", - offer->child_relid); - - print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, &oldchannel->offermsg, offer_sz, - false); - print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, offer, offer_sz, false); - - /* Fix up the old channel. */ - vmbus_setup_channel_state(oldchannel, offer); - + /* Add the channel back to the array of channels. */ + vmbus_channel_map_relid(oldchannel); check_ready_for_resume_event(); + mutex_unlock(&vmbus_connection.channel_mutex); return; } @@ -1036,14 +1081,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * * CPU1 CPU2 * - * [vmbus_process_offer()] [vmbus_onoffer_rescind()] + * [vmbus_onoffer()] [vmbus_onoffer_rescind()] * * LOCK channel_mutex WAIT_ON offer_in_progress == 0 * DECREMENT offer_in_progress LOCK channel_mutex - * INSERT chn_list SEARCH chn_list + * STORE channels[] LOAD channels[] * UNLOCK channel_mutex UNLOCK channel_mutex * - * Forbids: CPU2's SEARCH from *not* seeing CPU1's INSERT + * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE */ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index f4bd306d2cef..11170d9a2e1a 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -248,6 +248,14 @@ int vmbus_connect(void) pr_info("Vmbus version:%d.%d\n", version >> 16, version & 0xFFFF); + vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS, + sizeof(struct vmbus_channel *), + GFP_KERNEL); + if (vmbus_connection.channels == NULL) { + ret = -ENOMEM; + goto cleanup; + } + kfree(msginfo); return 0; @@ -295,33 +303,9 @@ void vmbus_disconnect(void) */ struct vmbus_channel *relid2channel(u32 relid) { - struct vmbus_channel *channel; - struct vmbus_channel *found_channel = NULL; - struct list_head *cur, *tmp; - struct vmbus_channel *cur_sc; - - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); - - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { - if (channel->offermsg.child_relid == relid) { - found_channel = channel; - break; - } else if (!list_empty(&channel->sc_list)) { - /* - * Deal with sub-channels. - */ - list_for_each_safe(cur, tmp, &channel->sc_list) { - cur_sc = list_entry(cur, struct vmbus_channel, - sc_list); - if (cur_sc->offermsg.child_relid == relid) { - found_channel = cur_sc; - break; - } - } - } - } - - return found_channel; + if (WARN_ON(relid >= MAX_CHANNEL_RELIDS)) + return NULL; + return READ_ONCE(vmbus_connection.channels[relid]); } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index e2b331045464..17bf1f229152 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -117,8 +117,6 @@ int hv_synic_alloc(void) pr_err("Unable to allocate post msg page\n"); goto err; } - - INIT_LIST_HEAD(&hv_cpu->chan_list); } return 0; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 000740d053a2..3edf993b0fd9 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -132,12 +132,6 @@ struct hv_per_cpu_context { * basis. */ struct tasklet_struct msg_dpc; - - /* - * To optimize the mapping of relid to channel, maintain - * per-cpu list of the channels based on their CPU affinity. - */ - struct list_head chan_list; }; struct hv_context { @@ -202,6 +196,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, /* TODO: Need to make this configurable */ #define MAX_NUM_CHANNELS_SUPPORTED 256 +#define MAX_CHANNEL_RELIDS \ + max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT) enum vmbus_connect_state { DISCONNECTED, @@ -251,6 +247,9 @@ struct vmbus_connection { struct list_head chn_list; struct mutex channel_mutex; + /* Array of channels */ + struct vmbus_channel **channels; + /* * An offer message is handled first on the work_queue, and then * is further handled on handle_primary_chan_wq or @@ -338,6 +337,9 @@ int vmbus_add_channel_kobj(struct hv_device *device_obj, void vmbus_remove_channel_attr_group(struct vmbus_channel *channel); +void vmbus_channel_map_relid(struct vmbus_channel *channel); +void vmbus_channel_unmap_relid(struct vmbus_channel *channel); + struct vmbus_channel *relid2channel(u32 relid); void vmbus_free_channels(void); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1f3e69d18d35..21a01bca7fcd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1252,33 +1252,39 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (relid == 0) continue; + /* + * Pairs with the kfree_rcu() in vmbus_chan_release(). + * Guarantees that the channel data structure doesn't + * get freed while the channel pointer below is being + * dereferenced. + */ rcu_read_lock(); /* Find channel based on relid */ - list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) { - if (channel->offermsg.child_relid != relid) - continue; + channel = relid2channel(relid); + if (channel == NULL) + goto sched_unlock_rcu; - if (channel->rescind) - continue; + if (channel->rescind) + goto sched_unlock_rcu; - trace_vmbus_chan_sched(channel); + trace_vmbus_chan_sched(channel); - ++channel->interrupts; + ++channel->interrupts; - switch (channel->callback_mode) { - case HV_CALL_ISR: - vmbus_channel_isr(channel); - break; + switch (channel->callback_mode) { + case HV_CALL_ISR: + vmbus_channel_isr(channel); + break; - case HV_CALL_BATCHED: - hv_begin_read(&channel->inbound); - /* fallthrough */ - case HV_CALL_DIRECT: - tasklet_schedule(&channel->callback_event); - } + case HV_CALL_BATCHED: + hv_begin_read(&channel->inbound); + fallthrough; + case HV_CALL_DIRECT: + tasklet_schedule(&channel->callback_event); } +sched_unlock_rcu: rcu_read_unlock(); } } @@ -2264,9 +2270,12 @@ static int vmbus_bus_suspend(struct device *dev) list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { /* - * Invalidate the field. Upon resume, vmbus_onoffer() will fix - * up the field, and the other fields (if necessary). + * Remove the channel from the array of channels and invalidate + * the channel's relid. Upon resume, vmbus_onoffer() will fix + * up the relid (and other fields, if necessary) and add the + * channel back to the array. */ + vmbus_channel_unmap_relid(channel); channel->offermsg.child_relid = INVALID_RELID; if (is_hvsock_channel(channel)) { @@ -2502,6 +2511,7 @@ static void __exit vmbus_exit(void) hv_debug_rm_all_dir(); vmbus_free_channels(); + kfree(vmbus_connection.channels); if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { kmsg_dump_unregister(&hv_kmsg_dumper); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index cbd24f4e68d1..f5b3f008c55a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -854,11 +854,6 @@ struct vmbus_channel { * Support per-channel state for use by vmbus drivers. */ void *per_channel_state; - /* - * To support per-cpu lookup mapping of relid to channel, - * link up channels based on their CPU affinity. - */ - struct list_head percpu_list; /* * Defer freeing channel until after all cpu's have -- cgit v1.2.3 From 9403b66e6161130ebae7e55a97491c84c1ad6f9f Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:09 +0200 Subject: Drivers: hv: vmbus: Use a spin lock for synchronizing channel scheduling vs. channel removal Since vmbus_chan_sched() dereferences the ring buffer pointer, we have to make sure that the ring buffer data structures don't get freed while such dereferencing is happening. Current code does this by sending an IPI to the CPU that is allowed to access that ring buffer from interrupt level, cf., vmbus_reset_channel_cb(). But with the new functionality to allow changing the CPU that a channel will interrupt, we can't be sure what CPU will be running the vmbus_chan_sched() function for a particular channel, so the current IPI mechanism is infeasible. Instead synchronize vmbus_chan_sched() and vmbus_reset_channel_cb() by using the (newly introduced) per-channel spin lock "sched_lock". Move the test for onchannel_callback being NULL before the "switch" control statement in vmbus_chan_sched(), in order to not access the ring buffer if the vmbus_reset_channel_cb() has been completed on the channel. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-7-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel.c | 24 +++++++----------------- drivers/hv/channel_mgmt.c | 1 + drivers/hv/vmbus_drv.c | 30 +++++++++++++++++------------- include/linux/hyperv.h | 6 ++++++ 4 files changed, 31 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 256ee90c7446..132e476f87b2 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -594,15 +594,10 @@ post_msg_err: } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); -static void reset_channel_cb(void *arg) -{ - struct vmbus_channel *channel = arg; - - channel->onchannel_callback = NULL; -} - void vmbus_reset_channel_cb(struct vmbus_channel *channel) { + unsigned long flags; + /* * vmbus_on_event(), running in the per-channel tasklet, can race * with vmbus_close_internal() in the case of SMP guest, e.g., when @@ -618,17 +613,12 @@ void vmbus_reset_channel_cb(struct vmbus_channel *channel) */ tasklet_disable(&channel->callback_event); - channel->sc_creation_callback = NULL; + /* See the inline comments in vmbus_chan_sched(). */ + spin_lock_irqsave(&channel->sched_lock, flags); + channel->onchannel_callback = NULL; + spin_unlock_irqrestore(&channel->sched_lock, flags); - /* Stop the callback asap */ - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, reset_channel_cb, - channel, true); - } else { - reset_channel_cb(channel); - put_cpu(); - } + channel->sc_creation_callback = NULL; /* Re-enable tasklet for use on re-open */ tasklet_enable(&channel->callback_event); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index bbd0eee4362f..2e730f84654b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -315,6 +315,7 @@ static struct vmbus_channel *alloc_channel(void) if (!channel) return NULL; + spin_lock_init(&channel->sched_lock); spin_lock_init(&channel->lock); init_completion(&channel->rescind_event); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 21a01bca7fcd..0f7dfa507a40 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1201,18 +1201,6 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) } #endif /* CONFIG_PM_SLEEP */ -/* - * Direct callback for channels using other deferred processing - */ -static void vmbus_channel_isr(struct vmbus_channel *channel) -{ - void (*callback_fn)(void *); - - callback_fn = READ_ONCE(channel->onchannel_callback); - if (likely(callback_fn != NULL)) - (*callback_fn)(channel->channel_callback_context); -} - /* * Schedule all channels with events pending */ @@ -1243,6 +1231,7 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) return; for_each_set_bit(relid, recv_int_page, maxbits) { + void (*callback_fn)(void *context); struct vmbus_channel *channel; if (!sync_test_and_clear_bit(relid, recv_int_page)) @@ -1268,13 +1257,26 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (channel->rescind) goto sched_unlock_rcu; + /* + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). + */ + spin_lock(&channel->sched_lock); + + callback_fn = channel->onchannel_callback; + if (unlikely(callback_fn == NULL)) + goto sched_unlock; + trace_vmbus_chan_sched(channel); ++channel->interrupts; switch (channel->callback_mode) { case HV_CALL_ISR: - vmbus_channel_isr(channel); + (*callback_fn)(channel->channel_callback_context); break; case HV_CALL_BATCHED: @@ -1284,6 +1286,8 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) tasklet_schedule(&channel->callback_event); } +sched_unlock: + spin_unlock(&channel->sched_lock); sched_unlock_rcu: rcu_read_unlock(); } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index f5b3f008c55a..62b2c11d0875 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -771,6 +771,12 @@ struct vmbus_channel { void (*onchannel_callback)(void *context); void *channel_callback_context; + /* + * Synchronize channel scheduling and channel removal; see the inline + * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). + */ + spinlock_t sched_lock; + /* * A channel can be marked for one of three modes of reading: * BATCHED - callback called from taslket and should read -- cgit v1.2.3 From 8ef4c4abbbcdcd9d4bc0fd9454df03e6dac24b73 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:11 +0200 Subject: Drivers: hv: vmbus: Remove the unused HV_LOCALIZED channel affinity logic The logic is unused since commit 509879bdb30b8 ("Drivers: hv: Introduce a policy for controlling channel affinity"). This logic assumes that a channel target_cpu doesn't change during the lifetime of a channel, but this assumption is incompatible with the new functionality that allows changing the vCPU a channel will interrupt. Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-9-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 105 +++++++++++----------------------------------- include/linux/hyperv.h | 27 ------------ 2 files changed, 25 insertions(+), 107 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2e730f84654b..a19dc28fdf77 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -433,14 +433,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel) spin_unlock_irqrestore(&primary_channel->lock, flags); } - /* - * We need to free the bit for init_vp_index() to work in the case - * of sub-channel, when we reload drivers like hv_netvsc. - */ - if (channel->affinity_policy == HV_LOCALIZED) - cpumask_clear_cpu(channel->target_cpu, - &primary_channel->alloced_cpus_in_node); - /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and * the relid is invalidated; after hibernation, when the user-space app @@ -662,20 +654,21 @@ static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); /* * Starting with Win8, we can statically distribute the incoming * channel interrupt load by binding a channel to VCPU. - * We distribute the interrupt loads to one or more NUMA nodes based on - * the channel's affinity_policy. * * For pre-win8 hosts or non-performance critical channels we assign the * first CPU in the first NUMA node. + * + * Starting with win8, performance critical channels will be distributed + * evenly among all the available NUMA nodes. Once the node is assigned, + * we will assign the CPU based on a simple round robin scheme. */ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) { - u32 cur_cpu; bool perf_chn = vmbus_devs[dev_type].perf_device; - struct vmbus_channel *primary = channel->primary_channel; - int next_node; cpumask_var_t available_mask; struct cpumask *alloced_mask; + u32 target_cpu; + int numa_node; if ((vmbus_proto_version == VERSION_WS2008) || (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) || @@ -693,31 +686,27 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) return; } - spin_lock(&bind_channel_to_cpu_lock); - /* - * Based on the channel affinity policy, we will assign the NUMA - * nodes. + * Serializes the accesses to the global variable next_numa_node_id. + * See also the header comment of the spin lock declaration. */ + spin_lock(&bind_channel_to_cpu_lock); - if ((channel->affinity_policy == HV_BALANCED) || (!primary)) { - while (true) { - next_node = next_numa_node_id++; - if (next_node == nr_node_ids) { - next_node = next_numa_node_id = 0; - continue; - } - if (cpumask_empty(cpumask_of_node(next_node))) - continue; - break; + while (true) { + numa_node = next_numa_node_id++; + if (numa_node == nr_node_ids) { + next_numa_node_id = 0; + continue; } - channel->numa_node = next_node; - primary = channel; + if (cpumask_empty(cpumask_of_node(numa_node))) + continue; + break; } - alloced_mask = &hv_context.hv_numa_map[primary->numa_node]; + channel->numa_node = numa_node; + alloced_mask = &hv_context.hv_numa_map[numa_node]; if (cpumask_weight(alloced_mask) == - cpumask_weight(cpumask_of_node(primary->numa_node))) { + cpumask_weight(cpumask_of_node(numa_node))) { /* * We have cycled through all the CPUs in the node; * reset the alloced map. @@ -725,57 +714,13 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) cpumask_clear(alloced_mask); } - cpumask_xor(available_mask, alloced_mask, - cpumask_of_node(primary->numa_node)); + cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node)); - cur_cpu = -1; - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * Normally Hyper-V host doesn't create more subchannels - * than there are VCPUs on the node but it is possible when not - * all present VCPUs on the node are initialized by guest. - * Clear the alloced_cpus_in_node to start over. - */ - if (cpumask_equal(&primary->alloced_cpus_in_node, - cpumask_of_node(primary->numa_node))) - cpumask_clear(&primary->alloced_cpus_in_node); - } - - while (true) { - cur_cpu = cpumask_next(cur_cpu, available_mask); - if (cur_cpu >= nr_cpu_ids) { - cur_cpu = -1; - cpumask_copy(available_mask, - cpumask_of_node(primary->numa_node)); - continue; - } - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * NOTE: in the case of sub-channel, we clear the - * sub-channel related bit(s) in - * primary->alloced_cpus_in_node in - * hv_process_channel_removal(), so when we - * reload drivers like hv_netvsc in SMP guest, here - * we're able to re-allocate - * bit from primary->alloced_cpus_in_node. - */ - if (!cpumask_test_cpu(cur_cpu, - &primary->alloced_cpus_in_node)) { - cpumask_set_cpu(cur_cpu, - &primary->alloced_cpus_in_node); - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } else { - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } + target_cpu = cpumask_first(available_mask); + cpumask_set_cpu(target_cpu, alloced_mask); - channel->target_cpu = cur_cpu; - channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); spin_unlock(&bind_channel_to_cpu_lock); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 62b2c11d0875..247356dbd742 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -689,11 +689,6 @@ union hv_connection_id { } u; }; -enum hv_numa_policy { - HV_BALANCED = 0, - HV_LOCALIZED, -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, @@ -808,10 +803,6 @@ struct vmbus_channel { u32 target_vp; /* The corresponding CPUID in the guest */ u32 target_cpu; - /* - * State to manage the CPU affiliation of channels. - */ - struct cpumask alloced_cpus_in_node; int numa_node; /* * Support for sub-channels. For high performance devices, @@ -898,18 +889,6 @@ struct vmbus_channel { */ bool low_latency; - /* - * NUMA distribution policy: - * We support two policies: - * 1) Balanced: Here all performance critical channels are - * distributed evenly amongst all the NUMA nodes. - * This policy will be the default policy. - * 2) Localized: All channels of a given instance of a - * performance critical service will be assigned CPUs - * within a selected NUMA node. - */ - enum hv_numa_policy affinity_policy; - bool probe_done; /* @@ -965,12 +944,6 @@ static inline bool is_sub_channel(const struct vmbus_channel *c) return c->offermsg.offer.sub_channel_index != 0; } -static inline void set_channel_affinity_state(struct vmbus_channel *c, - enum hv_numa_policy policy) -{ - c->affinity_policy = policy; -} - static inline void set_channel_read_mode(struct vmbus_channel *c, enum hv_callback_mode mode) { -- cgit v1.2.3 From 7527810573436f00e582d3d5ef2eb3c027c98d7d Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:13 +0200 Subject: Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type VMBus version 4.1 and later support the CHANNELMSG_MODIFYCHANNEL(22) message type which can be used to request Hyper-V to change the vCPU that a channel will interrupt. Introduce the CHANNELMSG_MODIFYCHANNEL message type, and define the vmbus_send_modifychannel() function to send CHANNELMSG_MODIFYCHANNEL requests to the host via a hypercall. The function is then used to define a sysfs "store" operation, which allows to change the (v)CPU the channel will interrupt by using the sysfs interface. The feature can be used for load balancing or other purposes. One interesting catch here is that Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL messages with the promise that (after the ACK is sent) the channel won't send any more interrupts to the "old" CPU. The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is problematic if the user want to take a CPU offline, since we don't want to take a CPU offline (and, potentially, "lose" channel interrupts on such CPU) if the host is still processing a CHANNELMSG_MODIFYCHANNEL message associated to that CPU. It is worth mentioning, however, that we have been unable to observe the above mentioned "race": in all our tests, CHANNELMSG_MODIFYCHANNEL requests appeared *as if* they were processed synchronously by the host. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-11-parri.andrea@gmail.com Reviewed-by: Michael Kelley [ wei: fix conflict in channel_mgmt.c ] Signed-off-by: Wei Liu --- drivers/hv/channel.c | 28 ++++++++++++ drivers/hv/channel_mgmt.c | 2 +- drivers/hv/hv_trace.h | 19 ++++++++ drivers/hv/vmbus_drv.c | 108 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/hyperv.h | 10 ++++- 5 files changed, 163 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 132e476f87b2..90070b337c10 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -289,6 +289,34 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, } EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); +/* + * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. + * + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not + * ACK such messages. IOW we can't know when the host will stop interrupting + * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * + * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version + * VERSION_WIN10_V4_1. + */ +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +{ + struct vmbus_channel_modifychannel conn_msg; + int ret; + + memset(&conn_msg, 0, sizeof(conn_msg)); + conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + conn_msg.child_relid = child_relid; + conn_msg.target_vp = target_vp; + + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); + + trace_vmbus_send_modifychannel(&conn_msg, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); + /* * create_gpadl_header - Creates a gpadl for the specified buffer */ diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2db3823f0e59..ffd7fffa5f83 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1383,7 +1383,7 @@ channel_message_table[CHANNELMSG_COUNT] = { { CHANNELMSG_19, 0, NULL, 0}, { CHANNELMSG_20, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, - { CHANNELMSG_22, 0, NULL, 0}, + { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, }; diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index e70783e33680..a43bc76c2d5d 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -296,6 +296,25 @@ TRACE_EVENT(vmbus_send_tl_connect_request, ) ); +TRACE_EVENT(vmbus_send_modifychannel, + TP_PROTO(const struct vmbus_channel_modifychannel *msg, + int ret), + TP_ARGS(msg, ret), + TP_STRUCT__entry( + __field(u32, child_relid) + __field(u32, target_vp) + __field(int, ret) + ), + TP_fast_assign( + __entry->child_relid = msg->child_relid; + __entry->target_vp = msg->target_vp; + __entry->ret = ret; + ), + TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d", + __entry->child_relid, __entry->target_vp, __entry->ret + ) + ); + DECLARE_EVENT_CLASS(vmbus_channel, TP_PROTO(const struct vmbus_channel *channel), TP_ARGS(channel), diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0f7dfa507a40..5d24b25fb5aa 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1606,8 +1606,24 @@ static ssize_t vmbus_chan_attr_show(struct kobject *kobj, return attribute->show(chan, buf); } +static ssize_t vmbus_chan_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + const struct vmbus_chan_attribute *attribute + = container_of(attr, struct vmbus_chan_attribute, attr); + struct vmbus_channel *chan + = container_of(kobj, struct vmbus_channel, kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(chan, buf, count); +} + static const struct sysfs_ops vmbus_chan_sysfs_ops = { .show = vmbus_chan_attr_show, + .store = vmbus_chan_attr_store, }; static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf) @@ -1678,11 +1694,99 @@ static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf) } static VMBUS_CHAN_ATTR_RO(write_avail); -static ssize_t show_target_cpu(struct vmbus_channel *channel, char *buf) +static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL); +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + ssize_t ret = count; + u32 target_cpu; + + if (vmbus_proto_version < VERSION_WIN10_V4_1) + return -EIO; + + if (sscanf(buf, "%uu", &target_cpu) != 1) + return -EIO; + + /* Validate target_cpu for the cpumask_test_cpu() operation below. */ + if (target_cpu >= nr_cpumask_bits) + return -EINVAL; + + /* No CPUs should come up or down during this. */ + cpus_read_lock(); + + if (!cpumask_test_cpu(target_cpu, cpu_online_mask)) { + cpus_read_unlock(); + return -EINVAL; + } + + /* + * Synchronizes target_cpu_store() and channel closure: + * + * { Initially: state = CHANNEL_OPENED } + * + * CPU1 CPU2 + * + * [target_cpu_store()] [vmbus_disconnect_ring()] + * + * LOCK channel_mutex LOCK channel_mutex + * LOAD r1 = state LOAD r2 = state + * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED) + * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN + * [...] SEND CLOSECHANNEL + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes + * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND + * + * Note. The host processes the channel messages "sequentially", in + * the order in which they are received on a per-partition basis. + */ + mutex_lock(&vmbus_connection.channel_mutex); + + /* + * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; + * avoid sending the message and fail here for such channels. + */ + if (channel->state != CHANNEL_OPENED_STATE) { + ret = -EIO; + goto cpu_store_unlock; + } + + if (channel->target_cpu == target_cpu) + goto cpu_store_unlock; + + if (vmbus_send_modifychannel(channel->offermsg.child_relid, + hv_cpu_number_to_vp_number(target_cpu))) { + ret = -EIO; + goto cpu_store_unlock; + } + + /* + * Warning. At this point, there is *no* guarantee that the host will + * have successfully processed the vmbus_send_modifychannel() request. + * See the header comment of vmbus_send_modifychannel() for more info. + * + * Lags in the processing of the above vmbus_send_modifychannel() can + * result in missed interrupts if the "old" target CPU is taken offline + * before Hyper-V starts sending interrupts to the "new" target CPU. + * But apart from this offlining scenario, the code tolerates such + * lags. It will function correctly even if a channel interrupt comes + * in on a CPU that is different from the channel target_cpu value. + */ + + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); + channel->numa_node = cpu_to_node(target_cpu); + +cpu_store_unlock: + mutex_unlock(&vmbus_connection.channel_mutex); + cpus_read_unlock(); + return ret; +} +static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); static ssize_t channel_pending_show(struct vmbus_channel *channel, char *buf) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 247356dbd742..b85d7580f2c1 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -425,7 +425,7 @@ enum vmbus_channel_message_type { CHANNELMSG_19 = 19, CHANNELMSG_20 = 20, CHANNELMSG_TL_CONNECT_REQUEST = 21, - CHANNELMSG_22 = 22, + CHANNELMSG_MODIFYCHANNEL = 22, CHANNELMSG_TL_CONNECT_RESULT = 23, CHANNELMSG_COUNT }; @@ -620,6 +620,13 @@ struct vmbus_channel_tl_connect_request { guid_t host_service_id; } __packed; +/* Modify Channel parameters, cf. vmbus_send_modifychannel() */ +struct vmbus_channel_modifychannel { + struct vmbus_channel_message_header header; + u32 child_relid; + u32 target_vp; +} __packed; + struct vmbus_channel_version_response { struct vmbus_channel_message_header header; u8 version_supported; @@ -1505,6 +1512,7 @@ extern __u32 vmbus_proto_version; int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); /* Get the start of the ring buffer. */ -- cgit v1.2.3 From 6824f0ce38cb4b903a3cbf00dd528861f6c2ea7c Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Fri, 3 Apr 2020 14:43:23 +0100 Subject: parport: Add comments for parport_register_dev_model() In preparation to remove parport_register_device(), copy the comments to parport_register_dev_model() and modify the parameters according to what parport_register_dev_model() has. Signed-off-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20200403134325.11523-9-sudipm.mukherjee@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/parport/share.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/parport.h | 4 ++++ 2 files changed, 66 insertions(+) (limited to 'include/linux') diff --git a/drivers/parport/share.c b/drivers/parport/share.c index 3169feebdc19..ee2892a935d6 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -841,6 +841,68 @@ static void free_pardevice(struct device *dev) kfree(par_dev); } +/** + * parport_register_dev_model - register a device on a parallel port + * @port: port to which the device is attached + * @name: a name to refer to the device + * @par_dev_cb: struct containing callbacks + * @id: device number to be given to the device + * + * This function, called by parallel port device drivers, + * declares that a device is connected to a port, and tells the + * system all it needs to know. + * + * The struct pardev_cb contains pointer to callbacks. preemption + * callback function, @preempt, is called when this device driver + * has claimed access to the port but another device driver wants + * to use it. It is given, @private, as its parameter, and should + * return zero if it is willing for the system to release the port + * to another driver on its behalf. If it wants to keep control of + * the port it should return non-zero, and no action will be taken. + * It is good manners for the driver to try to release the port at + * the earliest opportunity after its preemption callback rejects a + * preemption attempt. Note that if a preemption callback is happy + * for preemption to go ahead, there is no need to release the + * port; it is done automatically. This function may not block, as + * it may be called from interrupt context. If the device driver + * does not support preemption, @preempt can be %NULL. + * + * The wake-up ("kick") callback function, @wakeup, is called when + * the port is available to be claimed for exclusive access; that + * is, parport_claim() is guaranteed to succeed when called from + * inside the wake-up callback function. If the driver wants to + * claim the port it should do so; otherwise, it need not take + * any action. This function may not block, as it may be called + * from interrupt context. If the device driver does not want to + * be explicitly invited to claim the port in this way, @wakeup can + * be %NULL. + * + * The interrupt handler, @irq_func, is called when an interrupt + * arrives from the parallel port. Note that if a device driver + * wants to use interrupts it should use parport_enable_irq(), + * and can also check the irq member of the parport structure + * representing the port. + * + * The parallel port (lowlevel) driver is the one that has called + * request_irq() and whose interrupt handler is called first. + * This handler does whatever needs to be done to the hardware to + * acknowledge the interrupt (for PC-style ports there is nothing + * special to be done). It then tells the IEEE 1284 code about + * the interrupt, which may involve reacting to an IEEE 1284 + * event depending on the current IEEE 1284 phase. After this, + * it calls @irq_func. Needless to say, @irq_func will be called + * from interrupt context, and may not block. + * + * The %PARPORT_DEV_EXCL flag is for preventing port sharing, and + * so should only be used when sharing the port with other device + * drivers is impossible and would lead to incorrect behaviour. + * Use it sparingly! Normally, @flags will be zero. + * + * This function returns a pointer to a structure that represents + * the device on the port, or %NULL if there is not enough memory + * to allocate space for that structure. + **/ + struct pardevice * parport_register_dev_model(struct parport *port, const char *name, const struct pardev_cb *par_dev_cb, int id) diff --git a/include/linux/parport.h b/include/linux/parport.h index 13932ce8b37b..36a0f6270238 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -337,6 +337,10 @@ struct pardevice *parport_register_device(struct parport *port, void (*irq_func)(void *), int flags, void *handle); +/* + * parport_register_dev_model declares that a device is connected to a + * port, and tells the kernel all it needs to know. + */ struct pardevice * parport_register_dev_model(struct parport *port, const char *name, const struct pardev_cb *par_dev_cb, int cnt); -- cgit v1.2.3 From bae9defb06a781083844cafd0a359f423cd98388 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Fri, 3 Apr 2020 14:43:24 +0100 Subject: parport: remove unused parport_register_device() All the drivers that are using parallel port has been converted to use the new device model api, and parport_register_device() is no longer used. Signed-off-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20200403134325.11523-10-sudipm.mukherjee@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/parport/share.c | 192 ------------------------------------------------ include/linux/parport.h | 12 --- 2 files changed, 204 deletions(-) (limited to 'include/linux') diff --git a/drivers/parport/share.c b/drivers/parport/share.c index ee2892a935d6..661f623b3129 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -641,198 +641,6 @@ void parport_remove_port(struct parport *port) } EXPORT_SYMBOL(parport_remove_port); -/** - * parport_register_device - register a device on a parallel port - * @port: port to which the device is attached - * @name: a name to refer to the device - * @pf: preemption callback - * @kf: kick callback (wake-up) - * @irq_func: interrupt handler - * @flags: registration flags - * @handle: data for callback functions - * - * This function, called by parallel port device drivers, - * declares that a device is connected to a port, and tells the - * system all it needs to know. - * - * The @name is allocated by the caller and must not be - * deallocated until the caller calls @parport_unregister_device - * for that device. - * - * The preemption callback function, @pf, is called when this - * device driver has claimed access to the port but another - * device driver wants to use it. It is given @handle as its - * parameter, and should return zero if it is willing for the - * system to release the port to another driver on its behalf. - * If it wants to keep control of the port it should return - * non-zero, and no action will be taken. It is good manners for - * the driver to try to release the port at the earliest - * opportunity after its preemption callback rejects a preemption - * attempt. Note that if a preemption callback is happy for - * preemption to go ahead, there is no need to release the port; - * it is done automatically. This function may not block, as it - * may be called from interrupt context. If the device driver - * does not support preemption, @pf can be %NULL. - * - * The wake-up ("kick") callback function, @kf, is called when - * the port is available to be claimed for exclusive access; that - * is, parport_claim() is guaranteed to succeed when called from - * inside the wake-up callback function. If the driver wants to - * claim the port it should do so; otherwise, it need not take - * any action. This function may not block, as it may be called - * from interrupt context. If the device driver does not want to - * be explicitly invited to claim the port in this way, @kf can - * be %NULL. - * - * The interrupt handler, @irq_func, is called when an interrupt - * arrives from the parallel port. Note that if a device driver - * wants to use interrupts it should use parport_enable_irq(), - * and can also check the irq member of the parport structure - * representing the port. - * - * The parallel port (lowlevel) driver is the one that has called - * request_irq() and whose interrupt handler is called first. - * This handler does whatever needs to be done to the hardware to - * acknowledge the interrupt (for PC-style ports there is nothing - * special to be done). It then tells the IEEE 1284 code about - * the interrupt, which may involve reacting to an IEEE 1284 - * event depending on the current IEEE 1284 phase. After this, - * it calls @irq_func. Needless to say, @irq_func will be called - * from interrupt context, and may not block. - * - * The %PARPORT_DEV_EXCL flag is for preventing port sharing, and - * so should only be used when sharing the port with other device - * drivers is impossible and would lead to incorrect behaviour. - * Use it sparingly! Normally, @flags will be zero. - * - * This function returns a pointer to a structure that represents - * the device on the port, or %NULL if there is not enough memory - * to allocate space for that structure. - **/ - -struct pardevice * -parport_register_device(struct parport *port, const char *name, - int (*pf)(void *), void (*kf)(void *), - void (*irq_func)(void *), - int flags, void *handle) -{ - struct pardevice *tmp; - - if (port->physport->flags & PARPORT_FLAG_EXCL) { - /* An exclusive device is registered. */ - printk(KERN_DEBUG "%s: no more devices allowed\n", port->name); - return NULL; - } - - if (flags & PARPORT_DEV_LURK) { - if (!pf || !kf) { - pr_info("%s: refused to register lurking device (%s) without callbacks\n", - port->name, name); - return NULL; - } - } - - if (flags & PARPORT_DEV_EXCL) { - if (port->physport->devices) { - /* - * If a device is already registered and this new - * device wants exclusive access, then no need to - * continue as we can not grant exclusive access to - * this device. - */ - pr_err("%s: cannot grant exclusive access for device %s\n", - port->name, name); - return NULL; - } - } - - /* - * We up our own module reference count, and that of the port - * on which a device is to be registered, to ensure that - * neither of us gets unloaded while we sleep in (e.g.) - * kmalloc. - */ - if (!try_module_get(port->ops->owner)) - return NULL; - - parport_get_port(port); - - tmp = kmalloc(sizeof(struct pardevice), GFP_KERNEL); - if (!tmp) - goto out; - - tmp->state = kmalloc(sizeof(struct parport_state), GFP_KERNEL); - if (!tmp->state) - goto out_free_pardevice; - - tmp->name = name; - tmp->port = port; - tmp->daisy = -1; - tmp->preempt = pf; - tmp->wakeup = kf; - tmp->private = handle; - tmp->flags = flags; - tmp->irq_func = irq_func; - tmp->waiting = 0; - tmp->timeout = 5 * HZ; - tmp->devmodel = false; - - /* Chain this onto the list */ - tmp->prev = NULL; - /* - * This function must not run from an irq handler so we don' t need - * to clear irq on the local CPU. -arca - */ - spin_lock(&port->physport->pardevice_lock); - - if (flags & PARPORT_DEV_EXCL) { - if (port->physport->devices) { - spin_unlock(&port->physport->pardevice_lock); - printk(KERN_DEBUG "%s: cannot grant exclusive access for device %s\n", - port->name, name); - goto out_free_all; - } - port->flags |= PARPORT_FLAG_EXCL; - } - - tmp->next = port->physport->devices; - wmb(); /* - * Make sure that tmp->next is written before it's - * added to the list; see comments marked 'no locking - * required' - */ - if (port->physport->devices) - port->physport->devices->prev = tmp; - port->physport->devices = tmp; - spin_unlock(&port->physport->pardevice_lock); - - init_waitqueue_head(&tmp->wait_q); - tmp->timeslice = parport_default_timeslice; - tmp->waitnext = tmp->waitprev = NULL; - - /* - * This has to be run as last thing since init_state may need other - * pardevice fields. -arca - */ - port->ops->init_state(tmp, tmp->state); - if (!test_and_set_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags)) { - port->proc_device = tmp; - parport_device_proc_register(tmp); - } - return tmp; - - out_free_all: - kfree(tmp->state); - out_free_pardevice: - kfree(tmp); - out: - parport_put_port(port); - module_put(port->ops->owner); - - return NULL; -} -EXPORT_SYMBOL(parport_register_device); - static void free_pardevice(struct device *dev) { struct pardevice *par_dev = to_pardevice(dev); diff --git a/include/linux/parport.h b/include/linux/parport.h index 36a0f6270238..1fb508c19e83 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -325,18 +325,6 @@ struct pardev_cb { unsigned int flags; }; -/* parport_register_device declares that a device is connected to a - port, and tells the kernel all it needs to know. - - pf is the preemption function (may be NULL for no callback) - - kf is the wake-up function (may be NULL for no callback) - - irq_func is the interrupt handler (may be NULL for no interrupts) - - handle is a user pointer that gets handed to callback functions. */ -struct pardevice *parport_register_device(struct parport *port, - const char *name, - int (*pf)(void *), void (*kf)(void *), - void (*irq_func)(void *), - int flags, void *handle); - /* * parport_register_dev_model declares that a device is connected to a * port, and tells the kernel all it needs to know. -- cgit v1.2.3 From 5d1c9a114a6efba2c8391e39d4ac3e4e5c7b6d32 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 7 Apr 2020 18:59:51 +0300 Subject: net/mlx5: Update vport.c to new cmd interface Do mass update of vport.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_virt.c | 2 +- drivers/infiniband/hw/mlx5/mad.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 142 +++++++++++------------- include/linux/mlx5/vport.h | 3 +- 4 files changed, 71 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c index b61165359954..46b2d370fb3f 100644 --- a/drivers/infiniband/hw/mlx5/ib_virt.c +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -134,7 +134,7 @@ int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, if (!out) return -ENOMEM; - err = mlx5_core_query_vport_counter(mdev, true, vf, port, out, out_sz); + err = mlx5_core_query_vport_counter(mdev, true, vf, port, out); if (err) goto ex; diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index f0ab6d7d8497..454ce5de2de7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -187,8 +187,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num, goto done; } - err = mlx5_core_query_vport_counter(mdev, 0, 0, - mdev_port_num, out_cnt, sz); + err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num, + out_cnt); if (!err) pma_cnt_ext_assign(pma_cnt_ext, out_cnt); } else { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 23f879da9104..c107d92dc118 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -40,10 +40,11 @@ /* Mutex to hold while enabling or disabling RoCE */ static DEFINE_MUTEX(mlx5_roce_en_lock); -static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, - u16 vport, u32 *out, int outlen) +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) { - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); @@ -52,14 +53,9 @@ static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, if (vport) MLX5_SET(query_vport_state_in, in, other_vport, 1); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); -} - -u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) -{ - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0}; - - _mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; return MLX5_GET(query_vport_state_out, out, state); } @@ -67,8 +63,7 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state) { - u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; MLX5_SET(modify_vport_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VPORT_STATE); @@ -77,13 +72,13 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); MLX5_SET(modify_vport_state_in, in, admin_state, state); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, - u32 *out, int outlen) + u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; MLX5_SET(query_nic_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); @@ -91,26 +86,16 @@ static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, if (vport) MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); -} - -static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in, - int inlen) -{ - u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0}; - - MLX5_SET(modify_nic_vport_context_in, in, opcode, - MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - return mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); } int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 *min_inline) { - u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; int err; - err = mlx5_query_nic_vport_context(mdev, vport, out, sizeof(out)); + err = mlx5_query_nic_vport_context(mdev, vport, out); if (!err) *min_inline = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.min_wqe_inline_mode); @@ -139,8 +124,7 @@ EXPORT_SYMBOL_GPL(mlx5_query_min_inline); int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 min_inline) { - u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0}; - int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {}; void *nic_vport_ctx; MLX5_SET(modify_nic_vport_context_in, in, @@ -152,23 +136,20 @@ int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, in, nic_vport_context); MLX5_SET(nic_vport_context, nic_vport_ctx, min_wqe_inline_mode, min_inline); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - return mlx5_modify_nic_vport_context(mdev, in, inlen); + return mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); } int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, bool other, u8 *addr) { - int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; u8 *out_addr; - u32 *out; int err; - out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; - out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out, nic_vport_context.permanent_address); @@ -177,11 +158,10 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); MLX5_SET(query_nic_vport_context_in, in, other_vport, other); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); + err = mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); if (!err) ether_addr_copy(addr, &out_addr[2]); - kvfree(out); return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address); @@ -216,8 +196,10 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev, permanent_address); ether_addr_copy(&perm_mac[2], addr); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -235,7 +217,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + err = mlx5_query_nic_vport_context(mdev, 0, out); if (!err) *mtu = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.mtu); @@ -257,8 +239,10 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu) MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); return err; @@ -292,7 +276,7 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, req_list_size = max_list_size; } - out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + + out_sz = MLX5_ST_SZ_BYTES(query_nic_vport_context_in) + req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout); out = kzalloc(out_sz, GFP_KERNEL); @@ -332,7 +316,7 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, u8 addr_list[][ETH_ALEN], int list_size) { - u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)]; + u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {}; void *nic_vport_ctx; int max_list_size; int in_sz; @@ -350,7 +334,6 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + list_size * MLX5_ST_SZ_BYTES(mac_address_layout); - memset(out, 0, sizeof(out)); in = kzalloc(in_sz, GFP_KERNEL); if (!in) return -ENOMEM; @@ -442,7 +425,7 @@ int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out, outlen); + mlx5_query_nic_vport_context(mdev, 0, out); *system_image_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.system_image_guid); @@ -462,7 +445,7 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out, outlen); + mlx5_query_nic_vport_context(mdev, 0, out); *node_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.node_guid); @@ -498,8 +481,10 @@ int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in, in, nic_vport_context); MLX5_SET64(nic_vport_context, nic_vport_context, node_guid, node_guid); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -516,7 +501,7 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out, outlen); + mlx5_query_nic_vport_context(mdev, 0, out); *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.qkey_violation_counter); @@ -664,7 +649,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, struct mlx5_hca_vport_context *rep) { int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); - int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {0}; + int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {}; int is_group_manager; void *out; void *ctx; @@ -691,7 +676,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, if (MLX5_CAP_GEN(dev, num_ports) == 2) MLX5_SET(query_hca_vport_context_in, in, port_num, port_num); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + err = mlx5_cmd_exec_inout(dev, query_hca_vport_context, in, out); if (err) goto ex; @@ -788,7 +773,7 @@ int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, vport, out, outlen); + err = mlx5_query_nic_vport_context(mdev, vport, out); if (err) goto out; @@ -825,8 +810,10 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, nic_vport_context.promisc_mc, promisc_mc); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.promisc_all, promisc_all); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -865,8 +852,10 @@ int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable) if (MLX5_CAP_GEN(mdev, disable_local_lb_uc)) MLX5_SET(modify_nic_vport_context_in, in, field_select.disable_uc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); if (!err) mlx5_core_dbg(mdev, "%s local_lb\n", @@ -888,7 +877,7 @@ int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + err = mlx5_query_nic_vport_context(mdev, 0, out); if (err) goto out; @@ -925,8 +914,10 @@ static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev, MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en, state); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -965,16 +956,15 @@ int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) mutex_unlock(&mlx5_roce_en_lock); return err; } -EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); +EXPORT_SYMBOL(mlx5_nic_vport_disable_roce); int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, - int vf, u8 port_num, void *out, - size_t out_sz) + int vf, u8 port_num, void *out) { - int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); - int is_group_manager; - void *in; - int err; + int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); + int is_group_manager; + void *in; + int err; is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); in = kvzalloc(in_sz, GFP_KERNEL); @@ -997,7 +987,7 @@ int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, if (MLX5_CAP_GEN(dev, num_ports) == 2) MLX5_SET(query_vport_counter_in, in, port_num, port_num); - err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + err = mlx5_cmd_exec_inout(dev, query_vport_counter, in, out); free: kvfree(in); return err; @@ -1008,8 +998,8 @@ int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, u8 other_vport, u64 *rx_discard_vport_down, u64 *tx_discard_vport_down) { - u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; int err; MLX5_SET(query_vnic_env_in, in, opcode, @@ -1018,7 +1008,7 @@ int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, MLX5_SET(query_vnic_env_in, in, vport_number, vport); MLX5_SET(query_vnic_env_in, in, other_vport, other_vport); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_vnic_env, in, out); if (err) return err; @@ -1035,11 +1025,10 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, struct mlx5_hca_vport_context *req) { int in_sz = MLX5_ST_SZ_BYTES(modify_hca_vport_context_in); - u8 out[MLX5_ST_SZ_BYTES(modify_hca_vport_context_out)]; int is_group_manager; + void *ctx; void *in; int err; - void *ctx; mlx5_core_dbg(dev, "vf %d\n", vf); is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); @@ -1047,7 +1036,6 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, if (!in) return -ENOMEM; - memset(out, 0, sizeof(out)); MLX5_SET(modify_hca_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT); if (other_vport) { if (is_group_manager) { @@ -1074,7 +1062,7 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, MLX5_SET(hca_vport_context, ctx, cap_mask1, req->cap_mask1); MLX5_SET(hca_vport_context, ctx, cap_mask1_field_select, req->cap_mask1_perm); - err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + err = mlx5_cmd_exec_in(dev, modify_hca_vport_context, in); ex: kfree(in); return err; @@ -1103,8 +1091,10 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.affiliation_criteria, MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria)); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + err = mlx5_cmd_exec_in(port_mdev, modify_nic_vport_context, in); if (err) mlx5_nic_vport_disable_roce(port_mdev); @@ -1129,8 +1119,10 @@ int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev) nic_vport_context.affiliated_vhca_id, 0); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.affiliation_criteria, 0); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + err = mlx5_cmd_exec_in(port_mdev, modify_nic_vport_context, in); if (!err) mlx5_nic_vport_disable_roce(port_mdev); @@ -1170,4 +1162,4 @@ u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) { return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev); } -EXPORT_SYMBOL(mlx5_eswitch_get_total_vports); +EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 16060fb9b5e5..8170da1e9f70 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -127,8 +127,7 @@ int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, u8 other_vport, u64 *rx_discard_vport_down, u64 *tx_discard_vport_down); int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, - int vf, u8 port_num, void *out, - size_t out_sz); + int vf, u8 port_num, void *out); int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, int vf, -- cgit v1.2.3 From d1f620500cde5c72c7b96a19474733c4c6c67f38 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 11:39:14 +0300 Subject: net/mlx5: Update cq.c to new cmd interface Do mass update of cq.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 22 +++++++++------------- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en/health.c | 2 +- include/linux/mlx5/cq.h | 2 +- 4 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 4477a590b308..8379b24cb838 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -90,8 +90,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen) { int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn); - u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)]; - u32 din[MLX5_ST_SZ_DW(destroy_cq_in)]; + u32 din[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; struct mlx5_eq_comp *eq; int err; @@ -141,20 +140,17 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, err_cq_add: mlx5_eq_del_cq(&eq->core, cq); err_cmd: - memset(din, 0, sizeof(din)); - memset(dout, 0, sizeof(dout)); MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); MLX5_SET(destroy_cq_in, din, cqn, cq->cqn); MLX5_SET(destroy_cq_in, din, uid, cq->uid); - mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); + mlx5_cmd_exec_in(dev, destroy_cq, din); return err; } EXPORT_SYMBOL(mlx5_core_create_cq); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) { - u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; int err; mlx5_eq_del_cq(mlx5_get_async_eq(dev), cq); @@ -163,7 +159,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ); MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); MLX5_SET(destroy_cq_in, in, uid, cq->uid); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_in(dev, destroy_cq, in); if (err) return err; @@ -178,20 +174,20 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) EXPORT_SYMBOL(mlx5_core_destroy_cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - u32 *out, int outlen) + u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {}; MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ); MLX5_SET(query_cq_in, in, cqn, cq->cqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_inout(dev, query_cq, in, out); } EXPORT_SYMBOL(mlx5_core_query_cq); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {}; MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ); MLX5_SET(modify_cq_in, in, uid, cq->uid); @@ -204,7 +200,7 @@ int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, u16 cq_period, u16 cq_max_count) { - u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {}; void *cqc; MLX5_SET(modify_cq_in, in, cqn, cq->cqn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 65fef5a86644..c05e6a2c9126 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -333,7 +333,7 @@ static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, if (!out) return param; - err = mlx5_core_query_cq(dev, cq, out, outlen); + err = mlx5_core_query_cq(dev, cq, out); if (err) { mlx5_core_warn(dev, "failed to query cq\n"); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c index 3a199a03d929..7283443868f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -43,7 +43,7 @@ int mlx5e_reporter_cq_diagnose(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg) void *cqc; int err; - err = mlx5_core_query_cq(priv->mdev, &cq->mcq, out, sizeof(out)); + err = mlx5_core_query_cq(priv->mdev, &cq->mcq, out); if (err) return err; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 40748fc1b11b..b5a9399e07ee 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -188,7 +188,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - u32 *out, int outlen); + u32 *out); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen); int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, -- cgit v1.2.3 From e0b4b4722dfac09658d1519b296cf8dc349a2451 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 21:03:33 +0300 Subject: net/mlx5: Update transobj.c new cmd interface Do mass update of transobj.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 32 +++--- drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +- .../net/ethernet/mellanox/mlx5/core/en_common.c | 7 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +- .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 29 +++--- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 113 ++++++++------------- include/linux/mlx5/transobj.h | 19 ++-- 9 files changed, 85 insertions(+), 131 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3ecd1864b3c8..af599c8b88aa 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1255,7 +1255,7 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq, u32 tdn, struct ib_pd *pd) { - u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); @@ -1263,7 +1263,7 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, if (qp->flags & MLX5_IB_QP_UNDERLAY) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); - return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); + return mlx5_core_create_tis(dev->mdev, in, &sq->tisn); } static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, @@ -1460,9 +1460,8 @@ static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, - u32 *qp_flags_en, - struct ib_pd *pd, - u32 *out, int outlen) + u32 *qp_flags_en, struct ib_pd *pd, + u32 *out) { u8 lb_flag = 0; u32 *in; @@ -1495,9 +1494,8 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, } MLX5_SET(tirc, tirc, self_lb_block, lb_flag); - - err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen); - + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out); rq->tirn = MLX5_GET(create_tir_out, out, tirn); if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { err = mlx5_ib_enable_lb(dev, false, true); @@ -1557,9 +1555,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto err_destroy_sq; - err = create_raw_packet_qp_tir( - dev, rq, tdn, &qp->flags_en, pd, out, - MLX5_ST_SZ_BYTES(create_tir_out)); + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd, + out); if (err) goto err_destroy_rq; @@ -1854,7 +1851,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: - err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen); + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out); qp->rss_qp.tirn = MLX5_GET(create_tir_out, out, tirn); if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { @@ -2933,7 +2931,7 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); - err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + err = mlx5_core_modify_tis(dev, sq->tisn, in); kvfree(in); @@ -2960,7 +2958,7 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); - err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + err = mlx5_core_modify_tis(dev, sq->tisn, in); kvfree(in); @@ -3240,7 +3238,7 @@ static int modify_raw_packet_qp_rq( "RAW PACKET QP counters are not supported on current FW\n"); } - err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen); + err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in); if (err) goto out; @@ -3303,7 +3301,7 @@ static int modify_raw_packet_qp_sq( MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); } - err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen); + err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in); if (err) { /* Remove new rate from table if failed */ if (new_rate_added) @@ -6444,7 +6442,7 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, "Receive WQ counters are not supported on current FW\n"); } - err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); + err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in); if (!err) rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 12a61bf82c14..1599b05f3c5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1012,7 +1012,7 @@ int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz, void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_rss_params *rss_params, const struct mlx5e_tirc_config *ttconfig, void *tirc, bool inner); -void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen); +void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in); struct mlx5e_tirc_config mlx5e_tirc_get_default_config(enum mlx5e_traffic_types tt); struct mlx5e_xsk_param; @@ -1102,8 +1102,8 @@ void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv); void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv); #endif -int mlx5e_create_tir(struct mlx5_core_dev *mdev, - struct mlx5e_tir *tir, u32 *in, int inlen); +int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, + u32 *in); void mlx5e_destroy_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir); int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index f7890e0ce96c..af3228b3f303 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -36,12 +36,11 @@ * Global resources are common to all the netdevices crated on the same nic. */ -int mlx5e_create_tir(struct mlx5_core_dev *mdev, - struct mlx5e_tir *tir, u32 *in, int inlen) +int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 *in) { int err; - err = mlx5_core_create_tir(mdev, in, inlen, &tir->tirn); + err = mlx5_core_create_tir(mdev, in, &tir->tirn); if (err) return err; @@ -167,7 +166,7 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb) mutex_lock(&mdev->mlx5e_res.td.list_lock); list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) { tirn = tir->tirn; - err = mlx5_core_modify_tir(mdev, tirn, in, inlen); + err = mlx5_core_modify_tir(mdev, tirn, in); if (err) goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 6d703ddee4e2..de8250820b06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1204,7 +1204,7 @@ int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, } if (hash_changed) - mlx5e_modify_tirs_hash(priv, in, inlen); + mlx5e_modify_tirs_hash(priv, in); mutex_unlock(&priv->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 3bc2ac3d53fc..83c9b2bbc4af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -858,7 +858,7 @@ static int mlx5e_set_rss_hash_opt(struct mlx5e_priv *priv, goto out; priv->rss_params.rx_hash_fields[tt] = rx_hash_field; - mlx5e_modify_tirs_hash(priv, in, inlen); + mlx5e_modify_tirs_hash(priv, in); out: mutex_unlock(&priv->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 30970b405040..05dbe8b9caac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -721,7 +721,7 @@ int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state) MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); - err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen); + err = mlx5_core_modify_rq(mdev, rq->rqn, in); kvfree(in); @@ -752,7 +752,7 @@ static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable) MLX5_SET(rqc, rqc, scatter_fcs, enable); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); - err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen); + err = mlx5_core_modify_rq(mdev, rq->rqn, in); kvfree(in); @@ -781,7 +781,7 @@ static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) MLX5_SET(rqc, rqc, vsd, vsd); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); - err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen); + err = mlx5_core_modify_rq(mdev, rq->rqn, in); kvfree(in); @@ -1259,7 +1259,7 @@ int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); } - err = mlx5_core_modify_sq(mdev, sqn, in, inlen); + err = mlx5_core_modify_sq(mdev, sqn, in); kvfree(in); @@ -2698,7 +2698,7 @@ static void mlx5e_update_rx_hash_fields(struct mlx5e_tirc_config *ttconfig, ttconfig->rx_hash_fields = rx_hash_fields; } -void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) +void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in) { void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx); struct mlx5e_rss_params *rss = &priv->rss_params; @@ -2714,7 +2714,7 @@ void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) mlx5e_update_rx_hash_fields(&ttconfig, tt, rss->rx_hash_fields[tt]); mlx5e_build_indir_tir_ctx_hash(rss, &ttconfig, tirc, false); - mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen); + mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in); } if (!mlx5e_tunnel_inner_ft_supported(priv->mdev)) @@ -2725,8 +2725,7 @@ void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) mlx5e_update_rx_hash_fields(&ttconfig, tt, rss->rx_hash_fields[tt]); mlx5e_build_indir_tir_ctx_hash(rss, &ttconfig, tirc, true); - mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in, - inlen); + mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in); } } @@ -2752,15 +2751,13 @@ static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv) mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc); for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { - err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, - inlen); + err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in); if (err) goto free_in; } for (ix = 0; ix < priv->max_nch; ix++) { - err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn, - in, inlen); + err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn, in); if (err) goto free_in; } @@ -3214,7 +3211,7 @@ int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn) if (mlx5_lag_is_lacp_owner(mdev)) MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); - return mlx5_core_create_tis(mdev, in, MLX5_ST_SZ_BYTES(create_tis_in), tisn); + return mlx5_core_create_tis(mdev, in, tisn); } void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn) @@ -3332,7 +3329,7 @@ int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc) tir = &priv->indir_tir[tt]; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); mlx5e_build_indir_tir_ctx(priv, tt, tirc); - err = mlx5e_create_tir(priv->mdev, tir, in, inlen); + err = mlx5e_create_tir(priv->mdev, tir, in); if (err) { mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err); goto err_destroy_inner_tirs; @@ -3347,7 +3344,7 @@ int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc) tir = &priv->inner_indir_tir[i]; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); mlx5e_build_inner_indir_tir_ctx(priv, i, tirc); - err = mlx5e_create_tir(priv->mdev, tir, in, inlen); + err = mlx5e_create_tir(priv->mdev, tir, in); if (err) { mlx5_core_warn(priv->mdev, "create inner indirect tirs failed, %d\n", err); goto err_destroy_inner_tirs; @@ -3390,7 +3387,7 @@ int mlx5e_create_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs) tir = &tirs[ix]; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); mlx5e_build_direct_tir_ctx(priv, tir->rqt.rqtn, tirc); - err = mlx5e_create_tir(priv->mdev, tir, in, inlen); + err = mlx5e_create_tir(priv->mdev, tir, in); if (unlikely(err)) goto err_destroy_ch_tirs; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 438128dde187..88c0e460e995 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -568,7 +568,7 @@ struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex) static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp) { - u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {}; void *tirc; int err; @@ -582,7 +582,7 @@ static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp) MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn[0]); MLX5_SET(tirc, tirc, transport_domain, hp->tdn); - err = mlx5_core_create_tir(hp->func_mdev, in, MLX5_ST_SZ_BYTES(create_tir_in), &hp->tirn); + err = mlx5_core_create_tir(hp->func_mdev, in, &hp->tirn); if (err) goto create_tir_err; @@ -666,7 +666,7 @@ static int mlx5e_hairpin_create_indirect_tirs(struct mlx5e_hairpin *hp) mlx5e_build_indir_tir_ctx_hash(&priv->rss_params, &ttconfig, tirc, false); err = mlx5_core_create_tir(hp->func_mdev, in, - MLX5_ST_SZ_BYTES(create_tir_in), &hp->indir_tirn[tt]); + &hp->indir_tirn[tt]); if (err) { mlx5_core_warn(hp->func_mdev, "create indirect tirs failed, %d\n", err); goto err_destroy_tirs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index b1068500f1df..01cc00ad8acf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -36,14 +36,14 @@ int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) { - u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {}; int err; MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_transport_domain, in, out); if (!err) *tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain); @@ -54,19 +54,18 @@ EXPORT_SYMBOL(mlx5_core_alloc_transport_domain); void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) { - u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {}; MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, dealloc_transport_domain, in); } EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) { - u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {}; int err; MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ); @@ -78,44 +77,39 @@ int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) } EXPORT_SYMBOL(mlx5_core_create_rq); -int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen) +int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_rq_out)]; - MLX5_SET(modify_rq_in, in, rqn, rqn); MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ); - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_rq, in); } EXPORT_SYMBOL(mlx5_core_modify_rq); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) { - u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); MLX5_SET(destroy_rq_in, in, rqn, rqn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_rq, in); } EXPORT_SYMBOL(mlx5_core_destroy_rq); int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0}; - int outlen = MLX5_ST_SZ_BYTES(query_rq_out); + u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {}; MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ); MLX5_SET(query_rq_in, in, rqn, rqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_inout(dev, query_rq, in, out); } EXPORT_SYMBOL(mlx5_core_query_rq); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) { - u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {}; int err; MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); @@ -126,34 +120,30 @@ int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) return err; } -int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen) +int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_sq_out)] = {0}; - MLX5_SET(modify_sq_in, in, sqn, sqn); MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_sq, in); } EXPORT_SYMBOL(mlx5_core_modify_sq); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn) { - u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); MLX5_SET(destroy_sq_in, in, sqn, sqn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_sq, in); } int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0}; - int outlen = MLX5_ST_SZ_BYTES(query_sq_out); + u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {}; MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ); MLX5_SET(query_sq_in, in, sqn, sqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_inout(dev, query_sq, in, out); } EXPORT_SYMBOL(mlx5_core_query_sq); @@ -182,24 +172,13 @@ out: } EXPORT_SYMBOL_GPL(mlx5_core_query_sq_state); -int mlx5_core_create_tir_out(struct mlx5_core_dev *dev, - u32 *in, int inlen, - u32 *out, int outlen) -{ - MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); - - return mlx5_cmd_exec(dev, in, inlen, out, outlen); -} -EXPORT_SYMBOL(mlx5_core_create_tir_out); - -int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tirn) +int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, u32 *tirn) { u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {}; int err; - err = mlx5_core_create_tir_out(dev, in, inlen, - out, sizeof(out)); + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev, create_tir, in, out); if (!err) *tirn = MLX5_GET(create_tir_out, out, tirn); @@ -207,35 +186,30 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, } EXPORT_SYMBOL(mlx5_core_create_tir); -int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, - int inlen) +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_tir_out)] = {0}; - MLX5_SET(modify_tir_in, in, tirn, tirn); MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_tir, in); } void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn) { - u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); MLX5_SET(destroy_tir_in, in, tirn, tirn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_tir, in); } EXPORT_SYMBOL(mlx5_core_destroy_tir); -int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tisn) +int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, u32 *tisn) { - u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {}; int err; MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS); - err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, create_tis, in, out); if (!err) *tisn = MLX5_GET(create_tis_out, out, tisn); @@ -243,33 +217,29 @@ int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, } EXPORT_SYMBOL(mlx5_core_create_tis); -int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, - int inlen) +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0}; - MLX5_SET(modify_tis_in, in, tisn, tisn); MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_tis, in); } EXPORT_SYMBOL(mlx5_core_modify_tis); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn) { - u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {}; MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); MLX5_SET(destroy_tis_in, in, tisn, tisn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_tis, in); } EXPORT_SYMBOL(mlx5_core_destroy_tis); int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqtn) { - u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {}; int err; MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); @@ -284,7 +254,7 @@ EXPORT_SYMBOL(mlx5_core_create_rqt); int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {}; MLX5_SET(modify_rqt_in, in, rqtn, rqtn); MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); @@ -293,12 +263,11 @@ int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) { - u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_rqt, in); } EXPORT_SYMBOL(mlx5_core_destroy_rqt); @@ -383,7 +352,7 @@ static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn, int curr_state, int next_state, u16 peer_vhca, u32 peer_sq) { - u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {}; void *rqc; rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); @@ -396,8 +365,7 @@ static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn, MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); - return mlx5_core_modify_rq(func_mdev, rqn, - in, MLX5_ST_SZ_BYTES(modify_rq_in)); + return mlx5_core_modify_rq(func_mdev, rqn, in); } static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, @@ -417,8 +385,7 @@ static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, MLX5_SET(modify_sq_in, in, sq_state, curr_state); MLX5_SET(sqc, sqc, state, next_state); - return mlx5_core_modify_sq(peer_mdev, sqn, - in, MLX5_ST_SZ_BYTES(modify_sq_in)); + return mlx5_core_modify_sq(peer_mdev, sqn, in); } static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp) diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index dc6b1e7cb8c4..028f442530cf 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -39,27 +39,20 @@ int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn); void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn); -int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen); +int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn); int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn); -int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); +int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out); int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state); -int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tirn); -int mlx5_core_create_tir_out(struct mlx5_core_dev *dev, - u32 *in, int inlen, - u32 *out, int outlen); -int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, - int inlen); +int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, u32 *tirn); +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in); void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); -int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tisn); -int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, - int inlen); +int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, u32 *tisn); +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn); int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqtn); -- cgit v1.2.3 From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:27 -0700 Subject: net: napi: add hard irqs deferral feature Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b6055738b ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 29 ++++++++++++++++++----------- net/core/net-sysfs.c | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0750b54b3765..5a8d40f1ffe2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -329,6 +329,7 @@ struct napi_struct { unsigned long state; int weight; + int defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL @@ -1995,6 +1996,7 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/net/core/dev.c b/net/core/dev.c index fb61522b1ce1..67585484ad32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) + if (work_done) { + if (n->gro_bitmask) timeout = n->dev->gro_flush_timeout; - + n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = n->dev->gro_flush_timeout; + if (timeout) + ret = false; + } + if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0d9e46de205e..f3b650cd0923 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + dev->napi_defer_hard_irqs = val; + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, -- cgit v1.2.3 From e7d3c33c58e0108e38e969239cd3b3678a5d8ac5 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:42 +0300 Subject: mfd: intel_soc_pmic: Add SCU IPC member to struct intel_soc_pmic Both PMIC drivers (intel_soc_pmic_mrfld and intel_soc_pmic_bxtwc) will be using this field going forward to access the SCU IPC instance. While there add kernel-doc for the intel_soc_pmic structure. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- include/linux/mfd/intel_soc_pmic.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/intel_soc_pmic.h b/include/linux/mfd/intel_soc_pmic.h index bfecd6bd4990..6a88e34cb955 100644 --- a/include/linux/mfd/intel_soc_pmic.h +++ b/include/linux/mfd/intel_soc_pmic.h @@ -13,6 +13,20 @@ #include +/** + * struct intel_soc_pmic - Intel SoC PMIC data + * @irq: Master interrupt number of the parent PMIC device + * @regmap: Pointer to the parent PMIC device regmap structure + * @irq_chip_data: IRQ chip data for the PMIC itself + * @irq_chip_data_pwrbtn: Chained IRQ chip data for the Power Button + * @irq_chip_data_tmu: Chained IRQ chip data for the Time Management Unit + * @irq_chip_data_bcu: Chained IRQ chip data for the Burst Control Unit + * @irq_chip_data_adc: Chained IRQ chip data for the General Purpose ADC + * @irq_chip_data_chgr: Chained IRQ chip data for the External Charger + * @irq_chip_data_crit: Chained IRQ chip data for the Critical Event Handler + * @dev: Pointer to the parent PMIC device + * @scu: Pointer to the SCU IPC device data structure + */ struct intel_soc_pmic { int irq; struct regmap *regmap; @@ -24,6 +38,7 @@ struct intel_soc_pmic { struct regmap_irq_chip_data *irq_chip_data_chgr; struct regmap_irq_chip_data *irq_chip_data_crit; struct device *dev; + struct intel_scu_ipc_dev *scu; }; int intel_soc_pmic_exec_mipi_pmic_seq_element(u16 i2c_address, u32 reg_address, -- cgit v1.2.3 From 25f1ca31e230598eaf3c38d387a355a64bd772a7 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:51 +0300 Subject: platform/x86: intel_pmc_ipc: Convert to MFD This driver only creates a bunch of platform devices sharing resources belonging to the PMC device. This is pretty much what MFD subsystem is for so move the driver there, renaming it to intel_pmc_bxt.c which should be more clear what it is. MFD subsystem provides nice helper APIs for subdevice creation so convert the driver to use those. Unfortunately the ACPI device includes separate resources for most of the subdevices so we cannot simply call mfd_add_devices() to create all of them but instead we need to call it separately for each device. The new MFD driver continues to expose two sysfs attributes that allow userspace to send IPC commands to the PMC/SCU to avoid breaking any existing applications that may use these. Generally this is bad idea so document this in the ABI documentation. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- .../ABI/obsolete/sysfs-driver-intel_pmc_bxt | 22 + arch/x86/include/asm/intel_pmc_ipc.h | 47 -- arch/x86/include/asm/intel_telemetry.h | 1 + drivers/mfd/Kconfig | 16 +- drivers/mfd/Makefile | 1 + drivers/mfd/intel_pmc_bxt.c | 468 +++++++++++++++ drivers/platform/x86/Kconfig | 16 +- drivers/platform/x86/Makefile | 1 - drivers/platform/x86/intel_pmc_ipc.c | 645 --------------------- drivers/platform/x86/intel_telemetry_debugfs.c | 12 +- drivers/platform/x86/intel_telemetry_pltdrv.c | 2 + drivers/usb/typec/tcpm/Kconfig | 2 +- drivers/watchdog/iTCO_wdt.c | 25 +- include/linux/mfd/intel_pmc_bxt.h | 53 ++ include/linux/platform_data/itco_wdt.h | 11 +- 15 files changed, 602 insertions(+), 720 deletions(-) create mode 100644 Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt delete mode 100644 arch/x86/include/asm/intel_pmc_ipc.h create mode 100644 drivers/mfd/intel_pmc_bxt.c delete mode 100644 drivers/platform/x86/intel_pmc_ipc.c create mode 100644 include/linux/mfd/intel_pmc_bxt.h (limited to 'include/linux') diff --git a/Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt b/Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt new file mode 100644 index 000000000000..39d5659f388b --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt @@ -0,0 +1,22 @@ +These files allow sending arbitrary IPC commands to the PMC/SCU which +may be dangerous. These will be removed eventually and should not be +used in any new applications. + +What: /sys/bus/platform/devices/INT34D2:00/simplecmd +Date: Jun 2015 +KernelVersion: 4.1 +Contact: Mika Westerberg +Description: This interface allows userspace to send an arbitrary + IPC command to the PMC/SCU. + + Format: %d %d where first number is command and + second number is subcommand. + +What: /sys/bus/platform/devices/INT34D2:00/northpeak +Date: Jun 2015 +KernelVersion: 4.1 +Contact: Mika Westerberg +Description: This interface allows userspace to enable and disable + Northpeak through the PMC/SCU. + + Format: %u. diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h deleted file mode 100644 index 22848df5faaf..000000000000 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_INTEL_PMC_IPC_H_ -#define _ASM_X86_INTEL_PMC_IPC_H_ - -/* Commands */ -#define PMC_IPC_USB_PWR_CTRL 0xF0 -#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF -#define PMC_IPC_PHY_CONFIG 0xEE -#define PMC_IPC_NORTHPEAK_CTRL 0xED -#define PMC_IPC_PM_DEBUG 0xEC -#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA - -/* IPC return code */ -#define IPC_ERR_NONE 0 -#define IPC_ERR_CMD_NOT_SUPPORTED 1 -#define IPC_ERR_CMD_NOT_SERVICED 2 -#define IPC_ERR_UNABLE_TO_SERVICE 3 -#define IPC_ERR_CMD_INVALID 4 -#define IPC_ERR_CMD_FAILED 5 -#define IPC_ERR_EMSECURITY 6 -#define IPC_ERR_UNSIGNEDKERNEL 7 - -/* GCR reg offsets from gcr base*/ -#define PMC_GCR_PMC_CFG_REG 0x08 -#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 -#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 - -#if IS_ENABLED(CONFIG_INTEL_PMC_IPC) - -int intel_pmc_s0ix_counter_read(u64 *data); -int intel_pmc_gcr_read64(u32 offset, u64 *data); - -#else - -static inline int intel_pmc_s0ix_counter_read(u64 *data) -{ - return -EINVAL; -} - -static inline int intel_pmc_gcr_read64(u32 offset, u64 *data) -{ - return -EINVAL; -} - -#endif /*CONFIG_INTEL_PMC_IPC*/ - -#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 2c0e7d7a10e9..8046e70dfd7c 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -53,6 +53,7 @@ struct telemetry_plt_config { struct telemetry_unit_config ioss_config; struct mutex telem_trace_lock; struct mutex telem_lock; + struct intel_pmc_dev *pmc; struct intel_scu_ipc_dev *scu; bool telem_in_use; }; diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 5e21a78b6923..54b6aa4aaab1 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -551,7 +551,7 @@ config INTEL_SOC_PMIC config INTEL_SOC_PMIC_BXTWC tristate "Support for Intel Broxton Whiskey Cove PMIC" - depends on INTEL_PMC_IPC + depends on MFD_INTEL_PMC_BXT select MFD_CORE select REGMAP_IRQ help @@ -632,6 +632,20 @@ config MFD_INTEL_MSIC Passage) chip. This chip embeds audio, battery, GPIO, etc. devices used in Intel Medfield platforms. +config MFD_INTEL_PMC_BXT + tristate "Intel PMC Driver for Broxton" + depends on X86 + depends on X86_PLATFORM_DEVICES + depends on ACPI + select INTEL_SCU_IPC + select MFD_CORE + help + This driver provides support for the PMC (Power Management + Controller) on Intel Broxton and Apollo Lake. The PMC is a + multi-function device that exposes IPC, General Control + Register and P-unit access. In addition this creates devices + for iTCO watchdog and telemetry that are part of the PMC. + config MFD_IPAQ_MICRO bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support" depends on SA1100_H3100 || SA1100_H3600 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f935d10cbf0f..7761f84a96eb 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -212,6 +212,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS) += intel-lpss.o obj-$(CONFIG_MFD_INTEL_LPSS_PCI) += intel-lpss-pci.o obj-$(CONFIG_MFD_INTEL_LPSS_ACPI) += intel-lpss-acpi.o obj-$(CONFIG_MFD_INTEL_MSIC) += intel_msic.o +obj-$(CONFIG_MFD_INTEL_PMC_BXT) += intel_pmc_bxt.o obj-$(CONFIG_MFD_PALMAS) += palmas.o obj-$(CONFIG_MFD_VIPERBOARD) += viperboard.o obj-$(CONFIG_MFD_RC5T583) += rc5t583.o rc5t583-irq.o diff --git a/drivers/mfd/intel_pmc_bxt.c b/drivers/mfd/intel_pmc_bxt.c new file mode 100644 index 000000000000..9f01d38acc7f --- /dev/null +++ b/drivers/mfd/intel_pmc_bxt.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for the Intel Broxton PMC + * + * (C) Copyright 2014 - 2020 Intel Corporation + * + * This driver is based on Intel SCU IPC driver (intel_scu_ipc.c) by + * Sreedhara DS + * + * The PMC (Power Management Controller) running on the ARC processor + * communicates with another entity running in the IA (Intel Architecture) + * core through an IPC (Intel Processor Communications) mechanism which in + * turn sends messages between the IA and the PMC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Residency with clock rate at 19.2MHz to usecs */ +#define S0IX_RESIDENCY_IN_USECS(d, s) \ +({ \ + u64 result = 10ull * ((d) + (s)); \ + do_div(result, 192); \ + result; \ +}) + +/* Resources exported from IFWI */ +#define PLAT_RESOURCE_IPC_INDEX 0 +#define PLAT_RESOURCE_IPC_SIZE 0x1000 +#define PLAT_RESOURCE_GCR_OFFSET 0x1000 +#define PLAT_RESOURCE_GCR_SIZE 0x1000 +#define PLAT_RESOURCE_BIOS_DATA_INDEX 1 +#define PLAT_RESOURCE_BIOS_IFACE_INDEX 2 +#define PLAT_RESOURCE_TELEM_SSRAM_INDEX 3 +#define PLAT_RESOURCE_ISP_DATA_INDEX 4 +#define PLAT_RESOURCE_ISP_IFACE_INDEX 5 +#define PLAT_RESOURCE_GTD_DATA_INDEX 6 +#define PLAT_RESOURCE_GTD_IFACE_INDEX 7 +#define PLAT_RESOURCE_ACPI_IO_INDEX 0 + +/* + * BIOS does not create an ACPI device for each PMC function, but + * exports multiple resources from one ACPI device (IPC) for multiple + * functions. This driver is responsible for creating a child device and + * to export resources for those functions. + */ +#define SMI_EN_OFFSET 0x0040 +#define SMI_EN_SIZE 4 +#define TCO_BASE_OFFSET 0x0060 +#define TCO_REGS_SIZE 16 +#define TELEM_SSRAM_SIZE 240 +#define TELEM_PMC_SSRAM_OFFSET 0x1b00 +#define TELEM_PUNIT_SSRAM_OFFSET 0x1a00 + +/* Commands */ +#define PMC_NORTHPEAK_CTRL 0xed + +static inline bool is_gcr_valid(u32 offset) +{ + return offset < PLAT_RESOURCE_GCR_SIZE - 8; +} + +/** + * intel_pmc_gcr_read64() - Read a 64-bit PMC GCR register + * @pmc: PMC device pointer + * @offset: offset of GCR register from GCR address base + * @data: data pointer for storing the register output + * + * Reads the 64-bit PMC GCR register at given offset. + * + * Return: Negative value on error or 0 on success. + */ +int intel_pmc_gcr_read64(struct intel_pmc_dev *pmc, u32 offset, u64 *data) +{ + if (!is_gcr_valid(offset)) + return -EINVAL; + + spin_lock(&pmc->gcr_lock); + *data = readq(pmc->gcr_mem_base + offset); + spin_unlock(&pmc->gcr_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_gcr_read64); + +/** + * intel_pmc_gcr_update() - Update PMC GCR register bits + * @pmc: PMC device pointer + * @offset: offset of GCR register from GCR address base + * @mask: bit mask for update operation + * @val: update value + * + * Updates the bits of given GCR register as specified by + * @mask and @val. + * + * Return: Negative value on error or 0 on success. + */ +int intel_pmc_gcr_update(struct intel_pmc_dev *pmc, u32 offset, u32 mask, u32 val) +{ + u32 new_val; + + if (!is_gcr_valid(offset)) + return -EINVAL; + + spin_lock(&pmc->gcr_lock); + new_val = readl(pmc->gcr_mem_base + offset); + + new_val = (new_val & ~mask) | (val & mask); + writel(new_val, pmc->gcr_mem_base + offset); + + new_val = readl(pmc->gcr_mem_base + offset); + spin_unlock(&pmc->gcr_lock); + + /* Check whether the bit update is successful */ + return (new_val & mask) != (val & mask) ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_gcr_update); + +/** + * intel_pmc_s0ix_counter_read() - Read S0ix residency + * @pmc: PMC device pointer + * @data: Out param that contains current S0ix residency count. + * + * Writes to @data how many usecs the system has been in low-power S0ix + * state. + * + * Return: An error code or 0 on success. + */ +int intel_pmc_s0ix_counter_read(struct intel_pmc_dev *pmc, u64 *data) +{ + u64 deep, shlw; + + spin_lock(&pmc->gcr_lock); + deep = readq(pmc->gcr_mem_base + PMC_GCR_TELEM_DEEP_S0IX_REG); + shlw = readq(pmc->gcr_mem_base + PMC_GCR_TELEM_SHLW_S0IX_REG); + spin_unlock(&pmc->gcr_lock); + + *data = S0IX_RESIDENCY_IN_USECS(deep, shlw); + return 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_s0ix_counter_read); + +/** + * simplecmd_store() - Send a simple IPC command + * @dev: Device under the attribute is + * @attr: Attribute in question + * @buf: Buffer holding data to be stored to the attribute + * @count: Number of bytes in @buf + * + * Expects a string with two integers separated with space. These two + * values hold command and subcommand that is send to PMC. + * + * Return: Number number of bytes written (@count) or negative errno in + * case of error. + */ +static ssize_t simplecmd_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct intel_pmc_dev *pmc = dev_get_drvdata(dev); + struct intel_scu_ipc_dev *scu = pmc->scu; + int subcmd; + int cmd; + int ret; + + ret = sscanf(buf, "%d %d", &cmd, &subcmd); + if (ret != 2) { + dev_err(dev, "Invalid values, expected: cmd subcmd\n"); + return -EINVAL; + } + + ret = intel_scu_ipc_dev_simple_command(scu, cmd, subcmd); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_WO(simplecmd); + +/** + * northpeak_store() - Enable or disable Northpeak + * @dev: Device under the attribute is + * @attr: Attribute in question + * @buf: Buffer holding data to be stored to the attribute + * @count: Number of bytes in @buf + * + * Expects an unsigned integer. Non-zero enables Northpeak and zero + * disables it. + * + * Return: Number number of bytes written (@count) or negative errno in + * case of error. + */ +static ssize_t northpeak_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct intel_pmc_dev *pmc = dev_get_drvdata(dev); + struct intel_scu_ipc_dev *scu = pmc->scu; + unsigned long val; + int subcmd; + int ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + /* Northpeak is enabled if subcmd == 1 and disabled if it is 0 */ + if (val) + subcmd = 1; + else + subcmd = 0; + + ret = intel_scu_ipc_dev_simple_command(scu, PMC_NORTHPEAK_CTRL, subcmd); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_WO(northpeak); + +static struct attribute *intel_pmc_attrs[] = { + &dev_attr_northpeak.attr, + &dev_attr_simplecmd.attr, + NULL +}; + +static const struct attribute_group intel_pmc_group = { + .attrs = intel_pmc_attrs, +}; + +static const struct attribute_group *intel_pmc_groups[] = { + &intel_pmc_group, + NULL +}; + +static struct resource punit_res[6]; + +static struct mfd_cell punit = { + .name = "intel_punit_ipc", + .resources = punit_res, +}; + +static struct itco_wdt_platform_data tco_pdata = { + .name = "Apollo Lake SoC", + .version = 5, + .no_reboot_use_pmc = true, +}; + +static struct resource tco_res[2]; + +static const struct mfd_cell tco = { + .name = "iTCO_wdt", + .ignore_resource_conflicts = true, + .resources = tco_res, + .num_resources = ARRAY_SIZE(tco_res), + .platform_data = &tco_pdata, + .pdata_size = sizeof(tco_pdata), +}; + +static const struct resource telem_res[] = { + DEFINE_RES_MEM(TELEM_PUNIT_SSRAM_OFFSET, TELEM_SSRAM_SIZE), + DEFINE_RES_MEM(TELEM_PMC_SSRAM_OFFSET, TELEM_SSRAM_SIZE), +}; + +static const struct mfd_cell telem = { + .name = "intel_telemetry", + .resources = telem_res, + .num_resources = ARRAY_SIZE(telem_res), +}; + +static int intel_pmc_get_tco_resources(struct platform_device *pdev) +{ + struct resource *res; + + if (acpi_has_watchdog()) + return 0; + + res = platform_get_resource(pdev, IORESOURCE_IO, + PLAT_RESOURCE_ACPI_IO_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get IO resource\n"); + return -EINVAL; + } + + tco_res[0].flags = IORESOURCE_IO; + tco_res[0].start = res->start + TCO_BASE_OFFSET; + tco_res[0].end = tco_res[0].start + TCO_REGS_SIZE - 1; + tco_res[1].flags = IORESOURCE_IO; + tco_res[1].start = res->start + SMI_EN_OFFSET; + tco_res[1].end = tco_res[1].start + SMI_EN_SIZE - 1; + + return 0; +} + +static int intel_pmc_get_resources(struct platform_device *pdev, + struct intel_pmc_dev *pmc, + struct intel_scu_ipc_data *scu_data) +{ + struct resource gcr_res; + size_t npunit_res = 0; + struct resource *res; + int ret; + + scu_data->irq = platform_get_irq_optional(pdev, 0); + + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_IPC_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get IPC resource\n"); + return -EINVAL; + } + + /* IPC registers */ + scu_data->mem.flags = res->flags; + scu_data->mem.start = res->start; + scu_data->mem.end = res->start + PLAT_RESOURCE_IPC_SIZE - 1; + + /* GCR registers */ + gcr_res.flags = res->flags; + gcr_res.start = res->start + PLAT_RESOURCE_GCR_OFFSET; + gcr_res.end = gcr_res.start + PLAT_RESOURCE_GCR_SIZE - 1; + + pmc->gcr_mem_base = devm_ioremap_resource(&pdev->dev, &gcr_res); + if (IS_ERR(pmc->gcr_mem_base)) + return PTR_ERR(pmc->gcr_mem_base); + + /* Only register iTCO watchdog if there is no WDAT ACPI table */ + ret = intel_pmc_get_tco_resources(pdev); + if (ret) + return ret; + + /* BIOS data register */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_BIOS_DATA_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get resource of P-unit BIOS data\n"); + return -EINVAL; + } + punit_res[npunit_res++] = *res; + + /* BIOS interface register */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_BIOS_IFACE_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get resource of P-unit BIOS interface\n"); + return -EINVAL; + } + punit_res[npunit_res++] = *res; + + /* ISP data register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_ISP_DATA_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + /* ISP interface register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_ISP_IFACE_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + /* GTD data register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_GTD_DATA_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + /* GTD interface register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_GTD_IFACE_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + punit.num_resources = npunit_res; + + /* Telemetry SSRAM is optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_TELEM_SSRAM_INDEX); + if (res) + pmc->telem_base = res; + + return 0; +} + +static int intel_pmc_create_devices(struct intel_pmc_dev *pmc) +{ + int ret; + + if (!acpi_has_watchdog()) { + ret = devm_mfd_add_devices(pmc->dev, PLATFORM_DEVID_AUTO, &tco, + 1, NULL, 0, NULL); + if (ret) + return ret; + } + + ret = devm_mfd_add_devices(pmc->dev, PLATFORM_DEVID_AUTO, &punit, 1, + NULL, 0, NULL); + if (ret) + return ret; + + if (pmc->telem_base) { + ret = devm_mfd_add_devices(pmc->dev, PLATFORM_DEVID_AUTO, + &telem, 1, pmc->telem_base, 0, NULL); + } + + return ret; +} + +static const struct acpi_device_id intel_pmc_acpi_ids[] = { + { "INT34D2" }, + { } +}; +MODULE_DEVICE_TABLE(acpi, intel_pmc_acpi_ids); + +static int intel_pmc_probe(struct platform_device *pdev) +{ + struct intel_scu_ipc_data scu_data = {}; + struct intel_pmc_dev *pmc; + int ret; + + pmc = devm_kzalloc(&pdev->dev, sizeof(*pmc), GFP_KERNEL); + if (!pmc) + return -ENOMEM; + + pmc->dev = &pdev->dev; + spin_lock_init(&pmc->gcr_lock); + + ret = intel_pmc_get_resources(pdev, pmc, &scu_data); + if (ret) { + dev_err(&pdev->dev, "Failed to request resources\n"); + return ret; + } + + pmc->scu = devm_intel_scu_ipc_register(&pdev->dev, &scu_data); + if (IS_ERR(pmc->scu)) + return PTR_ERR(pmc->scu); + + platform_set_drvdata(pdev, pmc); + + ret = intel_pmc_create_devices(pmc); + if (ret) + dev_err(&pdev->dev, "Failed to create PMC devices\n"); + + return ret; +} + +static struct platform_driver intel_pmc_driver = { + .probe = intel_pmc_probe, + .driver = { + .name = "intel_pmc_bxt", + .acpi_match_table = intel_pmc_acpi_ids, + .dev_groups = intel_pmc_groups, + }, +}; +module_platform_driver(intel_pmc_driver); + +MODULE_AUTHOR("Mika Westerberg "); +MODULE_AUTHOR("Zha Qipeng "); +MODULE_DESCRIPTION("Intel Broxton PMC driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 303514af2e0d..642316761443 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1269,7 +1269,8 @@ config INTEL_UNCORE_FREQ_CONTROL config INTEL_BXTWC_PMIC_TMU tristate "Intel BXT Whiskey Cove TMU Driver" depends on REGMAP - depends on INTEL_SOC_PMIC_BXTWC && INTEL_PMC_IPC + depends on MFD_INTEL_PMC_BXT + depends on INTEL_SOC_PMIC_BXTWC ---help--- Select this driver to use Intel BXT Whiskey Cove PMIC TMU feature. This driver enables the alarm wakeup functionality in the TMU unit @@ -1327,15 +1328,6 @@ config INTEL_PMC_CORE - LTR Ignore - MPHY/PLL gating status (Sunrisepoint PCH only) -config INTEL_PMC_IPC - tristate "Intel PMC IPC Driver" - depends on ACPI - select INTEL_SCU_IPC - ---help--- - This driver provides support for PMC control on some Intel platforms. - The PMC is an ARC processor which defines IPC commands for communication - with other entities in the CPU. - config INTEL_PUNIT_IPC tristate "Intel P-Unit IPC Driver" ---help--- @@ -1374,7 +1366,9 @@ config INTEL_SCU_IPC_UTIL config INTEL_TELEMETRY tristate "Intel SoC Telemetry Driver" - depends on INTEL_PMC_IPC && INTEL_PUNIT_IPC && X86_64 + depends on X86_64 + depends on MFD_INTEL_PMC_BXT + depends on INTEL_PUNIT_IPC ---help--- This driver provides interfaces to configure and use telemetry for INTEL SoC from APL onwards. It is also diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index ef995a0f04f0..04db27a25946 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -138,7 +138,6 @@ obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o obj-$(CONFIG_INTEL_MID_POWER_BUTTON) += intel_mid_powerbtn.o obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o -obj-$(CONFIG_INTEL_PMC_IPC) += intel_pmc_ipc.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o diff --git a/drivers/platform/x86/intel_pmc_ipc.c b/drivers/platform/x86/intel_pmc_ipc.c deleted file mode 100644 index 16ca4ee9cdd5..000000000000 --- a/drivers/platform/x86/intel_pmc_ipc.c +++ /dev/null @@ -1,645 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Driver for the Intel PMC IPC mechanism - * - * (C) Copyright 2014-2015 Intel Corporation - * - * This driver is based on Intel SCU IPC driver(intel_scu_ipc.c) by - * Sreedhara DS - * - * PMC running in ARC processor communicates with other entity running in IA - * core through IPC mechanism which in turn messaging between IA core ad PMC. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* Residency with clock rate at 19.2MHz to usecs */ -#define S0IX_RESIDENCY_IN_USECS(d, s) \ -({ \ - u64 result = 10ull * ((d) + (s)); \ - do_div(result, 192); \ - result; \ -}) - -/* exported resources from IFWI */ -#define PLAT_RESOURCE_IPC_INDEX 0 -#define PLAT_RESOURCE_IPC_SIZE 0x1000 -#define PLAT_RESOURCE_GCR_OFFSET 0x1000 -#define PLAT_RESOURCE_GCR_SIZE 0x1000 -#define PLAT_RESOURCE_BIOS_DATA_INDEX 1 -#define PLAT_RESOURCE_BIOS_IFACE_INDEX 2 -#define PLAT_RESOURCE_TELEM_SSRAM_INDEX 3 -#define PLAT_RESOURCE_ISP_DATA_INDEX 4 -#define PLAT_RESOURCE_ISP_IFACE_INDEX 5 -#define PLAT_RESOURCE_GTD_DATA_INDEX 6 -#define PLAT_RESOURCE_GTD_IFACE_INDEX 7 -#define PLAT_RESOURCE_ACPI_IO_INDEX 0 - -/* - * BIOS does not create an ACPI device for each PMC function, - * but exports multiple resources from one ACPI device(IPC) for - * multiple functions. This driver is responsible to create a - * platform device and to export resources for those functions. - */ -#define TCO_DEVICE_NAME "iTCO_wdt" -#define SMI_EN_OFFSET 0x40 -#define SMI_EN_SIZE 4 -#define TCO_BASE_OFFSET 0x60 -#define TCO_REGS_SIZE 16 -#define PUNIT_DEVICE_NAME "intel_punit_ipc" -#define TELEMETRY_DEVICE_NAME "intel_telemetry" -#define TELEM_SSRAM_SIZE 240 -#define TELEM_PMC_SSRAM_OFFSET 0x1B00 -#define TELEM_PUNIT_SSRAM_OFFSET 0x1A00 -#define TCO_PMC_OFFSET 0x08 -#define TCO_PMC_SIZE 0x04 - -/* PMC register bit definitions */ - -/* PMC_CFG_REG bit masks */ -#define PMC_CFG_NO_REBOOT_MASK BIT_MASK(4) -#define PMC_CFG_NO_REBOOT_EN (1 << 4) -#define PMC_CFG_NO_REBOOT_DIS (0 << 4) - -static struct intel_pmc_ipc_dev { - struct device *dev; - - /* The following PMC BARs share the same ACPI device with the IPC */ - resource_size_t acpi_io_base; - int acpi_io_size; - struct platform_device *tco_dev; - - /* gcr */ - void __iomem *gcr_mem_base; - bool has_gcr_regs; - spinlock_t gcr_lock; - - /* punit */ - struct platform_device *punit_dev; - unsigned int punit_res_count; - - /* Telemetry */ - resource_size_t telem_pmc_ssram_base; - resource_size_t telem_punit_ssram_base; - int telem_pmc_ssram_size; - int telem_punit_ssram_size; - u8 telem_res_inval; - struct platform_device *telemetry_dev; -} ipcdev; - -static inline u64 gcr_data_readq(u32 offset) -{ - return readq(ipcdev.gcr_mem_base + offset); -} - -static inline int is_gcr_valid(u32 offset) -{ - if (!ipcdev.has_gcr_regs) - return -EACCES; - - if (offset > PLAT_RESOURCE_GCR_SIZE) - return -EINVAL; - - return 0; -} - -/** - * intel_pmc_gcr_read64() - Read a 64-bit PMC GCR register - * @offset: offset of GCR register from GCR address base - * @data: data pointer for storing the register output - * - * Reads the 64-bit PMC GCR register at given offset. - * - * Return: negative value on error or 0 on success. - */ -int intel_pmc_gcr_read64(u32 offset, u64 *data) -{ - int ret; - - spin_lock(&ipcdev.gcr_lock); - - ret = is_gcr_valid(offset); - if (ret < 0) { - spin_unlock(&ipcdev.gcr_lock); - return ret; - } - - *data = readq(ipcdev.gcr_mem_base + offset); - - spin_unlock(&ipcdev.gcr_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(intel_pmc_gcr_read64); - -/** - * intel_pmc_gcr_update() - Update PMC GCR register bits - * @offset: offset of GCR register from GCR address base - * @mask: bit mask for update operation - * @val: update value - * - * Updates the bits of given GCR register as specified by - * @mask and @val. - * - * Return: negative value on error or 0 on success. - */ -static int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val) -{ - u32 new_val; - int ret = 0; - - spin_lock(&ipcdev.gcr_lock); - - ret = is_gcr_valid(offset); - if (ret < 0) - goto gcr_ipc_unlock; - - new_val = readl(ipcdev.gcr_mem_base + offset); - - new_val &= ~mask; - new_val |= val & mask; - - writel(new_val, ipcdev.gcr_mem_base + offset); - - new_val = readl(ipcdev.gcr_mem_base + offset); - - /* check whether the bit update is successful */ - if ((new_val & mask) != (val & mask)) { - ret = -EIO; - goto gcr_ipc_unlock; - } - -gcr_ipc_unlock: - spin_unlock(&ipcdev.gcr_lock); - return ret; -} - -static int update_no_reboot_bit(void *priv, bool set) -{ - u32 value = set ? PMC_CFG_NO_REBOOT_EN : PMC_CFG_NO_REBOOT_DIS; - - return intel_pmc_gcr_update(PMC_GCR_PMC_CFG_REG, - PMC_CFG_NO_REBOOT_MASK, value); -} - -static ssize_t intel_pmc_ipc_simple_cmd_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct intel_scu_ipc_dev *scu = dev_get_drvdata(dev); - int subcmd; - int cmd; - int ret; - - ret = sscanf(buf, "%d %d", &cmd, &subcmd); - if (ret != 2) { - dev_err(dev, "Error args\n"); - return -EINVAL; - } - - ret = intel_scu_ipc_dev_simple_command(scu, cmd, subcmd); - if (ret) { - dev_err(dev, "command %d error with %d\n", cmd, ret); - return ret; - } - return (ssize_t)count; -} -static DEVICE_ATTR(simplecmd, 0200, NULL, intel_pmc_ipc_simple_cmd_store); - -static ssize_t intel_pmc_ipc_northpeak_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct intel_scu_ipc_dev *scu = dev_get_drvdata(dev); - unsigned long val; - int subcmd; - int ret; - - ret = kstrtoul(buf, 0, &val); - if (ret) - return ret; - - if (val) - subcmd = 1; - else - subcmd = 0; - ret = intel_scu_ipc_dev_simple_command(scu, PMC_IPC_NORTHPEAK_CTRL, subcmd); - if (ret) { - dev_err(dev, "command north %d error with %d\n", subcmd, ret); - return ret; - } - return (ssize_t)count; -} -static DEVICE_ATTR(northpeak, 0200, NULL, intel_pmc_ipc_northpeak_store); - -static struct attribute *intel_ipc_attrs[] = { - &dev_attr_northpeak.attr, - &dev_attr_simplecmd.attr, - NULL -}; - -static const struct attribute_group intel_ipc_group = { - .attrs = intel_ipc_attrs, -}; - -static const struct attribute_group *intel_ipc_groups[] = { - &intel_ipc_group, - NULL -}; - -static struct resource punit_res_array[] = { - /* Punit BIOS */ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, - /* Punit ISP */ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, - /* Punit GTD */ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, -}; - -#define TCO_RESOURCE_ACPI_IO 0 -#define TCO_RESOURCE_SMI_EN_IO 1 -#define TCO_RESOURCE_GCR_MEM 2 -static struct resource tco_res[] = { - /* ACPI - TCO */ - { - .flags = IORESOURCE_IO, - }, - /* ACPI - SMI */ - { - .flags = IORESOURCE_IO, - }, -}; - -static struct itco_wdt_platform_data tco_info = { - .name = "Apollo Lake SoC", - .version = 5, - .no_reboot_priv = &ipcdev, - .update_no_reboot_bit = update_no_reboot_bit, -}; - -#define TELEMETRY_RESOURCE_PUNIT_SSRAM 0 -#define TELEMETRY_RESOURCE_PMC_SSRAM 1 -static struct resource telemetry_res[] = { - /*Telemetry*/ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, -}; - -static int ipc_create_punit_device(void) -{ - struct platform_device *pdev; - const struct platform_device_info pdevinfo = { - .parent = ipcdev.dev, - .name = PUNIT_DEVICE_NAME, - .id = -1, - .res = punit_res_array, - .num_res = ipcdev.punit_res_count, - }; - - pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - ipcdev.punit_dev = pdev; - - return 0; -} - -static int ipc_create_tco_device(void) -{ - struct platform_device *pdev; - struct resource *res; - const struct platform_device_info pdevinfo = { - .parent = ipcdev.dev, - .name = TCO_DEVICE_NAME, - .id = -1, - .res = tco_res, - .num_res = ARRAY_SIZE(tco_res), - .data = &tco_info, - .size_data = sizeof(tco_info), - }; - - res = tco_res + TCO_RESOURCE_ACPI_IO; - res->start = ipcdev.acpi_io_base + TCO_BASE_OFFSET; - res->end = res->start + TCO_REGS_SIZE - 1; - - res = tco_res + TCO_RESOURCE_SMI_EN_IO; - res->start = ipcdev.acpi_io_base + SMI_EN_OFFSET; - res->end = res->start + SMI_EN_SIZE - 1; - - pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - ipcdev.tco_dev = pdev; - - return 0; -} - -static int ipc_create_telemetry_device(void) -{ - struct platform_device *pdev; - struct resource *res; - const struct platform_device_info pdevinfo = { - .parent = ipcdev.dev, - .name = TELEMETRY_DEVICE_NAME, - .id = -1, - .res = telemetry_res, - .num_res = ARRAY_SIZE(telemetry_res), - }; - - res = telemetry_res + TELEMETRY_RESOURCE_PUNIT_SSRAM; - res->start = ipcdev.telem_punit_ssram_base; - res->end = res->start + ipcdev.telem_punit_ssram_size - 1; - - res = telemetry_res + TELEMETRY_RESOURCE_PMC_SSRAM; - res->start = ipcdev.telem_pmc_ssram_base; - res->end = res->start + ipcdev.telem_pmc_ssram_size - 1; - - pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - ipcdev.telemetry_dev = pdev; - - return 0; -} - -static int ipc_create_pmc_devices(void) -{ - int ret; - - /* If we have ACPI based watchdog use that instead */ - if (!acpi_has_watchdog()) { - ret = ipc_create_tco_device(); - if (ret) { - dev_err(ipcdev.dev, "Failed to add tco platform device\n"); - return ret; - } - } - - ret = ipc_create_punit_device(); - if (ret) { - dev_err(ipcdev.dev, "Failed to add punit platform device\n"); - platform_device_unregister(ipcdev.tco_dev); - return ret; - } - - if (!ipcdev.telem_res_inval) { - ret = ipc_create_telemetry_device(); - if (ret) { - dev_warn(ipcdev.dev, - "Failed to add telemetry platform device\n"); - platform_device_unregister(ipcdev.punit_dev); - platform_device_unregister(ipcdev.tco_dev); - } - } - - return ret; -} - -static int ipc_plat_get_res(struct platform_device *pdev, - struct intel_scu_ipc_data *scu_data) -{ - struct resource *res, *punit_res = punit_res_array; - resource_size_t start; - void __iomem *addr; - int size; - - res = platform_get_resource(pdev, IORESOURCE_IO, - PLAT_RESOURCE_ACPI_IO_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get io resource\n"); - return -ENXIO; - } - size = resource_size(res); - ipcdev.acpi_io_base = res->start; - ipcdev.acpi_io_size = size; - dev_info(&pdev->dev, "io res: %pR\n", res); - - ipcdev.punit_res_count = 0; - - /* This is index 0 to cover BIOS data register */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_BIOS_DATA_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get res of punit BIOS data\n"); - return -ENXIO; - } - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit BIOS data res: %pR\n", res); - - /* This is index 1 to cover BIOS interface register */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_BIOS_IFACE_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get res of punit BIOS iface\n"); - return -ENXIO; - } - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit BIOS interface res: %pR\n", res); - - /* This is index 2 to cover ISP data register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_ISP_DATA_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit ISP data res: %pR\n", res); - } - - /* This is index 3 to cover ISP interface register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_ISP_IFACE_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit ISP interface res: %pR\n", res); - } - - /* This is index 4 to cover GTD data register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_GTD_DATA_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit GTD data res: %pR\n", res); - } - - /* This is index 5 to cover GTD interface register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_GTD_IFACE_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit GTD interface res: %pR\n", res); - } - - scu_data->irq = platform_get_irq(pdev, 0); - - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_IPC_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get ipc resource\n"); - return -ENXIO; - } - dev_info(&pdev->dev, "ipc res: %pR\n", res); - - scu_data->mem.flags = res->flags; - scu_data->mem.start = res->start; - scu_data->mem.end = res->start + PLAT_RESOURCE_IPC_SIZE - 1; - - start = res->start + PLAT_RESOURCE_GCR_OFFSET; - if (!devm_request_mem_region(&pdev->dev, start, PLAT_RESOURCE_GCR_SIZE, - "pmc_ipc_plat")) - return -EBUSY; - - addr = devm_ioremap(&pdev->dev, start, PLAT_RESOURCE_GCR_SIZE); - if (!addr) - return -ENOMEM; - - ipcdev.gcr_mem_base = addr; - - ipcdev.telem_res_inval = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_TELEM_SSRAM_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get telemetry ssram resource\n"); - ipcdev.telem_res_inval = 1; - } else { - ipcdev.telem_punit_ssram_base = res->start + - TELEM_PUNIT_SSRAM_OFFSET; - ipcdev.telem_punit_ssram_size = TELEM_SSRAM_SIZE; - ipcdev.telem_pmc_ssram_base = res->start + - TELEM_PMC_SSRAM_OFFSET; - ipcdev.telem_pmc_ssram_size = TELEM_SSRAM_SIZE; - dev_info(&pdev->dev, "telemetry ssram res: %pR\n", res); - } - - return 0; -} - -/** - * intel_pmc_s0ix_counter_read() - Read S0ix residency. - * @data: Out param that contains current S0ix residency count. - * - * Return: an error code or 0 on success. - */ -int intel_pmc_s0ix_counter_read(u64 *data) -{ - u64 deep, shlw; - - if (!ipcdev.has_gcr_regs) - return -EACCES; - - deep = gcr_data_readq(PMC_GCR_TELEM_DEEP_S0IX_REG); - shlw = gcr_data_readq(PMC_GCR_TELEM_SHLW_S0IX_REG); - - *data = S0IX_RESIDENCY_IN_USECS(deep, shlw); - - return 0; -} -EXPORT_SYMBOL_GPL(intel_pmc_s0ix_counter_read); - -#ifdef CONFIG_ACPI -static const struct acpi_device_id ipc_acpi_ids[] = { - { "INT34D2", 0}, - { } -}; -MODULE_DEVICE_TABLE(acpi, ipc_acpi_ids); -#endif - -static int ipc_plat_probe(struct platform_device *pdev) -{ - struct intel_scu_ipc_data scu_data = {}; - struct intel_scu_ipc_dev *scu; - int ret; - - ipcdev.dev = &pdev->dev; - spin_lock_init(&ipcdev.gcr_lock); - - ret = ipc_plat_get_res(pdev, &scu_data); - if (ret) { - dev_err(&pdev->dev, "Failed to request resource\n"); - return ret; - } - - scu = devm_intel_scu_ipc_register(&pdev->dev, &scu_data); - if (IS_ERR(scu)) - return PTR_ERR(scu); - - platform_set_drvdata(pdev, scu); - - ret = ipc_create_pmc_devices(); - if (ret) { - dev_err(&pdev->dev, "Failed to create pmc devices\n"); - return ret; - } - - ipcdev.has_gcr_regs = true; - - return 0; -} - -static int ipc_plat_remove(struct platform_device *pdev) -{ - platform_device_unregister(ipcdev.tco_dev); - platform_device_unregister(ipcdev.punit_dev); - platform_device_unregister(ipcdev.telemetry_dev); - ipcdev.dev = NULL; - return 0; -} - -static struct platform_driver ipc_plat_driver = { - .remove = ipc_plat_remove, - .probe = ipc_plat_probe, - .driver = { - .name = "pmc-ipc-plat", - .acpi_match_table = ACPI_PTR(ipc_acpi_ids), - .dev_groups = intel_ipc_groups, - }, -}; - -static int __init intel_pmc_ipc_init(void) -{ - return platform_driver_register(&ipc_plat_driver); -} - -static void __exit intel_pmc_ipc_exit(void) -{ - platform_driver_unregister(&ipc_plat_driver); -} - -MODULE_AUTHOR("Zha Qipeng "); -MODULE_DESCRIPTION("Intel PMC IPC driver"); -MODULE_LICENSE("GPL v2"); - -/* Some modules are dependent on this, so init earlier */ -fs_initcall(intel_pmc_ipc_init); -module_exit(intel_pmc_ipc_exit); diff --git a/drivers/platform/x86/intel_telemetry_debugfs.c b/drivers/platform/x86/intel_telemetry_debugfs.c index 6cac3e05b817..1d4d0fbfd63c 100644 --- a/drivers/platform/x86/intel_telemetry_debugfs.c +++ b/drivers/platform/x86/intel_telemetry_debugfs.c @@ -15,6 +15,7 @@ */ #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include -#include #include #define DRIVER_NAME "telemetry_soc_debugfs" @@ -647,10 +647,11 @@ DEFINE_SHOW_ATTRIBUTE(telem_soc_states); static int telem_s0ix_res_get(void *data, u64 *val) { + struct telemetry_plt_config *plt_config = telemetry_get_pltdata(); u64 s0ix_total_res; int ret; - ret = intel_pmc_s0ix_counter_read(&s0ix_total_res); + ret = intel_pmc_s0ix_counter_read(plt_config->pmc, &s0ix_total_res); if (ret) { pr_err("Failed to read S0ix residency"); return ret; @@ -837,12 +838,15 @@ static int pm_suspend_exit_cb(void) */ if (suspend_shlw_ctr_exit == suspend_shlw_ctr_temp && suspend_deep_ctr_exit == suspend_deep_ctr_temp) { - ret = intel_pmc_gcr_read64(PMC_GCR_TELEM_SHLW_S0IX_REG, + struct telemetry_plt_config *plt_config = telemetry_get_pltdata(); + struct intel_pmc_dev *pmc = plt_config->pmc; + + ret = intel_pmc_gcr_read64(pmc, PMC_GCR_TELEM_SHLW_S0IX_REG, &suspend_shlw_res_exit); if (ret < 0) goto out; - ret = intel_pmc_gcr_read64(PMC_GCR_TELEM_DEEP_S0IX_REG, + ret = intel_pmc_gcr_read64(pmc, PMC_GCR_TELEM_DEEP_S0IX_REG, &suspend_deep_res_exit); if (ret < 0) goto out; diff --git a/drivers/platform/x86/intel_telemetry_pltdrv.c b/drivers/platform/x86/intel_telemetry_pltdrv.c index e45303f99303..405dea87de6b 100644 --- a/drivers/platform/x86/intel_telemetry_pltdrv.c +++ b/drivers/platform/x86/intel_telemetry_pltdrv.c @@ -1115,6 +1115,8 @@ static int telemetry_pltdrv_probe(struct platform_device *pdev) telm_conf = (struct telemetry_plt_config *)id->driver_data; + telm_conf->pmc = dev_get_drvdata(pdev->dev.parent); + mem = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(mem)) return PTR_ERR(mem); diff --git a/drivers/usb/typec/tcpm/Kconfig b/drivers/usb/typec/tcpm/Kconfig index 5b986d6c801d..fa3f39336246 100644 --- a/drivers/usb/typec/tcpm/Kconfig +++ b/drivers/usb/typec/tcpm/Kconfig @@ -41,8 +41,8 @@ config TYPEC_FUSB302 config TYPEC_WCOVE tristate "Intel WhiskeyCove PMIC USB Type-C PHY driver" depends on ACPI + depends on MFD_INTEL_PMC_BXT depends on INTEL_SOC_PMIC - depends on INTEL_PMC_IPC depends on BXT_WC_PMIC_OPREGION help This driver adds support for USB Type-C on Intel Broxton platforms diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index e707c4797f76..a370a185a41c 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -64,6 +64,7 @@ #include /* For copy_to_user/put_user/... */ #include /* For inb/outb/... */ #include +#include #include "iTCO_vendor.h" @@ -233,12 +234,24 @@ static int update_no_reboot_bit_cnt(void *priv, bool set) return val != newval ? -EIO : 0; } +static int update_no_reboot_bit_pmc(void *priv, bool set) +{ + struct intel_pmc_dev *pmc = priv; + u32 bits = PMC_CFG_NO_REBOOT_EN; + u32 value = set ? bits : 0; + + return intel_pmc_gcr_update(pmc, PMC_GCR_PMC_CFG_REG, bits, value); +} + static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, - struct itco_wdt_platform_data *pdata) + struct platform_device *pdev, + struct itco_wdt_platform_data *pdata) { - if (pdata->update_no_reboot_bit) { - p->update_no_reboot_bit = pdata->update_no_reboot_bit; - p->no_reboot_priv = pdata->no_reboot_priv; + if (pdata->no_reboot_use_pmc) { + struct intel_pmc_dev *pmc = dev_get_drvdata(pdev->dev.parent); + + p->update_no_reboot_bit = update_no_reboot_bit_pmc; + p->no_reboot_priv = pmc; return; } @@ -478,14 +491,14 @@ static int iTCO_wdt_probe(struct platform_device *pdev) return -ENODEV; } - iTCO_wdt_no_reboot_bit_setup(p, pdata); + iTCO_wdt_no_reboot_bit_setup(p, pdev, pdata); /* * Get the Memory-Mapped GCS or PMC register, we need it for the * NO_REBOOT flag (TCO v2 and v3). */ if (p->iTCO_version >= 2 && p->iTCO_version < 6 && - !pdata->update_no_reboot_bit) { + !pdata->no_reboot_use_pmc) { p->gcs_pmc_res = platform_get_resource(pdev, IORESOURCE_MEM, ICH_RES_MEM_GCS_PMC); diff --git a/include/linux/mfd/intel_pmc_bxt.h b/include/linux/mfd/intel_pmc_bxt.h new file mode 100644 index 000000000000..f51a43d25ffd --- /dev/null +++ b/include/linux/mfd/intel_pmc_bxt.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MFD_INTEL_PMC_BXT_H +#define MFD_INTEL_PMC_BXT_H + +/* GCR reg offsets from GCR base */ +#define PMC_GCR_PMC_CFG_REG 0x08 +#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 +#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 + +/* PMC_CFG_REG bit masks */ +#define PMC_CFG_NO_REBOOT_EN BIT(4) + +/** + * struct intel_pmc_dev - Intel PMC device structure + * @dev: Pointer to the parent PMC device + * @scu: Pointer to the SCU IPC device data structure + * @gcr_mem_base: Virtual base address of GCR (Global Configuration Registers) + * @gcr_lock: Lock used to serialize access to GCR registers + * @telem_base: Pointer to telemetry SSRAM base resource or %NULL if not + * available + */ +struct intel_pmc_dev { + struct device *dev; + struct intel_scu_ipc_dev *scu; + void __iomem *gcr_mem_base; + spinlock_t gcr_lock; + struct resource *telem_base; +}; + +#if IS_ENABLED(CONFIG_MFD_INTEL_PMC_BXT) +int intel_pmc_gcr_read64(struct intel_pmc_dev *pmc, u32 offset, u64 *data); +int intel_pmc_gcr_update(struct intel_pmc_dev *pmc, u32 offset, u32 mask, u32 val); +int intel_pmc_s0ix_counter_read(struct intel_pmc_dev *pmc, u64 *data); +#else +static inline int intel_pmc_gcr_read64(struct intel_pmc_dev *pmc, u32 offset, + u64 *data) +{ + return -ENOTSUPP; +} + +static inline int intel_pmc_gcr_update(struct intel_pmc_dev *pmc, u32 offset, + u32 mask, u32 val) +{ + return -ENOTSUPP; +} + +static inline int intel_pmc_s0ix_counter_read(struct intel_pmc_dev *pmc, u64 *data) +{ + return -ENOTSUPP; +} +#endif + +#endif /* MFD_INTEL_PMC_BXT_H */ diff --git a/include/linux/platform_data/itco_wdt.h b/include/linux/platform_data/itco_wdt.h index 2ccdce6a4e27..45d860cac2b0 100644 --- a/include/linux/platform_data/itco_wdt.h +++ b/include/linux/platform_data/itco_wdt.h @@ -12,13 +12,16 @@ #define ICH_RES_MEM_OFF 2 #define ICH_RES_MEM_GCS_PMC 0 +/** + * struct itco_wdt_platform_data - iTCO_wdt platform data + * @name: Name of the platform + * @version: iTCO version + * @no_reboot_use_pmc: Use PMC BXT API to set and clear NO_REBOOT bit + */ struct itco_wdt_platform_data { char name[32]; unsigned int version; - /* private data to be passed to update_no_reboot_bit API */ - void *no_reboot_priv; - /* pointer for platform specific no reboot update function */ - int (*update_no_reboot_bit)(void *priv, bool set); + bool no_reboot_use_pmc; }; #endif /* _ITCO_WDT_H_ */ -- cgit v1.2.3 From 9166cc49767a646990a73380480356416b7794eb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:32 +0200 Subject: mac80211: implement Operating Mode Notification extended NSS support Somehow we missed this for a long time, but similar to the extended NSS support in VHT capabilities, we need to have this in Operating Mode notification. Implement it by * parsing the 160/80+80 bit there and setting the bandwidth appropriately * having callers of ieee80211_get_vht_max_nss() pass in the current max NSS value as received in the operating mode notification in order to modify it appropriately depending on the extended NSS bits. This updates all drivers that use it, i.e. only iwlwifi/mvm. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.098483728cfa.I4e8c25d3288441759c2793247197229f0696a37d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 6 +++--- include/linux/ieee80211.h | 12 +++++++++--- net/mac80211/vht.c | 10 ++++++++-- net/wireless/util.c | 26 ++++++++++++++------------ 4 files changed, 34 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index c1aba2bf73cf..a8c13f6fbce0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -1,10 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * - * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved. + * Copyright(c) 2005 - 2014, 2018 - 2020 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation * * Contact Information: * Intel Linux Wireless @@ -1430,7 +1429,8 @@ static u32 rs_bw_from_sta_bw(struct ieee80211_sta *sta) */ if (ieee80211_get_vht_max_nss(&vht_cap, IEEE80211_VHT_CHANWIDTH_160MHZ, - 0, true) < sta->rx_nss) + 0, true, + sta->rx_nss) < sta->rx_nss) return RATE_MCS_CHAN_WIDTH_80; return RATE_MCS_CHAN_WIDTH_160; case IEEE80211_STA_RX_BW_80: diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 16268ef1cbcc..c326aec535c6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2019 Intel Corporation + * Copyright (c) 2018 - 2020 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -859,6 +859,7 @@ enum ieee80211_ht_chanwidth_values { * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask * (the NSS value is the value of this field + 1) * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift @@ -866,11 +867,12 @@ enum ieee80211_ht_chanwidth_values { * using a beamforming steering matrix */ enum ieee80211_vht_opmode_bits { - IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 3, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, + IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, @@ -1731,6 +1733,9 @@ struct ieee80211_mu_edca_param_set { * @ext_nss_bw_capable: indicates whether or not the local transmitter * (rate scaling algorithm) can deal with the new logic * (dot11VHTExtendedNSSBWCapable) + * @max_vht_nss: current maximum NSS as advertised by the STA in + * operating mode notification, can be 0 in which case the + * capability data will be used to derive this (from MCS support) * * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can * vary for a given BW/MCS. This function parses the data. @@ -1739,7 +1744,8 @@ struct ieee80211_mu_edca_param_set { */ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable); + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss); /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 632f07401850..9c6045f9c24d 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -575,15 +575,21 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + else + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: + /* legacy only, no longer used by newer spec */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 6590efbbcbb9..123d6ce79b8e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include #include @@ -2030,10 +2030,10 @@ EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable) + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); - int max_vht_nss = 0; int ext_nss_bw; int supp_width; int i, mcs_encoding; @@ -2041,7 +2041,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, if (map == 0xffff) return 0; - if (WARN_ON(mcs > 9)) + if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; @@ -2050,16 +2050,18 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, else mcs_encoding = 2; - /* find max_vht_nss for the given MCS */ - for (i = 7; i >= 0; i--) { - int supp = (map >> (2 * i)) & 3; + if (!max_vht_nss) { + /* find max_vht_nss for the given MCS */ + for (i = 7; i >= 0; i--) { + int supp = (map >> (2 * i)) & 3; - if (supp == 3) - continue; + if (supp == 3) + continue; - if (supp >= mcs_encoding) { - max_vht_nss = i + 1; - break; + if (supp >= mcs_encoding) { + max_vht_nss = i + 1; + break; + } } } -- cgit v1.2.3 From 2a392596d8811c6d58c014ec881b159c75a0cf45 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:35 +0200 Subject: cfg80211: Parse HE membership selector This extends the support for drivers that rebuilds IEs in the FW (same as with HT/VHT). Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.20feaabfb484.I886252639604c8e3e84b8ef97962f1b0e4beec81@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 3 ++- net/wireless/nl80211.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c326aec535c6..38f513ce7528 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1067,6 +1067,7 @@ struct ieee80211_mgmt { /* Supported rates membership selectors */ #define BSS_MEMBERSHIP_SELECTOR_HT_PHY 127 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY 126 +#define BSS_MEMBERSHIP_SELECTOR_HE_PHY 122 /* mgmt header + 1 byte category code */ #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index dbb9675fe38f..e288fdcb3df2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1054,6 +1054,7 @@ enum cfg80211_ap_settings_flags { * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time + * @he_required: stations must support HE * @flags: flags, as defined in enum cfg80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings * @he_bss_color: BSS Color settings @@ -1083,7 +1084,7 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct ieee80211_he_operation *he_oper; - bool ht_required, vht_required; + bool ht_required, vht_required, he_required; bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d470d77d2eb6..3d27b24c68b2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4738,6 +4738,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, params->ht_required = true; if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) params->vht_required = true; + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) + params->he_required = true; } } -- cgit v1.2.3 From b572510100165ba037ba43dbbb0f05e8da12c741 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:02 -0700 Subject: ieee80211: share 802.11 unit conversion helpers MHZ_TO_KHZ, and KHZ_TO_MHZ are useful to drivers and elsewhere so export these in the common ieee80211 header. Move the power helpers also because we might as well. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-2-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 10 ++++++++++ include/net/regulatory.h | 7 ------- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 38f513ce7528..a561db435a4b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3330,6 +3330,16 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size) #define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024)) #define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x)) +/* convert frequencies */ +#define MHZ_TO_KHZ(freq) ((freq) * 1000) +#define KHZ_TO_MHZ(freq) ((freq) / 1000) + +/* convert powers */ +#define DBI_TO_MBI(gain) ((gain) * 100) +#define MBI_TO_DBI(gain) ((gain) / 100) +#define DBM_TO_MBM(gain) ((gain) * 100) +#define MBM_TO_DBM(gain) ((gain) / 100) + /** * ieee80211_action_contains_tpc - checks if the frame contains TPC element * @skb: the skb containing the frame, length will be checked diff --git a/include/net/regulatory.h b/include/net/regulatory.h index 3469750df0f4..09a3099886e5 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -231,13 +231,6 @@ struct ieee80211_regdomain { struct ieee80211_reg_rule reg_rules[]; }; -#define MHZ_TO_KHZ(freq) ((freq) * 1000) -#define KHZ_TO_MHZ(freq) ((freq) / 1000) -#define DBI_TO_MBI(gain) ((gain) * 100) -#define MBI_TO_DBI(gain) ((gain) / 100) -#define DBM_TO_MBM(gain) ((gain) * 100) -#define MBM_TO_DBM(gain) ((gain) / 100) - #define REG_RULE_EXT(start, end, bw, gain, eirp, dfs_cac, reg_flags) \ { \ .freq_range.start_freq_khz = MHZ_TO_KHZ(start), \ -- cgit v1.2.3 From 4e9a0f73f030e19a9259b69a7079021048e1f904 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 26 Mar 2020 09:24:14 +0100 Subject: efi: Clean up config table description arrays Increase legibility by adding whitespace to the efi_config_table_type_t arrays that describe which EFI config tables we look for when going over the firmware provided list. While at it, replace the 'name' char pointer with a char array, which is more space efficient on relocatable 64-bit kernels, as it avoids a 8 byte pointer and the associated relocation data (24 bytes when using RELA format) Signed-off-by: Ard Biesheuvel --- arch/ia64/kernel/efi.c | 12 ++++++------ arch/x86/platform/efi/efi.c | 8 ++++---- drivers/firmware/efi/arm-init.c | 4 ++-- drivers/firmware/efi/efi.c | 28 ++++++++++++++-------------- include/linux/efi.h | 2 +- 5 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index f69f3fe0532e..a54eacbc61a9 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -57,12 +57,12 @@ unsigned long hcdp_phys = EFI_INVALID_TABLE_ADDR; unsigned long sal_systab_phys = EFI_INVALID_TABLE_ADDR; static const efi_config_table_type_t arch_tables[] __initconst = { - {ESI_TABLE_GUID, "ESI", &esi_phys}, - {HCDP_TABLE_GUID, "HCDP", &hcdp_phys}, - {MPS_TABLE_GUID, "MPS", &mps_phys}, - {PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, "PALO", &palo_phys}, - {SAL_SYSTEM_TABLE_GUID, "SALsystab", &sal_systab_phys}, - {NULL_GUID, NULL, 0}, + {ESI_TABLE_GUID, &esi_phys, "ESI" }, + {HCDP_TABLE_GUID, &hcdp_phys, "HCDP" }, + {MPS_TABLE_GUID, &mps_phys, "MPS" }, + {PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, &palo_phys, "PALO" }, + {SAL_SYSTEM_TABLE_GUID, &sal_systab_phys, "SALsystab" }, + {}, }; extern efi_status_t efi_call_phys (void *, ...); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1aae5302501d..e966115d105c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -62,12 +62,12 @@ static unsigned long efi_runtime, efi_nr_tables; unsigned long efi_fw_vendor, efi_config_table; static const efi_config_table_type_t arch_tables[] __initconst = { - {EFI_PROPERTIES_TABLE_GUID, "PROP", &prop_phys}, - {UGA_IO_PROTOCOL_GUID, "UGA", &uga_phys}, + {EFI_PROPERTIES_TABLE_GUID, &prop_phys, "PROP" }, + {UGA_IO_PROTOCOL_GUID, &uga_phys, "UGA" }, #ifdef CONFIG_X86_UV - {UV_SYSTEM_TABLE_GUID, "UVsystab", &uv_systab_phys}, + {UV_SYSTEM_TABLE_GUID, &uv_systab_phys, "UVsystab" }, #endif - {NULL_GUID, NULL, NULL}, + {}, }; static const unsigned long * const efi_tables[] = { diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 9e5e62f5f94d..c697e70ca7e7 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -54,8 +54,8 @@ static phys_addr_t __init efi_to_phys(unsigned long addr) static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; static const efi_config_table_type_t arch_tables[] __initconst = { - {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table}, - {NULL_GUID, NULL, NULL} + {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, &screen_info_table}, + {} }; static void __init init_screen_info(void) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 911a2bd0f6b7..e49c0b6db988 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -502,21 +502,21 @@ void __init efi_mem_reserve(phys_addr_t addr, u64 size) } static const efi_config_table_type_t common_tables[] __initconst = { - {ACPI_20_TABLE_GUID, "ACPI 2.0", &efi.acpi20}, - {ACPI_TABLE_GUID, "ACPI", &efi.acpi}, - {SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios}, - {SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3}, - {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, - {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi_mem_attr_table}, - {LINUX_EFI_RANDOM_SEED_TABLE_GUID, "RNG", &efi_rng_seed}, - {LINUX_EFI_TPM_EVENT_LOG_GUID, "TPMEventLog", &efi.tpm_log}, - {LINUX_EFI_TPM_FINAL_LOG_GUID, "TPMFinalLog", &efi.tpm_final_log}, - {LINUX_EFI_MEMRESERVE_TABLE_GUID, "MEMRESERVE", &mem_reserve}, - {EFI_RT_PROPERTIES_TABLE_GUID, "RTPROP", &rt_prop}, + {ACPI_20_TABLE_GUID, &efi.acpi20, "ACPI 2.0" }, + {ACPI_TABLE_GUID, &efi.acpi, "ACPI" }, + {SMBIOS_TABLE_GUID, &efi.smbios, "SMBIOS" }, + {SMBIOS3_TABLE_GUID, &efi.smbios3, "SMBIOS 3.0" }, + {EFI_SYSTEM_RESOURCE_TABLE_GUID, &efi.esrt, "ESRT" }, + {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, &efi_mem_attr_table, "MEMATTR" }, + {LINUX_EFI_RANDOM_SEED_TABLE_GUID, &efi_rng_seed, "RNG" }, + {LINUX_EFI_TPM_EVENT_LOG_GUID, &efi.tpm_log, "TPMEventLog" }, + {LINUX_EFI_TPM_FINAL_LOG_GUID, &efi.tpm_final_log, "TPMFinalLog" }, + {LINUX_EFI_MEMRESERVE_TABLE_GUID, &mem_reserve, "MEMRESERVE" }, + {EFI_RT_PROPERTIES_TABLE_GUID, &rt_prop, "RTPROP" }, #ifdef CONFIG_EFI_RCI2_TABLE - {DELLEMC_EFI_RCI2_TABLE_GUID, NULL, &rci2_table_phys}, + {DELLEMC_EFI_RCI2_TABLE_GUID, &rci2_table_phys }, #endif - {NULL_GUID, NULL, NULL}, + {}, }; static __init int match_config_table(const efi_guid_t *guid, @@ -529,7 +529,7 @@ static __init int match_config_table(const efi_guid_t *guid, for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) { if (!efi_guidcmp(*guid, table_types[i].guid)) { *(table_types[i].ptr) = table; - if (table_types[i].name) + if (table_types[i].name[0]) pr_cont(" %s=0x%lx ", table_types[i].name, table); return 1; diff --git a/include/linux/efi.h b/include/linux/efi.h index 251f1f783cdf..9b7c7ec319ac 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -379,8 +379,8 @@ typedef union { typedef struct { efi_guid_t guid; - const char *name; unsigned long *ptr; + const char name[16]; } efi_config_table_type_t; #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL) -- cgit v1.2.3 From acd05785e48c01edb2c4f4d014d28478b5f19fb5 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 17 Apr 2020 15:14:46 -0700 Subject: kvm: add capability for halt polling KVM_CAP_HALT_POLL is a per-VM capability that lets userspace control the halt-polling time, allowing halt-polling to be tuned or disabled on particular VMs. With dynamic halt-polling, a VM's VCPUs can poll from anywhere from [0, halt_poll_ns] on each halt. KVM_CAP_HALT_POLL sets the upper limit on the poll time. Signed-off-by: David Matlack Signed-off-by: Jon Cargille Reviewed-by: Jim Mattson Message-Id: <20200417221446.108733-1-jcargill@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 17 +++++++++++++++++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 19 +++++++++++++++---- 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index efbbe570aa9b..d871dacb984e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5802,6 +5802,23 @@ If present, this capability can be enabled for a VM, meaning that KVM will allow the transition to secure guest mode. Otherwise KVM will veto the transition. +7.20 KVM_CAP_HALT_POLL +---------------------- + +:Architectures: all +:Target: VM +:Parameters: args[0] is the maximum poll time in nanoseconds +:Returns: 0 on success; -1 on error + +This capability overrides the kvm module parameter halt_poll_ns for the +target VM. + +VCPU polling allows a VCPU to poll for wakeup events instead of immediately +scheduling during guest halts. The maximum time a VCPU can spend polling is +controlled by the kvm module parameter halt_poll_ns. This capability allows +the maximum halt time to specified on a per-VM basis, effectively overriding +the module parameter for the target VM. + 8. Other capabilities. ====================== diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5285a5568208..3cc6ccbb1183 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -503,6 +503,7 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; + unsigned int max_halt_poll_ns; }; #define kvm_err(fmt, ...) \ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 428c7dde6b4b..ac9eba0289d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1017,6 +1017,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_VCPU_RESETS 179 #define KVM_CAP_S390_PROTECTED 180 #define KVM_CAP_PPC_SECURE_GUEST 181 +#define KVM_CAP_HALT_POLL 182 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7436d054305..33e1eee96f75 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -710,6 +710,8 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_arch_destroy_vm; } + kvm->max_halt_poll_ns = halt_poll_ns; + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; @@ -2713,15 +2715,16 @@ out: if (!kvm_arch_no_poll(vcpu)) { if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - } else if (halt_poll_ns) { + } else if (vcpu->kvm->max_halt_poll_ns) { if (block_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + else if (vcpu->halt_poll_ns && + block_ns > vcpu->kvm->max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) + else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && + block_ns < vcpu->kvm->max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; @@ -3510,6 +3513,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3560,6 +3564,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return 0; } #endif + case KVM_CAP_HALT_POLL: { + if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) + return -EINVAL; + + kvm->max_halt_poll_ns = cap->args[0]; + return 0; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } -- cgit v1.2.3 From 76c70cb58ce30264af4b714109ee756da25d830a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Apr 2020 18:52:30 +0200 Subject: PM: sleep: core: Rename dev_pm_may_skip_resume() The name of dev_pm_may_skip_resume() may be easily confused with the power.may_skip_resume flag which is not checked by that function, so rename the former as dev_pm_skip_resume(). No functional impact. Suggested-by: Alan Stern Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern Acked-by: Bjorn Helgaas --- Documentation/power/pci.rst | 2 +- drivers/acpi/acpi_lpss.c | 4 ++-- drivers/acpi/device_pm.c | 4 ++-- drivers/base/power/main.c | 8 ++++---- drivers/pci/pci-driver.c | 4 ++-- include/linux/pm.h | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index a39b2461919a..aa1c7fce6cd0 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -1034,7 +1034,7 @@ device to be left in suspend after system-wide transitions to the working state. This flag is checked by the PM core, but the PCI bus type informs the PM core which devices may be left in suspend from its perspective (that happens during the "noirq" phase of system-wide suspend and analogous transitions) and next it -uses the dev_pm_may_skip_resume() helper to decide whether or not to return from +uses the dev_pm_skip_resume() helper to decide whether or not to return from pci_pm_resume_noirq() and pci_pm_resume_early() upfront. 3.2. Device Runtime Power Management diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index c4a84df6cc98..7632df1a5be3 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -1093,7 +1093,7 @@ static int acpi_lpss_resume_early(struct device *dev) if (pdata->dev_desc->resume_from_noirq) return 0; - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) return 0; return acpi_lpss_do_resume_early(dev); @@ -1105,7 +1105,7 @@ static int acpi_lpss_resume_noirq(struct device *dev) int ret; /* Follow acpi_subsys_resume_noirq(). */ - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) return 0; ret = pm_generic_resume_noirq(dev); diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 1b02d7dc7d34..8c2a091728a9 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1127,7 +1127,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend_noirq); */ static int acpi_subsys_resume_noirq(struct device *dev) { - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) return 0; return pm_generic_resume_noirq(dev); @@ -1145,7 +1145,7 @@ static int acpi_subsys_resume_early(struct device *dev) { int ret; - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) return 0; ret = acpi_dev_resume(dev); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5adf0be6aa47..f98eced0f200 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -562,7 +562,7 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd) /*------------------------- Resume routines -------------------------*/ /** - * dev_pm_may_skip_resume - System-wide device resume optimization check. + * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: @@ -572,7 +572,7 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd) * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ -bool dev_pm_may_skip_resume(struct device *dev) +bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; @@ -611,7 +611,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn if (!dpm_wait_for_superior(dev, async)) goto Out; - skip_resume = dev_pm_may_skip_resume(dev); + skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for @@ -797,7 +797,7 @@ static int device_resume_early(struct device *dev, pm_message_t state, bool asyn if (callback) goto Run; - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index ce220b1987df..decf82595340 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -891,7 +891,7 @@ static int pci_pm_resume_noirq(struct device *dev) pci_power_t prev_state = pci_dev->current_state; bool skip_bus_pm = pci_dev->skip_bus_pm; - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) return 0; /* @@ -920,7 +920,7 @@ static int pci_pm_resume_noirq(struct device *dev) static int pci_pm_resume_early(struct device *dev) { - if (dev_pm_may_skip_resume(dev)) + if (dev_pm_skip_resume(dev)) return 0; return pm_generic_resume_early(dev); diff --git a/include/linux/pm.h b/include/linux/pm.h index e057d1fa2469..d89b7099f241 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -758,7 +758,7 @@ extern int pm_generic_poweroff_late(struct device *dev); extern int pm_generic_poweroff(struct device *dev); extern void pm_generic_complete(struct device *dev); -extern bool dev_pm_may_skip_resume(struct device *dev); +extern bool dev_pm_skip_resume(struct device *dev); extern bool dev_pm_smart_suspend_and_suspended(struct device *dev); #else /* !CONFIG_PM_SLEEP */ -- cgit v1.2.3 From fa2bfead910322e44e7e0bb74364ac198a2abd32 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Apr 2020 18:52:48 +0200 Subject: PM: sleep: core: Rename dev_pm_smart_suspend_and_suspended() Because all callers of dev_pm_smart_suspend_and_suspended use it only for checking whether or not to skip driver suspend callbacks for a device, rename it to dev_pm_skip_suspend() in analogy with dev_pm_skip_resume(). No functional impact. Suggested-by: Alan Stern Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern Acked-by: Bjorn Helgaas --- drivers/acpi/acpi_lpss.c | 6 +++--- drivers/acpi/device_pm.c | 8 ++++---- drivers/base/power/main.c | 13 ++++++------- drivers/pci/hotplug/pciehp_core.c | 2 +- drivers/pci/pci-driver.c | 8 ++++---- include/linux/pm.h | 2 +- 6 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 7632df1a5be3..5e2bfbcf526f 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -1041,7 +1041,7 @@ static int acpi_lpss_do_suspend_late(struct device *dev) { int ret; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; ret = pm_generic_suspend_late(dev); @@ -1169,7 +1169,7 @@ static int acpi_lpss_poweroff_late(struct device *dev) { struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev)); - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; if (pdata->dev_desc->resume_from_noirq) @@ -1182,7 +1182,7 @@ static int acpi_lpss_poweroff_noirq(struct device *dev) { struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev)); - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; if (pdata->dev_desc->resume_from_noirq) { diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 8c2a091728a9..ae234d731d42 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1084,7 +1084,7 @@ int acpi_subsys_suspend_late(struct device *dev) { int ret; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; ret = pm_generic_suspend_late(dev); @@ -1100,7 +1100,7 @@ int acpi_subsys_suspend_noirq(struct device *dev) { int ret; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; ret = pm_generic_suspend_noirq(dev); @@ -1213,7 +1213,7 @@ static int acpi_subsys_poweroff_late(struct device *dev) { int ret; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; ret = pm_generic_poweroff_late(dev); @@ -1229,7 +1229,7 @@ static int acpi_subsys_poweroff_late(struct device *dev) */ static int acpi_subsys_poweroff_noirq(struct device *dev) { - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; return pm_generic_poweroff_noirq(dev); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index f98eced0f200..3170d93e29f9 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -567,8 +567,7 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd) * * Return: * - %false if the transition under way is RESTORE. - * - The return value of dev_pm_smart_suspend_and_suspended() if the transition - * under way is THAW. + * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ @@ -578,7 +577,7 @@ bool dev_pm_skip_resume(struct device *dev) return false; if (pm_transition.event == PM_EVENT_THAW) - return dev_pm_smart_suspend_and_suspended(dev); + return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } @@ -624,7 +623,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn */ if (skip_resume) pm_runtime_set_suspended(dev); - else if (dev_pm_smart_suspend_and_suspended(dev)) + else if (dev_pm_skip_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { @@ -1223,7 +1222,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a if (callback) goto Run; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { @@ -1415,7 +1414,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as if (callback) goto Run; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { @@ -2003,7 +2002,7 @@ void device_pm_check_callbacks(struct device *dev) spin_unlock_irq(&dev->power.lock); } -bool dev_pm_smart_suspend_and_suspended(struct device *dev) +bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) && pm_runtime_status_suspended(dev); diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 312cc45c44c7..bf779f291f15 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -275,7 +275,7 @@ static int pciehp_suspend(struct pcie_device *dev) * If the port is already runtime suspended we can keep it that * way. */ - if (dev_pm_smart_suspend_and_suspended(&dev->port->dev)) + if (dev_pm_skip_suspend(&dev->port->dev)) return 0; pciehp_disable_interrupt(dev); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index decf82595340..da6510af1221 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -776,7 +776,7 @@ static int pci_pm_suspend(struct device *dev) static int pci_pm_suspend_late(struct device *dev) { - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); @@ -789,7 +789,7 @@ static int pci_pm_suspend_noirq(struct device *dev) struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) @@ -1126,7 +1126,7 @@ static int pci_pm_poweroff(struct device *dev) static int pci_pm_poweroff_late(struct device *dev) { - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); @@ -1139,7 +1139,7 @@ static int pci_pm_poweroff_noirq(struct device *dev) struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - if (dev_pm_smart_suspend_and_suspended(dev)) + if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) diff --git a/include/linux/pm.h b/include/linux/pm.h index d89b7099f241..8c59a7f0bcf4 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -759,7 +759,7 @@ extern int pm_generic_poweroff(struct device *dev); extern void pm_generic_complete(struct device *dev); extern bool dev_pm_skip_resume(struct device *dev); -extern bool dev_pm_smart_suspend_and_suspended(struct device *dev); +extern bool dev_pm_skip_suspend(struct device *dev); #else /* !CONFIG_PM_SLEEP */ -- cgit v1.2.3 From e07515563d010d8b32967634e8dc2fdc732c1aa6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Apr 2020 18:53:01 +0200 Subject: PM: sleep: core: Rename DPM_FLAG_NEVER_SKIP Rename DPM_FLAG_NEVER_SKIP to DPM_FLAG_NO_DIRECT_COMPLETE which matches its purpose more closely. No functional impact. Suggested-by: Alan Stern Signed-off-by: Rafael J. Wysocki Acked-by: Bjorn Helgaas # for PCI parts Acked-by: Jeff Kirsher Acked-by: Alan Stern Acked-by: Bjorn Helgaas Acked-by: Alex Deucher --- Documentation/driver-api/pm/devices.rst | 6 +++--- Documentation/power/pci.rst | 10 +++++----- drivers/base/power/main.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/i915/intel_runtime_pm.c | 2 +- drivers/gpu/drm/radeon/radeon_kms.c | 2 +- drivers/misc/mei/pci-me.c | 2 +- drivers/misc/mei/pci-txe.c | 2 +- drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- drivers/pci/pcie/portdrv_pci.c | 2 +- include/linux/pm.h | 6 +++--- 13 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index f66c7b9126ea..4ace0eba4506 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -361,9 +361,9 @@ the phases are: ``prepare``, ``suspend``, ``suspend_late``, ``suspend_noirq``. runtime PM disabled. This feature also can be controlled by device drivers by using the - ``DPM_FLAG_NEVER_SKIP`` and ``DPM_FLAG_SMART_PREPARE`` driver power - management flags. [Typically, they are set at the time the driver is - probed against the device in question by passing them to the + ``DPM_FLAG_NO_DIRECT_COMPLETE`` and ``DPM_FLAG_SMART_PREPARE`` driver + power management flags. [Typically, they are set at the time the driver + is probed against the device in question by passing them to the :c:func:`dev_pm_set_driver_flags` helper function.] If the first of these flags is set, the PM core will not apply the direct-complete procedure described above to the given device and, consequenty, to any diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index aa1c7fce6cd0..9e1408121bea 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -1004,11 +1004,11 @@ including the PCI bus type. The flags should be set once at the driver probe time with the help of the dev_pm_set_driver_flags() function and they should not be updated directly afterwards. -The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete -mechanism allowing device suspend/resume callbacks to be skipped if the device -is in runtime suspend when the system suspend starts. That also affects all of -the ancestors of the device, so this flag should only be used if absolutely -necessary. +The DPM_FLAG_NO_DIRECT_COMPLETE flag prevents the PM core from using the +direct-complete mechanism allowing device suspend/resume callbacks to be skipped +if the device is in runtime suspend when the system suspend starts. That also +affects all of the ancestors of the device, so this flag should only be used if +absolutely necessary. The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a positive value from pci_pm_prepare() if the ->prepare callback provided by the diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 3170d93e29f9..dbc1e5e7346b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1844,7 +1844,7 @@ unlock: spin_lock_irq(&dev->power.lock); dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && - !dev_pm_test_driver_flags(dev, DPM_FLAG_NEVER_SKIP); + !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index fd1dc3236eca..a9086ea1ab60 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -191,7 +191,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) } if (adev->runpm) { - dev_pm_set_driver_flags(dev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(dev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); pm_runtime_use_autosuspend(dev->dev); pm_runtime_set_autosuspend_delay(dev->dev, 5000); pm_runtime_set_active(dev->dev); diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index ad719c9602af..9cb2d7548daa 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -549,7 +549,7 @@ void intel_runtime_pm_enable(struct intel_runtime_pm *rpm) * becaue the HDA driver may require us to enable the audio power * domain during system suspend. */ - dev_pm_set_driver_flags(kdev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(kdev, DPM_FLAG_NO_DIRECT_COMPLETE); pm_runtime_set_autosuspend_delay(kdev, 10000); /* 10s */ pm_runtime_mark_last_busy(kdev); diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 58176db85952..372962358a18 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -158,7 +158,7 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) } if (radeon_is_px(dev)) { - dev_pm_set_driver_flags(dev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(dev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); pm_runtime_use_autosuspend(dev->dev); pm_runtime_set_autosuspend_delay(dev->dev, 5000); pm_runtime_set_active(dev->dev); diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 3d21c38e2dbb..53f16f3bd091 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -240,7 +240,7 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * MEI requires to resume from runtime suspend mode * in order to perform link reset flow upon system suspend. */ - dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); /* * ME maps runtime suspend/resume to D0i states, diff --git a/drivers/misc/mei/pci-txe.c b/drivers/misc/mei/pci-txe.c index beacf2a2f2b5..4bf26ce61044 100644 --- a/drivers/misc/mei/pci-txe.c +++ b/drivers/misc/mei/pci-txe.c @@ -128,7 +128,7 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * MEI requires to resume from runtime suspend mode * in order to perform link reset flow upon system suspend. */ - dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); /* * TXE maps runtime suspend/resume to own power gating states, diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 177c6da80c57..2730b1c7dddb 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -7549,7 +7549,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) e1000_print_device_info(adapter); - dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); if (pci_dev_run_wake(pdev) && hw->mac.type < e1000_pch_cnp) pm_runtime_put_noidle(&pdev->dev); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index b46bff8fe056..8bb3db2cbd41 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3445,7 +3445,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } } - dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); pm_runtime_put_noidle(&pdev->dev); return 0; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 69fa1ce1f927..59fc0097438f 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -4825,7 +4825,7 @@ static int igc_probe(struct pci_dev *pdev, pcie_print_link_status(pdev); netdev_info(netdev, "MAC: %pM\n", netdev->dev_addr); - dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP); + dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE); pm_runtime_put_noidle(&pdev->dev); diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 160d67c59310..3acf151ae015 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -115,7 +115,7 @@ static int pcie_portdrv_probe(struct pci_dev *dev, pci_save_state(dev); - dev_pm_set_driver_flags(&dev->dev, DPM_FLAG_NEVER_SKIP | + dev_pm_set_driver_flags(&dev->dev, DPM_FLAG_NO_DIRECT_COMPLETE | DPM_FLAG_SMART_SUSPEND); if (pci_bridge_d3_possible(dev)) { diff --git a/include/linux/pm.h b/include/linux/pm.h index 8c59a7f0bcf4..cdb8fbd6ab18 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -544,7 +544,7 @@ struct pm_subsys_data { * These flags can be set by device drivers at the probe time. They need not be * cleared by the drivers as the driver core will take care of that. * - * NEVER_SKIP: Do not skip all system suspend/resume callbacks for the device. + * NO_DIRECT_COMPLETE: Do not apply direct-complete optimization to the device. * SMART_PREPARE: Check the return value of the driver's ->prepare callback. * SMART_SUSPEND: No need to resume the device from runtime suspend. * LEAVE_SUSPENDED: Avoid resuming the device during system resume if possible. @@ -554,7 +554,7 @@ struct pm_subsys_data { * their ->prepare callbacks if the driver's ->prepare callback returns 0 (in * other words, the system suspend/resume callbacks can only be skipped for the * device if its driver doesn't object against that). This flag has no effect - * if NEVER_SKIP is set. + * if NO_DIRECT_COMPLETE is set. * * Setting SMART_SUSPEND instructs bus types and PM domains which may want to * runtime resume the device upfront during system suspend that doing so is not @@ -565,7 +565,7 @@ struct pm_subsys_data { * Setting LEAVE_SUSPENDED informs the PM core and middle-layer code that the * driver prefers the device to be left in suspend after system resume. */ -#define DPM_FLAG_NEVER_SKIP BIT(0) +#define DPM_FLAG_NO_DIRECT_COMPLETE BIT(0) #define DPM_FLAG_SMART_PREPARE BIT(1) #define DPM_FLAG_SMART_SUSPEND BIT(2) #define DPM_FLAG_LEAVE_SUSPENDED BIT(3) -- cgit v1.2.3 From 2a3f34750b8b07df42ab4b30b70e029d46e0d7f3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Apr 2020 18:53:20 +0200 Subject: PM: sleep: core: Rename DPM_FLAG_LEAVE_SUSPENDED Rename DPM_FLAG_LEAVE_SUSPENDED to DPM_FLAG_MAY_SKIP_RESUME which matches its purpose more closely. No functional impact. Suggested-by: Alan Stern Signed-off-by: Rafael J. Wysocki Acked-by: Wolfram Sang # for I2C Acked-by: Alan Stern Acked-by: Bjorn Helgaas --- Documentation/driver-api/pm/devices.rst | 4 ++-- Documentation/power/pci.rst | 2 +- drivers/acpi/acpi_tad.c | 2 +- drivers/base/power/main.c | 2 +- drivers/i2c/busses/i2c-designware-platdrv.c | 4 ++-- include/linux/pm.h | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index 4ace0eba4506..f342c7549b4c 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -803,7 +803,7 @@ general.] However, it often is desirable to leave devices in suspend after system transitions to the working state, especially if those devices had been in runtime suspend before the preceding system-wide suspend (or analogous) -transition. Device drivers can use the ``DPM_FLAG_LEAVE_SUSPENDED`` flag to +transition. Device drivers can use the ``DPM_FLAG_MAY_SKIP_RESUME`` flag to indicate to the PM core (and middle-layer code) that they prefer the specific devices handled by them to be left suspended and they have no problems with skipping their system-wide resume callbacks for this reason. Whether or not the @@ -825,7 +825,7 @@ device really can be left in suspend. For devices whose "noirq", "late" and "early" driver callbacks are invoked directly by the PM core, all of the system-wide resume callbacks are skipped if -``DPM_FLAG_LEAVE_SUSPENDED`` is set and the device is in runtime suspend during +``DPM_FLAG_MAY_SKIP_RESUME`` is set and the device is in runtime suspend during the ``suspend_noirq`` (or analogous) phase or the transition under way is a proper system suspend (rather than anything related to hibernation) and the device's wakeup settings are suitable for runtime PM (that is, it cannot diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index 9e1408121bea..f09b382b4621 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -1029,7 +1029,7 @@ into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(), the function will set the power.direct_complete flag for it (to make the PM core skip the subsequent "thaw" callbacks for it) and return. -Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the +Setting the DPM_FLAG_MAY_SKIP_RESUME flag means that the driver prefers the device to be left in suspend after system-wide transitions to the working state. This flag is checked by the PM core, but the PCI bus type informs the PM core which devices may be left in suspend from its perspective (that happens during diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c index 33a4bcdaa4d7..7d45cce0c3c1 100644 --- a/drivers/acpi/acpi_tad.c +++ b/drivers/acpi/acpi_tad.c @@ -624,7 +624,7 @@ static int acpi_tad_probe(struct platform_device *pdev) */ device_init_wakeup(dev, true); dev_pm_set_driver_flags(dev, DPM_FLAG_SMART_SUSPEND | - DPM_FLAG_LEAVE_SUSPENDED); + DPM_FLAG_MAY_SKIP_RESUME); /* * The platform bus type layer tells the ACPI PM domain powers up the * device, so set the runtime PM status of it to "active". diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index dbc1e5e7346b..aaa4aaf41d27 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1247,7 +1247,7 @@ Skip: * to be skipped. */ if (atomic_read(&dev->power.usage_count) > 1 || - !(dev_pm_test_driver_flags(dev, DPM_FLAG_LEAVE_SUSPENDED) && + !(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume)) dev->power.must_resume = true; diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 5536673060cc..c429d664f655 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -357,12 +357,12 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) if (dev->flags & ACCESS_NO_IRQ_SUSPEND) { dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE | - DPM_FLAG_LEAVE_SUSPENDED); + DPM_FLAG_MAY_SKIP_RESUME); } else { dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE | DPM_FLAG_SMART_SUSPEND | - DPM_FLAG_LEAVE_SUSPENDED); + DPM_FLAG_MAY_SKIP_RESUME); } /* The code below assumes runtime PM to be disabled. */ diff --git a/include/linux/pm.h b/include/linux/pm.h index cdb8fbd6ab18..35796fc49e7a 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -547,7 +547,7 @@ struct pm_subsys_data { * NO_DIRECT_COMPLETE: Do not apply direct-complete optimization to the device. * SMART_PREPARE: Check the return value of the driver's ->prepare callback. * SMART_SUSPEND: No need to resume the device from runtime suspend. - * LEAVE_SUSPENDED: Avoid resuming the device during system resume if possible. + * MAY_SKIP_RESUME: Avoid resuming the device during system resume if possible. * * Setting SMART_PREPARE instructs bus types and PM domains which may want * system suspend/resume callbacks to be skipped for the device to return 0 from @@ -562,13 +562,13 @@ struct pm_subsys_data { * invocations of the ->suspend_late and ->suspend_noirq callbacks provided by * the driver if they decide to leave the device in runtime suspend. * - * Setting LEAVE_SUSPENDED informs the PM core and middle-layer code that the + * Setting MAY_SKIP_RESUME informs the PM core and middle-layer code that the * driver prefers the device to be left in suspend after system resume. */ #define DPM_FLAG_NO_DIRECT_COMPLETE BIT(0) #define DPM_FLAG_SMART_PREPARE BIT(1) #define DPM_FLAG_SMART_SUSPEND BIT(2) -#define DPM_FLAG_LEAVE_SUSPENDED BIT(3) +#define DPM_FLAG_MAY_SKIP_RESUME BIT(3) struct dev_pm_info { pm_message_t power_state; -- cgit v1.2.3 From 2fff3f73e8c27801b84d2315e1a49bce96b00eff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Apr 2020 18:55:32 +0200 Subject: Documentation: PM: sleep: Update driver flags documentation Update the documentation of the driver flags for system-wide power management to reflect the current code flows and be more consistent. Signed-off-by: Rafael J. Wysocki --- Documentation/driver-api/pm/devices.rst | 139 +++++++++++++++++++++----------- Documentation/power/pci.rst | 43 +++++----- include/linux/pm.h | 24 ++---- 3 files changed, 119 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index f342c7549b4c..782cb37073a3 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -772,62 +772,107 @@ the state of devices (possibly except for resuming them from runtime suspend) from their ``->prepare`` and ``->suspend`` callbacks (or equivalent) *before* invoking device drivers' ``->suspend`` callbacks (or equivalent). +.. _smart_suspend_flag: + +The ``DPM_FLAG_SMART_SUSPEND`` Driver Flag +------------------------------------------ + Some bus types and PM domains have a policy to resume all devices from runtime suspend upfront in their ``->suspend`` callbacks, but that may not be really necessary if the driver of the device can cope with runtime-suspended devices. The driver can indicate that by setting ``DPM_FLAG_SMART_SUSPEND`` in -:c:member:`power.driver_flags` at the probe time, by passing it to the -:c:func:`dev_pm_set_driver_flags` helper. That also may cause middle-layer code +:c:member:`power.driver_flags` at the probe time with the help of the +:c:func:`dev_pm_set_driver_flags` helper routine. + +However, setting that flag also causes the PM core and middle-layer code (bus types, PM domains etc.) to skip the ``->suspend_late`` and ``->suspend_noirq`` callbacks provided by the driver if the device remains in -runtime suspend at the beginning of the ``suspend_late`` phase of system-wide -suspend (or in the ``poweroff_late`` phase of hibernation), when runtime PM -has been disabled for it, under the assumption that its state should not change -after that point until the system-wide transition is over (the PM core itself -does that for devices whose "noirq", "late" and "early" system-wide PM callbacks -are executed directly by it). If that happens, the driver's system-wide resume -callbacks, if present, may still be invoked during the subsequent system-wide -resume transition and the device's runtime power management status may be set -to "active" before enabling runtime PM for it, so the driver must be prepared to -cope with the invocation of its system-wide resume callbacks back-to-back with -its ``->runtime_suspend`` one (without the intervening ``->runtime_resume`` and -so on) and the final state of the device must reflect the "active" runtime PM -status in that case. +runtime suspend during the ``suspend_late`` phase of system-wide suspend (or +during the ``poweroff_late`` or ``freeze_late`` phase of hibernation), +after runtime PM was disabled for it. [Without doing that, the same driver +callback might be executed twice in a row for the same device, which would not +be valid in general.] If the middle-layer system-wide PM callbacks are present +for the device, they are responsible for doing the above, and the PM core takes +care of it otherwise. + +In addition, with ``DPM_FLAG_SMART_SUSPEND`` set, the driver's ``->thaw_late`` +and ``->thaw_noirq`` callbacks are skipped if the device remained in runtime +suspend during the preceding "freeze" transition related to hibernation. +Again, if the middle-layer callbacks are present for the device, they are +responsible for doing that, or the PM core takes care of it otherwise. + + +The ``DPM_FLAG_MAY_SKIP_RESUME`` Driver Flag +-------------------------------------------- During system-wide resume from a sleep state it's easiest to put devices into the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`. [Refer to that document for more information regarding this particular issue as well as for information on the device runtime power management framework in -general.] - -However, it often is desirable to leave devices in suspend after system -transitions to the working state, especially if those devices had been in +general.] However, it often is desirable to leave devices in suspend after +system transitions to the working state, especially if those devices had been in runtime suspend before the preceding system-wide suspend (or analogous) -transition. Device drivers can use the ``DPM_FLAG_MAY_SKIP_RESUME`` flag to -indicate to the PM core (and middle-layer code) that they prefer the specific -devices handled by them to be left suspended and they have no problems with -skipping their system-wide resume callbacks for this reason. Whether or not the -devices will actually be left in suspend may depend on their state before the -given system suspend-resume cycle and on the type of the system transition under -way. In particular, devices are not left suspended if that transition is a -restore from hibernation, as device states are not guaranteed to be reflected -by the information stored in the hibernation image in that case. - -The middle-layer code involved in the handling of the device is expected to -indicate to the PM core if the device may be left in suspend by setting its -:c:member:`power.may_skip_resume` status bit which is checked by the PM core -during the "noirq" phase of the preceding system-wide suspend (or analogous) -transition. The middle layer is then responsible for handling the device as -appropriate in its "noirq" resume callback, which is executed regardless of -whether or not the device is left suspended, but the other resume callbacks -(except for ``->complete``) will be skipped automatically by the PM core if the -device really can be left in suspend. - -For devices whose "noirq", "late" and "early" driver callbacks are invoked -directly by the PM core, all of the system-wide resume callbacks are skipped if -``DPM_FLAG_MAY_SKIP_RESUME`` is set and the device is in runtime suspend during -the ``suspend_noirq`` (or analogous) phase or the transition under way is a -proper system suspend (rather than anything related to hibernation) and the -device's wakeup settings are suitable for runtime PM (that is, it cannot -generate wakeup signals at all or it is allowed to wake up the system from -sleep). +transition. + +To that end, device drivers can use the ``DPM_FLAG_MAY_SKIP_RESUME`` flag to +indicate to the PM core and middle-layer code that they allow their "noirq" and +"early" resume callbacks to be skipped if the device can be left in suspend +after system-wide PM transitions to the working state. Whether or not that is +the case generally depends on the state of the device before the given system +suspend-resume cycle and on the type of the system transition under way. +In particular, the "restore" and "thaw" transitions related to hibernation are +not affected by ``DPM_FLAG_MAY_SKIP_RESUME`` at all. [All devices are always +resumed during the "restore" transition and whether or not any driver callbacks +are skipped during the "freeze" transition depends whether or not the +``DPM_FLAG_SMART_SUSPEND`` flag is set (see `above `_).] + +The ``DPM_FLAG_MAY_SKIP_RESUME`` flag is taken into account in combination with +the :c:member:`power.may_skip_resume` status bit set by the PM core during the +"suspend" phase of suspend-type transitions. If the driver or the middle layer +has a reason to prevent the driver's "noirq" and "early" resume callbacks from +being skipped during the subsequent resume transition of the system, it should +clear :c:member:`power.may_skip_resume` in its ``->suspend``, ``->suspend_late`` +or ``->suspend_noirq`` callback. [Note that the drivers setting +``DPM_FLAG_SMART_SUSPEND`` need to clear :c:member:`power.may_skip_resume` in +their ``->suspend`` callback in case the other two are skipped.] + +Setting the :c:member:`power.may_skip_resume` status bit along with the +``DPM_FLAG_MAY_SKIP_RESUME`` flag is necessary, but generally not sufficient, +for the driver's "noirq" and "early" resume callbacks to be skipped. Whether or +not they should be skipped can be determined by evaluating the +:c:func:`dev_pm_skip_resume` helper function. + +If that function returns ``true``, the driver's "noirq" and "early" resume +callbacks should be skipped and the device's runtime PM status will be set to +"suspended" by the PM core. Otherwise, if the device was runtime-suspended +during the preceding system-wide suspend transition and +``DPM_FLAG_SMART_SUSPEND`` is set for it, its runtime PM status will be set to +"active" by the PM core. [Hence, the drivers that do not set +``DPM_FLAG_SMART_SUSPEND`` should not expect the runtime PM status of their +devices to be changed from "suspended" to "active" by the PM core during +system-wide resume-type transitions.] + +If the ``DPM_FLAG_MAY_SKIP_RESUME`` flag is not set for a device, but +``DPM_FLAG_SMART_SUSPEND`` is set and the driver's "late" and "noirq" suspend +callbacks are skipped, its system-wide "noirq" and "early" resume callbacks, if +present, are invoked as usual and the device's runtime PM status is set to +"active" by the PM core before enabling runtime PM for it. In that case, the +driver must be prepared to cope with the invocation of its system-wide resume +callbacks back-to-back with its ``->runtime_suspend`` one (without the +intervening ``->runtime_resume`` and system-wide suspend callbacks) and the +final state of the device must reflect the "active" runtime PM status in that +case. [Note that this is not a problem at all if the driver's +``->suspend_late`` callback pointer points to the same function as its +``->runtime_suspend`` one and its ``->resume_early`` callback pointer points to +the same function as the ``->runtime_resume`` one, while none of the other +system-wide suspend-resume callbacks of the driver are present, for example.] + +Likewise, if ``DPM_FLAG_MAY_SKIP_RESUME`` is set for a device, its driver's +system-wide "noirq" and "early" resume callbacks may be skipped while its "late" +and "noirq" suspend callbacks may have been executed (in principle, regardless +of whether or not ``DPM_FLAG_SMART_SUSPEND`` is set). In that case, the driver +needs to be able to cope with the invocation of its ``->runtime_resume`` +callback back-to-back with its "late" and "noirq" suspend ones. [For instance, +that is not a concern if the driver sets both ``DPM_FLAG_SMART_SUSPEND`` and +``DPM_FLAG_MAY_SKIP_RESUME`` and uses the same pair of suspend/resume callback +functions for runtime PM and system-wide suspend/resume.] diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index f09b382b4621..1831e431f725 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -1010,32 +1010,33 @@ if the device is in runtime suspend when the system suspend starts. That also affects all of the ancestors of the device, so this flag should only be used if absolutely necessary. -The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a -positive value from pci_pm_prepare() if the ->prepare callback provided by the +The DPM_FLAG_SMART_PREPARE flag causes the PCI bus type to return a positive +value from pci_pm_prepare() only if the ->prepare callback provided by the driver of the device returns a positive value. That allows the driver to opt -out from using the direct-complete mechanism dynamically. +out from using the direct-complete mechanism dynamically (whereas setting +DPM_FLAG_NO_DIRECT_COMPLETE means permanent opt-out). The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's perspective the device can be safely left in runtime suspend during system suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff() -to skip resuming the device from runtime suspend unless there are PCI-specific -reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(), -pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early -if the device remains in runtime suspend in the beginning of the "late" phase -of the system-wide transition under way. Moreover, if the device is in -runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime -power management status will be changed to "active" (as it is going to be put -into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(), -the function will set the power.direct_complete flag for it (to make the PM core -skip the subsequent "thaw" callbacks for it) and return. - -Setting the DPM_FLAG_MAY_SKIP_RESUME flag means that the driver prefers the -device to be left in suspend after system-wide transitions to the working state. -This flag is checked by the PM core, but the PCI bus type informs the PM core -which devices may be left in suspend from its perspective (that happens during -the "noirq" phase of system-wide suspend and analogous transitions) and next it -uses the dev_pm_skip_resume() helper to decide whether or not to return from -pci_pm_resume_noirq() and pci_pm_resume_early() upfront. +to avoid resuming the device from runtime suspend unless there are PCI-specific +reasons for doing that. Also, it causes pci_pm_suspend_late/noirq() and +pci_pm_poweroff_late/noirq() to return early if the device remains in runtime +suspend during the "late" phase of the system-wide transition under way. +Moreover, if the device is in runtime suspend in pci_pm_resume_noirq() or +pci_pm_restore_noirq(), its runtime PM status will be changed to "active" (as it +is going to be put into D0 going forward). + +Setting the DPM_FLAG_MAY_SKIP_RESUME flag means that the driver allows its +"noirq" and "early" resume callbacks to be skipped if the device can be left +in suspend after a system-wide transition into the working state. This flag is +taken into consideration by the PM core along with the power.may_skip_resume +status bit of the device which is set by pci_pm_suspend_noirq() in certain +situations. If the PM core determines that the driver's "noirq" and "early" +resume callbacks should be skipped, the dev_pm_skip_resume() helper function +will return "true" and that will cause pci_pm_resume_noirq() and +pci_pm_resume_early() to return upfront without touching the device and +executing the driver callbacks. 3.2. Device Runtime Power Management ------------------------------------ diff --git a/include/linux/pm.h b/include/linux/pm.h index 35796fc49e7a..121c104a4090 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -545,25 +545,11 @@ struct pm_subsys_data { * cleared by the drivers as the driver core will take care of that. * * NO_DIRECT_COMPLETE: Do not apply direct-complete optimization to the device. - * SMART_PREPARE: Check the return value of the driver's ->prepare callback. - * SMART_SUSPEND: No need to resume the device from runtime suspend. - * MAY_SKIP_RESUME: Avoid resuming the device during system resume if possible. - * - * Setting SMART_PREPARE instructs bus types and PM domains which may want - * system suspend/resume callbacks to be skipped for the device to return 0 from - * their ->prepare callbacks if the driver's ->prepare callback returns 0 (in - * other words, the system suspend/resume callbacks can only be skipped for the - * device if its driver doesn't object against that). This flag has no effect - * if NO_DIRECT_COMPLETE is set. - * - * Setting SMART_SUSPEND instructs bus types and PM domains which may want to - * runtime resume the device upfront during system suspend that doing so is not - * necessary from the driver's perspective. It also may cause them to skip - * invocations of the ->suspend_late and ->suspend_noirq callbacks provided by - * the driver if they decide to leave the device in runtime suspend. - * - * Setting MAY_SKIP_RESUME informs the PM core and middle-layer code that the - * driver prefers the device to be left in suspend after system resume. + * SMART_PREPARE: Take the driver ->prepare callback return value into account. + * SMART_SUSPEND: Avoid resuming the device from runtime suspend. + * MAY_SKIP_RESUME: Allow driver "noirq" and "early" callbacks to be skipped. + * + * See Documentation/driver-api/pm/devices.rst for details. */ #define DPM_FLAG_NO_DIRECT_COMPLETE BIT(0) #define DPM_FLAG_SMART_PREPARE BIT(1) -- cgit v1.2.3 From 5c05c1dbb177293636a3f5ea4caa872dfcf50ccd Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Apr 2020 17:02:56 +0100 Subject: net: phylink, dsa: eliminate phylink_fixed_state_cb() Move the callback into the phylink_config structure, rather than providing a callback to set this up. Signed-off-by: Russell King Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 46 +++++++++++++++------------------------------- include/linux/phylink.h | 6 +++--- net/dsa/slave.c | 20 +++++++++++--------- 3 files changed, 29 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 34ca12aec61b..0f23bec431c1 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -480,8 +480,8 @@ static void phylink_get_fixed_state(struct phylink *pl, struct phylink_link_state *state) { *state = pl->link_config; - if (pl->get_fixed_state) - pl->get_fixed_state(pl->netdev, state); + if (pl->config->get_fixed_state) + pl->config->get_fixed_state(pl->config, state); else if (pl->link_gpio) state->link = !!gpiod_get_value_cansleep(pl->link_gpio); @@ -1044,32 +1044,6 @@ void phylink_disconnect_phy(struct phylink *pl) } EXPORT_SYMBOL_GPL(phylink_disconnect_phy); -/** - * phylink_fixed_state_cb() - allow setting a fixed link callback - * @pl: a pointer to a &struct phylink returned from phylink_create() - * @cb: callback to execute to determine the fixed link state. - * - * The MAC driver should call this driver when the state of its link - * can be determined through e.g: an out of band MMIO register. - */ -int phylink_fixed_state_cb(struct phylink *pl, - void (*cb)(struct net_device *dev, - struct phylink_link_state *state)) -{ - /* It does not make sense to let the link be overriden unless we use - * MLO_AN_FIXED - */ - if (pl->cfg_link_an_mode != MLO_AN_FIXED) - return -EINVAL; - - mutex_lock(&pl->state_mutex); - pl->get_fixed_state = cb; - mutex_unlock(&pl->state_mutex); - - return 0; -} -EXPORT_SYMBOL_GPL(phylink_fixed_state_cb); - /** * phylink_mac_change() - notify phylink of a change in MAC state * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -1106,6 +1080,8 @@ static irqreturn_t phylink_link_handler(int irq, void *data) */ void phylink_start(struct phylink *pl) { + bool poll = false; + ASSERT_RTNL(); phylink_info(pl, "configuring for %s/%s link mode\n", @@ -1142,10 +1118,18 @@ void phylink_start(struct phylink *pl) irq = 0; } if (irq <= 0) - mod_timer(&pl->link_poll, jiffies + HZ); + poll = true; + } + + switch (pl->cfg_link_an_mode) { + case MLO_AN_FIXED: + poll |= pl->config->poll_fixed_state; + break; + case MLO_AN_INBAND: + poll |= pl->config->pcs_poll; + break; } - if ((pl->cfg_link_an_mode == MLO_AN_FIXED && pl->get_fixed_state) || - pl->config->pcs_poll) + if (poll) mod_timer(&pl->link_poll, jiffies + HZ); if (pl->phydev) phy_start(pl->phydev); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 3f8d37ec5503..cc5b452a184e 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -67,6 +67,9 @@ struct phylink_config { struct device *dev; enum phylink_op_type type; bool pcs_poll; + bool poll_fixed_state; + void (*get_fixed_state)(struct phylink_config *config, + struct phylink_link_state *state); }; /** @@ -366,9 +369,6 @@ void phylink_destroy(struct phylink *); int phylink_connect_phy(struct phylink *, struct phy_device *); int phylink_of_phy_connect(struct phylink *, struct device_node *, u32 flags); void phylink_disconnect_phy(struct phylink *); -int phylink_fixed_state_cb(struct phylink *, - void (*cb)(struct net_device *dev, - struct phylink_link_state *)); void phylink_mac_change(struct phylink *, bool up); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f2c241cf3a80..1035230771ae 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1590,10 +1590,10 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); -static void dsa_slave_phylink_fixed_state(struct net_device *dev, +static void dsa_slave_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { - struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would @@ -1633,6 +1633,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) dp->pl_config.dev = &slave_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; + /* The get_fixed_state callback takes precedence over polling the + * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set + * this if the switch provides such a callback. + */ + if (ds->ops->phylink_fixed_state) { + dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state; + dp->pl_config.poll_fixed_state = true; + } + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { @@ -1641,13 +1650,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return PTR_ERR(dp->pl); } - /* Register only if the switch provides such a callback, since this - * callback takes precedence over polling the link GPIO in PHYLINK - * (see phylink_get_fixed_state). - */ - if (ds->ops->phylink_fixed_state) - phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state); - if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); -- cgit v1.2.3 From 3194915486b2bc3f77745774f1731b78f32ff688 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Apr 2020 21:35:36 +0200 Subject: net: phy: remove genphy_no_soft_reset Since 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") we don't need genphy_no_soft_reset() any longer. Not setting callback soft_reset results in a no-op now. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/cortina.c | 1 - drivers/net/phy/marvell10g.c | 2 -- drivers/net/phy/phy-c45.c | 1 - drivers/net/phy/phy_device.c | 1 - drivers/net/phy/teranetics.c | 1 - include/linux/phy.h | 4 ---- 6 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/cortina.c b/drivers/net/phy/cortina.c index 856cdc36aacd..aac51362c0fe 100644 --- a/drivers/net/phy/cortina.c +++ b/drivers/net/phy/cortina.c @@ -82,7 +82,6 @@ static struct phy_driver cortina_driver[] = { .features = PHY_10GBIT_FEATURES, .config_aneg = gen10g_config_aneg, .read_status = cortina_read_status, - .soft_reset = genphy_no_soft_reset, .probe = cortina_probe, }, }; diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 95e3f4644aeb..80cbc77ffd55 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -727,7 +727,6 @@ static struct phy_driver mv3310_drivers[] = { .phy_id_mask = MARVELL_PHY_ID_MASK, .name = "mv88x3310", .get_features = mv3310_get_features, - .soft_reset = genphy_no_soft_reset, .config_init = mv3310_config_init, .probe = mv3310_probe, .suspend = mv3310_suspend, @@ -745,7 +744,6 @@ static struct phy_driver mv3310_drivers[] = { .probe = mv3310_probe, .suspend = mv3310_suspend, .resume = mv3310_resume, - .soft_reset = genphy_no_soft_reset, .config_init = mv3310_config_init, .config_aneg = mv3310_config_aneg, .aneg_done = mv3310_aneg_done, diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 67ba47ae5284..defe09d94422 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -564,6 +564,5 @@ struct phy_driver genphy_c45_driver = { .phy_id = 0xffffffff, .phy_id_mask = 0xffffffff, .name = "Generic Clause 45 PHY", - .soft_reset = genphy_no_soft_reset, .read_status = genphy_c45_read_status, }; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 206d98502b13..c8f8fd9908fe 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2630,7 +2630,6 @@ static struct phy_driver genphy_driver = { .phy_id = 0xffffffff, .phy_id_mask = 0xffffffff, .name = "Generic PHY", - .soft_reset = genphy_no_soft_reset, .get_features = genphy_read_abilities, .suspend = genphy_suspend, .resume = genphy_resume, diff --git a/drivers/net/phy/teranetics.c b/drivers/net/phy/teranetics.c index beb054b931ee..8057ea8dbc21 100644 --- a/drivers/net/phy/teranetics.c +++ b/drivers/net/phy/teranetics.c @@ -78,7 +78,6 @@ static struct phy_driver teranetics_driver[] = { .phy_id_mask = 0xffffffff, .name = "Teranetics TN2020", .features = PHY_10GBIT_FEATURES, - .soft_reset = genphy_no_soft_reset, .aneg_done = teranetics_aneg_done, .config_aneg = gen10g_config_aneg, .read_status = teranetics_read_status, diff --git a/include/linux/phy.h b/include/linux/phy.h index 3941a6bcba10..e2bfb9240587 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1251,10 +1251,6 @@ static inline int genphy_config_aneg(struct phy_device *phydev) return __genphy_config_aneg(phydev, false); } -static inline int genphy_no_soft_reset(struct phy_device *phydev) -{ - return 0; -} static inline int genphy_no_ack_interrupt(struct phy_device *phydev) { return 0; -- cgit v1.2.3 From fec86c6b8369b5dd8e3a1ad3752578a737163b25 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Mon, 13 Apr 2020 10:24:40 +0200 Subject: iio: imu: adis: Add Managed device functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for a managed device version of adis_setup_buffer_and_trigger. It works exactly as the original one but it calls all the devm_iio_* functions to setup an iio buffer and trigger. Hence we do not need to care about cleaning those and we do not need to support a remove() callback for every driver using the adis library. Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis_buffer.c | 45 ++++++++++++++++++++++++++++++++++++++++++ drivers/iio/imu/adis_trigger.c | 41 +++++++++++++++++++++++++++++++++++--- include/linux/iio/imu/adis.h | 17 ++++++++++++++++ 3 files changed, 100 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/imu/adis_buffer.c b/drivers/iio/imu/adis_buffer.c index 04e5e2a0fd6b..8fdc4dca10c0 100644 --- a/drivers/iio/imu/adis_buffer.c +++ b/drivers/iio/imu/adis_buffer.c @@ -156,6 +156,14 @@ static irqreturn_t adis_trigger_handler(int irq, void *p) return IRQ_HANDLED; } +static void adis_buffer_cleanup(void *arg) +{ + struct adis *adis = arg; + + kfree(adis->buffer); + kfree(adis->xfer); +} + /** * adis_setup_buffer_and_trigger() - Sets up buffer and trigger for the adis device * @adis: The adis device. @@ -198,6 +206,43 @@ error_buffer_cleanup: } EXPORT_SYMBOL_GPL(adis_setup_buffer_and_trigger); +/** + * devm_adis_setup_buffer_and_trigger() - Sets up buffer and trigger for + * the managed adis device + * @adis: The adis device + * @indio_dev: The IIO device + * @trigger_handler: Optional trigger handler, may be NULL. + * + * Returns 0 on success, a negative error code otherwise. + * + * This function perfoms exactly the same as adis_setup_buffer_and_trigger() + */ +int +devm_adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, + irq_handler_t trigger_handler) +{ + int ret; + + if (!trigger_handler) + trigger_handler = adis_trigger_handler; + + ret = devm_iio_triggered_buffer_setup(&adis->spi->dev, indio_dev, + &iio_pollfunc_store_time, + trigger_handler, NULL); + if (ret) + return ret; + + if (adis->spi->irq) { + ret = devm_adis_probe_trigger(adis, indio_dev); + if (ret) + return ret; + } + + return devm_add_action_or_reset(&adis->spi->dev, adis_buffer_cleanup, + adis); +} +EXPORT_SYMBOL_GPL(devm_adis_setup_buffer_and_trigger); + /** * adis_cleanup_buffer_and_trigger() - Free buffer and trigger resources * @adis: The adis device. diff --git a/drivers/iio/imu/adis_trigger.c b/drivers/iio/imu/adis_trigger.c index 8b9cd02c0f9f..a36810e0b1ab 100644 --- a/drivers/iio/imu/adis_trigger.c +++ b/drivers/iio/imu/adis_trigger.c @@ -27,6 +27,13 @@ static const struct iio_trigger_ops adis_trigger_ops = { .set_trigger_state = &adis_data_rdy_trigger_set_state, }; +static void adis_trigger_setup(struct adis *adis) +{ + adis->trig->dev.parent = &adis->spi->dev; + adis->trig->ops = &adis_trigger_ops; + iio_trigger_set_drvdata(adis->trig, adis); +} + /** * adis_probe_trigger() - Sets up trigger for a adis device * @adis: The adis device @@ -45,9 +52,7 @@ int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev) if (adis->trig == NULL) return -ENOMEM; - adis->trig->dev.parent = &adis->spi->dev; - adis->trig->ops = &adis_trigger_ops; - iio_trigger_set_drvdata(adis->trig, adis); + adis_trigger_setup(adis); ret = request_irq(adis->spi->irq, &iio_trigger_generic_data_rdy_poll, @@ -73,6 +78,36 @@ error_free_trig: } EXPORT_SYMBOL_GPL(adis_probe_trigger); +/** + * devm_adis_probe_trigger() - Sets up trigger for a managed adis device + * @adis: The adis device + * @indio_dev: The IIO device + * + * Returns 0 on success or a negative error code + */ +int devm_adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev) +{ + int ret; + + adis->trig = devm_iio_trigger_alloc(&adis->spi->dev, "%s-dev%d", + indio_dev->name, indio_dev->id); + if (!adis->trig) + return -ENOMEM; + + adis_trigger_setup(adis); + + ret = devm_request_irq(&adis->spi->dev, adis->spi->irq, + &iio_trigger_generic_data_rdy_poll, + IRQF_TRIGGER_RISING, + indio_dev->name, + adis->trig); + if (ret) + return ret; + + return devm_iio_trigger_register(&adis->spi->dev, adis->trig); +} +EXPORT_SYMBOL_GPL(devm_adis_probe_trigger); + /** * adis_remove_trigger() - Remove trigger for a adis devices * @adis: The adis device diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index dd8219138c2e..e70814d96e1c 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -448,11 +448,15 @@ struct adis_burst { unsigned int extra_len; }; +int +devm_adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, + irq_handler_t trigger_handler); int adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, irqreturn_t (*trigger_handler)(int, void *)); void adis_cleanup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev); +int devm_adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev); int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev); void adis_remove_trigger(struct adis *adis); @@ -461,6 +465,13 @@ int adis_update_scan_mode(struct iio_dev *indio_dev, #else /* CONFIG_IIO_BUFFER */ +static inline int +devm_adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, + irq_handler_t trigger_handler) +{ + return 0; +} + static inline int adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, irqreturn_t (*trigger_handler)(int, void *)) { @@ -472,6 +483,12 @@ static inline void adis_cleanup_buffer_and_trigger(struct adis *adis, { } +static inline int devm_adis_probe_trigger(struct adis *adis, + struct iio_dev *indio_dev) +{ + return 0; +} + static inline int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev) { -- cgit v1.2.3 From 698211065d4aa71ce599b38c23452664a5a8e370 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Mon, 13 Apr 2020 10:24:41 +0200 Subject: iio: imu: adis: Add irq flag variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some ADIS devices that can configure the data ready pin polarity. Hence, we cannot hardcode our IRQ mask as IRQF_TRIGGER_RISING since we might want to have it as IRQF_TRIGGER_FALLING. Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis_trigger.c | 33 +++++++++++++++++++++++++++++++-- include/linux/iio/imu/adis.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/imu/adis_trigger.c b/drivers/iio/imu/adis_trigger.c index a36810e0b1ab..8afe71947c00 100644 --- a/drivers/iio/imu/adis_trigger.c +++ b/drivers/iio/imu/adis_trigger.c @@ -34,6 +34,27 @@ static void adis_trigger_setup(struct adis *adis) iio_trigger_set_drvdata(adis->trig, adis); } +static int adis_validate_irq_flag(struct adis *adis) +{ + /* + * Typically this devices have data ready either on the rising edge or + * on the falling edge of the data ready pin. This checks enforces that + * one of those is set in the drivers... It defaults to + * IRQF_TRIGGER_RISING for backward compatibility wiht devices that + * don't support changing the pin polarity. + */ + if (!adis->irq_flag) { + adis->irq_flag = IRQF_TRIGGER_RISING; + return 0; + } else if (adis->irq_flag != IRQF_TRIGGER_RISING && + adis->irq_flag != IRQF_TRIGGER_FALLING) { + dev_err(&adis->spi->dev, "Invalid IRQ mask: %08lx\n", + adis->irq_flag); + return -EINVAL; + } + + return 0; +} /** * adis_probe_trigger() - Sets up trigger for a adis device * @adis: The adis device @@ -54,9 +75,13 @@ int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev) adis_trigger_setup(adis); + ret = adis_validate_irq_flag(adis); + if (ret) + return ret; + ret = request_irq(adis->spi->irq, &iio_trigger_generic_data_rdy_poll, - IRQF_TRIGGER_RISING, + adis->irq_flag, indio_dev->name, adis->trig); if (ret) @@ -96,9 +121,13 @@ int devm_adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev) adis_trigger_setup(adis); + ret = adis_validate_irq_flag(adis); + if (ret) + return ret; + ret = devm_request_irq(&adis->spi->dev, adis->spi->irq, &iio_trigger_generic_data_rdy_poll, - IRQF_TRIGGER_RISING, + adis->irq_flag, indio_dev->name, adis->trig); if (ret) diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index e70814d96e1c..70e4d1d262f5 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -87,6 +87,7 @@ struct adis_data { * @msg: SPI message object * @xfer: SPI transfer objects to be used for a @msg * @current_page: Some ADIS devices have registers, this selects current page + * @irq_flag: IRQ handling flags as passed to request_irq() * @buffer: Data buffer for information read from the device * @tx: DMA safe TX buffer for SPI transfers * @rx: DMA safe RX buffer for SPI transfers @@ -113,6 +114,7 @@ struct adis { struct spi_message msg; struct spi_transfer *xfer; unsigned int current_page; + unsigned long irq_flag; void *buffer; uint8_t tx[10] ____cacheline_aligned; -- cgit v1.2.3 From b9c5eec725d67b548e4dfdc406d6ca2c6d30d1c2 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Mon, 13 Apr 2020 10:24:42 +0200 Subject: iio: adis: Add adis_update_bits() APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a `regmap_update_bits()` like API to the ADIS library. It provides locked and unlocked variant. Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis.c | 25 +++++++++++++++++++ include/linux/iio/imu/adis.h | 59 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) (limited to 'include/linux') diff --git a/drivers/iio/imu/adis.c b/drivers/iio/imu/adis.c index 2e7d0d337f8f..c539dfa3b8d3 100644 --- a/drivers/iio/imu/adis.c +++ b/drivers/iio/imu/adis.c @@ -223,6 +223,31 @@ int __adis_read_reg(struct adis *adis, unsigned int reg, return ret; } EXPORT_SYMBOL_GPL(__adis_read_reg); +/** + * __adis_update_bits_base() - ADIS Update bits function - Unlocked version + * @adis: The adis device + * @reg: The address of the lower of the two registers + * @mask: Bitmask to change + * @val: Value to be written + * @size: Size of the register to update + * + * Updates the desired bits of @reg in accordance with @mask and @val. + */ +int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, + const u32 val, u8 size) +{ + int ret; + u32 __val; + + ret = __adis_read_reg(adis, reg, &__val, size); + if (ret) + return ret; + + __val = (__val & ~mask) | (val & mask); + + return __adis_write_reg(adis, reg, __val, size); +} +EXPORT_SYMBOL_GPL(__adis_update_bits_base); #ifdef CONFIG_DEBUG_FS diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 70e4d1d262f5..247fc4c7185c 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -333,6 +333,65 @@ static inline int adis_read_reg_32(struct adis *adis, unsigned int reg, return ret; } +int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, + const u32 val, u8 size); +/** + * adis_update_bits_base() - ADIS Update bits function - Locked version + * @adis: The adis device + * @reg: The address of the lower of the two registers + * @mask: Bitmask to change + * @val: Value to be written + * @size: Size of the register to update + * + * Updates the desired bits of @reg in accordance with @mask and @val. + */ +static inline int adis_update_bits_base(struct adis *adis, unsigned int reg, + const u32 mask, const u32 val, u8 size) +{ + int ret; + + mutex_lock(&adis->state_lock); + ret = __adis_update_bits_base(adis, reg, mask, val, size); + mutex_unlock(&adis->state_lock); + return ret; +} + +/** + * adis_update_bits() - Wrapper macro for adis_update_bits_base - Locked version + * @adis: The adis device + * @reg: The address of the lower of the two registers + * @mask: Bitmask to change + * @val: Value to be written + * + * This macro evaluates the sizeof of @val at compile time and calls + * adis_update_bits_base() accordingly. Be aware that using MACROS/DEFINES for + * @val can lead to undesired behavior if the register to update is 16bit. + */ +#define adis_update_bits(adis, reg, mask, val) ({ \ + BUILD_BUG_ON(sizeof(val) == 1 || sizeof(val) == 8); \ + __builtin_choose_expr(sizeof(val) == 4, \ + adis_update_bits_base(adis, reg, mask, val, 4), \ + adis_update_bits_base(adis, reg, mask, val, 2)); \ +}) + +/** + * adis_update_bits() - Wrapper macro for adis_update_bits_base + * @adis: The adis device + * @reg: The address of the lower of the two registers + * @mask: Bitmask to change + * @val: Value to be written + * + * This macro evaluates the sizeof of @val at compile time and calls + * adis_update_bits_base() accordingly. Be aware that using MACROS/DEFINES for + * @val can lead to undesired behavior if the register to update is 16bit. + */ +#define __adis_update_bits(adis, reg, mask, val) ({ \ + BUILD_BUG_ON(sizeof(val) == 1 || sizeof(val) == 8); \ + __builtin_choose_expr(sizeof(val) == 4, \ + __adis_update_bits_base(adis, reg, mask, val, 4), \ + __adis_update_bits_base(adis, reg, mask, val, 2)); \ +}) + int adis_enable_irq(struct adis *adis, bool enable); int __adis_check_status(struct adis *adis); int __adis_initial_startup(struct adis *adis); -- cgit v1.2.3 From 3e04cb60e872b9433e523d62c39addf0bd1a18f1 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Mon, 13 Apr 2020 10:24:43 +0200 Subject: iio: adis: Support different burst sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add burst_max_len to `adis_burst`. This is useful for devices which support different burst modes with different sizes. The buffer to be used in the spi transfer is allocated with this variable making sure that has space for all burst modes. The spi transfer length should hold the "real" burst length depending on the current burst mode configured in the device. Moreover, `extra_len` in `adis_burst` is made const and it should contain the smallest extra length necessary for a burst transfer. In `struct adis` was added a new `burst_extra_len` that should hold the extra bytes needed depending on the device instance being used. Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis16400.c | 2 +- drivers/iio/imu/adis_buffer.c | 13 +++++++++---- include/linux/iio/imu/adis.h | 9 +++++++-- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/imu/adis16400.c b/drivers/iio/imu/adis16400.c index 4445c242709c..439feb755d82 100644 --- a/drivers/iio/imu/adis16400.c +++ b/drivers/iio/imu/adis16400.c @@ -1195,7 +1195,7 @@ static int adis16400_probe(struct spi_device *spi) indio_dev->available_scan_masks = st->avail_scan_mask; st->adis.burst = &adis16400_burst; if (st->variant->flags & ADIS16400_BURST_DIAG_STAT) - st->adis.burst->extra_len = sizeof(u16); + st->adis.burst_extra_len = sizeof(u16); } adis16400_data = &st->variant->adis_data; diff --git a/drivers/iio/imu/adis_buffer.c b/drivers/iio/imu/adis_buffer.c index 8fdc4dca10c0..5b4225ee09b9 100644 --- a/drivers/iio/imu/adis_buffer.c +++ b/drivers/iio/imu/adis_buffer.c @@ -23,25 +23,30 @@ static int adis_update_scan_mode_burst(struct iio_dev *indio_dev, const unsigned long *scan_mask) { struct adis *adis = iio_device_get_drvdata(indio_dev); - unsigned int burst_length; + unsigned int burst_length, burst_max_length; u8 *tx; /* All but the timestamp channel */ burst_length = (indio_dev->num_channels - 1) * sizeof(u16); - burst_length += adis->burst->extra_len; + burst_length += adis->burst->extra_len + adis->burst_extra_len; + + if (adis->burst->burst_max_len) + burst_max_length = adis->burst->burst_max_len; + else + burst_max_length = burst_length; adis->xfer = kcalloc(2, sizeof(*adis->xfer), GFP_KERNEL); if (!adis->xfer) return -ENOMEM; - adis->buffer = kzalloc(burst_length + sizeof(u16), GFP_KERNEL); + adis->buffer = kzalloc(burst_max_length + sizeof(u16), GFP_KERNEL); if (!adis->buffer) { kfree(adis->xfer); adis->xfer = NULL; return -ENOMEM; } - tx = adis->buffer + burst_length; + tx = adis->buffer + burst_max_length; tx[0] = ADIS_READ_REG(adis->burst->reg_cmd); tx[1] = 0; diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 247fc4c7185c..2df67448f0d1 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -83,6 +83,8 @@ struct adis_data { * @trig: IIO trigger object data * @data: ADIS chip variant specific data * @burst: ADIS burst transfer information + * @burst_extra_len: Burst extra length. Should only be used by devices that can + * dynamically change their burst mode length. * @state_lock: Lock used by the device to protect state * @msg: SPI message object * @xfer: SPI transfer objects to be used for a @msg @@ -98,7 +100,7 @@ struct adis { const struct adis_data *data; struct adis_burst *burst; - + unsigned int burst_extra_len; /** * The state_lock is meant to be used during operations that require * a sequence of SPI R/W in order to protect the SPI transfer @@ -502,11 +504,14 @@ int adis_single_conversion(struct iio_dev *indio_dev, * @en burst mode enabled * @reg_cmd register command that triggers burst * @extra_len extra length to account in the SPI RX buffer + * @burst_max_len holds the maximum burst size when the device supports + * more than one burst mode with different sizes */ struct adis_burst { bool en; unsigned int reg_cmd; - unsigned int extra_len; + const u32 extra_len; + const u32 burst_max_len; }; int -- cgit v1.2.3 From 8cf7961dab42c9177a556b719c15f5b9449c24d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:53:36 +0200 Subject: block: bypass ->make_request_fn for blk-mq drivers Call blk_mq_make_request when no ->make_request_fn is set. This is safe now that blk_alloc_queue always sets up the pointer for make_request based drivers. This avoids an indirect call in the blk-mq driver I/O fast path, which is rather expensive due to spectre mitigations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 26 +++++++++++++++++--------- block/blk-mq.c | 4 ++-- drivers/md/dm.c | 3 +++ include/linux/blk-mq.h | 2 ++ 4 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 38e984d95e84..dffff2100888 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1073,7 +1073,10 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = q->make_request_fn(q, bio); + if (q->make_request_fn) + ret = q->make_request_fn(q, bio); + else + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); @@ -1113,9 +1116,7 @@ EXPORT_SYMBOL(generic_make_request); * * This function behaves like generic_make_request(), but does not protect * against recursion. Must only be used if the called driver is known - * to not call generic_make_request (or direct_make_request) again from - * its make_request function. (Calling direct_make_request again from - * a workqueue is perfectly fine as that doesn't recurse). + * to be blk-mq based. */ blk_qc_t direct_make_request(struct bio *bio) { @@ -1123,20 +1124,27 @@ blk_qc_t direct_make_request(struct bio *bio) bool nowait = bio->bi_opf & REQ_NOWAIT; blk_qc_t ret; + if (WARN_ON_ONCE(q->make_request_fn)) + goto io_error; if (!generic_make_request_checks(bio)) return BLK_QC_T_NONE; if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) - bio_wouldblock_error(bio); - else - bio_io_error(bio); - return BLK_QC_T_NONE; + goto would_block; + goto io_error; } - ret = q->make_request_fn(q, bio); + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); return ret; + +would_block: + bio_wouldblock_error(bio); + return BLK_QC_T_NONE; +io_error: + bio_io_error(bio); + return BLK_QC_T_NONE; } EXPORT_SYMBOL_GPL(direct_make_request); diff --git a/block/blk-mq.c b/block/blk-mq.c index 71d0894ce1c5..bcc3a2397d4a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1984,7 +1984,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * Returns: Request queue cookie. */ -static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); @@ -2096,6 +2096,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return cookie; } +EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) @@ -2955,7 +2956,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index db9e46114653..0eb93da44ea2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1788,6 +1788,9 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) int srcu_idx; struct dm_table *map; + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) + return blk_mq_make_request(q, bio); + map = dm_get_live_table(md, &srcu_idx); /* if we're suspended, we have to queue this io for later */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 51fbf6f76593..d7307795439a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -578,4 +578,6 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } +blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio); + #endif -- cgit v1.2.3 From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 20 +++--------------- net/core/filter.c | 2 +- .../testing/selftests/bpf/verifier/event_output.c | 24 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd2b2322412d..25da6ff2a880 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +extern const struct bpf_func_proto bpf_event_output_data_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/net/core/filter.c b/net/core/filter.c index 7d6ceaa54d21..a943df3ad8b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4214,7 +4214,7 @@ BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_event_output_data_proto = { +const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, -- cgit v1.2.3 From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 24 Apr 2020 16:59:41 -0700 Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for CGROUP_NET=n linux-next build bot reported compile issue [1] with one of its configs. It looks like when we have CONFIG_NET=n and CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto definition (from net/core/filter.c) in cgroup_base_func_proto. I'm reshuffling the code a bit to make it work. The common helpers are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is exported from there. Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing bpf_user_rnd_u32. [1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72 Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com --- include/linux/bpf.h | 8 ++++++ include/linux/filter.h | 2 -- kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 78 +------------------------------------------------- 5 files changed, 87 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 25da6ff2a880..5147e11e53ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1215,6 +1215,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct bpf_prog *bpf_prog_by_id(u32 id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1365,6 +1366,12 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } + +static inline const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -1531,6 +1538,7 @@ const struct bpf_func_proto *bpf_tracing_func_proto( /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, diff --git a/include/linux/filter.h b/include/linux/filter.h index 9b5aa5c483cc..af37318bb1c5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -863,8 +863,6 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..0cc91805069a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..dbba4f41d508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + return NULL; + } +} diff --git a/net/core/filter.c b/net/core/filter.c index a943df3ad8b0..a605626142b6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, offset); } -BPF_CALL_0(bpf_get_raw_cpu_id) -{ - return raw_smp_processor_id(); -} - -static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = bpf_get_raw_cpu_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -4205,26 +4194,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, - void *, data, u64, size) -{ - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; - - return bpf_event_output(map, flags, data, size, NULL, 0, NULL); -} - -const struct bpf_func_proto bpf_event_output_data_proto = { - .func = bpf_event_output_data, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5983,52 +5952,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_raw_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - default: - break; - } - - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - default: - return NULL; - } -} +const struct bpf_func_proto bpf_event_output_data_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -- cgit v1.2.3 From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- drivers/media/rc/bpf-lirc.c | 2 ++ include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 ++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 13 ++++++++++++- 7 files changed, 44 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 0f3417d161b8..069c42f22a8c 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_prandom_u32: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5147e11e53ff..10960cfabea4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1509,6 +1509,7 @@ extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0cc91805069a..6aa11de67315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9a6b23387d02..5c0290e0696e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -615,6 +627,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 26363af5643490a817272e1cc6f1d3f1d550a699 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:35 +0200 Subject: mm: remove watermark_boost_factor_sysctl_handler watermark_boost_factor_sysctl_handler is just a pointless wrapper for proc_dointvec_minmax, so remove it and use proc_dointvec_minmax directly. Signed-off-by: Christoph Hellwig Acked-by: David Rientjes Signed-off-by: Al Viro --- include/linux/mmzone.h | 2 -- kernel/sysctl.c | 2 +- mm/page_alloc.c | 12 ------------ 3 files changed, 1 insertion(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1b9de7d220fb..f37bb8f187fc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -911,8 +911,6 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int watermark_boost_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..99d27acf4646 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1491,7 +1491,7 @@ static struct ctl_table vm_table[] = { .data = &watermark_boost_factor, .maxlen = sizeof(watermark_boost_factor), .mode = 0644, - .proc_handler = watermark_boost_factor_sysctl_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69827d4fa052..62c1550cd43e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7978,18 +7978,6 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int rc; - - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (rc) - return rc; - - return 0; -} - int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { -- cgit v1.2.3 From 2374c09b1c8a883bb9b4b2fc3756703eeb618f4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:36 +0200 Subject: sysctl: remove all extern declaration from sysctl.c Extern declarations in .c files are a bad style and can lead to mismatches. Use existing definitions in headers where they exist, and otherwise move the external declarations to suitable header files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/coredump.h | 4 ++++ include/linux/file.h | 2 ++ include/linux/mm.h | 2 ++ include/linux/mmzone.h | 2 ++ include/linux/pid.h | 3 +++ include/linux/sysctl.h | 8 ++++++++ kernel/sysctl.c | 45 +++------------------------------------------ 7 files changed, 24 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index abf4b4e65dbb..7a899e83835d 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -22,4 +22,8 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #endif +extern int core_uses_pid; +extern char core_pattern[]; +extern unsigned int core_pipe_limit; + #endif /* _LINUX_COREDUMP_H */ diff --git a/include/linux/file.h b/include/linux/file.h index 142d102f285e..122f80084a3e 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -94,4 +94,6 @@ extern void fd_install(unsigned int fd, struct file *file); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); +extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; + #endif /* __LINUX_FILE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..9c4e7e76dedd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3140,5 +3140,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif +extern int sysctl_nr_trim_pages; + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f37bb8f187fc..b2af594ef0f7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -909,6 +909,7 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; + int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, @@ -925,6 +926,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, extern int numa_zonelist_order_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/pid.h b/include/linux/pid.h index cc896f0fc4e3..93543cbc0e6b 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -108,6 +108,9 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new, struct pid_namespace; extern struct pid_namespace init_pid_ns; +extern int pid_max; +extern int pid_max_min, pid_max_max; + /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02fa84493f23..36143ca40b56 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -207,7 +207,15 @@ void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern int pwrsw_enabled; +extern int unaligned_enabled; +extern int unaligned_dump_stack; +extern int no_unaligned_warning; + extern struct ctl_table sysctl_mount_point[]; +extern struct ctl_table random_table[]; +extern struct ctl_table firmware_config_table[]; +extern struct ctl_table epoll_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99d27acf4646..31b934865ebc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include #include #include +#include +#include +#include #include "../lib/kstrtox.h" @@ -103,22 +106,6 @@ #if defined(CONFIG_SYSCTL) -/* External variables not in a header file. */ -extern int suid_dumpable; -#ifdef CONFIG_COREDUMP -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -#endif -extern int pid_max; -extern int pid_max_min, pid_max_max; -extern int percpu_pagelist_fraction; -extern int latencytop_enabled; -extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif - /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; @@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_INOTIFY_USER #include #endif -#ifdef CONFIG_SPARC -#endif - -#ifdef CONFIG_PARISC -extern int pwrsw_enabled; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN -extern int no_unaligned_warning; -#endif #ifdef CONFIG_PROC_SYSCTL @@ -243,14 +212,6 @@ static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif - -#ifdef CONFIG_FW_LOADER_USER_HELPER -extern struct ctl_table firmware_config_table[]; -#endif #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -- cgit v1.2.3 From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/fpsimd.c | 3 +- arch/mips/lasat/sysctl.c | 13 +- arch/s390/appldata/appldata_base.c | 11 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/cmm.c | 12 +- arch/x86/kernel/itmt.c | 3 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/random.c | 2 +- drivers/macintosh/mac_hid.c | 3 +- drivers/parport/procfs.c | 39 +++--- fs/dcache.c | 2 +- fs/drop_caches.c | 2 +- fs/file_table.c | 4 +- fs/fscache/main.c | 3 +- fs/inode.c | 2 +- fs/proc/proc_sysctl.c | 47 ++++--- fs/quota/dquot.c | 2 +- fs/xfs/xfs_sysctl.c | 4 +- include/linux/bpf-cgroup.h | 9 +- include/linux/compaction.h | 2 +- include/linux/fs.h | 6 +- include/linux/ftrace.h | 3 +- include/linux/hugetlb.h | 15 +- include/linux/kprobes.h | 2 +- include/linux/latencytop.h | 4 +- include/linux/mm.h | 12 +- include/linux/mmzone.h | 23 ++- include/linux/nmi.h | 15 +- include/linux/perf_event.h | 13 +- include/linux/printk.h | 2 +- include/linux/sched/sysctl.h | 44 ++---- include/linux/security.h | 2 +- include/linux/sysctl.h | 53 +++---- include/linux/timer.h | 3 +- include/linux/vmstat.h | 8 +- include/linux/writeback.h | 28 ++-- ipc/ipc_sysctl.c | 10 +- ipc/mq_sysctl.c | 4 +- kernel/bpf/cgroup.c | 35 ++--- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++-------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +- mm/compaction.c | 2 +- mm/hugetlb.c | 9 +- mm/page-writeback.c | 16 +-- mm/page_alloc.c | 30 ++-- mm/util.c | 10 +- mm/vmstat.c | 4 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/neighbour.c | 28 ++-- net/core/sysctl_net_core.c | 27 ++-- net/decnet/dn_dev.c | 7 +- net/decnet/sysctl_net_decnet.c | 27 ++-- net/ipv4/devinet.c | 9 +- net/ipv4/route.c | 3 +- net/ipv4/sysctl_net_ipv4.c | 38 ++--- net/ipv6/addrconf.c | 33 ++--- net/ipv6/ndisc.c | 3 +- net/ipv6/route.c | 5 +- net/ipv6/sysctl_net_ipv6.c | 3 +- net/mpls/af_mpls.c | 5 +- net/netfilter/ipvs/ip_vs_ctl.c | 6 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/phonet/sysctl.c | 3 +- net/rds/tcp.c | 6 +- net/sctp/sysctl.c | 32 ++--- net/sunrpc/sysctl.c | 29 ++-- net/sunrpc/xprtrdma/svc_rdma.c | 7 +- security/apparmor/lsm.c | 2 +- security/min_addr.c | 2 +- security/yama/yama_lsm.c | 2 +- 88 files changed, 458 insertions(+), 653 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c19aa81ddc8c..7364de008bab 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 94289d126993..35cb5e66c504 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl) #ifdef CONFIG_SYSCTL static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int vl = sve_default_vl; diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index e666fe26c50d..2119541a5b8b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write, len = 0; p = buffer; while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; + c = *p; + p++; if (c == 0 || c == '\n') break; len++; } if (len >= sizeof(ipbuf)-1) len = sizeof(ipbuf) - 1; - if (copy_from_user(ipbuf, buffer, len)) - return -EFAULT; + memcpy(ipbuf, buffer, len); ipbuf[len] = 0; *ppos += *lenp; /* Now see if we can convert it to a valid IP */ @@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, ipbuf, len)) - return -EFAULT; + memcpy(buffer, ipbuf, len); if (len < *lenp) { - if (put_user('\n', ((char *) buffer) + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; len++; } *lenp = len; diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index aa738cad1338..d74a4c7d5df6 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev; */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; static struct ctl_table appldata_table[] = { @@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd) */ static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; int rc; @@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, */ static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; int rc; @@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, */ static int appldata_generic_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; struct list_head *lh; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 6d321f5f101d..636446003a06 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -867,7 +867,7 @@ static int debug_active = 1; * if debug_active is already off */ static int s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5f70cefc13e4..332b542548cd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -594,7 +594,7 @@ static int __init topology_setup(char *str) early_param("topology", topology_setup); static int topology_ctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); int new_mode; diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index ae989b740376..36bce727897b 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..e3bbe108eb54 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void) } static int cdrom_sysctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d10e31fd342..1e0db78b83ba 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2057,7 +2057,7 @@ static char sysctl_bootid[16]; * sysctl system call, as 16 bytes of binary data. */ static int proc_do_uuid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; unsigned char buf[64], tmp_uuid[16], *uuid; diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7af0c536d568..28b8581b44dd 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void) } static int mac_hid_toggle_emumouse(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int old_val = *valp; diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 48804049d697..ee7b5daabfd4 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -34,7 +34,7 @@ #define PARPORT_MAX_SPINTIME_VALUE 1000 static int do_active_device(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[256]; @@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #ifdef CONFIG_PARPORT_1284 static int do_autoprobe(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; const char *str; @@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write, *ppos += len; - return copy_to_user (result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_irq(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_dma(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_modes(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; @@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD } diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..8dd4d8d7bd0b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index dc1a1d5d825b..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } int drop_caches_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..3b612535391f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files); */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 59c2494efda3..c1e6cc9091aa 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2; static struct ctl_table_header *fscache_sysctl_header; static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct workqueue_struct **wqp = table->extra1; unsigned int *datap = table->data; diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..cc6e701b7e5d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void) */ #ifdef CONFIG_SYSCTL int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d459b087..df2143e05c57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -539,13 +539,13 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, +static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, size_t count, loff_t *ppos, int write) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *new_buf = NULL; + void *kbuf; ssize_t error; if (IS_ERR(head)) @@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - ppos, &new_buf); + if (write) { + kbuf = memdup_user_nul(ubuf, count); + if (IS_ERR(kbuf)) { + error = PTR_ERR(kbuf); + goto out; + } + } else { + error = -ENOMEM; + kbuf = kzalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + ppos); if (error) - goto out; + goto out_free_buf; /* careful: calling conventions are nasty here */ - if (new_buf) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - error = table->proc_handler(table, write, (void __user *)new_buf, - &count, ppos); - set_fs(old_fs); - kfree(new_buf); - } else { - error = table->proc_handler(table, write, buf, &count, ppos); + error = table->proc_handler(table, write, kbuf, &count, ppos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_user(ubuf, kbuf, count)) + goto out_free_buf; } - if (!error) - error = count; + error = count; +out_free_buf: + kfree(kbuf); out: sysctl_head_finish(head); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b6a4f692d345..7b4bac91146b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = { EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 31b3bdbd2eba..021ef96d0542 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -13,7 +13,7 @@ STATIC int xfs_stats_clear_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { @@ -33,7 +33,7 @@ STATIC int xfs_panic_mask_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c11b413d5b1a..0b41fd5fc96b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -138,8 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, @@ -302,12 +301,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, pos, nbuf, \ + buf, count, pos, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -429,7 +428,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..a0eabfbeb0e1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..9b028d260649 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ddfc377de0d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void); extern int __disable_trace_on_warning; int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 43a1cef8f0f1..92c21c5ccc58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); - -#ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#endif +int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 04bdaf01112c..594265bfd390 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn); #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *length, loff_t *ppos); #endif extern void wait_for_kprobe_optimizer(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 9022f0c2e2e4..abe3d95f795b 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -extern int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c4e7e76dedd..a7b1ef8ed970 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2af594ef0f7..93cf20f41e26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -910,22 +910,21 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -extern int numa_zonelist_order_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); +int numa_zonelist_order_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9003e29cde46..750c7f395ca9 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif struct ctl_table; -extern int proc_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_nmi_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_soft_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_thresh(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_cpumask(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..347ea379622a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -extern int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - +int perf_proc_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..fcde0772ec98 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -189,7 +189,7 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, +devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos); extern void wake_up_klogd(void); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..7b4d3a49b6c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -12,9 +12,8 @@ extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; @@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); #endif /* @@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled; extern int sysctl_sched_rr_timeslice; extern int sched_rr_timeslice; -extern int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -#ifdef CONFIG_UCLAMP_TASK -extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif - -extern int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; -extern int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_energy_aware_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..6aa229b252ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -211,7 +211,7 @@ struct request_sock; #ifdef CONFIG_MMU extern int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif /* security_inode_init_security callback function to write xattrs */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 36143ca40b56..f2401e45a3c2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -44,35 +44,26 @@ struct ctl_dir; extern const int sysctl_vals[]; -typedef int proc_handler (struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -extern int proc_dostring(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int proc_dointvec_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - void __user *, size_t *, loff_t *); -extern int proc_do_large_bitmap(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); + +int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, + size_t *, loff_t *); +int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_do_static_key(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table @@ -246,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, #endif /* CONFIG_SYSCTL */ -int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 0dc19a8c39c9..07910ae5ddd9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -201,8 +201,7 @@ struct ctl_table; extern unsigned int sysctl_timer_migration; int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif unsigned long __round_jiffies(unsigned long j, int cpu); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 292485f3d24d..cb507151710f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -16,8 +16,8 @@ extern int sysctl_stat_interval; #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, - int write, void __user *buffer, size_t *length, loff_t *ppos); +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; -int vmstat_refresh(struct ctl_table *, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, + loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..f8a7e1a850fb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int dirty_background_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_background_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -struct ctl_table; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index affd66537e87..d1b8644bfb88 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL static int proc_ipc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = current->nsproxy->ipc_ns; int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); @@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 7c00f28923a8..72a92a08c848 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table) } static int proc_mq_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); @@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write, } static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..d8cfb7b99a83 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2463,7 +2463,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd459155d28a..2277c5728b1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3375,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..d3ee4c4dafac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62c1550cd43e..0c43e9ae5004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node"; * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7979,7 +7969,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, } int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8009,7 +7999,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8036,7 +8026,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8060,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); @@ -8082,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..8defc8ec141f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..c03a8c914922 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 59980ecfc962..04c3f9a82650 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..3f2263e79e4b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..0ddb13a6282b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +179,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +190,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index cca7ae712995..65abcf1b3210 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *, + loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table dn_dev_vars[5]; @@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { #ifdef CONFIG_DECNET_ROUTER struct net_device *dev = table->extra1; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 55bf64a22b59..deae519bdeec 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str) } static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char addr[DN_ASCBUF_LEN]; size_t len; @@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, if (write) { len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - + memcpy(addr, buffer, len); addr[len] = 0; strip_it(addr); @@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write, len = strlen(addr); addr[len++] = '\n'; - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - + if (len > *lenp) + len = *lenp; + memcpy(buffer, addr, len); *lenp = len; *ppos += len; @@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, } static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { size_t len; struct net_device *dev; @@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (*lenp > 16) return -E2BIG; - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - + memcpy(devname, buffer, *lenp); devname[*lenp] = 0; strip_it(devname); @@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - + memcpy(buffer, devname, len); *lenp = len; *ppos += len; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..a118978d222c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2361,8 +2361,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) } static int devinet_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2414,8 +2413,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, } static int devinet_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2458,8 +2456,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, } static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..041f4dcac440 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..868e317cc324 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2]) /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.ip_local_ports.range); @@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); @@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; @@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; @@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); @@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; @@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl, } static int proc_allowed_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; @@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key) } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); @@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol) } static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, } static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, } static int proc_tcp_available_ulp(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; @@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..9d0e89bccb90 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6108,9 +6108,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static -int addrconf_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6134,9 +6133,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; int min_mtu = IPV6_MIN_MTU; @@ -6206,9 +6204,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } -static -int addrconf_sysctl_disable(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6232,9 +6229,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int ret; @@ -6275,7 +6271,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, } static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -6337,7 +6333,7 @@ out: } static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int err; @@ -6404,8 +6400,7 @@ out: static int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6505,10 +6500,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } -static -int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..58f1255295d3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..acdb31e38412 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6088,9 +6088,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static -int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 63b657aa8d29..fac2135aa47b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -26,8 +26,7 @@ static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 4701edffb1f7..a42e4ed5ab0e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1362,8 +1362,7 @@ done: (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2594,7 +2593,7 @@ nolabels: } static int mpls_platform_labels(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8d14a1acbc37..412656c34f20 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1736,7 +1736,7 @@ static int three = 3; static int proc_do_defense_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; @@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, static int proc_do_sync_threshold(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val[2]; @@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, static int proc_do_sync_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..31b027b12ff3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -517,7 +517,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..6cb9f9474b05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = { }; static int nf_log_proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 251e750fd9aa..0d0bf41381c2 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max) } static int proc_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 66121bc6f34e..46782fac4c16 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos); + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net) } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos) + void *buffer, size_t *lenp, loff_t *fpos) { struct net *net = current->nsproxy->net_ns; int err; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 4740aa70e652..c16c80963e55 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -43,20 +43,15 @@ static unsigned long max_autoclose_max = ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = { }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; @@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, } static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " @@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, } static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..999eee1ed61c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,7 +60,7 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; size_t len; @@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write, return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..526da5d4710b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod; * current value. */ static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; @@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write, len -= *ppos; if (len > *lenp) len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; + if (len) + memcpy(buffer, str_buf, len); *lenp = len; *ppos += len; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b621ad74f54a..27e371b44dad 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void) #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!policy_admin_capable(NULL)) return -EPERM; diff --git a/security/min_addr.c b/security/min_addr.c index 94d2b0cf0e7b..88c9a6a21f47 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -30,7 +30,7 @@ static void update_mmap_min_addr(void) * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 94dc346370b1..536c99646f6a 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = { #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; -- cgit v1.2.3 From ddc465936643108d5ba61f88594a2868d6a156ab Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Thu, 5 Mar 2020 23:22:55 +0100 Subject: Revert "rculist: Describe variadic macro argument in a Sphinx-compatible way" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f452ee096d95482892b101bde4fd037fa025d3cc. The workaround became unnecessary with commit 43756e347f21 ("scripts/kernel-doc: Add support for named variable macro arguments"). Signed-off-by: Jonathan Neuschäfer Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8214cdc715f2..7375bb3da140 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -371,7 +371,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() @@ -646,7 +646,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() -- cgit v1.2.3 From 6be7436d2245d3dd8b9a8f949367c13841c23308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 13:47:41 -0700 Subject: rcu: Add rcu_gp_might_be_stalled() This commit adds rcu_gp_might_be_stalled(), which returns true if there is some reason to believe that the RCU grace period is stalled. The use case is where an RCU free-memory path needs to allocate memory in order to free it, a situation that should be avoided where possible. But where it is necessary, there is always the alternative of using synchronize_rcu() to wait for a grace period in order to avoid the allocation. And if the grace period is stalled, allocating memory to asynchronously wait for it is a bad idea of epic proportions: Far better to let others use the memory, because these others might actually be able to free that memory before the grace period ends. Thus, rcu_gp_might_be_stalled() can be used to help decide whether allocating memory on an RCU free path is a semi-reasonable course of action. Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree_stall.h | 40 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 045c28b71f4f..dbf5ac439594 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,6 +87,7 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } +static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 45f3f66bb04d..fbc26274af4d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -39,6 +39,7 @@ void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); +bool rcu_gp_might_be_stalled(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4dede00e2936 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } -- cgit v1.2.3 From f0bdf6d473cf12a488a78422e15aafdfe77cf853 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 14:52:32 -0800 Subject: rcu: Remove unused ->rcu_read_unlock_special.b.deferred_qs field The ->rcu_read_unlock_special.b.deferred_qs field is set to true in rcu_read_unlock_special() but never set to false. This is not particularly useful, so this commit removes this field. The only possible justification for this field is to ease debugging of RCU deferred quiscent states, but the combination of the other ->rcu_read_unlock_special fields plus ->rcu_blocked_node and of course ->rcu_read_lock_nesting should cover debugging needs. And if this last proves incorrect, this patch can always be reverted, along with the required setting of ->rcu_read_unlock_special.b.deferred_qs to false in rcu_preempt_deferred_qs_irqrestore(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 2 +- kernel/rcu/tree_plugin.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..a4b727f57095 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -613,7 +613,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 deferred_qs; + u8 pad; /* No garbage from compiler! */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 263c766b9dc1..f31c5992f842 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -634,7 +634,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } -- cgit v1.2.3 From 2beaf3280e57bb891f8012dca49c87ed0f01e2f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 14:23:21 -0700 Subject: sched/core: Add function to sample state of locked-down task A running task's state can be sampled in a consistent manner (for example, for diagnostic purposes) simply by invoking smp_call_function_single() on its CPU, which may be obtained using task_cpu(), then having the IPI handler verify that the desired task is in fact still running. However, if the task is not running, this sampling can in theory be done immediately and directly. In practice, the task might start running at any time, including during the sampling period. Gaining a consistent sample of a not-running task therefore requires that something be done to lock down the target task's state. This commit therefore adds a try_invoke_on_locked_down_task() function that invokes a specified function if the specified task can be locked down, returning true if successful and if the specified function returns true. Otherwise this function simply returns false. Given that the function passed to try_invoke_on_nonrunning_task() might be invoked with a runqueue lock held, that function had better be quite lightweight. The function is passed the target task's task_struct pointer and the argument passed to try_invoke_on_locked_down_task(), allowing easy access to task state and to a location for further variables to be passed in and out. Note that the specified function will be called even if the specified task is currently running. The function can use ->on_rq and task_curr() to quickly and easily determine the task's state, and can return false if this state is not to the function's liking. The caller of the try_invoke_on_locked_down_task() would then see the false return value, and could take appropriate action, for example, trying again later or sending an IPI if matters are more urgent. It is expected that use cases such as the RCU CPU stall warning code will simply return false if the task is currently running. However, there are use cases involving nohz_full CPUs where the specified function might instead fall back to an alternative sampling scheme that relies on heavier synchronization (such as memory barriers) in the target task. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Ben Segall Cc: Mel Gorman [ paulmck: Apply feedback from Peter Zijlstra and Steven Rostedt. ] [ paulmck: Invoke if running to handle feedback from Mathieu Desnoyers. ] Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/wait.h | 2 ++ kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index feeb6be5cad6..898c890fc153 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1149,4 +1149,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i (wait)->flags = 0; \ } while (0) +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg); + #endif /* _LINUX_WAIT_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5ca567adfcb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2566,6 +2566,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2639,6 +2641,52 @@ out: return success; } +/** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. -- cgit v1.2.3 From b3d73156b075014ce5b2609f4f47723d6c0c23d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Mar 2020 13:58:27 -0800 Subject: rcu: Reinstate synchronize_rcu_mult() With the advent and likely usage of synchronize_rcu_rude(), there is again a need to wait on multiple types of RCU grace periods, for example, call_rcu_tasks() and call_rcu_tasks_rude(). This commit therefore reinstates synchronize_rcu_mult() in order to allow these grace periods to be straightforwardly waited on concurrently. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_wait.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index c0578ba23c1a..699b938358bf 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -31,4 +31,23 @@ do { \ #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) +/** + * synchronize_rcu_mult - Wait concurrently for multiple grace periods + * @...: List of call_rcu() functions for different grace periods to wait on + * + * This macro waits concurrently for multiple types of RCU grace periods. + * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait + * on concurrent RCU and RCU-tasks grace periods. Waiting on a given SRCU + * domain requires you to write a wrapper function for that SRCU domain's + * call_srcu() function, with this wrapper supplying the pointer to the + * corresponding srcu_struct. + * + * The first argument tells Tiny RCU's _wait_rcu_gp() not to + * bother waiting for RCU. The reason for this is because anywhere + * synchronize_rcu_mult() can be called is automatically already a full + * grace period. + */ +#define synchronize_rcu_mult(...) \ + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ -- cgit v1.2.3 From 5873b8a94e5dae04b8e11fc798df512614e6d1e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 11:49:21 -0800 Subject: rcu-tasks: Refactor RCU-tasks to allow variants to be added This commit splits out generic processing from RCU-tasks-specific processing in order to allow additional flavors to be added. It also adds a def_bool TASKS_RCU_GENERIC to enable the common RCU-tasks infrastructure code. This is primarily, but not entirely, a code-movement commit. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 6 +- kernel/rcu/Kconfig | 10 +- kernel/rcu/tasks.h | 491 +++++++++++++++++++++++++---------------------- kernel/rcu/update.c | 4 + 4 files changed, 272 insertions(+), 239 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2678a37c3169..5523145e0a78 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -129,7 +129,7 @@ static inline void rcu_init_nohz(void) { } * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ -#ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RCU_GENERIC #define rcu_tasks_qs(t) \ do { \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ @@ -140,14 +140,14 @@ void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); -#else /* #ifdef CONFIG_TASKS_RCU */ +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_qs(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } -#endif /* #else #ifdef CONFIG_TASKS_RCU */ +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..38475d0bc634 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. config RCU_STALL_COMMON def_bool TREE_RCU diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5ccfe0d64e6a..d77921ee5a6e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -5,7 +5,13 @@ * Copyright (C) 2020 Paul E. McKenney */ -#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); /** * Definition for a Tasks-RCU-like mechanism. @@ -14,6 +20,8 @@ * @cbs_wq: Wait queue allowning new callback to get kthread's attention. * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @call_func: This flavor's call_rcu()-equivalent function. */ struct rcu_tasks { struct rcu_head *cbs_head; @@ -21,29 +29,20 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + call_rcu_func_t call_func; }; -#define DEFINE_RCU_TASKS(name) \ +#define DEFINE_RCU_TASKS(name, gp, call) \ static struct rcu_tasks name = \ { \ .cbs_tail = &name.cbs_head, \ .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ } -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ -DEFINE_RCU_TASKS(rcu_tasks); - /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); @@ -52,29 +51,16 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) { unsigned long flags; bool needwake; - struct rcu_tasks *rtp = &rcu_tasks; rhp->next = NULL; rhp->func = func; @@ -87,108 +73,25 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) if (needwake && READ_ONCE(rtp->kthread_ptr)) wake_up(&rtp->cbs_wq); } -EXPORT_SYMBOL_GPL(call_rcu_tasks); -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); + wait_rcu_gp(rtp->call_func); } /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; struct rcu_head *list; struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); struct rcu_tasks *rtp = arg; - int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -220,111 +123,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) continue; } - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * t->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); + // Wait for one grace period. + rtp->gp_func(rtp); /* Invoke the callbacks. */ while (list) { @@ -340,18 +140,16 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, &rcu_tasks, "rcu_tasks_kthread"); + t = kthread_run(rcu_tasks_kthread, rtp, "rcu_tasks_kthread"); if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; + return; smp_mb(); /* Ensure others see full kthread. */ - return 0; } -core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -369,8 +167,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) preempt_enable(); } -#endif /* #ifdef CONFIG_TASKS_RCU */ - #ifndef CONFIG_TINY_RCU /* @@ -387,3 +183,230 @@ static void __init rcu_tasks_bootup_oddness(void) } #endif /* #ifndef CONFIG_TINY_RCU */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(rcu_tasks_holdouts); + int fract; + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +#endif /* #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c5799349ff31..30dce20e1644 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -584,7 +584,11 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#ifdef CONFIG_TASKS_RCU_GENERIC #include "tasks.h" +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From c84aad765406c4c7573ce449e8a9977ebb8f4cb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 21:06:43 -0800 Subject: rcu-tasks: Add an RCU-tasks rude variant This commit adds a "rude" variant of RCU-tasks that has as quiescent states schedule(), cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, anyway) cond_resched(). In other words, RCU-tasks rude readers are regions of code with preemption disabled, but excluding code early in the CPU-online sequence and late in the CPU-offline sequence. Updates make use of IPIs and force an IPI and a context switch on each online CPU. This variant is useful in some situations in tracing. Suggested-by: Steven Rostedt [ paulmck: Apply EXPORT_SYMBOL_GPL() feedback from Qiujun Huang. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply review feedback from Steve Rostedt. ] --- include/linux/rcupdate.h | 3 ++ kernel/rcu/Kconfig | 11 +++++- kernel/rcu/tasks.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5523145e0a78..2be97a83f266 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -37,6 +37,7 @@ /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); +void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU @@ -138,6 +139,8 @@ static inline void rcu_init_nohz(void) { } #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); +void synchronize_rcu_tasks_rude(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 38475d0bc634..6ee6372a4459 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -84,6 +84,15 @@ config TASKS_RCU only voluntary context switch (not preemption!), idle, and user-mode execution as quiescent states. Not for manual selection. +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d77921ee5a6e..7f9ed20c26c7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,6 +180,9 @@ static void __init rcu_tasks_bootup_oddness(void) else pr_info("\tTasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -410,3 +413,98 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); #endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ -- cgit v1.2.3 From d5f177d35c24429c87db2567d20563fc16f7e8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Mar 2020 19:56:53 -0700 Subject: rcu-tasks: Add an RCU Tasks Trace to simplify protection of tracing hooks Because RCU does not watch exception early-entry/late-exit, idle-loop, or CPU-hotplug execution, protection of tracing and BPF operations is needlessly complicated. This commit therefore adds a variant of Tasks RCU that: o Has explicit read-side markers to allow finite grace periods in the face of in-kernel loops for PREEMPT=n builds. These markers are rcu_read_lock_trace() and rcu_read_unlock_trace(). o Protects code in the idle loop, exception entry/exit, and CPU-hotplug code paths. In this respect, RCU-tasks trace is similar to SRCU, but with lighter-weight readers. o Avoids expensive read-side instruction, having overhead similar to that of Preemptible RCU. There are of course downsides: o The grace-period code can send IPIs to CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. This is mitigated by later commits. o It is necessary to scan the full tasklist, much as for Tasks RCU. o There is a single callback queue guarded by a single lock, again, much as for Tasks RCU. However, those early use cases that request multiple grace periods in quick succession are expected to do so from a single task, which makes the single lock almost irrelevant. If needed, multiple callback queues can be provided using any number of schemes. Perhaps most important, this variant of RCU does not affect the vanilla flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace readers can operate from idle, offline, and exception entry/exit in no way enables rcu_preempt and rcu_sched readers to do so. The memory ordering was outlined here: https://lore.kernel.org/lkml/20200319034030.GX3199@paulmck-ThinkPad-P72/ This effort benefited greatly from off-list discussions of BPF requirements with Alexei Starovoitov and Andrii Nakryiko. At least some of the on-list discussions are captured in the Link: tags below. In addition, KCSAN was quite helpful in finding some early bugs. Link: https://lore.kernel.org/lkml/20200219150744.428764577@infradead.org/ Link: https://lore.kernel.org/lkml/87mu8p797b.fsf@nanos.tec.linutronix.de/ Link: https://lore.kernel.org/lkml/20200225221305.605144982@linutronix.de/ Cc: Alexei Starovoitov Cc: Andrii Nakryiko [ paulmck: Apply feedback from Steve Rostedt and Joel Fernandes. ] [ paulmck: Decrement trc_n_readers_need_end upon IPI failure. ] [ paulmck: Fix locking issue reported by rcutorture. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 84 ++++++++++ include/linux/sched.h | 8 + init/init_task.c | 4 + kernel/fork.c | 4 + kernel/rcu/Kconfig | 11 +- kernel/rcu/tasks.h | 361 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 467 insertions(+), 5 deletions(-) create mode 100644 include/linux/rcupdate_trace.h (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h new file mode 100644 index 000000000000..ed97e10817bd --- /dev/null +++ b/include/linux/rcupdate_trace.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion, adapted for tracing. + * + * Copyright (C) 2020 Paul E. McKenney. + */ + +#ifndef __LINUX_RCUPDATE_TRACE_H +#define __LINUX_RCUPDATE_TRACE_H + +#include +#include + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +extern struct lockdep_map rcu_trace_lock_map; + +static inline int rcu_read_lock_trace_held(void) +{ + return lock_is_held(&rcu_trace_lock_map); +} + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +static inline int rcu_read_lock_trace_held(void) +{ + return 1; +} + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +void rcu_read_unlock_trace_special(struct task_struct *t); + +/** + * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section + * + * When synchronize_rcu_trace() is invoked by one task, then that task + * is guaranteed to block until all other tasks exit their read-side + * critical sections. Similarly, if call_rcu_trace() is invoked on one + * task while other tasks are within RCU read-side critical sections, + * invocation of the corresponding RCU callback is deferred until after + * the all the other tasks exit their critical sections. + * + * For more details, please see the documentation for rcu_read_lock(). + */ +static inline void rcu_read_lock_trace(void) +{ + struct task_struct *t = current; + + WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + rcu_lock_acquire(&rcu_trace_lock_map); +} + +/** + * rcu_read_unlock_trace - mark end of RCU-trace read-side critical section + * + * Pairs with a preceding call to rcu_read_lock_trace(), and nesting is + * allowed. Invoking a rcu_read_unlock_trace() when there is no matching + * rcu_read_lock_trace() is verboten, and will result in lockdep complaints. + * + * For more details, please see the documentation for rcu_read_unlock(). + */ +static inline void rcu_read_unlock_trace(void) +{ + int nesting; + struct task_struct *t = current; + + rcu_lock_release(&rcu_trace_lock_map); + nesting = READ_ONCE(t->trc_reader_nesting) - 1; + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting) + return; // We assume shallow reader nesting. + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +void synchronize_rcu_tasks_trace(void); +void rcu_barrier_tasks_trace(void); + +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + +#endif /* __LINUX_RCUPDATE_TRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a4b727f57095..864f60e51c41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -724,6 +724,14 @@ struct task_struct { struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + bool trc_reader_need_end; + bool trc_reader_checked; + struct list_head trc_holdout_list; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + struct sched_info sched_info; struct list_head tasks; diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..e8b3740ee598 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -141,6 +141,10 @@ struct task_struct init_task .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, #endif +#ifdef CONFIG_TASKS_TRACE_RCU + .trc_reader_nesting = 0, + .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), +#endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), #endif diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..72e9396235b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1683,6 +1683,10 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 6ee6372a4459..cb1d18ef343c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -93,6 +93,15 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d8b09d5d1db1..fd34fd673a8c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -181,12 +181,17 @@ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) preempt_enable(); } +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { + struct task_struct *t = current; + preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); preempt_enable(); + exit_tasks_rcu_finish_trace(t); } #ifndef CONFIG_TINY_RCU @@ -196,15 +201,19 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) */ static void __init rcu_tasks_bootup_oddness(void) { -#ifdef CONFIG_TASKS_RCU +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU pr_info("\tRude variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -569,3 +578,347 @@ static int __init rcu_spawn_tasks_rude_kthread(void) core_initcall(rcu_spawn_tasks_rude_kthread); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + if (atomic_dec_and_test(&trc_n_readers_need_end)) + wake_up(&trc_wait); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + // If the task is in a read-side critical section, set up its + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + if (unlikely(t->trc_reader_nesting)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + } + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Wait for CPU-hotplug paths to complete. + cpus_read_lock(); + cpus_read_unlock(); + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + t->trc_reader_checked = false; + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* Do intermediate processing between task and holdout scans. */ +static void rcu_tasks_trace_postscan(void) +{ + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool ndrpt, bool *frptp) +{ + struct task_struct *g, *t; + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(void) +{ + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + wait_event_idle_exclusive(trc_wait, + atomic_read(&trc_n_readers_need_end) == 0); + + smp_mb(); // Caller's code must be ordered after wakeup. +} + +/* Report any needed quiescent state for this exiting task. */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3 From 43766c3eadcf6033c92eb953f88801aebac0f785 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 20:38:29 -0700 Subject: rcu-tasks: Make RCU Tasks Trace make use of RCU scheduler hooks This commit makes the calls to rcu_tasks_qs() detect and report quiescent states for RCU tasks trace. If the task is in a quiescent state and if ->trc_reader_checked is not yet set, the task sets its own ->trc_reader_checked. This will cause the grace-period kthread to remove it from the holdout list if it still remains there. [ paulmck: Fix conditional compilation per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 44 +++++++++++++++++++++++++++++++++++++------- include/linux/rcutiny.h | 2 +- kernel/rcu/tasks.h | 5 +++-- kernel/rcu/tree_plugin.h | 6 ++---- 4 files changed, 43 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2be97a83f266..659cbfa7581a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -131,20 +131,50 @@ static inline void rcu_init_nohz(void) { } * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC -#define rcu_tasks_qs(t) \ - do { \ - if (READ_ONCE((t)->rcu_tasks_holdout)) \ - WRITE_ONCE((t)->rcu_tasks_holdout, false); \ + +# ifdef CONFIG_TASKS_RCU +# define rcu_tasks_classic_qs(t, preempt) \ + do { \ + if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ + WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +# else +# define rcu_tasks_classic_qs(t, preempt) do { } while (0) +# define call_rcu_tasks call_rcu +# define synchronize_rcu_tasks synchronize_rcu +# endif + +# ifdef CONFIG_TASKS_RCU_TRACE +# define rcu_tasks_trace_qs(t) \ + do { \ + if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ + !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ + smp_store_release(&(t)->trc_reader_checked, true); \ + smp_mb(); /* Readers partitioned by store. */ \ + } \ + } while (0) +# else +# define rcu_tasks_trace_qs(t) do { } while (0) +# endif + +#define rcu_tasks_qs(t, preempt) \ +do { \ + rcu_tasks_classic_qs((t), (preempt)); \ + rcu_tasks_trace_qs((t)); \ +} while (0) + +# ifdef CONFIG_TASKS_RUDE_RCU void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); +# endif + +#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ -#define rcu_tasks_qs(t) do { } while (0) +#define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu @@ -161,7 +191,7 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - rcu_tasks_qs(current); \ + rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 045c28b71f4f..d77e11186afd 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -49,7 +49,7 @@ static inline void rcu_softirq_qs(void) #define rcu_note_context_switch(preempt) \ do { \ rcu_qs(); \ - rcu_tasks_qs(current); \ + rcu_tasks_qs(current, (preempt)); \ } while (0) static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c93fb29b460c..6f8a4040fbdd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,7 +180,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - smp_mb__after_unlock_lock(); // Order updates vs. GP. + smp_mb__after_spinlock(); // Order updates vs. GP. list = rtp->cbs_head; rtp->cbs_head = NULL; rtp->cbs_tail = &rtp->cbs_head; @@ -874,7 +874,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { WRITE_ONCE(t->trc_reader_need_end, false); - t->trc_reader_checked = false; + WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); } @@ -983,6 +983,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); } smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. } /* Report any needed quiescent state for this exiting task. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4f34c325dd90..37e02812d18f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -331,8 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -841,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } -- cgit v1.2.3 From 276c410448dbca357a2bc3539acfe04862e5f172 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 16:02:06 -0700 Subject: rcu-tasks: Split ->trc_reader_need_end This commit splits ->trc_reader_need_end by using the rcu_special union. This change permits readers to check to see if a memory barrier is required without any added overhead in the common case where no such barrier is required. This commit also adds the read-side checking. Later commits will add the machinery to properly set the new ->trc_reader_special.b.need_mb field. This commit also makes rcu_read_unlock_trace_special() tolerate nested read-side critical sections within interrupt and NMI handlers. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 11 +++++++---- include/linux/sched.h | 4 ++-- init/init_task.c | 1 + kernel/fork.c | 1 + kernel/rcu/tasks.h | 33 ++++++++++++++++++++------------- 5 files changed, 31 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index ed97e10817bd..c42b365ca176 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -31,7 +31,7 @@ static inline int rcu_read_lock_trace_held(void) #ifdef CONFIG_TASKS_TRACE_RCU -void rcu_read_unlock_trace_special(struct task_struct *t); +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section @@ -50,6 +50,8 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers rcu_lock_acquire(&rcu_trace_lock_map); } @@ -69,10 +71,11 @@ static inline void rcu_read_unlock_trace(void) rcu_lock_release(&rcu_trace_lock_map); nesting = READ_ONCE(t->trc_reader_nesting) - 1; - WRITE_ONCE(t->trc_reader_nesting, nesting); - if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting) + if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { + WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. - rcu_read_unlock_trace_special(t); + } + rcu_read_unlock_trace_special(t, nesting); } void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); diff --git a/include/linux/sched.h b/include/linux/sched.h index 864f60e51c41..9437b53cc603 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -613,7 +613,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 pad; /* No garbage from compiler! */ + u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; @@ -727,7 +727,7 @@ struct task_struct { #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; - bool trc_reader_need_end; + union rcu_special trc_reader_special; bool trc_reader_checked; struct list_head trc_holdout_list; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/init/init_task.c b/init/init_task.c index e8b3740ee598..825972daec32 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -143,6 +143,7 @@ struct task_struct init_task #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, + .trc_reader_special.s = 0, .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), #endif #ifdef CONFIG_CPUSETS diff --git a/kernel/fork.c b/kernel/fork.c index 72e9396235b4..96eb4b535ced 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1685,6 +1685,7 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eeac4a122234..17b1b9a31071 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -723,10 +723,17 @@ DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); /* If we are the last reader, wake up the grace-period kthread. */ -void rcu_read_unlock_trace_special(struct task_struct *t) +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - WRITE_ONCE(t->trc_reader_need_end, false); - if (atomic_dec_and_test(&trc_n_readers_need_end)) + int nq = t->trc_reader_special.b.need_qs; + + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) wake_up(&trc_wait); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -777,8 +784,8 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -804,8 +811,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // exit from that critical section. if (unlikely(t->trc_reader_nesting)) { atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); } return true; } @@ -884,7 +891,7 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { - WRITE_ONCE(t->trc_reader_need_end, false); + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); @@ -916,7 +923,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], t->trc_reader_nesting, - " N"[!!t->trc_reader_need_end], + " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); } @@ -980,11 +987,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) break; // Count reached zero. // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_need_end)) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_need_end)) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); } @@ -1003,8 +1010,8 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) - rcu_read_unlock_trace_special(t); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); } /** -- cgit v1.2.3 From 9ae58d7bd11f1fc4c96389df11751f8593d8bd33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Mar 2020 17:16:37 -0700 Subject: rcu-tasks: Add Kconfig option to mediate smp_mb() vs. IPI This commit provides a new TASKS_TRACE_RCU_READ_MB Kconfig option that enables use of read-side memory barriers by both rcu_read_lock_trace() and rcu_read_unlock_trace() when the are executed with the current->trc_reader_special.b.need_mb flag set. This flag is currently never set. Doing that is the subject of a later commit. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 3 ++- kernel/rcu/Kconfig | 18 ++++++++++++++++++ kernel/rcu/tasks.h | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index c42b365ca176..4c25a41f8b27 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -50,7 +50,8 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); - if (t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers rcu_lock_acquire(&rcu_trace_lock_map); } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cb1d18ef343c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -234,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4857450c0430..4147857007d7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -734,7 +734,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { int nq = t->trc_reader_special.b.need_qs; - if (t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. if (nq) -- cgit v1.2.3 From be44ae62431196ac2a55198c0855028fff3ccfb4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 25 Feb 2020 21:09:19 -0800 Subject: locktorture.c: Fix if-statement empty body warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using -Wextra, gcc complains about torture_preempt_schedule() when its definition is empty (i.e., when CONFIG_PREEMPTION is not set/enabled). Fix these warnings by adding an empty do-while block for that macro when CONFIG_PREEMPTION is not set. Fixes these build warnings: ../kernel/locking/locktorture.c:119:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:166:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:337:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:490:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:528:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:553:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] I have verified that there is no object code change (with gcc 7.5.0). Signed-off-by: Randy Dunlap Cc: Davidlohr Bueso Cc: Josh Triplett Signed-off-by: "Paul E. McKenney" --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 6241f59e2d6f..629b66e6c161 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -89,7 +89,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else -#define torture_preempt_schedule() +#define torture_preempt_schedule() do { } while (0) #endif #endif /* __LINUX_TORTURE_H */ -- cgit v1.2.3 From 4b8d7d4c599182393421c190bae3604b4db9629a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:00 +0200 Subject: bridge: mrp: Extend bridge interface To integrate MRP into the bridge, first the bridge needs to be aware of ports that are part of an MRP ring and which rings are on the bridge. Therefore extend bridge interface with the following: - add new flag(BR_MPP_AWARE) to the net bridge ports, this bit will be set when the port is added to an MRP instance. In this way it knows if the frame was received on MRP ring port - add new flag(BR_MRP_LOST_CONT) to the net bridge ports, this bit will be set when the port lost the continuity of MRP Test frames. - add a list of MRP instances Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 2 ++ net/bridge/br_private.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 9e57c4411734..b3a8d3054af0 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -47,6 +47,8 @@ struct br_ip_list { #define BR_BCAST_FLOOD BIT(14) #define BR_NEIGH_SUPPRESS BIT(15) #define BR_ISOLATED BIT(16) +#define BR_MRP_AWARE BIT(17) +#define BR_MRP_LOST_CONT BIT(18) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1f97703a52ff..835a70f8d3ea 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -428,6 +428,10 @@ struct net_bridge { int offload_fwd_mark; #endif struct hlist_head fdb_list; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + struct list_head __rcu mrp_list; +#endif }; struct br_input_skb_cb { -- cgit v1.2.3 From e145d9a184f23639edc864e2fa08fcc1853361a4 Mon Sep 17 00:00:00 2001 From: Akash Asthana Date: Wed, 15 Apr 2020 15:53:10 +0530 Subject: interconnect: Add devm_of_icc_get() as exported API for users Users can use devm version of of_icc_get() to benefit from automatic resource release. Signed-off-by: Akash Asthana Reviewed by: Matthias Kaehlcke Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/1586946198-13912-2-git-send-email-akashast@codeaurora.org Signed-off-by: Georgi Djakov --- drivers/interconnect/core.c | 25 +++++++++++++++++++++++++ include/linux/interconnect.h | 7 +++++++ 2 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 2c6515e3ecf1..f5699ed34e43 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -350,6 +350,31 @@ static struct icc_node *of_icc_get_from_provider(struct of_phandle_args *spec) return node; } +static void devm_icc_release(struct device *dev, void *res) +{ + icc_put(*(struct icc_path **)res); +} + +struct icc_path *devm_of_icc_get(struct device *dev, const char *name) +{ + struct icc_path **ptr, *path; + + ptr = devres_alloc(devm_icc_release, sizeof(**ptr), GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + path = of_icc_get(dev, name); + if (!IS_ERR(path)) { + *ptr = path; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return path; +} +EXPORT_SYMBOL_GPL(devm_of_icc_get); + /** * of_icc_get() - get a path handle from a DT node based on name * @dev: device pointer for the consumer device diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index d70a914cba11..770692421f4c 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -28,6 +28,7 @@ struct device; struct icc_path *icc_get(struct device *dev, const int src_id, const int dst_id); struct icc_path *of_icc_get(struct device *dev, const char *name); +struct icc_path *devm_of_icc_get(struct device *dev, const char *name); void icc_put(struct icc_path *path); int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw); void icc_set_tag(struct icc_path *path, u32 tag); @@ -46,6 +47,12 @@ static inline struct icc_path *of_icc_get(struct device *dev, return NULL; } +static inline struct icc_path *devm_of_icc_get(struct device *dev, + const char *name) +{ + return NULL; +} + static inline void icc_put(struct icc_path *path) { } -- cgit v1.2.3 From 40a571bc408bf10ac8be0eadf838483b9cf90141 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Fri, 3 Apr 2020 23:41:27 +0300 Subject: mtd: spi-nor: fix kernel-doc for 'struct spi_nor' When introducing 'struct spi_nor', a number of issues was added in its kernel-doc comment: - double article in the heading kernel-doc comment; - "point" instead of "pointer" for the 'mtd' and 'dev' fields; - "a" articles instead of "an" for the 'dev' field; - acronyms in the lower case for the 'dev' field; - missing "pointer to" for the 'priv' field. Fix all of those at once... Fixes: 6e602ef73334 ("mtd: spi-nor: add the basic data structures") Signed-off-by: Sergei Shtylyov Signed-off-by: Tudor Ambarus --- include/linux/mtd/spi-nor.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 1cc8ed5d59ed..d3fcf7654040 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -327,10 +327,10 @@ struct spi_nor_manufacturer; struct spi_nor_flash_parameter; /** - * struct spi_nor - Structure for defining a the SPI NOR layer - * @mtd: point to a mtd_info structure + * struct spi_nor - Structure for defining the SPI NOR layer + * @mtd: pointer to an mtd_info structure * @lock: the lock for the read/write/erase/lock/unlock operations - * @dev: point to a spi device, or a spi nor controller device. + * @dev: pointer to an SPI device or an SPI NOR controller device * @spimem: point to the spi mem device * @bouncebuf: bounce buffer used when the buffer passed by the MTD * layer is not DMA-able @@ -354,7 +354,7 @@ struct spi_nor_flash_parameter; * settings that can be overwritten by the spi_nor_fixups * hooks, or dynamically when parsing the SFDP tables. * @dirmap: pointers to struct spi_mem_dirmap_desc for reads/writes. - * @priv: the private data + * @priv: pointer to the private data */ struct spi_nor { struct mtd_info mtd; -- cgit v1.2.3 From ba0aa311b0eb9d0a66a1b6b6fdeaa3b7584b798a Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Fri, 3 Apr 2020 23:44:16 +0300 Subject: mtd: spi-nor: fix kernel-doc for spi_nor::mtd When embedding 'struct mtd_info' within 'struct spi_nor', the kernel-doc comment was forgotten. Fix it by dropping the "pointer to" part from the comment. Fixes: 1976367173a4 ("mtd: spi-nor: embed struct mtd_info within struct spi_nor") Signed-off-by: Sergei Shtylyov Signed-off-by: Tudor Ambarus --- include/linux/mtd/spi-nor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index d3fcf7654040..20077881c955 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -328,7 +328,7 @@ struct spi_nor_flash_parameter; /** * struct spi_nor - Structure for defining the SPI NOR layer - * @mtd: pointer to an mtd_info structure + * @mtd: an mtd_info structure * @lock: the lock for the read/write/erase/lock/unlock operations * @dev: pointer to an SPI device or an SPI NOR controller device * @spimem: point to the spi mem device -- cgit v1.2.3 From ba053dd3b4d841e17e910d0b91e22d9ea813fa39 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Fri, 3 Apr 2020 23:45:49 +0300 Subject: mtd: spi-nor: fix kernel-doc for spi_nor::reg_proto When adding the '{read|write|reg}_proto' fields to 'struct spi_nor', a colon was missed in the comment for the spi_nor::reg_proto' -- add it. Fixes: cfc5604c488c ("mtd: spi-nor: introduce SPI 1-2-2 and SPI 1-4-4 protocols") Signed-off-by: Sergei Shtylyov Signed-off-by: Tudor Ambarus --- include/linux/mtd/spi-nor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 20077881c955..1b2143d921ac 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -347,7 +347,7 @@ struct spi_nor_flash_parameter; * @flags: flag options for the current SPI-NOR (SNOR_F_*) * @read_proto: the SPI protocol for read operations * @write_proto: the SPI protocol for write operations - * @reg_proto the SPI protocol for read_reg/write_reg/erase operations + * @reg_proto: the SPI protocol for read_reg/write_reg/erase operations * @controller_ops: SPI NOR controller driver specific operations. * @params: [FLASH-SPECIFIC] SPI-NOR flash parameters and settings. * The structure includes legacy flash parameters and -- cgit v1.2.3 From 80cb801144265866ce29fabe5ea7ac8588aa673c Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Fri, 3 Apr 2020 23:49:48 +0300 Subject: mtd: spi-nor: fix kernel-doc for spi_nor::info When adding the 'info' field to 'struct spi_nor', some acronyms were in lower case and some in upper case and the JEDEC acronym mistyped -- fix these issues. Fixes: 46dde01f6bab ("mtd: spi-nor: add spi_nor_init() function") Signed-off-by: Sergei Shtylyov Signed-off-by: Tudor Ambarus --- include/linux/mtd/spi-nor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 1b2143d921ac..3333103c519b 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -335,7 +335,7 @@ struct spi_nor_flash_parameter; * @bouncebuf: bounce buffer used when the buffer passed by the MTD * layer is not DMA-able * @bouncebuf_size: size of the bounce buffer - * @info: spi-nor part JDEC MFR id and other info + * @info: SPI NOR part JEDEC MFR ID and other info * @manufacturer: spi-nor manufacturer * @page_size: the page size of the SPI NOR * @addr_width: number of address bytes -- cgit v1.2.3 From 1f241ad2a093b889122bd6bfdce57551d21bba5b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Fri, 3 Apr 2020 23:50:57 +0300 Subject: mtd: spi-nor: fix kernel-doc for spi_nor::spimem When adding the 'spimem' field to 'struct spi_nor', a grammar mistake ("point" instead of "pointer") was made -- fix it and convert the SPI acronym to uppercase and fully spell out "memory", while at it... Fixes: b35b9a10362 ("mtd: spi-nor: Move m25p80 code in spi-nor.c") Signed-off-by: Sergei Shtylyov Signed-off-by: Tudor Ambarus --- include/linux/mtd/spi-nor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 3333103c519b..bebff2729c18 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -331,7 +331,7 @@ struct spi_nor_flash_parameter; * @mtd: an mtd_info structure * @lock: the lock for the read/write/erase/lock/unlock operations * @dev: pointer to an SPI device or an SPI NOR controller device - * @spimem: point to the spi mem device + * @spimem: pointer to the SPI memory device * @bouncebuf: bounce buffer used when the buffer passed by the MTD * layer is not DMA-able * @bouncebuf_size: size of the bounce buffer -- cgit v1.2.3 From 767dea211cd0c68d8116d8c3b5104e82454fb44b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2020 07:17:03 +0200 Subject: x86/tboot: Mark tboot static This structure is only really used in tboot.c. The only exception is a single tboot_enabled check, but for that we don't need an inline function. Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200428051703.1625952-1-hch@lst.de --- arch/x86/kernel/tboot.c | 8 ++++++-- include/linux/tboot.h | 8 +------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b89f6ac6a0c0..b2942b2dbfcf 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -35,8 +35,7 @@ #include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ -struct tboot *tboot __read_mostly; -EXPORT_SYMBOL(tboot); +static struct tboot *tboot __read_mostly; /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -46,6 +45,11 @@ EXPORT_SYMBOL(tboot); static u8 tboot_uuid[16] __initdata = TBOOT_UUID; +bool tboot_enabled(void) +{ + return tboot != NULL; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ diff --git a/include/linux/tboot.h b/include/linux/tboot.h index 5424bc6feac8..c7e424766360 100644 --- a/include/linux/tboot.h +++ b/include/linux/tboot.h @@ -121,13 +121,7 @@ struct tboot { #define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - +bool tboot_enabled(void); extern void tboot_probe(void); extern void tboot_shutdown(u32 shutdown_type); extern struct acpi_table_header *tboot_get_dmar_table( -- cgit v1.2.3 From 0c6b20a1d720ad1d43a2aca0ffa1af07c201b55d Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 27 Apr 2020 13:28:27 +0530 Subject: bus: mhi: core: Add support for MHI suspend and resume Add support for MHI suspend and resume states. While at it, the mhi_notify() function needs to be exported as well. Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200427075829.9304-2-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/main.c | 3 +- drivers/bus/mhi/core/pm.c | 143 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mhi.h | 19 ++++++ 3 files changed, 164 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index eb4256b81406..3e9aa3b2da77 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -267,7 +267,7 @@ int mhi_destroy_device(struct device *dev, void *data) return 0; } -static void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason) +void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason) { struct mhi_driver *mhi_drv; @@ -279,6 +279,7 @@ static void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason) if (mhi_drv->status_cb) mhi_drv->status_cb(mhi_dev, cb_reason); } +EXPORT_SYMBOL_GPL(mhi_notify); /* Bind MHI channels to MHI devices */ void mhi_create_devices(struct mhi_controller *mhi_cntrl) diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c index 52690cb5c89c..3529419d076b 100644 --- a/drivers/bus/mhi/core/pm.c +++ b/drivers/bus/mhi/core/pm.c @@ -669,6 +669,149 @@ void mhi_pm_st_worker(struct work_struct *work) } } +int mhi_pm_suspend(struct mhi_controller *mhi_cntrl) +{ + struct mhi_chan *itr, *tmp; + struct device *dev = &mhi_cntrl->mhi_dev->dev; + enum mhi_pm_state new_state; + int ret; + + if (mhi_cntrl->pm_state == MHI_PM_DISABLE) + return -EINVAL; + + if (MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) + return -EIO; + + /* Return busy if there are any pending resources */ + if (atomic_read(&mhi_cntrl->dev_wake)) + return -EBUSY; + + /* Take MHI out of M2 state */ + read_lock_bh(&mhi_cntrl->pm_lock); + mhi_cntrl->wake_get(mhi_cntrl, false); + read_unlock_bh(&mhi_cntrl->pm_lock); + + ret = wait_event_timeout(mhi_cntrl->state_event, + mhi_cntrl->dev_state == MHI_STATE_M0 || + mhi_cntrl->dev_state == MHI_STATE_M1 || + MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state), + msecs_to_jiffies(mhi_cntrl->timeout_ms)); + + read_lock_bh(&mhi_cntrl->pm_lock); + mhi_cntrl->wake_put(mhi_cntrl, false); + read_unlock_bh(&mhi_cntrl->pm_lock); + + if (!ret || MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) { + dev_err(dev, + "Could not enter M0/M1 state"); + return -EIO; + } + + write_lock_irq(&mhi_cntrl->pm_lock); + + if (atomic_read(&mhi_cntrl->dev_wake)) { + write_unlock_irq(&mhi_cntrl->pm_lock); + return -EBUSY; + } + + dev_info(dev, "Allowing M3 transition\n"); + new_state = mhi_tryset_pm_state(mhi_cntrl, MHI_PM_M3_ENTER); + if (new_state != MHI_PM_M3_ENTER) { + write_unlock_irq(&mhi_cntrl->pm_lock); + dev_err(dev, + "Error setting to PM state: %s from: %s\n", + to_mhi_pm_state_str(MHI_PM_M3_ENTER), + to_mhi_pm_state_str(mhi_cntrl->pm_state)); + return -EIO; + } + + /* Set MHI to M3 and wait for completion */ + mhi_set_mhi_state(mhi_cntrl, MHI_STATE_M3); + write_unlock_irq(&mhi_cntrl->pm_lock); + dev_info(dev, "Wait for M3 completion\n"); + + ret = wait_event_timeout(mhi_cntrl->state_event, + mhi_cntrl->dev_state == MHI_STATE_M3 || + MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state), + msecs_to_jiffies(mhi_cntrl->timeout_ms)); + + if (!ret || MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) { + dev_err(dev, + "Did not enter M3 state, MHI state: %s, PM state: %s\n", + TO_MHI_STATE_STR(mhi_cntrl->dev_state), + to_mhi_pm_state_str(mhi_cntrl->pm_state)); + return -EIO; + } + + /* Notify clients about entering LPM */ + list_for_each_entry_safe(itr, tmp, &mhi_cntrl->lpm_chans, node) { + mutex_lock(&itr->mutex); + if (itr->mhi_dev) + mhi_notify(itr->mhi_dev, MHI_CB_LPM_ENTER); + mutex_unlock(&itr->mutex); + } + + return 0; +} +EXPORT_SYMBOL_GPL(mhi_pm_suspend); + +int mhi_pm_resume(struct mhi_controller *mhi_cntrl) +{ + struct mhi_chan *itr, *tmp; + struct device *dev = &mhi_cntrl->mhi_dev->dev; + enum mhi_pm_state cur_state; + int ret; + + dev_info(dev, "Entered with PM state: %s, MHI state: %s\n", + to_mhi_pm_state_str(mhi_cntrl->pm_state), + TO_MHI_STATE_STR(mhi_cntrl->dev_state)); + + if (mhi_cntrl->pm_state == MHI_PM_DISABLE) + return 0; + + if (MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) + return -EIO; + + /* Notify clients about exiting LPM */ + list_for_each_entry_safe(itr, tmp, &mhi_cntrl->lpm_chans, node) { + mutex_lock(&itr->mutex); + if (itr->mhi_dev) + mhi_notify(itr->mhi_dev, MHI_CB_LPM_EXIT); + mutex_unlock(&itr->mutex); + } + + write_lock_irq(&mhi_cntrl->pm_lock); + cur_state = mhi_tryset_pm_state(mhi_cntrl, MHI_PM_M3_EXIT); + if (cur_state != MHI_PM_M3_EXIT) { + write_unlock_irq(&mhi_cntrl->pm_lock); + dev_info(dev, + "Error setting to PM state: %s from: %s\n", + to_mhi_pm_state_str(MHI_PM_M3_EXIT), + to_mhi_pm_state_str(mhi_cntrl->pm_state)); + return -EIO; + } + + /* Set MHI to M0 and wait for completion */ + mhi_set_mhi_state(mhi_cntrl, MHI_STATE_M0); + write_unlock_irq(&mhi_cntrl->pm_lock); + + ret = wait_event_timeout(mhi_cntrl->state_event, + mhi_cntrl->dev_state == MHI_STATE_M0 || + MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state), + msecs_to_jiffies(mhi_cntrl->timeout_ms)); + + if (!ret || MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) { + dev_err(dev, + "Did not enter M0 state, MHI state: %s, PM state: %s\n", + TO_MHI_STATE_STR(mhi_cntrl->dev_state), + to_mhi_pm_state_str(mhi_cntrl->pm_state)); + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mhi_pm_resume); + int __mhi_device_get_sync(struct mhi_controller *mhi_cntrl) { int ret; diff --git a/include/linux/mhi.h b/include/linux/mhi.h index ad1996001965..a4288f4d656f 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -568,6 +568,13 @@ void mhi_driver_unregister(struct mhi_driver *mhi_drv); void mhi_set_mhi_state(struct mhi_controller *mhi_cntrl, enum mhi_state state); +/** + * mhi_notify - Notify the MHI client driver about client device status + * @mhi_dev: MHI device instance + * @cb_reason: MHI callback reason + */ +void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason); + /** * mhi_prepare_for_power_up - Do pre-initialization before power up. * This is optional, call this before power up if @@ -604,6 +611,18 @@ void mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful); */ void mhi_unprepare_after_power_down(struct mhi_controller *mhi_cntrl); +/** + * mhi_pm_suspend - Move MHI into a suspended state + * @mhi_cntrl: MHI controller + */ +int mhi_pm_suspend(struct mhi_controller *mhi_cntrl); + +/** + * mhi_pm_resume - Resume MHI from suspended state + * @mhi_cntrl: MHI controller + */ +int mhi_pm_resume(struct mhi_controller *mhi_cntrl); + /** * mhi_download_rddm_img - Download ramdump image from device for * debugging purpose. -- cgit v1.2.3 From 7536ad8dbfcfd56cd04d005b76cd9ecf2036e220 Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Tue, 14 Apr 2020 15:47:54 -0500 Subject: firmware: fpga: replace the error codes with the standard ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel service layer driver has defined error codes for the specific services, which started from FPGA configuration then RSU (Remote Status Update). Intel service layer driver should define the standard error codes rather than keep adding more error codes for the new services. The standard error codes will be used by all the clients of Intel service layer driver. Replace FPGA and RSU specific error codes with Intel service layer’s Common error codes. Signed-off-by: Richard Gong Link: https://lore.kernel.org/r/1586897274-307-2-git-send-email-richard.gong@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/stratix10-rsu.c | 10 +-- drivers/firmware/stratix10-svc.c | 62 ++++++------------- drivers/fpga/stratix10-soc.c | 25 +++----- include/linux/firmware/intel/stratix10-smc.h | 49 +++++++-------- .../linux/firmware/intel/stratix10-svc-client.h | 71 +++++++++------------- 5 files changed, 84 insertions(+), 133 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-rsu.c b/drivers/firmware/stratix10-rsu.c index f8533338b018..4379475c99ed 100644 --- a/drivers/firmware/stratix10-rsu.c +++ b/drivers/firmware/stratix10-rsu.c @@ -72,7 +72,7 @@ static void rsu_status_callback(struct stratix10_svc_client *client, struct stratix10_rsu_priv *priv = client->priv; struct arm_smccc_res *res = (struct arm_smccc_res *)data->kaddr1; - if (data->status == BIT(SVC_STATUS_RSU_OK)) { + if (data->status == BIT(SVC_STATUS_OK)) { priv->status.version = FIELD_GET(RSU_VERSION_MASK, res->a2); priv->status.state = FIELD_GET(RSU_STATE_MASK, res->a2); @@ -108,9 +108,9 @@ static void rsu_command_callback(struct stratix10_svc_client *client, { struct stratix10_rsu_priv *priv = client->priv; - if (data->status == BIT(SVC_STATUS_RSU_NO_SUPPORT)) + if (data->status == BIT(SVC_STATUS_NO_SUPPORT)) dev_warn(client->dev, "Secure FW doesn't support notify\n"); - else if (data->status == BIT(SVC_STATUS_RSU_ERROR)) + else if (data->status == BIT(SVC_STATUS_ERROR)) dev_err(client->dev, "Failure, returned status is %lu\n", BIT(data->status)); @@ -133,9 +133,9 @@ static void rsu_retry_callback(struct stratix10_svc_client *client, struct stratix10_rsu_priv *priv = client->priv; unsigned int *counter = (unsigned int *)data->kaddr1; - if (data->status == BIT(SVC_STATUS_RSU_OK)) + if (data->status == BIT(SVC_STATUS_OK)) priv->retry_counter = *counter; - else if (data->status == BIT(SVC_STATUS_RSU_NO_SUPPORT)) + else if (data->status == BIT(SVC_STATUS_NO_SUPPORT)) dev_warn(client->dev, "Secure FW doesn't support retry\n"); else dev_err(client->dev, "Failed to get retry counter %lu\n", diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index d5f0769f3761..e0db8dbfc9d1 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -214,7 +214,7 @@ static void svc_thread_cmd_data_claim(struct stratix10_svc_controller *ctrl, complete(&ctrl->complete_status); break; } - cb_data->status = BIT(SVC_STATUS_RECONFIG_BUFFER_DONE); + cb_data->status = BIT(SVC_STATUS_BUFFER_DONE); cb_data->kaddr1 = svc_pa_to_va(res.a1); cb_data->kaddr2 = (res.a2) ? svc_pa_to_va(res.a2) : NULL; @@ -227,7 +227,7 @@ static void svc_thread_cmd_data_claim(struct stratix10_svc_controller *ctrl, __func__); } } while (res.a0 == INTEL_SIP_SMC_STATUS_OK || - res.a0 == INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY || + res.a0 == INTEL_SIP_SMC_STATUS_BUSY || wait_for_completion_timeout(&ctrl->complete_status, timeout)); } @@ -250,7 +250,7 @@ static void svc_thread_cmd_config_status(struct stratix10_svc_controller *ctrl, cb_data->kaddr1 = NULL; cb_data->kaddr2 = NULL; cb_data->kaddr3 = NULL; - cb_data->status = BIT(SVC_STATUS_RECONFIG_ERROR); + cb_data->status = BIT(SVC_STATUS_ERROR); pr_debug("%s: polling config status\n", __func__); @@ -259,7 +259,7 @@ static void svc_thread_cmd_config_status(struct stratix10_svc_controller *ctrl, ctrl->invoke_fn(INTEL_SIP_SMC_FPGA_CONFIG_ISDONE, 0, 0, 0, 0, 0, 0, 0, &res); if ((res.a0 == INTEL_SIP_SMC_STATUS_OK) || - (res.a0 == INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR)) + (res.a0 == INTEL_SIP_SMC_STATUS_ERROR)) break; /* @@ -271,7 +271,7 @@ static void svc_thread_cmd_config_status(struct stratix10_svc_controller *ctrl, } if (res.a0 == INTEL_SIP_SMC_STATUS_OK && count_in_sec) - cb_data->status = BIT(SVC_STATUS_RECONFIG_COMPLETED); + cb_data->status = BIT(SVC_STATUS_COMPLETED); p_data->chan->scl->receive_cb(p_data->chan->scl, cb_data); } @@ -294,24 +294,18 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data, switch (p_data->command) { case COMMAND_RECONFIG: - cb_data->status = BIT(SVC_STATUS_RECONFIG_REQUEST_OK); + case COMMAND_RSU_UPDATE: + case COMMAND_RSU_NOTIFY: + cb_data->status = BIT(SVC_STATUS_OK); break; case COMMAND_RECONFIG_DATA_SUBMIT: - cb_data->status = BIT(SVC_STATUS_RECONFIG_BUFFER_SUBMITTED); - break; - case COMMAND_NOOP: - cb_data->status = BIT(SVC_STATUS_RECONFIG_BUFFER_SUBMITTED); - cb_data->kaddr1 = svc_pa_to_va(res.a1); + cb_data->status = BIT(SVC_STATUS_BUFFER_SUBMITTED); break; case COMMAND_RECONFIG_STATUS: - cb_data->status = BIT(SVC_STATUS_RECONFIG_COMPLETED); - break; - case COMMAND_RSU_UPDATE: - case COMMAND_RSU_NOTIFY: - cb_data->status = BIT(SVC_STATUS_RSU_OK); + cb_data->status = BIT(SVC_STATUS_COMPLETED); break; case COMMAND_RSU_RETRY: - cb_data->status = BIT(SVC_STATUS_RSU_OK); + cb_data->status = BIT(SVC_STATUS_OK); cb_data->kaddr1 = &res.a1; break; default: @@ -430,9 +424,9 @@ static int svc_normal_to_secure_thread(void *data) if (pdata->command == COMMAND_RSU_STATUS) { if (res.a0 == INTEL_SIP_SMC_RSU_ERROR) - cbdata->status = BIT(SVC_STATUS_RSU_ERROR); + cbdata->status = BIT(SVC_STATUS_ERROR); else - cbdata->status = BIT(SVC_STATUS_RSU_OK); + cbdata->status = BIT(SVC_STATUS_OK); cbdata->kaddr1 = &res; cbdata->kaddr2 = NULL; @@ -445,7 +439,7 @@ static int svc_normal_to_secure_thread(void *data) case INTEL_SIP_SMC_STATUS_OK: svc_thread_recv_status_ok(pdata, cbdata, res); break; - case INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY: + case INTEL_SIP_SMC_STATUS_BUSY: switch (pdata->command) { case COMMAND_RECONFIG_DATA_SUBMIT: svc_thread_cmd_data_claim(ctrl, @@ -460,33 +454,13 @@ static int svc_normal_to_secure_thread(void *data) break; } break; - case INTEL_SIP_SMC_FPGA_CONFIG_STATUS_REJECTED: + case INTEL_SIP_SMC_STATUS_REJECTED: pr_debug("%s: STATUS_REJECTED\n", __func__); break; - case INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR: + case INTEL_SIP_SMC_STATUS_ERROR: case INTEL_SIP_SMC_RSU_ERROR: pr_err("%s: STATUS_ERROR\n", __func__); - switch (pdata->command) { - /* for FPGA mgr */ - case COMMAND_RECONFIG_DATA_CLAIM: - case COMMAND_RECONFIG: - case COMMAND_RECONFIG_DATA_SUBMIT: - case COMMAND_RECONFIG_STATUS: - cbdata->status = - BIT(SVC_STATUS_RECONFIG_ERROR); - break; - - /* for RSU */ - case COMMAND_RSU_STATUS: - case COMMAND_RSU_UPDATE: - case COMMAND_RSU_NOTIFY: - case COMMAND_RSU_RETRY: - cbdata->status = - BIT(SVC_STATUS_RSU_ERROR); - break; - } - - cbdata->status = BIT(SVC_STATUS_RECONFIG_ERROR); + cbdata->status = BIT(SVC_STATUS_ERROR); cbdata->kaddr1 = NULL; cbdata->kaddr2 = NULL; cbdata->kaddr3 = NULL; @@ -502,7 +476,7 @@ static int svc_normal_to_secure_thread(void *data) if ((pdata->command == COMMAND_RSU_RETRY) || (pdata->command == COMMAND_RSU_NOTIFY)) { cbdata->status = - BIT(SVC_STATUS_RSU_NO_SUPPORT); + BIT(SVC_STATUS_NO_SUPPORT); cbdata->kaddr1 = NULL; cbdata->kaddr2 = NULL; cbdata->kaddr3 = NULL; diff --git a/drivers/fpga/stratix10-soc.c b/drivers/fpga/stratix10-soc.c index 215d33789c74..bae98c8ce362 100644 --- a/drivers/fpga/stratix10-soc.c +++ b/drivers/fpga/stratix10-soc.c @@ -154,11 +154,11 @@ static void s10_receive_callback(struct stratix10_svc_client *client, * Here we set status bits as we receive them. Elsewhere, we always use * test_and_clear_bit() to check status in priv->status */ - for (i = 0; i <= SVC_STATUS_RECONFIG_ERROR; i++) + for (i = 0; i <= SVC_STATUS_ERROR; i++) if (status & (1 << i)) set_bit(i, &priv->status); - if (status & BIT(SVC_STATUS_RECONFIG_BUFFER_DONE)) { + if (status & BIT(SVC_STATUS_BUFFER_DONE)) { s10_unlock_bufs(priv, data->kaddr1); s10_unlock_bufs(priv, data->kaddr2); s10_unlock_bufs(priv, data->kaddr3); @@ -209,8 +209,7 @@ static int s10_ops_write_init(struct fpga_manager *mgr, } ret = 0; - if (!test_and_clear_bit(SVC_STATUS_RECONFIG_REQUEST_OK, - &priv->status)) { + if (!test_and_clear_bit(SVC_STATUS_OK, &priv->status)) { ret = -ETIMEDOUT; goto init_done; } @@ -323,17 +322,15 @@ static int s10_ops_write(struct fpga_manager *mgr, const char *buf, &priv->status_return_completion, S10_BUFFER_TIMEOUT); - if (test_and_clear_bit(SVC_STATUS_RECONFIG_BUFFER_DONE, - &priv->status) || - test_and_clear_bit(SVC_STATUS_RECONFIG_BUFFER_SUBMITTED, + if (test_and_clear_bit(SVC_STATUS_BUFFER_DONE, &priv->status) || + test_and_clear_bit(SVC_STATUS_BUFFER_SUBMITTED, &priv->status)) { ret = 0; continue; } - if (test_and_clear_bit(SVC_STATUS_RECONFIG_ERROR, - &priv->status)) { - dev_err(dev, "ERROR - giving up - SVC_STATUS_RECONFIG_ERROR\n"); + if (test_and_clear_bit(SVC_STATUS_ERROR, &priv->status)) { + dev_err(dev, "ERROR - giving up - SVC_STATUS_ERROR\n"); ret = -EFAULT; break; } @@ -393,13 +390,11 @@ static int s10_ops_write_complete(struct fpga_manager *mgr, timeout = ret; ret = 0; - if (test_and_clear_bit(SVC_STATUS_RECONFIG_COMPLETED, - &priv->status)) + if (test_and_clear_bit(SVC_STATUS_COMPLETED, &priv->status)) break; - if (test_and_clear_bit(SVC_STATUS_RECONFIG_ERROR, - &priv->status)) { - dev_err(dev, "ERROR - giving up - SVC_STATUS_RECONFIG_ERROR\n"); + if (test_and_clear_bit(SVC_STATUS_ERROR, &priv->status)) { + dev_err(dev, "ERROR - giving up - SVC_STATUS_ERROR\n"); ret = -EFAULT; break; } diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 013ae4819deb..682dbf694007 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -54,32 +54,25 @@ * Secure monitor software doesn't recognize the request. * * INTEL_SIP_SMC_STATUS_OK: - * FPGA configuration completed successfully, - * In case of FPGA configuration write operation, it means secure monitor - * software can accept the next chunk of FPGA configuration data. + * Secure monitor software accepts the service client's request. * - * INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY: - * In case of FPGA configuration write operation, it means secure monitor - * software is still processing previous data & can't accept the next chunk - * of data. Service driver needs to issue - * INTEL_SIP_SMC_FPGA_CONFIG_COMPLETED_WRITE call to query the - * completed block(s). + * INTEL_SIP_SMC_STATUS_BUSY: + * Secure monitor software is still processing service client's request. * - * INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR: - * There is error during the FPGA configuration process. + * INTEL_SIP_SMC_STATUS_REJECTED: + * Secure monitor software reject the service client's request. * - * INTEL_SIP_SMC_REG_ERROR: - * There is error during a read or write operation of the protected registers. + * INTEL_SIP_SMC_STATUS_ERROR: + * There is error during the process of service request. * * INTEL_SIP_SMC_RSU_ERROR: - * There is error during a remote status update. + * There is error during the process of remote status update request. */ #define INTEL_SIP_SMC_RETURN_UNKNOWN_FUNCTION 0xFFFFFFFF #define INTEL_SIP_SMC_STATUS_OK 0x0 -#define INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY 0x1 -#define INTEL_SIP_SMC_FPGA_CONFIG_STATUS_REJECTED 0x2 -#define INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR 0x4 -#define INTEL_SIP_SMC_REG_ERROR 0x5 +#define INTEL_SIP_SMC_STATUS_BUSY 0x1 +#define INTEL_SIP_SMC_STATUS_REJECTED 0x2 +#define INTEL_SIP_SMC_STATUS_ERROR 0x4 #define INTEL_SIP_SMC_RSU_ERROR 0x7 /** @@ -95,7 +88,7 @@ * a2-7: not used. * * Return status: - * a0: INTEL_SIP_SMC_STATUS_OK, or INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR. + * a0: INTEL_SIP_SMC_STATUS_OK, or INTEL_SIP_SMC_STATUS_ERROR. * a1-3: not used. */ #define INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_START 1 @@ -115,8 +108,8 @@ * a3-7: not used. * * Return status: - * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY or - * INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR. + * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_STATUS_BUSY or + * INTEL_SIP_SMC_STATUS_ERROR. * a1: 64bit physical address of 1st completed memory block if any completed * block, otherwise zero value. * a2: 64bit physical address of 2nd completed memory block if any completed @@ -133,15 +126,15 @@ * * Sync call used by service driver at EL1 to track the completed write * transactions. This request is called after INTEL_SIP_SMC_FPGA_CONFIG_WRITE - * call returns INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY. + * call returns INTEL_SIP_SMC_STATUS_BUSY. * * Call register usage: * a0: INTEL_SIP_SMC_FPGA_CONFIG_COMPLETED_WRITE. * a1-7: not used. * * Return status: - * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY or - * INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR. + * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_FPGA_BUSY or + * INTEL_SIP_SMC_STATUS_ERROR. * a1: 64bit physical address of 1st completed memory block. * a2: 64bit physical address of 2nd completed memory block if * any completed block, otherwise zero value. @@ -164,8 +157,8 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) * a1-7: not used. * * Return status: - * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_FPGA_CONFIG_STATUS_BUSY or - * INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR. + * a0: INTEL_SIP_SMC_STATUS_OK, INTEL_SIP_SMC_STATUS_BUSY or + * INTEL_SIP_SMC_STATUS_ERROR. * a1-3: not used. */ #define INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_ISDONE 4 @@ -183,7 +176,7 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) * a1-7: not used. * * Return status: - * a0: INTEL_SIP_SMC_STATUS_OK or INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR. + * a0: INTEL_SIP_SMC_STATUS_OK or INTEL_SIP_SMC_STATUS_ERROR. * a1: start of physical address of reserved memory block. * a2: size of reserved memory block. * a3: not used. @@ -203,7 +196,7 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) * a1-7: not used. * * Return status: - * a0: INTEL_SIP_SMC_STATUS_OK or INTEL_SIP_SMC_FPGA_CONFIG_STATUS_ERROR. + * a0: INTEL_SIP_SMC_STATUS_OK or INTEL_SIP_SMC_STATUS_ERROR. * a1-3: not used. */ #define INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_LOOPBACK 6 diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 59bc6e2af693..64213c3e82f5 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -18,45 +18,37 @@ /** * Status of the sent command, in bit number * - * SVC_COMMAND_STATUS_RECONFIG_REQUEST_OK: - * Secure firmware accepts the request of FPGA reconfiguration. + * SVC_STATUS_OK: + * Secure firmware accepts the request issued by one of service clients. * - * SVC_STATUS_RECONFIG_BUFFER_SUBMITTED: - * Service client successfully submits FPGA configuration - * data buffer to secure firmware. + * SVC_STATUS_BUFFER_SUBMITTED: + * Service client successfully submits data buffer to secure firmware. * - * SVC_COMMAND_STATUS_RECONFIG_BUFFER_DONE: + * SVC_STATUS_BUFFER_DONE: * Secure firmware completes data process, ready to accept the * next WRITE transaction. * - * SVC_COMMAND_STATUS_RECONFIG_COMPLETED: - * Secure firmware completes FPGA configuration successfully, FPGA should - * be in user mode. + * SVC_STATUS_COMPLETED: + * Secure firmware completes service request successfully. In case of + * FPGA configuration, FPGA should be in user mode. * - * SVC_COMMAND_STATUS_RECONFIG_BUSY: - * FPGA configuration is still in process. + * SVC_COMMAND_STATUS_BUSY: + * Service request is still in process. * - * SVC_COMMAND_STATUS_RECONFIG_ERROR: - * Error encountered during FPGA configuration. + * SVC_COMMAND_STATUS_ERROR: + * Error encountered during the process of the service request. * - * SVC_STATUS_RSU_OK: - * Secure firmware accepts the request of remote status update (RSU). - * - * SVC_STATUS_RSU_ERROR: - * Error encountered during remote system update. - * - * SVC_STATUS_RSU_NO_SUPPORT: - * Secure firmware doesn't support RSU retry or notify feature. + * SVC_STATUS_NO_SUPPORT: + * Secure firmware doesn't support requested features such as RSU retry + * or RSU notify. */ -#define SVC_STATUS_RECONFIG_REQUEST_OK 0 -#define SVC_STATUS_RECONFIG_BUFFER_SUBMITTED 1 -#define SVC_STATUS_RECONFIG_BUFFER_DONE 2 -#define SVC_STATUS_RECONFIG_COMPLETED 3 -#define SVC_STATUS_RECONFIG_BUSY 4 -#define SVC_STATUS_RECONFIG_ERROR 5 -#define SVC_STATUS_RSU_OK 6 -#define SVC_STATUS_RSU_ERROR 7 -#define SVC_STATUS_RSU_NO_SUPPORT 8 +#define SVC_STATUS_OK 0 +#define SVC_STATUS_BUFFER_SUBMITTED 1 +#define SVC_STATUS_BUFFER_DONE 2 +#define SVC_STATUS_COMPLETED 3 +#define SVC_STATUS_BUSY 4 +#define SVC_STATUS_ERROR 5 +#define SVC_STATUS_NO_SUPPORT 6 /** * Flag bit for COMMAND_RECONFIG @@ -84,32 +76,29 @@ struct stratix10_svc_chan; * @COMMAND_NOOP: do 'dummy' request for integration/debug/trouble-shooting * * @COMMAND_RECONFIG: ask for FPGA configuration preparation, return status - * is SVC_STATUS_RECONFIG_REQUEST_OK + * is SVC_STATUS_OK * * @COMMAND_RECONFIG_DATA_SUBMIT: submit buffer(s) of bit-stream data for the - * FPGA configuration, return status is SVC_STATUS_RECONFIG_BUFFER_SUBMITTED, - * or SVC_STATUS_RECONFIG_ERROR + * FPGA configuration, return status is SVC_STATUS_SUBMITTED or SVC_STATUS_ERROR * * @COMMAND_RECONFIG_DATA_CLAIM: check the status of the configuration, return - * status is SVC_STATUS_RECONFIG_COMPLETED, or SVC_STATUS_RECONFIG_BUSY, or - * SVC_STATUS_RECONFIG_ERROR + * status is SVC_STATUS_COMPLETED, or SVC_STATUS_BUSY, or SVC_STATUS_ERROR * * @COMMAND_RECONFIG_STATUS: check the status of the configuration, return - * status is SVC_STATUS_RECONFIG_COMPLETED, or SVC_STATUS_RECONFIG_BUSY, or - * SVC_STATUS_RECONFIG_ERROR + * status is SVC_STATUS_COMPLETED, or SVC_STATUS_BUSY, or SVC_STATUS_ERROR * * @COMMAND_RSU_STATUS: request remote system update boot log, return status * is log data or SVC_STATUS_RSU_ERROR * * @COMMAND_RSU_UPDATE: set the offset of the bitstream to boot after reboot, - * return status is SVC_STATUS_RSU_OK or SVC_STATUS_RSU_ERROR + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR * * @COMMAND_RSU_NOTIFY: report the status of hard processor system - * software to firmware, return status is SVC_STATUS_RSU_OK or - * SVC_STATUS_RSU_ERROR + * software to firmware, return status is SVC_STATUS_OK or + * SVC_STATUS_ERROR * * @COMMAND_RSU_RETRY: query firmware for the current image's retry counter, - * return status is SVC_STATUS_RSU_OK or SVC_STATUS_RSU_ERROR + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR */ enum stratix10_svc_command_code { COMMAND_NOOP = 0, -- cgit v1.2.3 From b9b3a8be28b31a3dbcb0ced07aa0d869f45cdb69 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:43 -0700 Subject: firmware: xilinx: Remove eemi ops for get_api_version Use direct function calls instead of using eemi ops. So remove eemi ops for get_api_version and use direct function call. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-2-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp-debug.c | 2 +- drivers/firmware/xilinx/zynqmp.c | 4 ++-- drivers/soc/xilinx/zynqmp_power.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp-debug.c b/drivers/firmware/xilinx/zynqmp-debug.c index 43bc6cfdab45..06a21efe6829 100644 --- a/drivers/firmware/xilinx/zynqmp-debug.c +++ b/drivers/firmware/xilinx/zynqmp-debug.c @@ -92,7 +92,7 @@ static int process_api_request(u32 pm_id, u64 *pm_api_arg, u32 *pm_api_ret) switch (pm_id) { case PM_GET_API_VERSION: - ret = eemi_ops->get_api_version(&pm_api_version); + ret = zynqmp_pm_get_api_version(&pm_api_version); sprintf(debugfs_buf, "PM-API Version = %d.%d\n", pm_api_version >> 16, pm_api_version & 0xffff); break; diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 41b65164a367..36dab68344df 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -219,7 +219,7 @@ static u32 pm_tz_version; * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_get_api_version(u32 *version) +int zynqmp_pm_get_api_version(u32 *version) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -237,6 +237,7 @@ static int zynqmp_pm_get_api_version(u32 *version) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_get_api_version); /** * zynqmp_pm_get_chipid - Get silicon ID registers @@ -734,7 +735,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .get_api_version = zynqmp_pm_get_api_version, .get_chipid = zynqmp_pm_get_chipid, .query_data = zynqmp_pm_query_data, .clock_enable = zynqmp_pm_clock_enable, diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c index 09227895d216..d327d9ef0e81 100644 --- a/drivers/soc/xilinx/zynqmp_power.c +++ b/drivers/soc/xilinx/zynqmp_power.c @@ -186,11 +186,11 @@ static int zynqmp_pm_probe(struct platform_device *pdev) if (IS_ERR(eemi_ops)) return PTR_ERR(eemi_ops); - if (!eemi_ops->get_api_version || !eemi_ops->init_finalize) + if (!eemi_ops->init_finalize) return -ENXIO; eemi_ops->init_finalize(); - eemi_ops->get_api_version(&pm_api_version); + zynqmp_pm_get_api_version(&pm_api_version); /* Check PM API version number */ if (pm_api_version < ZYNQMP_PM_VERSION) diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 8efa5ac22d7e..a21abcdb1020 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -294,7 +294,6 @@ struct zynqmp_pm_query_data { }; struct zynqmp_eemi_ops { - int (*get_api_version)(u32 *version); int (*get_chipid)(u32 *idcode, u32 *version); int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); @@ -331,11 +330,16 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) const struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void); +int zynqmp_pm_get_api_version(u32 *version); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { return ERR_PTR(-ENODEV); } +static inline int zynqmp_pm_get_api_version(u32 *version) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 21cd93bab92b2818c9391465123cd4be6431c69e Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:44 -0700 Subject: firmware: xilinx: Remove eemi ops for get_chipid Use direct function call instead of eemi ops for get_chipid. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-3-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 4 ++-- drivers/nvmem/zynqmp_nvmem.c | 11 +---------- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 36dab68344df..36ab9ac1729a 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_api_version); * Return: Returns the status of the operation and the idcode and version * registers in @idcode and @version. */ -static int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) +int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -261,6 +261,7 @@ static int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_get_chipid); /** * zynqmp_pm_get_trustzone_version() - Get secure trustzone firmware version @@ -735,7 +736,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .get_chipid = zynqmp_pm_get_chipid, .query_data = zynqmp_pm_query_data, .clock_enable = zynqmp_pm_clock_enable, .clock_disable = zynqmp_pm_clock_disable, diff --git a/drivers/nvmem/zynqmp_nvmem.c b/drivers/nvmem/zynqmp_nvmem.c index 5893543918c8..e28d7b133e11 100644 --- a/drivers/nvmem/zynqmp_nvmem.c +++ b/drivers/nvmem/zynqmp_nvmem.c @@ -16,8 +16,6 @@ struct zynqmp_nvmem_data { struct nvmem_device *nvmem; }; -static const struct zynqmp_eemi_ops *eemi_ops; - static int zynqmp_nvmem_read(void *context, unsigned int offset, void *val, size_t bytes) { @@ -25,10 +23,7 @@ static int zynqmp_nvmem_read(void *context, unsigned int offset, int idcode, version; struct zynqmp_nvmem_data *priv = context; - if (!eemi_ops->get_chipid) - return -ENXIO; - - ret = eemi_ops->get_chipid(&idcode, &version); + ret = zynqmp_pm_get_chipid(&idcode, &version); if (ret < 0) return ret; @@ -61,10 +56,6 @@ static int zynqmp_nvmem_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) - return PTR_ERR(eemi_ops); - priv->dev = dev; econfig.dev = dev; econfig.reg_read = zynqmp_nvmem_read; diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index a21abcdb1020..89f6a5394a65 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -294,7 +294,6 @@ struct zynqmp_pm_query_data { }; struct zynqmp_eemi_ops { - int (*get_chipid)(u32 *idcode, u32 *version); int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); int (*query_data)(struct zynqmp_pm_query_data qdata, u32 *out); @@ -331,6 +330,7 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) const struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void); int zynqmp_pm_get_api_version(u32 *version); +int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -340,6 +340,10 @@ static inline int zynqmp_pm_get_api_version(u32 *version) { return -ENODEV; } +static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 6366c1bac3149c63752c03ea1c731d461d5349a7 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:45 -0700 Subject: firmware: xilinx: Remove eemi ops for query_data Use direct function call for query_data instead of using eemi ops. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-4-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/clkc.c | 17 ++++++----------- drivers/clk/zynqmp/divider.c | 3 +-- drivers/firmware/xilinx/zynqmp-debug.c | 3 +-- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 7 ++++++- 5 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/clkc.c b/drivers/clk/zynqmp/clkc.c index 10e89f23880b..5eed5ce10179 100644 --- a/drivers/clk/zynqmp/clkc.c +++ b/drivers/clk/zynqmp/clkc.c @@ -134,7 +134,6 @@ static struct clk_hw *(* const clk_topology[]) (const char *name, u32 clk_id, static struct zynqmp_clock *clock; static struct clk_hw_onecell_data *zynqmp_data; static unsigned int clock_max_idx; -static const struct zynqmp_eemi_ops *eemi_ops; /** * zynqmp_is_valid_clock() - Check whether clock is valid or not @@ -206,7 +205,7 @@ static int zynqmp_pm_clock_get_num_clocks(u32 *nclocks) qdata.qid = PM_QID_CLOCK_GET_NUM_CLOCKS; - ret = eemi_ops->query_data(qdata, ret_payload); + ret = zynqmp_pm_query_data(qdata, ret_payload); *nclocks = ret_payload[1]; return ret; @@ -231,7 +230,7 @@ static int zynqmp_pm_clock_get_name(u32 clock_id, qdata.qid = PM_QID_CLOCK_GET_NAME; qdata.arg1 = clock_id; - eemi_ops->query_data(qdata, ret_payload); + zynqmp_pm_query_data(qdata, ret_payload); memcpy(response, ret_payload, sizeof(*response)); return 0; @@ -265,7 +264,7 @@ static int zynqmp_pm_clock_get_topology(u32 clock_id, u32 index, qdata.arg1 = clock_id; qdata.arg2 = index; - ret = eemi_ops->query_data(qdata, ret_payload); + ret = zynqmp_pm_query_data(qdata, ret_payload); memcpy(response, &ret_payload[1], sizeof(*response)); return ret; @@ -296,7 +295,7 @@ struct clk_hw *zynqmp_clk_register_fixed_factor(const char *name, u32 clk_id, qdata.qid = PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS; qdata.arg1 = clk_id; - ret = eemi_ops->query_data(qdata, ret_payload); + ret = zynqmp_pm_query_data(qdata, ret_payload); if (ret) return ERR_PTR(ret); @@ -339,7 +338,7 @@ static int zynqmp_pm_clock_get_parents(u32 clock_id, u32 index, qdata.arg1 = clock_id; qdata.arg2 = index; - ret = eemi_ops->query_data(qdata, ret_payload); + ret = zynqmp_pm_query_data(qdata, ret_payload); memcpy(response, &ret_payload[1], sizeof(*response)); return ret; @@ -364,7 +363,7 @@ static int zynqmp_pm_clock_get_attributes(u32 clock_id, qdata.qid = PM_QID_CLOCK_GET_ATTRIBUTES; qdata.arg1 = clock_id; - ret = eemi_ops->query_data(qdata, ret_payload); + ret = zynqmp_pm_query_data(qdata, ret_payload); memcpy(response, &ret_payload[1], sizeof(*response)); return ret; @@ -738,10 +737,6 @@ static int zynqmp_clock_probe(struct platform_device *pdev) int ret; struct device *dev = &pdev->dev; - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) - return PTR_ERR(eemi_ops); - ret = zynqmp_clk_setup(dev->of_node); return ret; diff --git a/drivers/clk/zynqmp/divider.c b/drivers/clk/zynqmp/divider.c index 4be2cc76aa2e..e21f4ea421f5 100644 --- a/drivers/clk/zynqmp/divider.c +++ b/drivers/clk/zynqmp/divider.c @@ -258,7 +258,6 @@ static const struct clk_ops zynqmp_clk_divider_ops = { */ u32 zynqmp_clk_get_max_divisor(u32 clk_id, u32 type) { - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); struct zynqmp_pm_query_data qdata = {0}; u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -266,7 +265,7 @@ u32 zynqmp_clk_get_max_divisor(u32 clk_id, u32 type) qdata.qid = PM_QID_CLOCK_GET_MAX_DIVISOR; qdata.arg1 = clk_id; qdata.arg2 = type; - ret = eemi_ops->query_data(qdata, ret_payload); + ret = zynqmp_pm_query_data(qdata, ret_payload); /* * To maintain backward compatibility return maximum possible value * (0xFFFF) if query for max divisor is not successful. diff --git a/drivers/firmware/xilinx/zynqmp-debug.c b/drivers/firmware/xilinx/zynqmp-debug.c index 06a21efe6829..99606b34975e 100644 --- a/drivers/firmware/xilinx/zynqmp-debug.c +++ b/drivers/firmware/xilinx/zynqmp-debug.c @@ -85,7 +85,6 @@ static int get_pm_api_id(char *pm_api_req, u32 *pm_id) static int process_api_request(u32 pm_id, u64 *pm_api_arg, u32 *pm_api_ret) { - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); u32 pm_api_version; int ret; struct zynqmp_pm_query_data qdata = {0}; @@ -102,7 +101,7 @@ static int process_api_request(u32 pm_id, u64 *pm_api_arg, u32 *pm_api_ret) qdata.arg2 = pm_api_arg[2]; qdata.arg3 = pm_api_arg[3]; - ret = eemi_ops->query_data(qdata, pm_api_ret); + ret = zynqmp_pm_query_data(qdata, pm_api_ret); if (ret) break; diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 36ab9ac1729a..3fbab2947c3b 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -326,7 +326,7 @@ static int get_set_conduit_method(struct device_node *np) * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out) +int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out) { int ret; @@ -340,6 +340,7 @@ static int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out) */ return qdata.qid == PM_QID_CLOCK_GET_NAME ? 0 : ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_query_data); /** * zynqmp_pm_clock_enable() - Enable the clock for given id @@ -736,7 +737,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .query_data = zynqmp_pm_query_data, .clock_enable = zynqmp_pm_clock_enable, .clock_disable = zynqmp_pm_clock_disable, .clock_getstate = zynqmp_pm_clock_getstate, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 89f6a5394a65..fa1195c72976 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*query_data)(struct zynqmp_pm_query_data qdata, u32 *out); int (*clock_enable)(u32 clock_id); int (*clock_disable)(u32 clock_id); int (*clock_getstate)(u32 clock_id, u32 *state); @@ -331,6 +330,7 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, const struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void); int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); +int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -344,6 +344,11 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) { return -ENODEV; } +static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, + u32 *out) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 3637e84cd2e910f84835fac9650316dce53218ef Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:46 -0700 Subject: firmware: xilinx: Remove eemi ops for clock_enable Use direct function call for clock_enable instead of eemi ops. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-5-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/clk-gate-zynqmp.c | 2 +- drivers/clk/zynqmp/pll.c | 3 +-- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/clk-gate-zynqmp.c b/drivers/clk/zynqmp/clk-gate-zynqmp.c index 83b236f20fff..437b921f33b7 100644 --- a/drivers/clk/zynqmp/clk-gate-zynqmp.c +++ b/drivers/clk/zynqmp/clk-gate-zynqmp.c @@ -39,7 +39,7 @@ static int zynqmp_clk_gate_enable(struct clk_hw *hw) int ret; const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_enable(clk_id); + ret = zynqmp_pm_clock_enable(clk_id); if (ret) pr_warn_once("%s() clock enabled failed for %s, ret = %d\n", diff --git a/drivers/clk/zynqmp/pll.c b/drivers/clk/zynqmp/pll.c index 89b599530105..153aa6720619 100644 --- a/drivers/clk/zynqmp/pll.c +++ b/drivers/clk/zynqmp/pll.c @@ -246,12 +246,11 @@ static int zynqmp_pll_enable(struct clk_hw *hw) const char *clk_name = clk_hw_get_name(hw); u32 clk_id = clk->clk_id; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); if (zynqmp_pll_is_enabled(hw)) return 0; - ret = eemi_ops->clock_enable(clk_id); + ret = zynqmp_pm_clock_enable(clk_id); if (ret) pr_warn_once("%s() clock enable failed for %s, ret = %d\n", __func__, clk_name, ret); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 3fbab2947c3b..da13627c1ca1 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -351,10 +351,11 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_query_data); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_enable(u32 clock_id) +int zynqmp_pm_clock_enable(u32 clock_id) { return zynqmp_pm_invoke_fn(PM_CLOCK_ENABLE, clock_id, 0, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_enable); /** * zynqmp_pm_clock_disable() - Disable the clock for given id @@ -737,7 +738,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_enable = zynqmp_pm_clock_enable, .clock_disable = zynqmp_pm_clock_disable, .clock_getstate = zynqmp_pm_clock_getstate, .clock_setdivider = zynqmp_pm_clock_setdivider, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index fa1195c72976..77365d1d246c 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_enable)(u32 clock_id); int (*clock_disable)(u32 clock_id); int (*clock_getstate)(u32 clock_id, u32 *state); int (*clock_setdivider)(u32 clock_id, u32 divider); @@ -331,6 +330,7 @@ const struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void); int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); +int zynqmp_pm_clock_enable(u32 clock_id); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -349,6 +349,10 @@ static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, { return -ENODEV; } +static inline int zynqmp_pm_clock_enable(u32 clock_id) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From f5ccd54b67b3f029de9d3818efa70f210d189019 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:47 -0700 Subject: firmware: xilinx: Remove eemi ops for clock_disable Use direct function call for clock_disable instead using of eemi ops. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-6-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/clk-gate-zynqmp.c | 4 +--- drivers/clk/zynqmp/pll.c | 3 +-- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/clk-gate-zynqmp.c b/drivers/clk/zynqmp/clk-gate-zynqmp.c index 437b921f33b7..11f1b9718c3d 100644 --- a/drivers/clk/zynqmp/clk-gate-zynqmp.c +++ b/drivers/clk/zynqmp/clk-gate-zynqmp.c @@ -37,7 +37,6 @@ static int zynqmp_clk_gate_enable(struct clk_hw *hw) const char *clk_name = clk_hw_get_name(hw); u32 clk_id = gate->clk_id; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); ret = zynqmp_pm_clock_enable(clk_id); @@ -58,9 +57,8 @@ static void zynqmp_clk_gate_disable(struct clk_hw *hw) const char *clk_name = clk_hw_get_name(hw); u32 clk_id = gate->clk_id; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_disable(clk_id); + ret = zynqmp_pm_clock_disable(clk_id); if (ret) pr_warn_once("%s() clock disable failed for %s, ret = %d\n", diff --git a/drivers/clk/zynqmp/pll.c b/drivers/clk/zynqmp/pll.c index 153aa6720619..38b8dbc693f4 100644 --- a/drivers/clk/zynqmp/pll.c +++ b/drivers/clk/zynqmp/pll.c @@ -268,12 +268,11 @@ static void zynqmp_pll_disable(struct clk_hw *hw) const char *clk_name = clk_hw_get_name(hw); u32 clk_id = clk->clk_id; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); if (!zynqmp_pll_is_enabled(hw)) return; - ret = eemi_ops->clock_disable(clk_id); + ret = zynqmp_pm_clock_disable(clk_id); if (ret) pr_warn_once("%s() clock disable failed for %s, ret = %d\n", __func__, clk_name, ret); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index da13627c1ca1..d884805b7d6b 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -366,10 +366,11 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_enable); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_disable(u32 clock_id) +int zynqmp_pm_clock_disable(u32 clock_id) { return zynqmp_pm_invoke_fn(PM_CLOCK_DISABLE, clock_id, 0, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_disable); /** * zynqmp_pm_clock_getstate() - Get the clock state for given id @@ -738,7 +739,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_disable = zynqmp_pm_clock_disable, .clock_getstate = zynqmp_pm_clock_getstate, .clock_setdivider = zynqmp_pm_clock_setdivider, .clock_getdivider = zynqmp_pm_clock_getdivider, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 77365d1d246c..f9a84d974b5c 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_disable)(u32 clock_id); int (*clock_getstate)(u32 clock_id, u32 *state); int (*clock_setdivider)(u32 clock_id, u32 divider); int (*clock_getdivider)(u32 clock_id, u32 *divider); @@ -331,6 +330,7 @@ int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); +int zynqmp_pm_clock_disable(u32 clock_id); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -353,6 +353,10 @@ static inline int zynqmp_pm_clock_enable(u32 clock_id) { return -ENODEV; } +static inline int zynqmp_pm_clock_disable(u32 clock_id) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 5e76731dd370ca3217fceaa3e2e84e992e6b7b7f Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:48 -0700 Subject: firmware: xilinx: Remove eemi ops for clock_getstate Use direct function call instead of eemi ops for clock_getstate. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-7-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/clk-gate-zynqmp.c | 3 +-- drivers/clk/zynqmp/pll.c | 3 +-- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/clk-gate-zynqmp.c b/drivers/clk/zynqmp/clk-gate-zynqmp.c index 11f1b9718c3d..10c9b889324f 100644 --- a/drivers/clk/zynqmp/clk-gate-zynqmp.c +++ b/drivers/clk/zynqmp/clk-gate-zynqmp.c @@ -77,9 +77,8 @@ static int zynqmp_clk_gate_is_enabled(struct clk_hw *hw) const char *clk_name = clk_hw_get_name(hw); u32 clk_id = gate->clk_id; int state, ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_getstate(clk_id, &state); + ret = zynqmp_pm_clock_getstate(clk_id, &state); if (ret) { pr_warn_once("%s() clock get state failed for %s, ret = %d\n", __func__, clk_name, ret); diff --git a/drivers/clk/zynqmp/pll.c b/drivers/clk/zynqmp/pll.c index 38b8dbc693f4..41f376aad801 100644 --- a/drivers/clk/zynqmp/pll.c +++ b/drivers/clk/zynqmp/pll.c @@ -222,9 +222,8 @@ static int zynqmp_pll_is_enabled(struct clk_hw *hw) u32 clk_id = clk->clk_id; unsigned int state; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_getstate(clk_id, &state); + ret = zynqmp_pm_clock_getstate(clk_id, &state); if (ret) { pr_warn_once("%s() clock get state failed for %s, ret = %d\n", __func__, clk_name, ret); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index d884805b7d6b..c11b528f4a37 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -382,7 +382,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_disable); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state) +int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -393,6 +393,7 @@ static int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getstate); /** * zynqmp_pm_clock_setdivider() - Set the clock divider for given id @@ -739,7 +740,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_getstate = zynqmp_pm_clock_getstate, .clock_setdivider = zynqmp_pm_clock_setdivider, .clock_getdivider = zynqmp_pm_clock_getdivider, .clock_setrate = zynqmp_pm_clock_setrate, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index f9a84d974b5c..e874f0c5e7ab 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_getstate)(u32 clock_id, u32 *state); int (*clock_setdivider)(u32 clock_id, u32 divider); int (*clock_getdivider)(u32 clock_id, u32 *divider); int (*clock_setrate)(u32 clock_id, u64 rate); @@ -331,6 +330,7 @@ int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); +int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -357,6 +357,10 @@ static inline int zynqmp_pm_clock_disable(u32 clock_id) { return -ENODEV; } +static inline int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From fc9fb8fb985c092f9cf01c7c50269c132efc4d58 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:49 -0700 Subject: firmware: xilinx: Remove eemi ops for clock_setdivider Use direct function call instead of using eemi ops for clock_setdivider. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-8-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/divider.c | 3 +-- drivers/clk/zynqmp/pll.c | 4 ++-- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 4 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/divider.c b/drivers/clk/zynqmp/divider.c index e21f4ea421f5..13041cd93968 100644 --- a/drivers/clk/zynqmp/divider.c +++ b/drivers/clk/zynqmp/divider.c @@ -219,7 +219,6 @@ static int zynqmp_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate, u32 div_type = divider->div_type; u32 value, div; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); value = zynqmp_divider_get_val(parent_rate, rate, divider->flags); if (div_type == TYPE_DIV1) { @@ -233,7 +232,7 @@ static int zynqmp_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate, if (divider->flags & CLK_DIVIDER_POWER_OF_TWO) div = __ffs(div); - ret = eemi_ops->clock_setdivider(clk_id, div); + ret = zynqmp_pm_clock_setdivider(clk_id, div); if (ret) pr_warn_once("%s() set divider failed for %s, ret = %d\n", diff --git a/drivers/clk/zynqmp/pll.c b/drivers/clk/zynqmp/pll.c index 41f376aad801..95fad06eb542 100644 --- a/drivers/clk/zynqmp/pll.c +++ b/drivers/clk/zynqmp/pll.c @@ -187,7 +187,7 @@ static int zynqmp_pll_set_rate(struct clk_hw *hw, unsigned long rate, rate = parent_rate * m; frac = (parent_rate * f) / FRAC_DIV; - ret = eemi_ops->clock_setdivider(clk_id, m); + ret = zynqmp_pm_clock_setdivider(clk_id, m); if (ret == -EUSERS) WARN(1, "More than allowed devices are using the %s, which is forbidden\n", clk_name); @@ -201,7 +201,7 @@ static int zynqmp_pll_set_rate(struct clk_hw *hw, unsigned long rate, fbdiv = DIV_ROUND_CLOSEST(rate, parent_rate); fbdiv = clamp_t(u32, fbdiv, PLL_FBDIV_MIN, PLL_FBDIV_MAX); - ret = eemi_ops->clock_setdivider(clk_id, fbdiv); + ret = zynqmp_pm_clock_setdivider(clk_id, fbdiv); if (ret) pr_warn_once("%s() set divider failed for %s, ret = %d\n", __func__, clk_name, ret); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index c11b528f4a37..ef2412dc7d9f 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -405,11 +405,12 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getstate); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider) +int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider) { return zynqmp_pm_invoke_fn(PM_CLOCK_SETDIVIDER, clock_id, divider, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setdivider); /** * zynqmp_pm_clock_getdivider() - Get the clock divider for given id @@ -740,7 +741,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_setdivider = zynqmp_pm_clock_setdivider, .clock_getdivider = zynqmp_pm_clock_getdivider, .clock_setrate = zynqmp_pm_clock_setrate, .clock_getrate = zynqmp_pm_clock_getrate, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index e874f0c5e7ab..023f1f9807ed 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_setdivider)(u32 clock_id, u32 divider); int (*clock_getdivider)(u32 clock_id, u32 *divider); int (*clock_setrate)(u32 clock_id, u64 rate); int (*clock_getrate)(u32 clock_id, u64 *rate); @@ -331,6 +330,7 @@ int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state); +int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -361,6 +361,10 @@ static inline int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state) { return -ENODEV; } +static inline int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 0667a8d144bc830d0a752f079c9789735cd4f1f8 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:50 -0700 Subject: firmware: xilinx: Remove eemi ops for clock_getdivider Use direct function call instead of using eemi ops for clock_getdivider. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-9-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/divider.c | 6 ++---- drivers/clk/zynqmp/pll.c | 2 +- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/divider.c b/drivers/clk/zynqmp/divider.c index 13041cd93968..8eed715707e3 100644 --- a/drivers/clk/zynqmp/divider.c +++ b/drivers/clk/zynqmp/divider.c @@ -83,9 +83,8 @@ static unsigned long zynqmp_clk_divider_recalc_rate(struct clk_hw *hw, u32 div_type = divider->div_type; u32 div, value; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_getdivider(clk_id, &div); + ret = zynqmp_pm_clock_getdivider(clk_id, &div); if (ret) pr_warn_once("%s() get divider failed for %s, ret = %d\n", @@ -163,11 +162,10 @@ static long zynqmp_clk_divider_round_rate(struct clk_hw *hw, u32 div_type = divider->div_type; u32 bestdiv; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); /* if read only, just return current value */ if (divider->flags & CLK_DIVIDER_READ_ONLY) { - ret = eemi_ops->clock_getdivider(clk_id, &bestdiv); + ret = zynqmp_pm_clock_getdivider(clk_id, &bestdiv); if (ret) pr_warn_once("%s() get divider failed for %s, ret = %d\n", diff --git a/drivers/clk/zynqmp/pll.c b/drivers/clk/zynqmp/pll.c index 95fad06eb542..73fb5bb8eb55 100644 --- a/drivers/clk/zynqmp/pll.c +++ b/drivers/clk/zynqmp/pll.c @@ -141,7 +141,7 @@ static unsigned long zynqmp_pll_recalc_rate(struct clk_hw *hw, int ret; const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_getdivider(clk_id, &fbdiv); + ret = zynqmp_pm_clock_getdivider(clk_id, &fbdiv); if (ret) pr_warn_once("%s() get divider failed for %s, ret = %d\n", __func__, clk_name, ret); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index ef2412dc7d9f..5782b5c3bf9e 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -422,7 +422,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setdivider); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider) +int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -433,6 +433,7 @@ static int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getdivider); /** * zynqmp_pm_clock_setrate() - Set the clock rate for given id @@ -741,7 +742,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_getdivider = zynqmp_pm_clock_getdivider, .clock_setrate = zynqmp_pm_clock_setrate, .clock_getrate = zynqmp_pm_clock_getrate, .clock_setparent = zynqmp_pm_clock_setparent, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 023f1f9807ed..3bda22f5262b 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_getdivider)(u32 clock_id, u32 *divider); int (*clock_setrate)(u32 clock_id, u64 rate); int (*clock_getrate)(u32 clock_id, u64 *rate); int (*clock_setparent)(u32 clock_id, u32 parent_id); @@ -331,6 +330,7 @@ int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state); int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider); +int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -365,6 +365,10 @@ static inline int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider) { return -ENODEV; } +static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 7a1e10621a215dad4727aaa82396c4c19ce6593f Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:51 -0700 Subject: firmware: xilinx: Remove eemi ops for clock set/get rate Use direct function call instead of eemi ops for clock set/get rate. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-10-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 8 ++++---- include/linux/firmware/xlnx-zynqmp.h | 12 ++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 5782b5c3bf9e..84622014601e 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -444,13 +444,14 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getdivider); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate) +int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate) { return zynqmp_pm_invoke_fn(PM_CLOCK_SETRATE, clock_id, lower_32_bits(rate), upper_32_bits(rate), 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setrate); /** * zynqmp_pm_clock_getrate() - Get the clock rate for given id @@ -462,7 +463,7 @@ static int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate) * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate) +int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -473,6 +474,7 @@ static int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getrate); /** * zynqmp_pm_clock_setparent() - Set the clock parent for given id @@ -742,8 +744,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_setrate = zynqmp_pm_clock_setrate, - .clock_getrate = zynqmp_pm_clock_getrate, .clock_setparent = zynqmp_pm_clock_setparent, .clock_getparent = zynqmp_pm_clock_getparent, .ioctl = zynqmp_pm_ioctl, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 3bda22f5262b..a71f52c6173e 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,8 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_setrate)(u32 clock_id, u64 rate); - int (*clock_getrate)(u32 clock_id, u64 *rate); int (*clock_setparent)(u32 clock_id, u32 parent_id); int (*clock_getparent)(u32 clock_id, u32 *parent_id); int (*ioctl)(u32 node_id, u32 ioctl_id, u32 arg1, u32 arg2, u32 *out); @@ -331,6 +329,8 @@ int zynqmp_pm_clock_disable(u32 clock_id); int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state); int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider); int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider); +int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate); +int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -369,6 +369,14 @@ static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider) { return -ENODEV; } +static inline int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate) +{ + return -ENODEV; +} +static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 70c0d36462ca5be8fc05176d11bec832dbb355b2 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:52 -0700 Subject: firmware: xilinx: Remove eemi ops for clock set/get parent Use direct function call instead of eemi ops for clock set/get parent. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-11-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/clk-mux-zynqmp.c | 6 ++---- drivers/firmware/xilinx/zynqmp.c | 8 ++++---- include/linux/firmware/xlnx-zynqmp.h | 12 ++++++++++-- 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/clk-mux-zynqmp.c b/drivers/clk/zynqmp/clk-mux-zynqmp.c index 0af8f74c5fa5..06194149be83 100644 --- a/drivers/clk/zynqmp/clk-mux-zynqmp.c +++ b/drivers/clk/zynqmp/clk-mux-zynqmp.c @@ -47,9 +47,8 @@ static u8 zynqmp_clk_mux_get_parent(struct clk_hw *hw) u32 clk_id = mux->clk_id; u32 val; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_getparent(clk_id, &val); + ret = zynqmp_pm_clock_getparent(clk_id, &val); if (ret) pr_warn_once("%s() getparent failed for clock: %s, ret = %d\n", @@ -71,9 +70,8 @@ static int zynqmp_clk_mux_set_parent(struct clk_hw *hw, u8 index) const char *clk_name = clk_hw_get_name(hw); u32 clk_id = mux->clk_id; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->clock_setparent(clk_id, index); + ret = zynqmp_pm_clock_setparent(clk_id, index); if (ret) pr_warn_once("%s() set parent failed for clock: %s, ret = %d\n", diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 84622014601e..b0aa967d9b7c 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -485,11 +485,12 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getrate); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id) +int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id) { return zynqmp_pm_invoke_fn(PM_CLOCK_SETPARENT, clock_id, parent_id, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setparent); /** * zynqmp_pm_clock_getparent() - Get the clock parent for given id @@ -501,7 +502,7 @@ static int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id) * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id) +int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -512,6 +513,7 @@ static int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getparent); /** * zynqmp_is_valid_ioctl() - Check whether IOCTL ID is valid or not @@ -744,8 +746,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .clock_setparent = zynqmp_pm_clock_setparent, - .clock_getparent = zynqmp_pm_clock_getparent, .ioctl = zynqmp_pm_ioctl, .reset_assert = zynqmp_pm_reset_assert, .reset_get_status = zynqmp_pm_reset_get_status, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index a71f52c6173e..7abb68311f17 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,8 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*clock_setparent)(u32 clock_id, u32 parent_id); - int (*clock_getparent)(u32 clock_id, u32 *parent_id); int (*ioctl)(u32 node_id, u32 ioctl_id, u32 arg1, u32 arg2, u32 *out); int (*reset_assert)(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); @@ -331,6 +329,8 @@ int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider); int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider); int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate); int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate); +int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id); +int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -377,6 +377,14 @@ static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate) { return -ENODEV; } +static inline int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id) +{ + return -ENODEV; +} +static inline int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 426c8d85df7a7b8337e09eab2806e802311778fd Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:53 -0700 Subject: firmware: xilinx: Use APIs instead of IOCTLs Remove IOCTL API and use individual APIs for better readability. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-12-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/clk/zynqmp/pll.c | 14 ++--- drivers/firmware/xilinx/zynqmp.c | 118 ++++++++++++++++++++++++++--------- drivers/mmc/host/sdhci-of-arasan.c | 38 +---------- include/linux/firmware/xlnx-zynqmp.h | 31 ++++++++- 4 files changed, 126 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/zynqmp/pll.c b/drivers/clk/zynqmp/pll.c index 73fb5bb8eb55..92f449ed38e5 100644 --- a/drivers/clk/zynqmp/pll.c +++ b/drivers/clk/zynqmp/pll.c @@ -50,10 +50,8 @@ static inline enum pll_mode zynqmp_pll_get_mode(struct clk_hw *hw) const char *clk_name = clk_hw_get_name(hw); u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - ret = eemi_ops->ioctl(0, IOCTL_GET_PLL_FRAC_MODE, clk_id, 0, - ret_payload); + ret = zynqmp_pm_get_pll_frac_mode(clk_id, ret_payload); if (ret) pr_warn_once("%s() PLL get frac mode failed for %s, ret = %d\n", __func__, clk_name, ret); @@ -73,14 +71,13 @@ static inline void zynqmp_pll_set_mode(struct clk_hw *hw, bool on) const char *clk_name = clk_hw_get_name(hw); int ret; u32 mode; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); if (on) mode = PLL_MODE_FRAC; else mode = PLL_MODE_INT; - ret = eemi_ops->ioctl(0, IOCTL_SET_PLL_FRAC_MODE, clk_id, mode, NULL); + ret = zynqmp_pm_set_pll_frac_mode(clk_id, mode); if (ret) pr_warn_once("%s() PLL set frac mode failed for %s, ret = %d\n", __func__, clk_name, ret); @@ -139,7 +136,6 @@ static unsigned long zynqmp_pll_recalc_rate(struct clk_hw *hw, unsigned long rate, frac; u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); ret = zynqmp_pm_clock_getdivider(clk_id, &fbdiv); if (ret) @@ -148,8 +144,7 @@ static unsigned long zynqmp_pll_recalc_rate(struct clk_hw *hw, rate = parent_rate * fbdiv; if (zynqmp_pll_get_mode(hw) == PLL_MODE_FRAC) { - eemi_ops->ioctl(0, IOCTL_GET_PLL_FRAC_DATA, clk_id, 0, - ret_payload); + zynqmp_pm_get_pll_frac_data(clk_id, ret_payload); data = ret_payload[1]; frac = (parent_rate * data) / FRAC_DIV; rate = rate + frac; @@ -177,7 +172,6 @@ static int zynqmp_pll_set_rate(struct clk_hw *hw, unsigned long rate, u32 fbdiv; long rate_div, frac, m, f; int ret; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); if (zynqmp_pll_get_mode(hw) == PLL_MODE_FRAC) { rate_div = (rate * FRAC_DIV) / parent_rate; @@ -194,7 +188,7 @@ static int zynqmp_pll_set_rate(struct clk_hw *hw, unsigned long rate, else if (ret) pr_warn_once("%s() set divider failed for %s, ret = %d\n", __func__, clk_name, ret); - eemi_ops->ioctl(0, IOCTL_SET_PLL_FRAC_DATA, clk_id, f, NULL); + zynqmp_pm_set_pll_frac_data(clk_id, f); return rate + frac; } diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index b0aa967d9b7c..94fd755b19c6 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -516,47 +516,108 @@ int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id) EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getparent); /** - * zynqmp_is_valid_ioctl() - Check whether IOCTL ID is valid or not - * @ioctl_id: IOCTL ID + * zynqmp_pm_set_pll_frac_mode() - PM API for set PLL mode * - * Return: 1 if IOCTL is valid else 0 + * @clk_id: PLL clock ID + * @mode: PLL mode (PLL_MODE_FRAC/PLL_MODE_INT) + * + * This function sets PLL mode + * + * Return: Returns status, either success or error+reason */ -static inline int zynqmp_is_valid_ioctl(u32 ioctl_id) +int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode) { - switch (ioctl_id) { - case IOCTL_SD_DLL_RESET: - case IOCTL_SET_SD_TAPDELAY: - case IOCTL_SET_PLL_FRAC_MODE: - case IOCTL_GET_PLL_FRAC_MODE: - case IOCTL_SET_PLL_FRAC_DATA: - case IOCTL_GET_PLL_FRAC_DATA: - return 1; - default: - return 0; - } + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_PLL_FRAC_MODE, + clk_id, mode, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_set_pll_frac_mode); /** - * zynqmp_pm_ioctl() - PM IOCTL API for device control and configs - * @node_id: Node ID of the device - * @ioctl_id: ID of the requested IOCTL - * @arg1: Argument 1 to requested IOCTL call - * @arg2: Argument 2 to requested IOCTL call - * @out: Returned output value + * zynqmp_pm_get_pll_frac_mode() - PM API for get PLL mode + * + * @clk_id: PLL clock ID + * @mode: PLL mode * - * This function calls IOCTL to firmware for device control and configuration. + * This function return current PLL mode * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_ioctl(u32 node_id, u32 ioctl_id, u32 arg1, u32 arg2, - u32 *out) +int zynqmp_pm_get_pll_frac_mode(u32 clk_id, u32 *mode) { - if (!zynqmp_is_valid_ioctl(ioctl_id)) - return -EINVAL; + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_GET_PLL_FRAC_MODE, + clk_id, 0, mode); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_get_pll_frac_mode); + +/** + * zynqmp_pm_set_pll_frac_data() - PM API for setting pll fraction data + * + * @clk_id: PLL clock ID + * @data: fraction data + * + * This function sets fraction data. + * It is valid for fraction mode only. + * + * Return: Returns status, either success or error+reason + */ +int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_PLL_FRAC_DATA, + clk_id, data, NULL); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_set_pll_frac_data); + +/** + * zynqmp_pm_get_pll_frac_data() - PM API for getting pll fraction data + * + * @clk_id: PLL clock ID + * @data: fraction data + * + * This function returns fraction data value. + * + * Return: Returns status, either success or error+reason + */ +int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_GET_PLL_FRAC_DATA, + clk_id, 0, data); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_get_pll_frac_data); - return zynqmp_pm_invoke_fn(PM_IOCTL, node_id, ioctl_id, - arg1, arg2, out); +/** + * zynqmp_pm_set_sd_tapdelay() - Set tap delay for the SD device + * + * @node_id Node ID of the device + * @type Type of tap delay to set (input/output) + * @value Value to set fot the tap delay + * + * This function sets input/output tap delay for the SD device. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, node_id, IOCTL_SET_SD_TAPDELAY, + type, value, NULL); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_set_sd_tapdelay); + +/** + * zynqmp_pm_sd_dll_reset() - Reset DLL logic + * + * @node_id Node ID of the device + * @type Reset type + * + * This function resets DLL logic for the SD device. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, node_id, IOCTL_SET_SD_TAPDELAY, + type, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_sd_dll_reset); /** * zynqmp_pm_reset_assert - Request setting of reset (1 - assert, 0 - release) @@ -746,7 +807,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .ioctl = zynqmp_pm_ioctl, .reset_assert = zynqmp_pm_reset_assert, .reset_get_status = zynqmp_pm_reset_get_status, .init_finalize = zynqmp_pm_init_finalize, diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index d4905c106c06..d01f76223387 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -98,10 +98,6 @@ struct sdhci_arasan_clk_data { void *clk_of_data; }; -struct sdhci_arasan_zynqmp_clk_data { - const struct zynqmp_eemi_ops *eemi_ops; -}; - /** * struct sdhci_arasan_data * @host: Pointer to the main SDHCI host structure. @@ -630,9 +626,6 @@ static int sdhci_zynqmp_sdcardclk_set_phase(struct clk_hw *hw, int degrees) struct sdhci_arasan_data *sdhci_arasan = container_of(clk_data, struct sdhci_arasan_data, clk_data); struct sdhci_host *host = sdhci_arasan->host; - struct sdhci_arasan_zynqmp_clk_data *zynqmp_clk_data = - clk_data->clk_of_data; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_clk_data->eemi_ops; const char *clk_name = clk_hw_get_name(hw); u32 node_id = !strcmp(clk_name, "clk_out_sd0") ? NODE_SD_0 : NODE_SD_1; u8 tap_delay, tap_max = 0; @@ -672,8 +665,7 @@ static int sdhci_zynqmp_sdcardclk_set_phase(struct clk_hw *hw, int degrees) tap_delay = (degrees * tap_max) / 360; /* Set the Clock Phase */ - ret = eemi_ops->ioctl(node_id, IOCTL_SET_SD_TAPDELAY, - PM_TAPDELAY_OUTPUT, tap_delay, NULL); + ret = zynqmp_pm_set_sd_tapdelay(node_id, PM_TAPDELAY_OUTPUT, tap_delay); if (ret) pr_err("Error setting Output Tap Delay\n"); @@ -702,9 +694,6 @@ static int sdhci_zynqmp_sampleclk_set_phase(struct clk_hw *hw, int degrees) struct sdhci_arasan_data *sdhci_arasan = container_of(clk_data, struct sdhci_arasan_data, clk_data); struct sdhci_host *host = sdhci_arasan->host; - struct sdhci_arasan_zynqmp_clk_data *zynqmp_clk_data = - clk_data->clk_of_data; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_clk_data->eemi_ops; const char *clk_name = clk_hw_get_name(hw); u32 node_id = !strcmp(clk_name, "clk_in_sd0") ? NODE_SD_0 : NODE_SD_1; u8 tap_delay, tap_max = 0; @@ -744,8 +733,7 @@ static int sdhci_zynqmp_sampleclk_set_phase(struct clk_hw *hw, int degrees) tap_delay = (degrees * tap_max) / 360; /* Set the Clock Phase */ - ret = eemi_ops->ioctl(node_id, IOCTL_SET_SD_TAPDELAY, - PM_TAPDELAY_INPUT, tap_delay, NULL); + ret = zynqmp_pm_set_sd_tapdelay(node_id, PM_TAPDELAY_INPUT, tap_delay); if (ret) pr_err("Error setting Input Tap Delay\n"); @@ -759,11 +747,6 @@ static const struct clk_ops zynqmp_sampleclk_ops = { static void arasan_zynqmp_dll_reset(struct sdhci_host *host, u32 deviceid) { - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct sdhci_arasan_data *sdhci_arasan = sdhci_pltfm_priv(pltfm_host); - struct sdhci_arasan_zynqmp_clk_data *zynqmp_clk_data = - sdhci_arasan->clk_data.clk_of_data; - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_clk_data->eemi_ops; u16 clk; clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); @@ -771,8 +754,7 @@ static void arasan_zynqmp_dll_reset(struct sdhci_host *host, u32 deviceid) sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); /* Issue DLL Reset */ - eemi_ops->ioctl(deviceid, IOCTL_SD_DLL_RESET, - PM_DLL_RESET_PULSE, 0, NULL); + zynqmp_pm_sd_dll_reset(deviceid, PM_DLL_RESET_PULSE); clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); @@ -1277,20 +1259,6 @@ static int sdhci_arasan_probe(struct platform_device *pdev) goto clk_disable_all; if (of_device_is_compatible(np, "xlnx,zynqmp-8.9a")) { - struct sdhci_arasan_zynqmp_clk_data *zynqmp_clk_data; - const struct zynqmp_eemi_ops *eemi_ops; - - zynqmp_clk_data = devm_kzalloc(&pdev->dev, - sizeof(*zynqmp_clk_data), - GFP_KERNEL); - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) { - ret = PTR_ERR(eemi_ops); - goto unreg_clk; - } - - zynqmp_clk_data->eemi_ops = eemi_ops; - sdhci_arasan->clk_data.clk_of_data = zynqmp_clk_data; host->mmc_host_ops.execute_tuning = arasan_zynqmp_execute_tuning; } diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 7abb68311f17..5aff8963c11a 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*ioctl)(u32 node_id, u32 ioctl_id, u32 arg1, u32 arg2, u32 *out); int (*reset_assert)(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); int (*reset_get_status)(const enum zynqmp_pm_reset reset, u32 *status); @@ -331,6 +330,12 @@ int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate); int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate); int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id); int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id); +int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode); +int zynqmp_pm_get_pll_frac_mode(u32 clk_id, u32 *mode); +int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data); +int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data); +int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value); +int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -385,6 +390,30 @@ static inline int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id) { return -ENODEV; } +static inline int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode) +{ + return -ENODEV; +} +static inline int zynqmp_pm_get_pll_frac_mode(u32 clk_id, u32 *mode) +{ + return -ENODEV; +} +static inline int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data) +{ + return -ENODEV; +} +static inline int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data) +{ + return -ENODEV; +} +static inline int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value) +{ + return -ENODEV; +} +static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From cf23ec3531462376ef48237235daea577e1194e7 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:54 -0700 Subject: firmware: xilinx: Remove eemi ops for reset_assert Use direct function call instead of using eemi ops for reset_assert. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-13-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 6 +++--- drivers/reset/reset-zynqmp.c | 18 ++++++------------ include/linux/firmware/xlnx-zynqmp.h | 9 +++++++-- 3 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 94fd755b19c6..2a790917ef8e 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -627,12 +627,13 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_sd_dll_reset); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, - const enum zynqmp_pm_reset_action assert_flag) +int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, + const enum zynqmp_pm_reset_action assert_flag) { return zynqmp_pm_invoke_fn(PM_RESET_ASSERT, reset, assert_flag, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_reset_assert); /** * zynqmp_pm_reset_get_status - Get status of the reset @@ -807,7 +808,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .reset_assert = zynqmp_pm_reset_assert, .reset_get_status = zynqmp_pm_reset_get_status, .init_finalize = zynqmp_pm_init_finalize, .set_suspend_mode = zynqmp_pm_set_suspend_mode, diff --git a/drivers/reset/reset-zynqmp.c b/drivers/reset/reset-zynqmp.c index 0144075b11a6..4a01b7e94549 100644 --- a/drivers/reset/reset-zynqmp.c +++ b/drivers/reset/reset-zynqmp.c @@ -27,19 +27,15 @@ to_zynqmp_reset_data(struct reset_controller_dev *rcdev) static int zynqmp_reset_assert(struct reset_controller_dev *rcdev, unsigned long id) { - struct zynqmp_reset_data *priv = to_zynqmp_reset_data(rcdev); - - return priv->eemi_ops->reset_assert(ZYNQMP_RESET_ID + id, - PM_RESET_ACTION_ASSERT); + return zynqmp_pm_reset_assert(ZYNQMP_RESET_ID + id, + PM_RESET_ACTION_ASSERT); } static int zynqmp_reset_deassert(struct reset_controller_dev *rcdev, unsigned long id) { - struct zynqmp_reset_data *priv = to_zynqmp_reset_data(rcdev); - - return priv->eemi_ops->reset_assert(ZYNQMP_RESET_ID + id, - PM_RESET_ACTION_RELEASE); + return zynqmp_pm_reset_assert(ZYNQMP_RESET_ID + id, + PM_RESET_ACTION_RELEASE); } static int zynqmp_reset_status(struct reset_controller_dev *rcdev, @@ -58,10 +54,8 @@ static int zynqmp_reset_status(struct reset_controller_dev *rcdev, static int zynqmp_reset_reset(struct reset_controller_dev *rcdev, unsigned long id) { - struct zynqmp_reset_data *priv = to_zynqmp_reset_data(rcdev); - - return priv->eemi_ops->reset_assert(ZYNQMP_RESET_ID + id, - PM_RESET_ACTION_PULSE); + return zynqmp_pm_reset_assert(ZYNQMP_RESET_ID + id, + PM_RESET_ACTION_PULSE); } static const struct reset_control_ops zynqmp_reset_ops = { diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 5aff8963c11a..22b2bbe0faea 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,8 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*reset_assert)(const enum zynqmp_pm_reset reset, - const enum zynqmp_pm_reset_action assert_flag); int (*reset_get_status)(const enum zynqmp_pm_reset reset, u32 *status); int (*init_finalize)(void); int (*set_suspend_mode)(u32 mode); @@ -336,6 +334,8 @@ int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data); int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data); int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value); int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); +int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, + const enum zynqmp_pm_reset_action assert_flag); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -414,6 +414,11 @@ static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type) { return -ENODEV; } +static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, + const enum zynqmp_pm_reset_action assert_flag) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 1b413581fe26404217a2160d444642440bb0e9e7 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:55 -0700 Subject: firmware: xilinx: Remove eemi ops for reset_get_status Use direct function call instead of using eemi ops for reset_get_status. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-14-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 5 ++--- drivers/reset/reset-zynqmp.c | 8 +------- include/linux/firmware/xlnx-zynqmp.h | 7 ++++++- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 2a790917ef8e..4380853526eb 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -642,8 +642,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_reset_assert); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, - u32 *status) +int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -657,6 +656,7 @@ static int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_reset_get_status); /** * zynqmp_pm_fpga_load - Perform the fpga load @@ -808,7 +808,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .reset_get_status = zynqmp_pm_reset_get_status, .init_finalize = zynqmp_pm_init_finalize, .set_suspend_mode = zynqmp_pm_set_suspend_mode, .request_node = zynqmp_pm_request_node, diff --git a/drivers/reset/reset-zynqmp.c b/drivers/reset/reset-zynqmp.c index 4a01b7e94549..373ea8d4f7a1 100644 --- a/drivers/reset/reset-zynqmp.c +++ b/drivers/reset/reset-zynqmp.c @@ -15,7 +15,6 @@ struct zynqmp_reset_data { struct reset_controller_dev rcdev; - const struct zynqmp_eemi_ops *eemi_ops; }; static inline struct zynqmp_reset_data * @@ -41,10 +40,9 @@ static int zynqmp_reset_deassert(struct reset_controller_dev *rcdev, static int zynqmp_reset_status(struct reset_controller_dev *rcdev, unsigned long id) { - struct zynqmp_reset_data *priv = to_zynqmp_reset_data(rcdev); int val, err; - err = priv->eemi_ops->reset_get_status(ZYNQMP_RESET_ID + id, &val); + err = zynqmp_pm_reset_get_status(ZYNQMP_RESET_ID + id, &val); if (err) return err; @@ -73,10 +71,6 @@ static int zynqmp_reset_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(priv->eemi_ops)) - return PTR_ERR(priv->eemi_ops); - platform_set_drvdata(pdev, priv); priv->rcdev.ops = &zynqmp_reset_ops; diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 22b2bbe0faea..200f9e0dc406 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*reset_get_status)(const enum zynqmp_pm_reset reset, u32 *status); int (*init_finalize)(void); int (*set_suspend_mode)(u32 mode); int (*request_node)(const u32 node, @@ -336,6 +335,7 @@ int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value); int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); +int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -419,6 +419,11 @@ static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, { return -ENODEV; } +static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, + u32 *status) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 9474da950d1e39f71cbfeaba87367fa146635a88 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:56 -0700 Subject: firmware: xilinx: Remove eemi ops for init_finalize Use direct function call instead of eemi ops for init_finalize. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-15-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 4 ++-- drivers/soc/xilinx/zynqmp_power.c | 9 +-------- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 4380853526eb..869366f3fbf7 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -710,10 +710,11 @@ static int zynqmp_pm_fpga_get_status(u32 *value) * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_init_finalize(void) +int zynqmp_pm_init_finalize(void) { return zynqmp_pm_invoke_fn(PM_PM_INIT_FINALIZE, 0, 0, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_init_finalize); /** * zynqmp_pm_set_suspend_mode() - Set system suspend mode @@ -808,7 +809,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .init_finalize = zynqmp_pm_init_finalize, .set_suspend_mode = zynqmp_pm_set_suspend_mode, .request_node = zynqmp_pm_request_node, .release_node = zynqmp_pm_release_node, diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c index d327d9ef0e81..f4a937143098 100644 --- a/drivers/soc/xilinx/zynqmp_power.c +++ b/drivers/soc/xilinx/zynqmp_power.c @@ -182,14 +182,7 @@ static int zynqmp_pm_probe(struct platform_device *pdev) u32 pm_api_version; struct mbox_client *client; - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) - return PTR_ERR(eemi_ops); - - if (!eemi_ops->init_finalize) - return -ENXIO; - - eemi_ops->init_finalize(); + zynqmp_pm_init_finalize(); zynqmp_pm_get_api_version(&pm_api_version); /* Check PM API version number */ diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 200f9e0dc406..9aa5fe8acbea 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*init_finalize)(void); int (*set_suspend_mode)(u32 mode); int (*request_node)(const u32 node, const u32 capabilities, @@ -336,6 +335,7 @@ int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); +int zynqmp_pm_init_finalize(void); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -424,6 +424,10 @@ static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, { return -ENODEV; } +static inline int zynqmp_pm_init_finalize(void) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 951d0a97e41caff96c180ca416093276bfd9a4fd Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:57 -0700 Subject: firmware: xilinx: Remove eemi ops for set_suspend_mode Use direct function call instead of eemi ops for set_suspend_mode. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-16-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 4 ++-- drivers/soc/xilinx/zynqmp_power.c | 6 +----- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 869366f3fbf7..cc74165b845b 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -724,10 +724,11 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_init_finalize); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_set_suspend_mode(u32 mode) +int zynqmp_pm_set_suspend_mode(u32 mode) { return zynqmp_pm_invoke_fn(PM_SET_SUSPEND_MODE, mode, 0, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_set_suspend_mode); /** * zynqmp_pm_request_node() - Request a node with specific capabilities @@ -809,7 +810,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .set_suspend_mode = zynqmp_pm_set_suspend_mode, .request_node = zynqmp_pm_request_node, .release_node = zynqmp_pm_release_node, .set_requirement = zynqmp_pm_set_requirement, diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c index f4a937143098..31ff49fcd078 100644 --- a/drivers/soc/xilinx/zynqmp_power.c +++ b/drivers/soc/xilinx/zynqmp_power.c @@ -30,7 +30,6 @@ struct zynqmp_pm_work_struct { static struct zynqmp_pm_work_struct *zynqmp_pm_init_suspend_work; static struct mbox_chan *rx_chan; -static const struct zynqmp_eemi_ops *eemi_ops; enum pm_suspend_mode { PM_SUSPEND_MODE_FIRST = 0, @@ -155,9 +154,6 @@ static ssize_t suspend_mode_store(struct device *dev, { int md, ret = -EINVAL; - if (!eemi_ops->set_suspend_mode) - return ret; - for (md = PM_SUSPEND_MODE_FIRST; md < ARRAY_SIZE(suspend_modes); md++) if (suspend_modes[md] && sysfs_streq(suspend_modes[md], buf)) { @@ -166,7 +162,7 @@ static ssize_t suspend_mode_store(struct device *dev, } if (!ret && md != suspend_mode) { - ret = eemi_ops->set_suspend_mode(md); + ret = zynqmp_pm_set_suspend_mode(md); if (likely(!ret)) suspend_mode = md; } diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 9aa5fe8acbea..761caede5a51 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*set_suspend_mode)(u32 mode); int (*request_node)(const u32 node, const u32 capabilities, const u32 qos, @@ -336,6 +335,7 @@ int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); int zynqmp_pm_init_finalize(void); +int zynqmp_pm_set_suspend_mode(u32 mode); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -428,6 +428,10 @@ static inline int zynqmp_pm_init_finalize(void) { return -ENODEV; } +static inline int zynqmp_pm_set_suspend_mode(u32 mode) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From bf8b27ed2324b5108439593dcfb9ab264a745ee7 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:58 -0700 Subject: firmware: xilinx: Remove eemi ops for request_node Use direct function call instead of using eemi ops for request_node. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-17-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 7 +++---- drivers/soc/xilinx/zynqmp_pm_domains.c | 5 +---- include/linux/firmware/xlnx-zynqmp.h | 12 ++++++++---- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index cc74165b845b..e65ee76880ac 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -742,13 +742,13 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_suspend_mode); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_request_node(const u32 node, const u32 capabilities, - const u32 qos, - const enum zynqmp_pm_request_ack ack) +int zynqmp_pm_request_node(const u32 node, const u32 capabilities, + const u32 qos, const enum zynqmp_pm_request_ack ack) { return zynqmp_pm_invoke_fn(PM_REQUEST_NODE, node, capabilities, qos, ack, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_request_node); /** * zynqmp_pm_release_node() - Release a node @@ -810,7 +810,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .request_node = zynqmp_pm_request_node, .release_node = zynqmp_pm_release_node, .set_requirement = zynqmp_pm_set_requirement, .fpga_load = zynqmp_pm_fpga_load, diff --git a/drivers/soc/xilinx/zynqmp_pm_domains.c b/drivers/soc/xilinx/zynqmp_pm_domains.c index 23d90cb12ba9..cf4eed0e3011 100644 --- a/drivers/soc/xilinx/zynqmp_pm_domains.c +++ b/drivers/soc/xilinx/zynqmp_pm_domains.c @@ -163,16 +163,13 @@ static int zynqmp_gpd_attach_dev(struct generic_pm_domain *domain, int ret; struct zynqmp_pm_domain *pd; - if (!eemi_ops->request_node) - return -ENXIO; - pd = container_of(domain, struct zynqmp_pm_domain, gpd); /* If this is not the first device to attach there is nothing to do */ if (domain->device_count) return 0; - ret = eemi_ops->request_node(pd->node_id, 0, 0, + ret = zynqmp_pm_request_node(pd->node_id, 0, 0, ZYNQMP_PM_REQUEST_ACK_BLOCKING); /* If requesting a node fails print and return the error */ if (ret) { diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 761caede5a51..fb7e5c91c816 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,10 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*request_node)(const u32 node, - const u32 capabilities, - const u32 qos, - const enum zynqmp_pm_request_ack ack); int (*release_node)(const u32 node); int (*set_requirement)(const u32 node, const u32 capabilities, @@ -336,6 +332,8 @@ int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); int zynqmp_pm_init_finalize(void); int zynqmp_pm_set_suspend_mode(u32 mode); +int zynqmp_pm_request_node(const u32 node, const u32 capabilities, + const u32 qos, const enum zynqmp_pm_request_ack ack); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -432,6 +430,12 @@ static inline int zynqmp_pm_set_suspend_mode(u32 mode) { return -ENODEV; } +static inline int zynqmp_pm_request_node(const u32 node, const u32 capabilities, + const u32 qos, + const enum zynqmp_pm_request_ack ack) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 07fb1a4619fcb35f79d0adc13c8678f7726337ef Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:57:59 -0700 Subject: firmware: xilinx: Remove eemi ops for release_node Use direct function call instead of using eemi ops for release_node. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-18-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 4 ++-- drivers/soc/xilinx/zynqmp_pm_domains.c | 5 +---- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index e65ee76880ac..ce65bafd2fcf 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -760,10 +760,11 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_request_node); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_release_node(const u32 node) +int zynqmp_pm_release_node(const u32 node) { return zynqmp_pm_invoke_fn(PM_RELEASE_NODE, node, 0, 0, 0, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_release_node); /** * zynqmp_pm_set_requirement() - PM call to set requirement for PM slaves @@ -810,7 +811,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .release_node = zynqmp_pm_release_node, .set_requirement = zynqmp_pm_set_requirement, .fpga_load = zynqmp_pm_fpga_load, .fpga_get_status = zynqmp_pm_fpga_get_status, diff --git a/drivers/soc/xilinx/zynqmp_pm_domains.c b/drivers/soc/xilinx/zynqmp_pm_domains.c index cf4eed0e3011..20bee26535d1 100644 --- a/drivers/soc/xilinx/zynqmp_pm_domains.c +++ b/drivers/soc/xilinx/zynqmp_pm_domains.c @@ -196,16 +196,13 @@ static void zynqmp_gpd_detach_dev(struct generic_pm_domain *domain, int ret; struct zynqmp_pm_domain *pd; - if (!eemi_ops->release_node) - return; - pd = container_of(domain, struct zynqmp_pm_domain, gpd); /* If this is not the last device to detach there is nothing to do */ if (domain->device_count) return; - ret = eemi_ops->release_node(pd->node_id); + ret = zynqmp_pm_release_node(pd->node_id); /* If releasing a node fails print the error and return */ if (ret) { pr_err("%s() %s release failed for node %d: %d\n", diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index fb7e5c91c816..bfa8cca64455 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*release_node)(const u32 node); int (*set_requirement)(const u32 node, const u32 capabilities, const u32 qos, @@ -334,6 +333,7 @@ int zynqmp_pm_init_finalize(void); int zynqmp_pm_set_suspend_mode(u32 mode); int zynqmp_pm_request_node(const u32 node, const u32 capabilities, const u32 qos, const enum zynqmp_pm_request_ack ack); +int zynqmp_pm_release_node(const u32 node); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -436,6 +436,10 @@ static inline int zynqmp_pm_request_node(const u32 node, const u32 capabilities, { return -ENODEV; } +static inline int zynqmp_pm_release_node(const u32 node) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From cbbbda71fe37fe70e610d5ec3977fc6a096280ed Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:00 -0700 Subject: firmware: xilinx: Remove eemi ops for set_requirement Use direct function call instead of using eemi ops for set_requirement. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-19-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 8 ++++---- drivers/soc/xilinx/zynqmp_pm_domains.c | 16 ++-------------- include/linux/firmware/xlnx-zynqmp.h | 14 ++++++++++---- 3 files changed, 16 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index ce65bafd2fcf..0c5c8bccbeba 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -778,13 +778,14 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_release_node); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, - const u32 qos, - const enum zynqmp_pm_request_ack ack) +int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, + const u32 qos, + const enum zynqmp_pm_request_ack ack) { return zynqmp_pm_invoke_fn(PM_SET_REQUIREMENT, node, capabilities, qos, ack, NULL); } +EXPORT_SYMBOL_GPL(zynqmp_pm_set_requirement); /** * zynqmp_pm_aes - Access AES hardware to encrypt/decrypt the data using @@ -811,7 +812,6 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) } static const struct zynqmp_eemi_ops eemi_ops = { - .set_requirement = zynqmp_pm_set_requirement, .fpga_load = zynqmp_pm_fpga_load, .fpga_get_status = zynqmp_pm_fpga_get_status, .aes = zynqmp_pm_aes_engine, diff --git a/drivers/soc/xilinx/zynqmp_pm_domains.c b/drivers/soc/xilinx/zynqmp_pm_domains.c index 20bee26535d1..226d343f0a6a 100644 --- a/drivers/soc/xilinx/zynqmp_pm_domains.c +++ b/drivers/soc/xilinx/zynqmp_pm_domains.c @@ -23,8 +23,6 @@ /* Flag stating if PM nodes mapped to the PM domain has been requested */ #define ZYNQMP_PM_DOMAIN_REQUESTED BIT(0) -static const struct zynqmp_eemi_ops *eemi_ops; - static int min_capability; /** @@ -76,11 +74,8 @@ static int zynqmp_gpd_power_on(struct generic_pm_domain *domain) int ret; struct zynqmp_pm_domain *pd; - if (!eemi_ops->set_requirement) - return -ENXIO; - pd = container_of(domain, struct zynqmp_pm_domain, gpd); - ret = eemi_ops->set_requirement(pd->node_id, + ret = zynqmp_pm_set_requirement(pd->node_id, ZYNQMP_PM_CAPABILITY_ACCESS, ZYNQMP_PM_MAX_QOS, ZYNQMP_PM_REQUEST_ACK_BLOCKING); @@ -111,9 +106,6 @@ static int zynqmp_gpd_power_off(struct generic_pm_domain *domain) u32 capabilities = min_capability; bool may_wakeup; - if (!eemi_ops->set_requirement) - return -ENXIO; - pd = container_of(domain, struct zynqmp_pm_domain, gpd); /* If domain is already released there is nothing to be done */ @@ -134,7 +126,7 @@ static int zynqmp_gpd_power_off(struct generic_pm_domain *domain) } } - ret = eemi_ops->set_requirement(pd->node_id, capabilities, 0, + ret = zynqmp_pm_set_requirement(pd->node_id, capabilities, 0, ZYNQMP_PM_REQUEST_ACK_NO); /** * If powering down of any node inside this domain fails, @@ -260,10 +252,6 @@ static int zynqmp_gpd_probe(struct platform_device *pdev) struct zynqmp_pm_domain *pd; struct device *dev = &pdev->dev; - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) - return PTR_ERR(eemi_ops); - pd = devm_kcalloc(dev, ZYNQMP_NUM_DOMAINS, sizeof(*pd), GFP_KERNEL); if (!pd) return -ENOMEM; diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index bfa8cca64455..5927f6f49980 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,10 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*set_requirement)(const u32 node, - const u32 capabilities, - const u32 qos, - const enum zynqmp_pm_request_ack ack); int (*aes)(const u64 address, u32 *out); }; @@ -334,6 +330,9 @@ int zynqmp_pm_set_suspend_mode(u32 mode); int zynqmp_pm_request_node(const u32 node, const u32 capabilities, const u32 qos, const enum zynqmp_pm_request_ack ack); int zynqmp_pm_release_node(const u32 node); +int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, + const u32 qos, + const enum zynqmp_pm_request_ack ack); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -440,6 +439,13 @@ static inline int zynqmp_pm_release_node(const u32 node) { return -ENODEV; } +static inline int zynqmp_pm_set_requirement(const u32 node, + const u32 capabilities, + const u32 qos, + const enum zynqmp_pm_request_ack ack) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From bc86f9c546160afce631fae223c3772d9375ee25 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:01 -0700 Subject: firmware: xilinx: Remove eemi ops for aes engine Use direct function call for aes engine instead of using eemi ops. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-20-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/xilinx/zynqmp-aes-gcm.c | 12 +----------- drivers/firmware/xilinx/zynqmp.c | 4 ++-- include/linux/firmware/xlnx-zynqmp.h | 6 +++++- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/xilinx/zynqmp-aes-gcm.c b/drivers/crypto/xilinx/zynqmp-aes-gcm.c index 09f7f468eef8..d0a0daf3ea08 100644 --- a/drivers/crypto/xilinx/zynqmp-aes-gcm.c +++ b/drivers/crypto/xilinx/zynqmp-aes-gcm.c @@ -46,7 +46,6 @@ struct zynqmp_aead_drv_ctx { } alg; struct device *dev; struct crypto_engine *engine; - const struct zynqmp_eemi_ops *eemi_ops; }; struct zynqmp_aead_hw_req { @@ -92,9 +91,6 @@ static int zynqmp_aes_aead_cipher(struct aead_request *req) drv_ctx = container_of(alg, struct zynqmp_aead_drv_ctx, alg.aead); - if (!drv_ctx->eemi_ops->aes) - return -ENOTSUPP; - if (tfm_ctx->keysrc == ZYNQMP_AES_KUP_KEY) dma_size = req->cryptlen + ZYNQMP_AES_KEY_SIZE + GCM_AES_IV_SIZE; @@ -136,7 +132,7 @@ static int zynqmp_aes_aead_cipher(struct aead_request *req) hwreq->key = 0; } - drv_ctx->eemi_ops->aes(dma_addr_hw_req, &status); + zynqmp_pm_aes_engine(dma_addr_hw_req, &status); if (status) { switch (status) { @@ -388,12 +384,6 @@ static int zynqmp_aes_aead_probe(struct platform_device *pdev) else return -ENODEV; - aes_drv_ctx.eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(aes_drv_ctx.eemi_ops)) { - dev_err(dev, "Failed to get ZynqMP EEMI interface\n"); - return PTR_ERR(aes_drv_ctx.eemi_ops); - } - err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(ZYNQMP_DMA_BIT_MASK)); if (err < 0) { dev_err(dev, "No usable DMA configuration\n"); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 0c5c8bccbeba..e6e7b63cde98 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -795,7 +795,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_requirement); * * Return: Returns status, either success or error code. */ -static int zynqmp_pm_aes_engine(const u64 address, u32 *out) +int zynqmp_pm_aes_engine(const u64 address, u32 *out) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -810,11 +810,11 @@ static int zynqmp_pm_aes_engine(const u64 address, u32 *out) return ret; } +EXPORT_SYMBOL_GPL(zynqmp_pm_aes_engine); static const struct zynqmp_eemi_ops eemi_ops = { .fpga_load = zynqmp_pm_fpga_load, .fpga_get_status = zynqmp_pm_fpga_get_status, - .aes = zynqmp_pm_aes_engine, }; /** diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 5927f6f49980..11d7aef2cc3c 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -296,7 +296,6 @@ struct zynqmp_pm_query_data { struct zynqmp_eemi_ops { int (*fpga_load)(const u64 address, const u32 size, const u32 flags); int (*fpga_get_status)(u32 *value); - int (*aes)(const u64 address, u32 *out); }; int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, @@ -333,6 +332,7 @@ int zynqmp_pm_release_node(const u32 node); int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, const u32 qos, const enum zynqmp_pm_request_ack ack); +int zynqmp_pm_aes_engine(const u64 address, u32 *out); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -446,6 +446,10 @@ static inline int zynqmp_pm_set_requirement(const u32 node, { return -ENODEV; } +static inline int zynqmp_pm_aes_engine(const u64 address, u32 *out) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 4db8180ffe7c07bc4a602c82d6d9c1c04751583d Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:02 -0700 Subject: firmware: xilinx: Remove eemi ops for fpga related APIs Use direct function call instead of using eemi ops for fpga related APIs. Also remove eemi ops structure. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-21-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 30 ++---------------------------- drivers/fpga/zynqmp-fpga.c | 14 +++----------- drivers/spi/spi-zynqmp-gqspi.c | 5 ----- include/linux/firmware/xlnx-zynqmp.h | 16 +++++++++++----- 4 files changed, 16 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index e6e7b63cde98..ef7ba32e017a 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -24,8 +24,6 @@ #include #include "zynqmp-debug.h" -static const struct zynqmp_eemi_ops *eemi_ops_tbl; - static bool feature_check_enabled; static u32 zynqmp_pm_features[PM_API_MAX]; @@ -671,8 +669,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_reset_get_status); * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_fpga_load(const u64 address, const u32 size, - const u32 flags) +int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags) { return zynqmp_pm_invoke_fn(PM_FPGA_LOAD, lower_32_bits(address), upper_32_bits(address), size, flags, NULL); @@ -687,7 +684,7 @@ static int zynqmp_pm_fpga_load(const u64 address, const u32 size, * * Return: Returns status, either success or error+reason */ -static int zynqmp_pm_fpga_get_status(u32 *value) +int zynqmp_pm_fpga_get_status(u32 *value) { u32 ret_payload[PAYLOAD_ARG_CNT]; int ret; @@ -812,26 +809,6 @@ int zynqmp_pm_aes_engine(const u64 address, u32 *out) } EXPORT_SYMBOL_GPL(zynqmp_pm_aes_engine); -static const struct zynqmp_eemi_ops eemi_ops = { - .fpga_load = zynqmp_pm_fpga_load, - .fpga_get_status = zynqmp_pm_fpga_get_status, -}; - -/** - * zynqmp_pm_get_eemi_ops - Get eemi ops functions - * - * Return: Pointer of eemi_ops structure - */ -const struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) -{ - if (eemi_ops_tbl) - return eemi_ops_tbl; - else - return ERR_PTR(-EPROBE_DEFER); - -} -EXPORT_SYMBOL_GPL(zynqmp_pm_get_eemi_ops); - static int zynqmp_firmware_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -878,9 +855,6 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) pr_info("%s Trustzone version v%d.%d\n", __func__, pm_tz_version >> 16, pm_tz_version & 0xFFFF); - /* Assign eemi_ops_table */ - eemi_ops_tbl = &eemi_ops; - zynqmp_pm_api_debugfs_init(); ret = mfd_add_devices(&pdev->dev, PLATFORM_DEVID_NONE, firmware_devs, diff --git a/drivers/fpga/zynqmp-fpga.c b/drivers/fpga/zynqmp-fpga.c index b8a88d21d038..4a1139e05280 100644 --- a/drivers/fpga/zynqmp-fpga.c +++ b/drivers/fpga/zynqmp-fpga.c @@ -40,16 +40,12 @@ static int zynqmp_fpga_ops_write_init(struct fpga_manager *mgr, static int zynqmp_fpga_ops_write(struct fpga_manager *mgr, const char *buf, size_t size) { - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); struct zynqmp_fpga_priv *priv; dma_addr_t dma_addr; u32 eemi_flags = 0; char *kbuf; int ret; - if (IS_ERR_OR_NULL(eemi_ops) || !eemi_ops->fpga_load) - return -ENXIO; - priv = mgr->priv; kbuf = dma_alloc_coherent(priv->dev, size, &dma_addr, GFP_KERNEL); @@ -63,7 +59,7 @@ static int zynqmp_fpga_ops_write(struct fpga_manager *mgr, if (priv->flags & FPGA_MGR_PARTIAL_RECONFIG) eemi_flags |= XILINX_ZYNQMP_PM_FPGA_PARTIAL; - ret = eemi_ops->fpga_load(dma_addr, size, eemi_flags); + ret = zynqmp_pm_fpga_load(dma_addr, size, eemi_flags); dma_free_coherent(priv->dev, size, kbuf, dma_addr); @@ -78,13 +74,9 @@ static int zynqmp_fpga_ops_write_complete(struct fpga_manager *mgr, static enum fpga_mgr_states zynqmp_fpga_ops_state(struct fpga_manager *mgr) { - const struct zynqmp_eemi_ops *eemi_ops = zynqmp_pm_get_eemi_ops(); - u32 status; - - if (IS_ERR_OR_NULL(eemi_ops) || !eemi_ops->fpga_get_status) - return FPGA_MGR_STATE_UNKNOWN; + u32 status = 0; - eemi_ops->fpga_get_status(&status); + zynqmp_pm_fpga_get_status(&status); if (status & IXR_FPGA_DONE_MASK) return FPGA_MGR_STATE_OPERATING; diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c index 7412a3042a8d..811c97a7c858 100644 --- a/drivers/spi/spi-zynqmp-gqspi.c +++ b/drivers/spi/spi-zynqmp-gqspi.c @@ -135,7 +135,6 @@ #define SPI_AUTOSUSPEND_TIMEOUT 3000 enum mode_type {GQSPI_MODE_IO, GQSPI_MODE_DMA}; -static const struct zynqmp_eemi_ops *eemi_ops; /** * struct zynqmp_qspi - Defines qspi driver instance @@ -1015,10 +1014,6 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) struct zynqmp_qspi *xqspi; struct device *dev = &pdev->dev; - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) - return PTR_ERR(eemi_ops); - master = spi_alloc_master(&pdev->dev, sizeof(*xqspi)); if (!master) return -ENOMEM; diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 11d7aef2cc3c..44ffb4f0d8be 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -293,16 +293,11 @@ struct zynqmp_pm_query_data { u32 arg3; }; -struct zynqmp_eemi_ops { - int (*fpga_load)(const u64 address, const u32 size, const u32 flags); - int (*fpga_get_status)(u32 *value); -}; int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 *ret_payload); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) -const struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void); int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); @@ -333,6 +328,8 @@ int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, const u32 qos, const enum zynqmp_pm_request_ack ack); int zynqmp_pm_aes_engine(const u64 address, u32 *out); +int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags); +int zynqmp_pm_fpga_get_status(u32 *value); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -450,6 +447,15 @@ static inline int zynqmp_pm_aes_engine(const u64 address, u32 *out) { return -ENODEV; } +static inline int zynqmp_pm_fpga_load(const u64 address, const u32 size, + const u32 flags) +{ + return -ENODEV; +} +static inline int zynqmp_pm_fpga_get_status(u32 *value) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 4f680b72ea07a3e161ce1167a97ee09b1369f7de Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:03 -0700 Subject: firmware: xilinx: Add APIs to read/write GGS/PGGS registers Add APIs to read/write PGGS and GGS registers. Signed-off-by: Rajan Vaja Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-22-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 66 ++++++++++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 24 +++++++++++++ 2 files changed, 90 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index ef7ba32e017a..3518456bf73d 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -617,6 +617,72 @@ int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type) } EXPORT_SYMBOL_GPL(zynqmp_pm_sd_dll_reset); +/** + * zynqmp_pm_write_ggs() - PM API for writing global general storage (ggs) + * @index GGS register index + * @value Register value to be written + * + * This function writes value to GGS register. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_write_ggs(u32 index, u32 value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_WRITE_GGS, + index, value, NULL); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_write_ggs); + +/** + * zynqmp_pm_write_ggs() - PM API for reading global general storage (ggs) + * @index GGS register index + * @value Register value to be written + * + * This function returns GGS register value. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_read_ggs(u32 index, u32 *value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_READ_GGS, + index, 0, value); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_read_ggs); + +/** + * zynqmp_pm_write_pggs() - PM API for writing persistent global general + * storage (pggs) + * @index PGGS register index + * @value Register value to be written + * + * This function writes value to PGGS register. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_write_pggs(u32 index, u32 value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_WRITE_PGGS, index, value, + NULL); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_write_pggs); + +/** + * zynqmp_pm_write_pggs() - PM API for reading persistent global general + * storage (pggs) + * @index PGGS register index + * @value Register value to be written + * + * This function returns PGGS register value. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_read_pggs(u32 index, u32 *value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_READ_PGGS, index, 0, + value); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_read_pggs); + /** * zynqmp_pm_reset_assert - Request setting of reset (1 - assert, 0 - release) * @reset: Reset to be configured diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 44ffb4f0d8be..e23251d93176 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -107,6 +107,10 @@ enum pm_ioctl_id { IOCTL_GET_PLL_FRAC_MODE, IOCTL_SET_PLL_FRAC_DATA, IOCTL_GET_PLL_FRAC_DATA, + IOCTL_WRITE_GGS = 12, + IOCTL_READ_GGS = 13, + IOCTL_WRITE_PGGS = 14, + IOCTL_READ_PGGS = 15, }; enum pm_query_id { @@ -330,6 +334,10 @@ int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, int zynqmp_pm_aes_engine(const u64 address, u32 *out); int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags); int zynqmp_pm_fpga_get_status(u32 *value); +int zynqmp_pm_write_ggs(u32 index, u32 value); +int zynqmp_pm_read_ggs(u32 index, u32 *value); +int zynqmp_pm_write_pggs(u32 index, u32 value); +int zynqmp_pm_read_pggs(u32 index, u32 *value); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -456,6 +464,22 @@ static inline int zynqmp_pm_fpga_get_status(u32 *value) { return -ENODEV; } +static inline int zynqmp_pm_write_ggs(u32 index, u32 value) +{ + return -ENODEV; +} +static inline int zynqmp_pm_read_ggs(u32 index, u32 *value) +{ + return -ENODEV; +} +static inline int zynqmp_pm_write_pggs(u32 index, u32 value) +{ + return -ENODEV; +} +static inline int zynqmp_pm_read_pggs(u32 index, u32 *value) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From ae5c961da6483b413dcb36d8bf6a881ffd2fb33f Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:04 -0700 Subject: firmware: xilinx: Add sysfs interface Add firmware-ggs sysfs interface which provides read/write interface to global storage registers. Signed-off-by: Rajan Vaja Signed-off-by: Michal Simek Signed-off-by: Tejas Patel Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-23-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- .../ABI/stable/sysfs-driver-firmware-zynqmp | 50 ++++++ drivers/firmware/xilinx/zynqmp.c | 167 ++++++++++++++++++++- include/linux/firmware/xlnx-zynqmp.h | 2 + 3 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/stable/sysfs-driver-firmware-zynqmp (limited to 'include/linux') diff --git a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp new file mode 100644 index 000000000000..2e3aebd01d16 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp @@ -0,0 +1,50 @@ +What: /sys/devices/platform/firmware\:zynqmp-firmware/ggs* +Date: March 2020 +KernelVersion: 5.6 +Contact: "Jolly Shah" +Description: + Read/Write PMU global general storage register value, + GLOBAL_GEN_STORAGE{0:3}. + Global general storage register that can be used + by system to pass information between masters. + + The register is reset during system or power-on + resets. Three registers are used by the FSBL and + other Xilinx software products: GLOBAL_GEN_STORAGE{4:6}. + + Usage: + # cat /sys/devices/platform/firmware\:zynqmp-firmware/ggs0 + # echo > /sys/devices/platform/firmware\:zynqmp-firmware/ggs0 + + Example: + # cat /sys/devices/platform/firmware\:zynqmp-firmware/ggs0 + # echo 0x1234ABCD > /sys/devices/platform/firmware\:zynqmp-firmware/ggs0 + +Users: Xilinx + +What: /sys/devices/platform/firmware\:zynqmp-firmware/pggs* +Date: March 2020 +KernelVersion: 5.6 +Contact: "Jolly Shah" +Description: + Read/Write PMU persistent global general storage register + value, PERS_GLOB_GEN_STORAGE{0:3}. + Persistent global general storage register that + can be used by system to pass information between + masters. + + This register is only reset by the power-on reset + and maintains its value through a system reset. + Four registers are used by the FSBL and other Xilinx + software products: PERS_GLOB_GEN_STORAGE{4:7}. + Register is reset only by a POR reset. + + Usage: + # cat /sys/devices/platform/firmware\:zynqmp-firmware/pggs0 + # echo > /sys/devices/platform/firmware\:zynqmp-firmware/pggs0 + + Example: + # cat /sys/devices/platform/firmware\:zynqmp-firmware/pggs0 + # echo 0x1234ABCD > /sys/devices/platform/firmware\:zynqmp-firmware/pggs0 + +Users: Xilinx diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 3518456bf73d..2fe4f57c005a 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -2,7 +2,7 @@ /* * Xilinx Zynq MPSoC Firmware layer * - * Copyright (C) 2014-2018 Xilinx, Inc. + * Copyright (C) 2014-2020 Xilinx, Inc. * * Michal Simek * Davorin Mista @@ -875,6 +875,170 @@ int zynqmp_pm_aes_engine(const u64 address, u32 *out) } EXPORT_SYMBOL_GPL(zynqmp_pm_aes_engine); +static ssize_t ggs_show(struct device *device, + struct device_attribute *attr, + char *buf, + u32 reg) +{ + int ret; + u32 ret_payload[PAYLOAD_ARG_CNT]; + + ret = zynqmp_pm_read_ggs(reg, ret_payload); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", ret_payload[1]); +} + +static ssize_t ggs_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count, + u32 reg) +{ + long value; + int ret; + + if (reg >= GSS_NUM_REGS) + return -EINVAL; + + ret = kstrtol(buf, 16, &value); + if (ret) { + count = -EFAULT; + goto err; + } + + ret = zynqmp_pm_write_ggs(reg, value); + if (ret) + count = -EFAULT; + +err: + return count; +} + +/* GGS register show functions */ +#define GGS0_SHOW(N) \ + ssize_t ggs##N##_show(struct device *device, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + return ggs_show(device, attr, buf, N); \ + } + +static GGS0_SHOW(0); +static GGS0_SHOW(1); +static GGS0_SHOW(2); +static GGS0_SHOW(3); + +/* GGS register store function */ +#define GGS0_STORE(N) \ + ssize_t ggs##N##_store(struct device *device, \ + struct device_attribute *attr, \ + const char *buf, \ + size_t count) \ + { \ + return ggs_store(device, attr, buf, count, N); \ + } + +static GGS0_STORE(0); +static GGS0_STORE(1); +static GGS0_STORE(2); +static GGS0_STORE(3); + +static ssize_t pggs_show(struct device *device, + struct device_attribute *attr, + char *buf, + u32 reg) +{ + int ret; + u32 ret_payload[PAYLOAD_ARG_CNT]; + + ret = zynqmp_pm_read_pggs(reg, ret_payload); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", ret_payload[1]); +} + +static ssize_t pggs_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count, + u32 reg) +{ + long value; + int ret; + + if (reg >= GSS_NUM_REGS) + return -EINVAL; + + ret = kstrtol(buf, 16, &value); + if (ret) { + count = -EFAULT; + goto err; + } + + ret = zynqmp_pm_write_pggs(reg, value); + if (ret) + count = -EFAULT; + +err: + return count; +} + +#define PGGS0_SHOW(N) \ + ssize_t pggs##N##_show(struct device *device, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + return pggs_show(device, attr, buf, N); \ + } + +#define PGGS0_STORE(N) \ + ssize_t pggs##N##_store(struct device *device, \ + struct device_attribute *attr, \ + const char *buf, \ + size_t count) \ + { \ + return pggs_store(device, attr, buf, count, N); \ + } + +/* PGGS register show functions */ +static PGGS0_SHOW(0); +static PGGS0_SHOW(1); +static PGGS0_SHOW(2); +static PGGS0_SHOW(3); + +/* PGGS register store functions */ +static PGGS0_STORE(0); +static PGGS0_STORE(1); +static PGGS0_STORE(2); +static PGGS0_STORE(3); + +/* GGS register attributes */ +static DEVICE_ATTR_RW(ggs0); +static DEVICE_ATTR_RW(ggs1); +static DEVICE_ATTR_RW(ggs2); +static DEVICE_ATTR_RW(ggs3); + +/* PGGS register attributes */ +static DEVICE_ATTR_RW(pggs0); +static DEVICE_ATTR_RW(pggs1); +static DEVICE_ATTR_RW(pggs2); +static DEVICE_ATTR_RW(pggs3); + +static struct attribute *zynqmp_firmware_attrs[] = { + &dev_attr_ggs0.attr, + &dev_attr_ggs1.attr, + &dev_attr_ggs2.attr, + &dev_attr_ggs3.attr, + &dev_attr_pggs0.attr, + &dev_attr_pggs1.attr, + &dev_attr_pggs2.attr, + &dev_attr_pggs3.attr, + NULL, +}; + +ATTRIBUTE_GROUPS(zynqmp_firmware); + static int zynqmp_firmware_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -952,6 +1116,7 @@ static struct platform_driver zynqmp_firmware_driver = { .driver = { .name = "zynqmp_firmware", .of_match_table = zynqmp_firmware_of_match, + .dev_groups = zynqmp_firmware_groups, }, .probe = zynqmp_firmware_probe, .remove = zynqmp_firmware_remove, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index e23251d93176..c1356e93dcb2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -42,6 +42,8 @@ #define ZYNQMP_PM_MAX_QOS 100U +#define GSS_NUM_REGS (4) + /* Node capabilities */ #define ZYNQMP_PM_CAPABILITY_ACCESS 0x1U #define ZYNQMP_PM_CAPABILITY_CONTEXT 0x2U -- cgit v1.2.3 From fdd2ed88ca971376669bce145f1a1bcf028d775e Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:05 -0700 Subject: firmware: xilinx: Add system shutdown API interface Add system shutdown API interface which asks firmware to perform system shutdown/restart. Signed-off-by: Rajan Vaja Signed-off-by: Michal Simek Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-24-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/xilinx/zynqmp.c | 13 +++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 6 ++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 2fe4f57c005a..9ba376cfc81f 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -875,6 +875,19 @@ int zynqmp_pm_aes_engine(const u64 address, u32 *out) } EXPORT_SYMBOL_GPL(zynqmp_pm_aes_engine); +/** + * zynqmp_pm_system_shutdown - PM call to request a system shutdown or restart + * @type: Shutdown or restart? 0 for shutdown, 1 for restart + * @subtype: Specifies which system should be restarted or shut down + * + * Return: Returns status, either success or error+reason + */ +int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype) +{ + return zynqmp_pm_invoke_fn(PM_SYSTEM_SHUTDOWN, type, subtype, + 0, 0, NULL); +} + static ssize_t ggs_show(struct device *device, struct device_attribute *attr, char *buf, diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index c1356e93dcb2..2254c7cc3362 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -64,6 +64,7 @@ enum pm_api_id { PM_GET_API_VERSION = 1, + PM_SYSTEM_SHUTDOWN = 12, PM_REQUEST_NODE = 13, PM_RELEASE_NODE, PM_SET_REQUIREMENT, @@ -340,6 +341,7 @@ int zynqmp_pm_write_ggs(u32 index, u32 value); int zynqmp_pm_read_ggs(u32 index, u32 *value); int zynqmp_pm_write_pggs(u32 index, u32 value); int zynqmp_pm_read_pggs(u32 index, u32 *value); +int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -482,6 +484,10 @@ static inline int zynqmp_pm_read_pggs(u32 index, u32 *value) { return -ENODEV; } +static inline int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From b3ae24c44848c6403fb2333d7cbe494565058352 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:06 -0700 Subject: firmware: xilinx: Add sysfs to set shutdown scope The Linux shutdown functionality implemented via PSCI system_off does not include an option to set a scope, i.e. which parts of the system to shut down. This patch creates sysfs that allows to set the shutdown scope for the next shutdown request. When the next shutdown is performed, the platform specific portion of PSCI-system_off can use the chosen shutdown scope. Signed-off-by: Rajan Vaja Signed-off-by: Stefan Krsmanovic Signed-off-by: Michal Simek Signed-off-by: Tejas Patel Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-25-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- .../ABI/stable/sysfs-driver-firmware-zynqmp | 32 +++++++ drivers/firmware/xilinx/zynqmp.c | 98 +++++++++++++++++++++- include/linux/firmware/xlnx-zynqmp.h | 12 +++ 3 files changed, 141 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp index 2e3aebd01d16..554f30c3b398 100644 --- a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp +++ b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp @@ -48,3 +48,35 @@ Description: # echo 0x1234ABCD > /sys/devices/platform/firmware\:zynqmp-firmware/pggs0 Users: Xilinx + +What: /sys/devices/platform/firmware\:zynqmp-firmware/shutdown_scope +Date: March 2020 +KernelVersion: 5.6 +Contact: "Jolly Shah" +Description: + This sysfs interface allows to set the shutdown scope for the + next shutdown request. When the next shutdown is performed, the + platform specific portion of PSCI-system_off can use the chosen + shutdown scope. + + Following are available shutdown scopes(subtypes): + + subsystem: Only the APU along with all of its peripherals + not used by other processing units will be + shut down. This may result in the FPD power + domain being shut down provided that no other + processing unit uses FPD peripherals or DRAM. + ps_only: The complete PS will be shut down, including the + RPU, PMU, etc. Only the PL domain (FPGA) + remains untouched. + system: The complete system/device is shut down. + + Usage: + # cat /sys/devices/platform/firmware\:zynqmp-firmware/shutdown_scope + # echo > /sys/devices/platform/firmware\:zynqmp-firmware/shutdown_scope + + Example: + # cat /sys/devices/platform/firmware\:zynqmp-firmware/shutdown_scope + # echo "subsystem" > /sys/devices/platform/firmware\:zynqmp-firmware/shutdown_scope + +Users: Xilinx diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 9ba376cfc81f..8d3661877670 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -888,6 +888,102 @@ int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype) 0, 0, NULL); } +/** + * struct zynqmp_pm_shutdown_scope - Struct for shutdown scope + * @subtype: Shutdown subtype + * @name: Matching string for scope argument + * + * This struct encapsulates mapping between shutdown scope ID and string. + */ +struct zynqmp_pm_shutdown_scope { + const enum zynqmp_pm_shutdown_subtype subtype; + const char *name; +}; + +static struct zynqmp_pm_shutdown_scope shutdown_scopes[] = { + [ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM] = { + .subtype = ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM, + .name = "subsystem", + }, + [ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY] = { + .subtype = ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY, + .name = "ps_only", + }, + [ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM] = { + .subtype = ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM, + .name = "system", + }, +}; + +static struct zynqmp_pm_shutdown_scope *selected_scope = + &shutdown_scopes[ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM]; + +/** + * zynqmp_pm_is_shutdown_scope_valid - Check if shutdown scope string is valid + * @scope_string: Shutdown scope string + * + * Return: Return pointer to matching shutdown scope struct from + * array of available options in system if string is valid, + * otherwise returns NULL. + */ +static struct zynqmp_pm_shutdown_scope* + zynqmp_pm_is_shutdown_scope_valid(const char *scope_string) +{ + int count; + + for (count = 0; count < ARRAY_SIZE(shutdown_scopes); count++) + if (sysfs_streq(scope_string, shutdown_scopes[count].name)) + return &shutdown_scopes[count]; + + return NULL; +} + +static ssize_t shutdown_scope_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(shutdown_scopes); i++) { + if (&shutdown_scopes[i] == selected_scope) { + strcat(buf, "["); + strcat(buf, shutdown_scopes[i].name); + strcat(buf, "]"); + } else { + strcat(buf, shutdown_scopes[i].name); + } + strcat(buf, " "); + } + strcat(buf, "\n"); + + return strlen(buf); +} + +static ssize_t shutdown_scope_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + struct zynqmp_pm_shutdown_scope *scope; + + scope = zynqmp_pm_is_shutdown_scope_valid(buf); + if (!scope) + return -EINVAL; + + ret = zynqmp_pm_system_shutdown(ZYNQMP_PM_SHUTDOWN_TYPE_SETSCOPE_ONLY, + scope->subtype); + if (ret) { + pr_err("unable to set shutdown scope %s\n", buf); + return ret; + } + + selected_scope = scope; + + return count; +} + +static DEVICE_ATTR_RW(shutdown_scope); + static ssize_t ggs_show(struct device *device, struct device_attribute *attr, char *buf, @@ -923,7 +1019,6 @@ static ssize_t ggs_store(struct device *device, ret = zynqmp_pm_write_ggs(reg, value); if (ret) count = -EFAULT; - err: return count; } @@ -1047,6 +1142,7 @@ static struct attribute *zynqmp_firmware_attrs[] = { &dev_attr_pggs1.attr, &dev_attr_pggs2.attr, &dev_attr_pggs3.attr, + &dev_attr_shutdown_scope.attr, NULL, }; diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 2254c7cc3362..c297333677d4 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -286,6 +286,18 @@ enum dll_reset_type { PM_DLL_RESET_PULSE, }; +enum zynqmp_pm_shutdown_type { + ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN, + ZYNQMP_PM_SHUTDOWN_TYPE_RESET, + ZYNQMP_PM_SHUTDOWN_TYPE_SETSCOPE_ONLY, +}; + +enum zynqmp_pm_shutdown_subtype { + ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM, + ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY, + ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM, +}; + /** * struct zynqmp_pm_query_data - PM query data * @qid: query ID -- cgit v1.2.3 From a2cc220a9a9227b2c5c9b39f57340bce64f040fd Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Fri, 24 Apr 2020 13:58:07 -0700 Subject: firmware: xilinx: Add sysfs and API to set boot health status Add sysfs interface to set boot health status from user space. Add API used by this interface to communicate with firmware. If PMUFW is compiled with CHECK_HEALTHY_BOOT, it will check the healthy bit on FPD WDT expiration. If healthy bit is set by a user application running in Linux, PMUFW will do APU only restart. If healthy bit is not set during FPD WDT expiration, PMUFW will do system restart. Signed-off-by: Rajan Vaja Signed-off-by: Michal Simek Signed-off-by: Tejas Patel Signed-off-by: Jolly Shah Link: https://lore.kernel.org/r/1587761887-4279-26-git-send-email-jolly.shah@xilinx.com Signed-off-by: Greg Kroah-Hartman --- .../ABI/stable/sysfs-driver-firmware-zynqmp | 21 ++++++++++++ drivers/firmware/xilinx/zynqmp.c | 39 ++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 7 ++++ 3 files changed, 67 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp index 554f30c3b398..00fa04c76ff3 100644 --- a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp +++ b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp @@ -80,3 +80,24 @@ Description: # echo "subsystem" > /sys/devices/platform/firmware\:zynqmp-firmware/shutdown_scope Users: Xilinx + +What: /sys/devices/platform/firmware\:zynqmp-firmware/health_status +Date: March 2020 +KernelVersion: 5.6 +Contact: "Jolly Shah" +Description: + This sysfs interface allows to set the health status. If PMUFW + is compiled with CHECK_HEALTHY_BOOT, it will check the healthy + bit on FPD WDT expiration. If healthy bit is set by a user + application running in Linux, PMUFW will do APU only restart. If + healthy bit is not set during FPD WDT expiration, PMUFW will do + system restart. + + Usage: + Set healthy bit + # echo 1 > /sys/devices/platform/firmware\:zynqmp-firmware/health_status + + Unset healthy bit + # echo 0 > /sys/devices/platform/firmware\:zynqmp-firmware/health_status + +Users: Xilinx diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 8d3661877670..bfaf29a58eac 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -683,6 +683,21 @@ int zynqmp_pm_read_pggs(u32 index, u32 *value) } EXPORT_SYMBOL_GPL(zynqmp_pm_read_pggs); +/** + * zynqmp_pm_set_boot_health_status() - PM API for setting healthy boot status + * @value Status value to be written + * + * This function sets healthy bit value to indicate boot health status + * to firmware. + * + * @return Returns status, either success or error+reason + */ +int zynqmp_pm_set_boot_health_status(u32 value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_BOOT_HEALTH_STATUS, + value, 0, NULL); +} + /** * zynqmp_pm_reset_assert - Request setting of reset (1 - assert, 0 - release) * @reset: Reset to be configured @@ -984,6 +999,29 @@ static ssize_t shutdown_scope_store(struct device *device, static DEVICE_ATTR_RW(shutdown_scope); +static ssize_t health_status_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned int value; + + ret = kstrtouint(buf, 10, &value); + if (ret) + return ret; + + ret = zynqmp_pm_set_boot_health_status(value); + if (ret) { + dev_err(device, "unable to set healthy bit value to %u\n", + value); + return ret; + } + + return count; +} + +static DEVICE_ATTR_WO(health_status); + static ssize_t ggs_show(struct device *device, struct device_attribute *attr, char *buf, @@ -1143,6 +1181,7 @@ static struct attribute *zynqmp_firmware_attrs[] = { &dev_attr_pggs2.attr, &dev_attr_pggs3.attr, &dev_attr_shutdown_scope.attr, + &dev_attr_health_status.attr, NULL, }; diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index c297333677d4..5968df82b991 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -114,6 +114,8 @@ enum pm_ioctl_id { IOCTL_READ_GGS = 13, IOCTL_WRITE_PGGS = 14, IOCTL_READ_PGGS = 15, + /* Set healthy bit value */ + IOCTL_SET_BOOT_HEALTH_STATUS = 17, }; enum pm_query_id { @@ -354,6 +356,7 @@ int zynqmp_pm_read_ggs(u32 index, u32 *value); int zynqmp_pm_write_pggs(u32 index, u32 value); int zynqmp_pm_read_pggs(u32 index, u32 *value); int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype); +int zynqmp_pm_set_boot_health_status(u32 value); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -500,6 +503,10 @@ static inline int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype) { return -ENODEV; } +static inline int zynqmp_pm_set_boot_health_status(u32 value) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From d65dbedfd298344747033f17c1efd2afc8082bc7 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 24 Apr 2020 12:45:02 -0700 Subject: net/mlx5: Add support for COPY steering action Add COPY type to modify_header action. IPsec feature is the first feature that needs COPY steering action. Signed-off-by: Huy Nguyen Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed Acked-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/flow.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 8 ++++---- 8 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 862b7bf3e646..69cb7e6e8955 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -427,7 +427,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( num_actions = uverbs_attr_ptr_get_array_size( attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, - MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)); + MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)); if (num_actions < 0) return num_actions; @@ -648,7 +648,7 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY), UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES( - set_action_in_add_action_in_auto)), + set_add_copy_action_in_auto)), UA_MANDATORY, UA_ALLOC_AND_COPY), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index ad3e3a65d403..91464f70a3fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -385,7 +385,7 @@ mlx5_tc_ct_entry_create_nat(struct mlx5_tc_ct_priv *ct_priv, char *modact; int err, i; - action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto); + action_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); flow_action_for_each(i, act, flow_action) { switch (act->id) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 88c0e460e995..12c5ca5b93ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -61,7 +61,7 @@ #include "lib/geneve.h" #include "diag/en_tc_tracepoint.h" -#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) +#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) struct mlx5_nic_flow_attr { u32 action; @@ -2660,7 +2660,7 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, set_vals = &hdrs[0].vals; add_vals = &hdrs[1].vals; - action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto); + action_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); for (i = 0; i < ARRAY_SIZE(fields); i++) { bool skip; @@ -2793,7 +2793,7 @@ int alloc_mod_hdr_actions(struct mlx5_core_dev *mdev, if (mod_hdr_acts->num_actions < mod_hdr_acts->max_actions) return 0; - action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto); + action_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); max_hw_actions = mlx5e_flow_namespace_max_modify_action(mdev, namespace); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c index 029001040737..d5bf908dfecd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c @@ -274,7 +274,7 @@ mlx5_esw_chains_destroy_fdb_table(struct mlx5_eswitch *esw, static int create_fdb_chain_restore(struct fdb_chain *fdb_chain) { - char modact[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)]; + char modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)]; struct mlx5_eswitch *esw = fdb_chain->esw; struct mlx5_modify_hdr *mod_hdr; u32 index; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index dc098bb58973..703f307c5967 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1490,7 +1490,7 @@ static void esw_destroy_restore_table(struct mlx5_eswitch *esw) static int esw_create_restore_table(struct mlx5_eswitch *esw) { - u8 modact[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)] = {}; + u8 modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *dev = esw->dev; @@ -1900,7 +1900,7 @@ static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw, static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { - u8 action[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)] = {}; + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; struct mlx5_flow_act flow_act = {}; int err = 0; u32 key; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 304d1e4f0541..1a8e826ac86b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -791,7 +791,7 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, return -EOPNOTSUPP; } - actions_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) * num_actions; + actions_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions; inlen = MLX5_ST_SZ_BYTES(alloc_modify_header_context_in) + actions_size; in = kzalloc(inlen, GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 3b3f5b9d4f95..8887b2440c7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -576,7 +576,7 @@ static int mlx5_cmd_dr_modify_header_alloc(struct mlx5_flow_root_namespace *ns, struct mlx5dr_action *action; size_t actions_sz; - actions_sz = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) * + actions_sz = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions; action = mlx5dr_action_create_modify_header(dr_domain, 0, actions_sz, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6fa24918eade..3ad2c51ccde9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5670,9 +5670,9 @@ struct mlx5_ifc_copy_action_in_bits { u8 reserved_at_38[0x8]; }; -union mlx5_ifc_set_action_in_add_action_in_auto_bits { - struct mlx5_ifc_set_action_in_bits set_action_in; - struct mlx5_ifc_add_action_in_bits add_action_in; +union mlx5_ifc_set_add_copy_action_in_auto_bits { + struct mlx5_ifc_set_action_in_bits set_action_in; + struct mlx5_ifc_add_action_in_bits add_action_in; struct mlx5_ifc_copy_action_in_bits copy_action_in; u8 reserved_at_0[0x40]; }; @@ -5746,7 +5746,7 @@ struct mlx5_ifc_alloc_modify_header_context_in_bits { u8 reserved_at_68[0x10]; u8 num_of_actions[0x8]; - union mlx5_ifc_set_action_in_add_action_in_auto_bits actions[0]; + union mlx5_ifc_set_add_copy_action_in_auto_bits actions[0]; }; struct mlx5_ifc_dealloc_modify_header_context_out_bits { -- cgit v1.2.3 From 2b58f6d9df50f534fe465113b69de60a2ef0e74a Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 24 Apr 2020 12:45:03 -0700 Subject: net/mlx5: Introduce IPsec Connect-X offload hardware bits and structures Add IPsec offload related IFC structs, layouts and enumerations. Signed-off-by: Raed Salem Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 +++ include/linux/mlx5/mlx5_ifc.h | 78 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2b90097a6cf9..7b57877e501e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1107,6 +1107,7 @@ enum mlx5_cap_type { MLX5_CAP_TLS, MLX5_CAP_VDPA_EMULATION = 0x13, MLX5_CAP_DEV_EVENT = 0x14, + MLX5_CAP_IPSEC, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1324,6 +1325,9 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(device_virtio_emulation_cap, \ (mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap) +#define MLX5_CAP_IPSEC(mdev, cap)\ + MLX5_GET(ipsec_cap, (mdev)->caps.hca_cur[MLX5_CAP_IPSEC], cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3ad2c51ccde9..cf971d341189 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -886,7 +886,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 tunnel_stateless_vxlan_gpe[0x1]; u8 tunnel_stateless_ipv4_over_vxlan[0x1]; u8 tunnel_stateless_ip_over_ip[0x1]; - u8 reserved_at_2a[0x6]; + u8 insert_trailer[0x1]; + u8 reserved_at_2b[0x5]; u8 max_vxlan_udp_ports[0x8]; u8 reserved_at_38[0x6]; u8 max_geneve_opt_len[0x1]; @@ -1100,6 +1101,23 @@ struct mlx5_ifc_tls_cap_bits { u8 reserved_at_20[0x7e0]; }; +struct mlx5_ifc_ipsec_cap_bits { + u8 ipsec_full_offload[0x1]; + u8 ipsec_crypto_offload[0x1]; + u8 ipsec_esn[0x1]; + u8 ipsec_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 ipsec_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 ipsec_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 ipsec_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_ipsec_offload[0x5]; + u8 reserved_at_10[0x10]; + + u8 min_log_ipsec_full_replay_window[0x8]; + u8 max_log_ipsec_full_replay_window[0x8]; + u8 reserved_at_30[0x7d0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -1464,7 +1482,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_460[0x3]; u8 log_max_uctx[0x5]; - u8 reserved_at_468[0x3]; + u8 reserved_at_468[0x2]; + u8 ipsec_offload[0x1]; u8 log_max_umem[0x5]; u8 max_num_eqs[0x10]; @@ -4143,7 +4162,8 @@ enum { MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION = 0x0, MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG = 0x1, MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST = 0x2, - MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3 + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_IPSEC_OBJ_ID = 0x4 }; struct mlx5_ifc_set_fte_out_bits { @@ -10468,10 +10488,62 @@ struct mlx5_ifc_affiliated_event_header_bits { enum { MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT(0x13), }; enum { MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc, + MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, +}; + +enum { + MLX5_IPSEC_OBJECT_ICV_LEN_16B, + MLX5_IPSEC_OBJECT_ICV_LEN_12B, + MLX5_IPSEC_OBJECT_ICV_LEN_8B, +}; + +struct mlx5_ifc_ipsec_obj_bits { + u8 modify_field_select[0x40]; + u8 full_offload[0x1]; + u8 reserved_at_41[0x1]; + u8 esn_en[0x1]; + u8 esn_overlap[0x1]; + u8 reserved_at_44[0x2]; + u8 icv_length[0x2]; + u8 reserved_at_48[0x4]; + u8 aso_return_reg[0x4]; + u8 reserved_at_50[0x10]; + + u8 esn_msb[0x20]; + + u8 reserved_at_80[0x8]; + u8 dekn[0x18]; + + u8 salt[0x20]; + + u8 implicit_iv[0x40]; + + u8 reserved_at_100[0x700]; +}; + +struct mlx5_ifc_create_ipsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_ipsec_obj_bits ipsec_object; +}; + +enum { + MLX5_MODIFY_IPSEC_BITMASK_ESN_OVERLAP = BIT(0), + MLX5_MODIFY_IPSEC_BITMASK_ESN_MSB = BIT(1), +}; + +struct mlx5_ifc_query_ipsec_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; + struct mlx5_ifc_ipsec_obj_bits ipsec_object; +}; + +struct mlx5_ifc_modify_ipsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_ipsec_obj_bits ipsec_object; }; struct mlx5_ifc_encryption_key_obj_bits { -- cgit v1.2.3 From dff8e2d15283dd92582ddeec25ca86e4cf2618c7 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Fri, 24 Apr 2020 12:45:04 -0700 Subject: net/mlx5: Use aligned variable while allocating ICM memory The alignment value is part of the input structure, so use it and spare extra memory allocation when is not needed. Now, using the new ability when allocating icm for Direct-Rule insertion. Signed-off-by: Ariel Levkovich Signed-off-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c | 15 ++++-- .../mellanox/mlx5/core/steering/dr_icm_pool.c | 53 ++++++++++------------ include/linux/mlx5/driver.h | 3 +- 4 files changed, 38 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f10675213115..65e0e24d463b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2444,7 +2444,7 @@ static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, act_size = roundup_pow_of_two(act_size); dm->size = act_size; - err = mlx5_dm_sw_icm_alloc(dev, type, act_size, + err = mlx5_dm_sw_icm_alloc(dev, type, act_size, attr->alignment, to_mucontext(ctx)->devx_uid, &dm->dev_addr, &dm->icm_dm.obj_id); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c index 6cbccba56f70..3d5e57ff558c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c @@ -90,7 +90,8 @@ void mlx5_dm_cleanup(struct mlx5_core_dev *dev) } int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, - u64 length, u16 uid, phys_addr_t *addr, u32 *obj_id) + u64 length, u32 log_alignment, u16 uid, + phys_addr_t *addr, u32 *obj_id) { u32 num_blocks = DIV_ROUND_UP_ULL(length, MLX5_SW_ICM_BLOCK_SIZE(dev)); u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; @@ -99,6 +100,7 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, unsigned long *block_map; u64 icm_start_addr; u32 log_icm_size; + u64 align_mask; u32 max_blocks; u64 block_idx; void *sw_icm; @@ -136,11 +138,14 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, return -EOPNOTSUPP; max_blocks = BIT(log_icm_size - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + if (log_alignment < MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) + log_alignment = MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); + align_mask = BIT(log_alignment - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) - 1; + spin_lock(&dm->lock); - block_idx = bitmap_find_next_zero_area(block_map, - max_blocks, - 0, - num_blocks, 0); + block_idx = bitmap_find_next_zero_area(block_map, max_blocks, 0, + num_blocks, align_mask); if (block_idx < max_blocks) bitmap_set(block_map, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c index 30d2d7376f56..cc33515b9aba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c @@ -95,13 +95,12 @@ static int dr_icm_create_dm_mkey(struct mlx5_core_dev *mdev, } static struct mlx5dr_icm_mr * -dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, - enum mlx5_sw_icm_type type, - size_t align_base) +dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool) { struct mlx5_core_dev *mdev = pool->dmn->mdev; + enum mlx5_sw_icm_type dm_type; struct mlx5dr_icm_mr *icm_mr; - size_t align_diff; + size_t log_align_base; int err; icm_mr = kvzalloc(sizeof(*icm_mr), GFP_KERNEL); @@ -111,14 +110,22 @@ dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, icm_mr->pool = pool; INIT_LIST_HEAD(&icm_mr->mr_list); - icm_mr->dm.type = type; - - /* 2^log_biggest_table * entry-size * double-for-alignment */ icm_mr->dm.length = mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, - pool->icm_type) * 2; + pool->icm_type); + + if (pool->icm_type == DR_ICM_TYPE_STE) { + dm_type = MLX5_SW_ICM_TYPE_STEERING; + log_align_base = ilog2(icm_mr->dm.length); + } else { + dm_type = MLX5_SW_ICM_TYPE_HEADER_MODIFY; + /* Align base is 64B */ + log_align_base = ilog2(DR_ICM_MODIFY_HDR_ALIGN_BASE); + } + icm_mr->dm.type = dm_type; - err = mlx5_dm_sw_icm_alloc(mdev, icm_mr->dm.type, icm_mr->dm.length, 0, - &icm_mr->dm.addr, &icm_mr->dm.obj_id); + err = mlx5_dm_sw_icm_alloc(mdev, icm_mr->dm.type, icm_mr->dm.length, + log_align_base, 0, &icm_mr->dm.addr, + &icm_mr->dm.obj_id); if (err) { mlx5dr_err(pool->dmn, "Failed to allocate SW ICM memory, err (%d)\n", err); goto free_icm_mr; @@ -137,15 +144,18 @@ dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, icm_mr->icm_start_addr = icm_mr->dm.addr; - /* align_base is always a power of 2 */ - align_diff = icm_mr->icm_start_addr & (align_base - 1); - if (align_diff) - icm_mr->used_length = align_base - align_diff; + if (icm_mr->icm_start_addr & (BIT(log_align_base) - 1)) { + mlx5dr_err(pool->dmn, "Failed to get Aligned ICM mem (asked: %zu)\n", + log_align_base); + goto free_mkey; + } list_add_tail(&icm_mr->mr_list, &pool->icm_mr_list); return icm_mr; +free_mkey: + mlx5_core_destroy_mkey(mdev, &icm_mr->mkey); free_dm: mlx5_dm_sw_icm_dealloc(mdev, icm_mr->dm.type, icm_mr->dm.length, 0, icm_mr->dm.addr, icm_mr->dm.obj_id); @@ -200,24 +210,11 @@ static int dr_icm_chunks_create(struct mlx5dr_icm_bucket *bucket) struct mlx5dr_icm_pool *pool = bucket->pool; struct mlx5dr_icm_mr *icm_mr = NULL; struct mlx5dr_icm_chunk *chunk; - enum mlx5_sw_icm_type dm_type; - size_t align_base; int i, err = 0; mr_req_size = bucket->num_of_entries * bucket->entry_size; mr_row_size = mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, pool->icm_type); - - if (pool->icm_type == DR_ICM_TYPE_STE) { - dm_type = MLX5_SW_ICM_TYPE_STEERING; - /* Align base is the biggest chunk size / row size */ - align_base = mr_row_size; - } else { - dm_type = MLX5_SW_ICM_TYPE_HEADER_MODIFY; - /* Align base is 64B */ - align_base = DR_ICM_MODIFY_HDR_ALIGN_BASE; - } - mutex_lock(&pool->mr_mutex); if (!list_empty(&pool->icm_mr_list)) { icm_mr = list_last_entry(&pool->icm_mr_list, @@ -228,7 +225,7 @@ static int dr_icm_chunks_create(struct mlx5dr_icm_bucket *bucket) } if (!icm_mr || mr_free_size < mr_row_size) { - icm_mr = dr_icm_pool_mr_create(pool, dm_type, align_base); + icm_mr = dr_icm_pool_mr_create(pool); if (!icm_mr) { err = -ENOMEM; goto out_err; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b60e5ab7906b..b46537a81703 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1080,7 +1080,8 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, - u64 length, u16 uid, phys_addr_t *addr, u32 *obj_id); + u64 length, u32 log_alignment, u16 uid, + phys_addr_t *addr, u32 *obj_id); int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, u64 length, u16 uid, phys_addr_t addr, u32 obj_id); -- cgit v1.2.3 From 244faedfd4d8e8c8e9f3c628d29bb74196b49743 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 24 Apr 2020 12:45:05 -0700 Subject: net/mlx5: Refactor imm_inval_pkey field in cqe struct The imm_inval_pkey field can hold four different types of data, depends on the usage, the data could be one of the below: - Immediate field of the received message - Invalidate rkey - Pkey of the packet - Flow table metadata Current implementation doesn't reflect the intended usage of the field at usage time. Reflect the different types by replace this field with a union, modify code where this field is used to reflect its intended usage. Signed-off-by: Raed Salem Reviewed-by: Huy Nguyen Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/cq.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- include/linux/mlx5/device.h | 7 ++++++- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 32c05730dfe9..0c18cb6a2f14 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -202,7 +202,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, case MLX5_CQE_RESP_WR_IMM: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = cqe->imm_inval_pkey; + wc->ex.imm_data = cqe->immediate; break; case MLX5_CQE_RESP_SEND: wc->opcode = IB_WC_RECV; @@ -214,12 +214,12 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, case MLX5_CQE_RESP_SEND_IMM: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = cqe->imm_inval_pkey; + wc->ex.imm_data = cqe->immediate; break; case MLX5_CQE_RESP_SEND_INV: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_INVALIDATE; - wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inval_rkey); break; } wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; @@ -227,7 +227,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; wc->wc_flags |= g ? IB_WC_GRH : 0; if (unlikely(is_qp1(qp->ibqp.qp_type))) { - u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; + u16 pkey = be32_to_cpu(cqe->pkey) & 0xffff; ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey, &wc->pkey_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 12c5ca5b93ca..5b632434866f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4891,7 +4891,7 @@ bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe, reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG) reg_c0 = 0; - reg_c1 = be32_to_cpu(cqe->imm_inval_pkey); + reg_c1 = be32_to_cpu(cqe->ft_metadata); if (!reg_c0) return true; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 7b57877e501e..746e17473d72 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -767,7 +767,12 @@ struct mlx5_cqe64 { u8 l4_l3_hdr_type; __be16 vlan_info; __be32 srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */ - __be32 imm_inval_pkey; + union { + __be32 immediate; + __be32 inval_rkey; + __be32 pkey; + __be32 ft_metadata; + }; u8 rsvd40[4]; __be32 byte_cnt; __be32 timestamp_h; -- cgit v1.2.3 From 06939536263d684073a30543930622eede633af1 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 24 Apr 2020 12:45:06 -0700 Subject: net/mlx5: Add structure layout and defines for MFRL register Add needed structure layouts and defines for MFRL (Management Firmware Reset Level) register. This structure will be used for the firmware upgrade and reset flow in the downstream patches. Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b46537a81703..d82dbbab8179 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -130,6 +130,7 @@ enum { MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, MLX5_REG_MCIA = 0x9014, + MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, MLX5_REG_MTRC_CAP = 0x9040, MLX5_REG_MTRC_CONF = 0x9041, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cf971d341189..9e6a3cec1e32 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9703,6 +9703,29 @@ struct mlx5_ifc_mcda_reg_bits { u8 data[0][0x20]; }; +enum { + MLX5_MFRL_REG_RESET_TYPE_FULL_CHIP = BIT(0), + MLX5_MFRL_REG_RESET_TYPE_NET_PORT_ALIVE = BIT(1), +}; + +enum { + MLX5_MFRL_REG_RESET_LEVEL0 = BIT(0), + MLX5_MFRL_REG_RESET_LEVEL3 = BIT(3), + MLX5_MFRL_REG_RESET_LEVEL6 = BIT(6), +}; + +struct mlx5_ifc_mfrl_reg_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x2]; + u8 pci_sync_for_fw_update_start[0x1]; + u8 pci_sync_for_fw_update_resp[0x2]; + u8 rst_type_sel[0x3]; + u8 reserved_at_28[0x8]; + u8 reset_type[0x8]; + u8 reset_level[0x8]; +}; + struct mlx5_ifc_mirc_reg_bits { u8 reserved_at_0[0x18]; u8 status_code[0x8]; @@ -9766,6 +9789,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mcc_reg_bits mcc_reg; struct mlx5_ifc_mcda_reg_bits mcda_reg; struct mlx5_ifc_mirc_reg_bits mirc_reg; + struct mlx5_ifc_mfrl_reg_bits mfrl_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From 3df0107784ceb388039b1fe510a8c7b8816de8f0 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 24 Apr 2020 12:45:07 -0700 Subject: net/mlx5: Add structure and defines for pci sync for fw update event Add needed structure layouts and defines for pci sync for fw update event. The downstream patches will include event handlers for this event type. Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 15 +++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 746e17473d72..de93f0b67973 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -364,6 +364,7 @@ enum { enum { MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT = 0x5, + MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT = 0x8, }; enum { @@ -689,6 +690,19 @@ struct mlx5_eqe_temp_warning { __be64 sensor_warning_lsb; } __packed; +#define SYNC_RST_STATE_MASK 0xf + +enum sync_rst_state_type { + MLX5_SYNC_RST_STATE_RESET_REQUEST = 0x0, + MLX5_SYNC_RST_STATE_RESET_NOW = 0x1, + MLX5_SYNC_RST_STATE_RESET_ABORT = 0x2, +}; + +struct mlx5_eqe_sync_fw_update { + u8 reserved_at_0[3]; + u8 sync_rst_state; +}; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -707,6 +721,7 @@ union ev_data { struct mlx5_eqe_dct dct; struct mlx5_eqe_temp_warning temp_warning; struct mlx5_eqe_xrq_err xrq_err; + struct mlx5_eqe_sync_fw_update sync_fw_update; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9e6a3cec1e32..058ded202b65 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1317,7 +1317,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 wol_p[0x1]; u8 stat_rate_support[0x10]; - u8 reserved_at_1f0[0xc]; + u8 reserved_at_1f0[0x1]; + u8 pci_sync_for_fw_update_event[0x1]; + u8 reserved_at_1f2[0xa]; u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; -- cgit v1.2.3 From ee5cdf7a5e8945372c7496e98de2b364e095b60b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Fri, 24 Apr 2020 12:45:08 -0700 Subject: net/mlx5: Introduce TLS RX offload hardware bits Add TLS RX offload related IFC hardware fields and enumerations. Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Reviewed-by: Boris Pismenny Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 18 ++++++++++++++++-- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index de93f0b67973..1bc27aca648b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -450,10 +450,12 @@ enum { enum { MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x1, + MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS = 0x2, }; enum { MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x1, + MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS = 0x2, }; enum { @@ -764,7 +766,7 @@ struct mlx5_err_cqe { }; struct mlx5_cqe64 { - u8 outer_l3_tunneled; + u8 tls_outer_l3_tunneled; u8 rsvd0; __be16 wqe_id; u8 lro_tcppsh_abort_dupack; @@ -854,7 +856,12 @@ static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe) { - return cqe->outer_l3_tunneled & 0x1; + return cqe->tls_outer_l3_tunneled & 0x1; +} + +static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe) +{ + return (cqe->tls_outer_l3_tunneled >> 3) & 0x3; } static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) @@ -942,6 +949,13 @@ enum { CQE_L4_OK = 1 << 2, }; +enum { + CQE_TLS_OFFLOAD_NOT_DECRYPTED = 0x0, + CQE_TLS_OFFLOAD_DECRYPTED = 0x1, + CQE_TLS_OFFLOAD_RESYNC = 0x2, + CQE_TLS_OFFLOAD_ERROR = 0x3, +}; + struct mlx5_sig_err_cqe { u8 rsvd0[16]; __be32 expected_trans_sig; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 058ded202b65..6a6bb5dc7916 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1491,7 +1491,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_480[0x1]; u8 tls_tx[0x1]; - u8 reserved_at_482[0x1]; + u8 tls_rx[0x1]; u8 log_max_l2_table[0x5]; u8 reserved_at_488[0x8]; u8 log_uar_page_sz[0x10]; @@ -3136,7 +3136,8 @@ struct mlx5_ifc_tirc_bits { u8 reserved_at_0[0x20]; u8 disp_type[0x4]; - u8 reserved_at_24[0x1c]; + u8 tls_en[0x1]; + u8 reserved_at_25[0x1b]; u8 reserved_at_40[0x40]; -- cgit v1.2.3 From 0e1533bb9cce2c6b2aecdfddfcc0de3beeaddc7b Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Fri, 24 Apr 2020 12:45:09 -0700 Subject: net/mlx5: Add release all pages capability bit Add a bit in HCA capabilities layout to indicate if release all pages is supported. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6a6bb5dc7916..fb243848132d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1244,7 +1244,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_130[0xa]; u8 log_max_ra_res_dc[0x6]; - u8 reserved_at_140[0x9]; + u8 reserved_at_140[0x6]; + u8 release_all_pages[0x1]; + u8 reserved_at_147[0x2]; u8 roce_accl[0x1]; u8 log_max_ra_req_qp[0x6]; u8 reserved_at_150[0xa]; -- cgit v1.2.3 From 2dc8b5246d2c94f732c02e7a688d8a9c0c65361f Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 24 Apr 2020 12:45:10 -0700 Subject: net/mlx5: TX WQE Add trailer insertion field Add new TX WQE field for Connect-X6DX trailer insertion support, when set, the HW adds a trailer to the packet, the WQE trailer association flags are used to set to HW the header which the trailer belongs. Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index ef127a156a62..f23eb18526fe 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -229,6 +229,11 @@ enum { enum { MLX5_ETH_WQE_SVLAN = 1 << 0, + MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC = 1 << 26, + MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC = 1 << 27, + MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC = 3 << 26, + MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC = 1 << 28, + MLX5_ETH_WQE_INSERT_TRAILER = 1 << 30, MLX5_ETH_WQE_INSERT_VLAN = 1 << 15, }; @@ -257,6 +262,7 @@ struct mlx5_wqe_eth_seg { __be16 type; __be16 vlan_tci; } insert; + __be32 trailer; }; }; -- cgit v1.2.3 From 35fc0e3b0bd5be3b059e53ae90c4536ee4922330 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 24 Apr 2020 11:19:10 -0500 Subject: rculist: Add hlists_swap_heads_rcu Using the struct pid to refer to two tasks in de_thread was a clever idea and ultimately too clever, as it has lead to proc_flush_task being called inconsistently. To support rectifying this add hlists_swap_heads_rcu. An hlist primitive that just swaps the hlist heads of two lists. This is exactly what is needed for exchanging the pids of two tasks. Only consideration of correctness of the code has been given, as the caller is expected to be a slowpath. Link: https://lore.kernel.org/lkml/87mu6vajnq.fsf_-_@x220.int.ebiederm.org/ Acked-by: Linus Torvalds Acked-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- include/linux/rculist.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8214cdc715f2..67867e0d4cec 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -506,6 +506,27 @@ static inline void hlist_replace_rcu(struct hlist_node *old, WRITE_ONCE(old->pprev, LIST_POISON2); } +/** + * hlists_swap_heads_rcu - swap the lists the hlist heads point to + * @left: The hlist head on the left + * @right: The hlist head on the right + * + * The lists start out as [@left ][node1 ... ] and + [@right ][node2 ... ] + * The lists end up as [@left ][node2 ... ] + * [@right ][node1 ... ] + */ +static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) +{ + struct hlist_node *node1 = left->first; + struct hlist_node *node2 = right->first; + + rcu_assign_pointer(left->first, node2); + rcu_assign_pointer(right->first, node1); + WRITE_ONCE(node2->pprev, &left->first); + WRITE_ONCE(node1->pprev, &right->first); +} + /* * return the first or the next element in an RCU protected hlist */ -- cgit v1.2.3 From 6b03d1304a32dc8450c7516000a0fe07bef7c446 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 19 Apr 2020 06:35:02 -0500 Subject: proc: Ensure we see the exit of each process tid exactly once When the thread group leader changes during exec and the old leaders thread is reaped proc_flush_pid will flush the dentries for the entire process because the leader still has it's original pid. Fix this by exchanging the pids in an rcu safe manner, and wrapping the code to do that up in a helper exchange_tids. When I removed switch_exec_pids and introduced this behavior in d73d65293e3e ("[PATCH] pidhash: kill switch_exec_pids") there really was nothing that cared as flushing happened with the cached dentry and de_thread flushed both of them on exec. This lack of fully exchanging pids became a problem a few months later when I introduced 48e6484d4902 ("[PATCH] proc: Rewrite the proc dentry flush on exit optimization"). Which overlooked the de_thread case was no longer swapping pids, and I was looking up proc dentries by task->pid. The current behavior isn't properly a bug as everything in proc will continue to work correctly just a little bit less efficiently. Fix this just so there are no little surprise corner cases waiting to bite people. -- Oleg points out this could be an issue in next_tgid in proc where has_group_leader_pid is called, and reording some of the assignments should fix that. -- Oleg points out this will break the 10 year old hack in __exit_signal.c > /* > * This can only happen if the caller is de_thread(). > * FIXME: this is the temporary hack, we should teach > * posix-cpu-timers to handle this case correctly. > */ > if (unlikely(has_group_leader_pid(tsk))) > posix_cpu_timers_exit_group(tsk); The code in next_tgid has been changed to use PIDTYPE_TGID, and the posix cpu timers code has been fixed so it does not need the 10 year old hack, so this should be safe to merge now. Link: https://lore.kernel.org/lkml/87h7x3ajll.fsf_-_@x220.int.ebiederm.org/ Acked-by: Linus Torvalds Acked-by: Oleg Nesterov Fixes: 48e6484d4902 ("[PATCH] proc: Rewrite the proc dentry flush on exit optimization"). Signed-off-by: Eric W. Biederman --- fs/exec.c | 5 +---- include/linux/pid.h | 1 + kernel/pid.c | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 06b4c550af5d..9b60f927afd7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1186,11 +1186,8 @@ static int de_thread(struct task_struct *tsk) /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. - * Note: The old leader also uses this pid until release_task - * is called. Odd but simple and correct. */ - tsk->pid = leader->pid; - change_pid(tsk, PIDTYPE_PID, task_pid(leader)); + exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); diff --git a/include/linux/pid.h b/include/linux/pid.h index cc896f0fc4e3..2159ffca63fc 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -102,6 +102,7 @@ extern void attach_pid(struct task_struct *task, enum pid_type); extern void detach_pid(struct task_struct *task, enum pid_type); extern void change_pid(struct task_struct *task, enum pid_type, struct pid *pid); +extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); diff --git a/kernel/pid.c b/kernel/pid.c index c835b844aca7..6d5d0a5bda82 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -363,6 +363,25 @@ void change_pid(struct task_struct *task, enum pid_type type, attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) -- cgit v1.2.3 From c4dad0aab3fca0c1f0baa4cc84b6ec91b7ebf426 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:39:28 -0400 Subject: audit: tidy and extend netfilter_cfg x_tables NETFILTER_CFG record generation was inconsistent for x_tables and ebtables configuration changes. The call was needlessly messy and there were supporting records missing at times while they were produced when not requested. Simplify the logging call into a new audit_log_nfcfg call. Honour the audit_enabled setting while more consistently recording information including supporting records by tidying up dummy checks. Add an op= field that indicates the operation being performed (register or replace). Here is the enhanced sample record: type=NETFILTER_CFG msg=audit(1580905834.919:82970): table=filter family=2 entries=83 op=replace Generate audit NETFILTER_CFG records on ebtables table registration. Previously this was being done for x_tables registration and replacement operations and ebtables table replacement only. See: https://github.com/linux-audit/audit-kernel/issues/25 See: https://github.com/linux-audit/audit-kernel/issues/35 See: https://github.com/linux-audit/audit-kernel/issues/43 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 21 +++++++++++++++++++++ kernel/auditsc.c | 24 ++++++++++++++++++++++++ net/bridge/netfilter/ebtables.c | 12 ++++-------- net/netfilter/x_tables.c | 12 +++--------- 4 files changed, 52 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index f9ceae57ca8d..5c7f07a9776d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -94,6 +94,11 @@ struct audit_ntp_data { struct audit_ntp_data {}; #endif +enum audit_nfcfgop { + AUDIT_XT_OP_REGISTER, + AUDIT_XT_OP_REPLACE, +}; + extern int is_audit_feature_set(int which); extern int __init audit_register_class(int class, unsigned *list); @@ -379,6 +384,8 @@ extern void __audit_log_kern_module(char *name); extern void __audit_fanotify(unsigned int response); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); +extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -514,6 +521,14 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) __audit_ntp_log(ad); } +static inline void audit_log_nfcfg(const char *name, u8 af, + unsigned int nentries, + enum audit_nfcfgop op) +{ + if (audit_enabled) + __audit_log_nfcfg(name, af, nentries, op); +} + extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ @@ -646,6 +661,12 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) static inline void audit_ptrace(struct task_struct *t) { } + +static inline void audit_log_nfcfg(const char *name, u8 af, + unsigned int nentries, + enum audit_nfcfgop op) +{ } + #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 814406a35db1..705beac0ce29 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -130,6 +130,16 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; +struct audit_nfcfgop_tab { + enum audit_nfcfgop op; + const char *s; +}; + +const struct audit_nfcfgop_tab audit_nfcfgs[] = { + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, +}; + static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -2542,6 +2552,20 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } +void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op) +{ + struct audit_buffer *ab; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + if (!ab) + return; + audit_log_format(ab, "table=%s family=%u entries=%u op=%s", + name, af, nentries, audit_nfcfgs[op].s); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(__audit_log_nfcfg); + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 78db58c7aec2..0a148e68b6e1 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1046,14 +1046,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, vfree(table); vfree(counterstmp); -#ifdef CONFIG_AUDIT - if (audit_enabled) { - audit_log(audit_context(), GFP_KERNEL, - AUDIT_NETFILTER_CFG, - "table=%s family=%u entries=%u", - repl->name, AF_BRIDGE, repl->nentries); - } -#endif + audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, + AUDIT_XT_OP_REPLACE); return ret; free_unlock: @@ -1223,6 +1217,8 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, *res = NULL; } + audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, + AUDIT_XT_OP_REGISTER); return ret; free_unlock: mutex_unlock(&ebt_mutex); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index cd2b034eef59..8f8c5dbf603d 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1408,15 +1408,9 @@ xt_replace_table(struct xt_table *table, } } -#ifdef CONFIG_AUDIT - if (audit_enabled) { - audit_log(audit_context(), GFP_KERNEL, - AUDIT_NETFILTER_CFG, - "table=%s family=%u entries=%u", - table->name, table->af, private->number); - } -#endif - + audit_log_nfcfg(table->name, table->af, private->number, + !private->number ? AUDIT_XT_OP_REGISTER : + AUDIT_XT_OP_REPLACE); return private; } EXPORT_SYMBOL_GPL(xt_replace_table); -- cgit v1.2.3 From bbd40fc4816d5329a89397206ece9f1763787a88 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 26 Apr 2020 07:59:55 -0500 Subject: signal: Remove has_group_leader_pid After the introduction of exchange_tids has_group_leader_pid is equivalent to thread_group_leader. After the last couple of cleanups has_group_leader_pid has no more callers. So remove the now unused and redundant has_group_leader_pid. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3e5b090c16d4..0ee5e696c5d8 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -654,17 +654,6 @@ static inline bool thread_group_leader(struct task_struct *p) return p->exit_signal >= 0; } -/* Do to the insanities of de_thread it is possible for a process - * to have the pid of the thread group leader without actually being - * the thread group leader. For iteration through the pids in proc - * all we care about is that we have a task with the appropriate - * pid, we don't actually care if we have the right task. - */ -static inline bool has_group_leader_pid(struct task_struct *p) -{ - return task_pid(p) == task_tgid(p); -} - static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { -- cgit v1.2.3 From a45d88530b2552ad5ea0da18861600b4ecc9d0c7 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:39:29 -0400 Subject: netfilter: add audit table unregister actions Audit the action of unregistering ebtables and x_tables. See: https://github.com/linux-audit/audit-kernel/issues/44 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 1 + kernel/auditsc.c | 5 +++-- net/bridge/netfilter/ebtables.c | 2 ++ net/netfilter/x_tables.c | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 5c7f07a9776d..1a2351508211 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -97,6 +97,7 @@ struct audit_ntp_data {}; enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, + AUDIT_XT_OP_UNREGISTER, }; extern int is_audit_feature_set(int which); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 705beac0ce29..d281c18d1771 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -136,8 +136,9 @@ struct audit_nfcfgop_tab { }; const struct audit_nfcfgop_tab audit_nfcfgs[] = { - { AUDIT_XT_OP_REGISTER, "register" }, - { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_UNREGISTER, "unregister" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 0a148e68b6e1..4778db5601b0 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1124,6 +1124,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) mutex_lock(&ebt_mutex); list_del(&table->list); mutex_unlock(&ebt_mutex); + audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, + AUDIT_XT_OP_UNREGISTER); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8f8c5dbf603d..99a468be4a59 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1472,6 +1472,8 @@ void *xt_unregister_table(struct xt_table *table) private = table->private; list_del(&table->list); mutex_unlock(&xt[table->af].mutex); + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_UNREGISTER); kfree(table); return private; -- cgit v1.2.3 From f9d041271cf44ca02eed0cc82e1a6d8c814c53ed Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:05 -0700 Subject: bpf: Refactor bpf_link update handling Make bpf_link update support more generic by making it into another bpf_link_ops methods. This allows generic syscall handling code to be agnostic to various conditionally compiled features (e.g., the case of CONFIG_CGROUP_BPF). This also allows to keep link type-specific code to remain static within respective code base. Refactor existing bpf_cgroup_link code and take advantage of this. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-2-andriin@fb.com --- include/linux/bpf-cgroup.h | 12 ------------ include/linux/bpf.h | 3 ++- kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 11 ++++------- kernel/cgroup/cgroup.c | 27 --------------------------- 5 files changed, 34 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0b41fd5fc96b..a9cb9a5bf8e9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -100,8 +100,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type); -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -112,8 +110,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type); -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -353,7 +349,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, #else struct bpf_prog; -struct bpf_link; struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} @@ -377,13 +372,6 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, return -EINVAL; } -static inline int cgroup_bpf_replace(struct bpf_link *link, - struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - return -EINVAL; -} - static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10960cfabea4..81c8620cb4c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1093,7 +1093,8 @@ struct bpf_link { struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); - + int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog); }; void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf634959885c..da6e48e802b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -557,8 +557,9 @@ found: * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog) +static int __cgroup_bpf_replace(struct cgroup *cgrp, + struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) { struct list_head *progs = &cgrp->bpf.progs[link->type]; struct bpf_prog *old_prog; @@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, return 0; } +static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -EINVAL; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, @@ -811,6 +836,7 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .update_prog = cgroup_bpf_replace, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7626b8024471..f5358e1462eb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3645,13 +3645,10 @@ static int link_update(union bpf_attr *attr) goto out_put_progs; } -#ifdef CONFIG_CGROUP_BPF - if (link->ops == &bpf_cgroup_link_lops) { - ret = cgroup_bpf_replace(link, old_prog, new_prog); - goto out_put_progs; - } -#endif - ret = -EINVAL; + if (link->ops->update_prog) + ret = link->ops->update_prog(link, new_prog, old_prog); + else + ret = EINVAL; out_put_progs: if (old_prog) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..557a9b9d2244 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, return ret; } -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - struct bpf_cgroup_link *cg_link; - int ret; - - if (link->ops != &bpf_cgroup_link_lops) - return -EINVAL; - - cg_link = container_of(link, struct bpf_cgroup_link, link); - - mutex_lock(&cgroup_mutex); - /* link might have been auto-released by dying cgroup, so fail */ - if (!cg_link->cgroup) { - ret = -EINVAL; - goto out_unlock; - } - if (old_prog && link->prog != old_prog) { - ret = -EPERM; - goto out_unlock; - } - ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); -out_unlock: - mutex_unlock(&cgroup_mutex); - return ret; -} - int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { -- cgit v1.2.3 From a3b80e1078943dc12553166fb08e258463dec013 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:06 -0700 Subject: bpf: Allocate ID for bpf_link Generate ID for each bpf_link using IDR, similarly to bpf_map and bpf_prog. bpf_link creation, initialization, attachment, and exposing to user-space through FD and ID is a complicated multi-step process, abstract it away through bpf_link_primer and bpf_link_prime(), bpf_link_settle(), and bpf_link_cleanup() internal API. They guarantee that until bpf_link is properly attached, user-space won't be able to access partially-initialized bpf_link either from FD or ID. All this allows to simplify bpf_link attachment and error handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-3-andriin@fb.com --- include/linux/bpf.h | 17 ++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 14 ++--- kernel/bpf/syscall.c | 143 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 118 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81c8620cb4c4..875d1f0af803 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1085,11 +1085,19 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; + u32 id; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; }; +struct bpf_link_primer { + struct bpf_link *link; + struct file *file; + int fd; + u32 id; +}; + struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); @@ -1097,10 +1105,11 @@ struct bpf_link_ops { struct bpf_prog *old_prog); }; -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog); -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd); +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog); +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); +int bpf_link_settle(struct bpf_link_primer *primer); +void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a6c47f3febe..6121aa487465 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -523,6 +523,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da6e48e802b2..1bdf37fca879 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -841,10 +841,10 @@ const struct bpf_link_ops bpf_cgroup_link_lops = { int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -862,22 +862,20 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f5358e1462eb..5439e05e3d25 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -2181,25 +2183,38 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2210,6 +2225,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2275,9 +2291,11 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", link_type, + link->id, prog_tag, prog->aux->id); } @@ -2292,36 +2310,76 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + fput(file); + return id; + } + + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2367,9 +2425,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2404,22 +2462,19 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) } bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2447,7 +2502,7 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, }; @@ -2456,13 +2511,13 @@ static const struct bpf_link_ops bpf_raw_tp_lops = { static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2515,24 +2570,22 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -3464,7 +3517,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; -- cgit v1.2.3 From f2e10bff16a0fdd41ba278c84da9813700e356af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:08 -0700 Subject: bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link Add ability to fetch bpf_link details through BPF_OBJ_GET_INFO_BY_FD command. Also enhance show_fdinfo to potentially include bpf_link type-specific information (similarly to obj_info). Also introduce enum bpf_link_type stored in bpf_link itself and expose it in UAPI. bpf_link_tracing also now will store and return bpf_attach_type. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-5-andriin@fb.com --- include/linux/bpf-cgroup.h | 2 - include/linux/bpf.h | 8 ++- include/linux/bpf_types.h | 6 ++ include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/btf.c | 2 + kernel/bpf/cgroup.c | 43 +++++++++++- kernel/bpf/syscall.c | 155 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 + tools/include/uapi/linux/bpf.h | 31 +++++++++ 9 files changed, 253 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a9cb9a5bf8e9..272626cc3fc9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -57,8 +57,6 @@ struct bpf_cgroup_link { enum bpf_attach_type type; }; -extern const struct bpf_link_ops bpf_cgroup_link_lops; - struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 875d1f0af803..c07b1d2f3824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1026,9 +1026,11 @@ extern const struct file_operations bpf_prog_fops; extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; @@ -1086,6 +1088,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; u32 id; + enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; @@ -1103,9 +1106,12 @@ struct bpf_link_ops { void (*dealloc)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); + void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); + int (*fill_link_info)(const struct bpf_link *link, + struct bpf_link_info *info); }; -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ba0c2d56f8a3..8345cdf553b8 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,3 +118,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif + +BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) +#ifdef CONFIG_CGROUP_BPF +BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) +#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7e6541fceade..0eccafae55bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -222,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -3612,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..a2cfba89a8e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1bdf37fca879..5c0e964105ac 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -833,10 +833,48 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -858,7 +896,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c213a730502..d23c04cbe14f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -51,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -1548,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -2183,10 +2187,11 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; link->id = 0; link->ops = ops; link->prog = prog; @@ -2266,27 +2271,23 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } -#ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "", +#include +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE +#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -2294,10 +2295,12 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2403,6 +2406,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2418,9 +2422,33 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) @@ -2460,7 +2488,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -2502,9 +2532,56 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -2570,7 +2647,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); @@ -3366,6 +3444,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3390,6 +3504,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91728e0f27eb..2b337e32aa94 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4a6c47f3febe..0eccafae55bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -113,6 +113,8 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, }; enum bpf_map_type { @@ -220,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -523,6 +534,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; @@ -3609,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). -- cgit v1.2.3 From 61365ca7b24f1cb106a4c619c94610862bbec778 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 15 Apr 2020 14:14:49 +0200 Subject: backlight: l4f00242t03: Convert to GPIO descriptors This converts the l4f00242t03 backlight driver to use GPIO descriptors and switches the two Freescale i.MX boards over to passing descriptors instead of global GPIO numbers. We use the typical names "enable" and "reset" as found in the device tree bindings for panel GPIOs. This saves a lot of code in the driver and makes it possible to get rid of the platform data header altogether. Signed-off-by: Linus Walleij Reviewed-by: Daniel Thompson Acked-by: Shawn Guo Signed-off-by: Lee Jones --- arch/arm/mach-imx/mach-mx27_3ds.c | 21 +++++++++++----- arch/arm/mach-imx/mach-mx31_3ds.c | 24 +++++++++++++++---- drivers/video/backlight/l4f00242t03.c | 45 ++++++++++++++--------------------- include/linux/spi/l4f00242t03.h | 17 ------------- 4 files changed, 52 insertions(+), 55 deletions(-) delete mode 100644 include/linux/spi/l4f00242t03.h (limited to 'include/linux') diff --git a/arch/arm/mach-imx/mach-mx27_3ds.c b/arch/arm/mach-imx/mach-mx27_3ds.c index 7b8325fb5b41..1da5f07952ac 100644 --- a/arch/arm/mach-imx/mach-mx27_3ds.c +++ b/arch/arm/mach-imx/mach-mx27_3ds.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -20,8 +21,6 @@ #include #include #include -#include - #include #include @@ -351,9 +350,19 @@ static const struct imx_fb_platform_data mx27_3ds_fb_data __initconst = { }; /* LCD */ -static struct l4f00242t03_pdata mx27_3ds_lcd_pdata = { - .reset_gpio = LCD_RESET, - .data_enable_gpio = LCD_ENABLE, +static struct gpiod_lookup_table mx27_3ds_lcd_gpiod_table = { + .dev_id = "spi0.0", /* Bus 0 chipselect 0 */ + .table = { + /* + * The i.MX27 has the i.MX21 GPIO controller, the GPIOs + * numbered IMX_GPIO_NR(1, 3) and IMX_GPIO_NR(1, 31) + * are in "bank 1" which is subtracted by one in the macro + * so these are actually bank 0 on "imx21-gpio.0". + */ + GPIO_LOOKUP("imx21-gpio.0", 3, "reset", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("imx21-gpio.0", 31, "enable", GPIO_ACTIVE_HIGH), + { }, + }, }; static struct spi_board_info mx27_3ds_spi_devs[] __initdata = { @@ -370,7 +379,6 @@ static struct spi_board_info mx27_3ds_spi_devs[] __initdata = { .max_speed_hz = 5000000, .bus_num = 0, .chip_select = 0, /* SS0 */ - .platform_data = &mx27_3ds_lcd_pdata, }, }; @@ -416,6 +424,7 @@ static void __init mx27pdk_late_init(void) if (!otg_mode_host) imx27_add_fsl_usb2_udc(&otg_device_pdata); + gpiod_add_lookup_table(&mx27_3ds_lcd_gpiod_table); mx27_3ds_spi_devs[0].irq = gpio_to_irq(PMIC_INT); spi_register_board_info(mx27_3ds_spi_devs, ARRAY_SIZE(mx27_3ds_spi_devs)); diff --git a/arch/arm/mach-imx/mach-mx31_3ds.c b/arch/arm/mach-imx/mach-mx31_3ds.c index 716d2ad51103..e81386190479 100644 --- a/arch/arm/mach-imx/mach-mx31_3ds.c +++ b/arch/arm/mach-imx/mach-mx31_3ds.c @@ -10,10 +10,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -160,9 +160,23 @@ static struct mx3fb_platform_data mx3fb_pdata __initdata = { }; /* LCD */ -static struct l4f00242t03_pdata mx31_3ds_l4f00242t03_pdata = { - .reset_gpio = IOMUX_TO_GPIO(MX31_PIN_LCS1), - .data_enable_gpio = IOMUX_TO_GPIO(MX31_PIN_SER_RS), +static struct gpiod_lookup_table mx31_3ds_lcd_gpiod_table = { + .dev_id = "spi0.2", /* Bus 0 chipselect 2 */ + .table = { + /* + * "reset" has IOMUX_TO_GPIO(IOMUX_PIN(88, 28)). + * The macro only shifts 88 to bits 9..16 and then + * mask it and shift it back. The GPIO number is 88. + * 88 is 2*32+24 + */ + GPIO_LOOKUP("imx31-gpio.2", 24, "reset", GPIO_ACTIVE_HIGH), + /* + * Same reasoning as above for + * IOMUX_TO_GPIO(IOMUX_PIN(89, 27), pin 89 is 2*32+25. + */ + GPIO_LOOKUP("imx31-gpio.2", 25, "enable", GPIO_ACTIVE_HIGH), + { }, + }, }; /* @@ -387,7 +401,6 @@ static struct spi_board_info mx31_3ds_spi_devs[] __initdata = { .max_speed_hz = 5000000, .bus_num = 0, .chip_select = 2, /* SS2 */ - .platform_data = &mx31_3ds_l4f00242t03_pdata, }, }; @@ -566,6 +579,7 @@ static void __init mx31_3ds_init(void) static void __init mx31_3ds_late(void) { + gpiod_add_lookup_table(&mx31_3ds_lcd_gpiod_table); mx31_3ds_spi_devs[0].irq = gpio_to_irq(IOMUX_TO_GPIO(MX31_PIN_GPIO1_3)); spi_register_board_info(mx31_3ds_spi_devs, ARRAY_SIZE(mx31_3ds_spi_devs)); diff --git a/drivers/video/backlight/l4f00242t03.c b/drivers/video/backlight/l4f00242t03.c index 8554b4aa980c..46f97d1c3d21 100644 --- a/drivers/video/backlight/l4f00242t03.c +++ b/drivers/video/backlight/l4f00242t03.c @@ -14,13 +14,11 @@ #include #include #include -#include +#include #include #include #include - #include -#include struct l4f00242t03_priv { struct spi_device *spi; @@ -28,16 +26,18 @@ struct l4f00242t03_priv { int lcd_state; struct regulator *io_reg; struct regulator *core_reg; + struct gpio_desc *reset; + struct gpio_desc *enable; }; -static void l4f00242t03_reset(unsigned int gpio) +static void l4f00242t03_reset(struct gpio_desc *gpiod) { pr_debug("l4f00242t03_reset.\n"); - gpio_set_value(gpio, 1); + gpiod_set_value(gpiod, 1); mdelay(100); - gpio_set_value(gpio, 0); + gpiod_set_value(gpiod, 0); mdelay(10); /* tRES >= 100us */ - gpio_set_value(gpio, 1); + gpiod_set_value(gpiod, 1); mdelay(20); } @@ -45,7 +45,6 @@ static void l4f00242t03_reset(unsigned int gpio) static void l4f00242t03_lcd_init(struct spi_device *spi) { - struct l4f00242t03_pdata *pdata = dev_get_platdata(&spi->dev); struct l4f00242t03_priv *priv = spi_get_drvdata(spi); const u16 cmd[] = { 0x36, param(0), 0x3A, param(0x60) }; int ret; @@ -76,21 +75,20 @@ static void l4f00242t03_lcd_init(struct spi_device *spi) return; } - l4f00242t03_reset(pdata->reset_gpio); + l4f00242t03_reset(priv->reset); - gpio_set_value(pdata->data_enable_gpio, 1); + gpiod_set_value(priv->enable, 1); msleep(60); spi_write(spi, (const u8 *)cmd, ARRAY_SIZE(cmd) * sizeof(u16)); } static void l4f00242t03_lcd_powerdown(struct spi_device *spi) { - struct l4f00242t03_pdata *pdata = dev_get_platdata(&spi->dev); struct l4f00242t03_priv *priv = spi_get_drvdata(spi); dev_dbg(&spi->dev, "Powering down LCD\n"); - gpio_set_value(pdata->data_enable_gpio, 0); + gpiod_set_value(priv->enable, 0); regulator_disable(priv->io_reg); regulator_disable(priv->core_reg); @@ -168,13 +166,6 @@ static struct lcd_ops l4f_ops = { static int l4f00242t03_probe(struct spi_device *spi) { struct l4f00242t03_priv *priv; - struct l4f00242t03_pdata *pdata = dev_get_platdata(&spi->dev); - int ret; - - if (pdata == NULL) { - dev_err(&spi->dev, "Uninitialized platform data.\n"); - return -EINVAL; - } priv = devm_kzalloc(&spi->dev, sizeof(struct l4f00242t03_priv), GFP_KERNEL); @@ -187,21 +178,21 @@ static int l4f00242t03_probe(struct spi_device *spi) priv->spi = spi; - ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio, - GPIOF_OUT_INIT_HIGH, "lcd l4f00242t03 reset"); - if (ret) { + priv->reset = devm_gpiod_get(&spi->dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(priv->reset)) { dev_err(&spi->dev, "Unable to get the lcd l4f00242t03 reset gpio.\n"); - return ret; + return PTR_ERR(priv->reset); } + gpiod_set_consumer_name(priv->reset, "lcd l4f00242t03 reset"); - ret = devm_gpio_request_one(&spi->dev, pdata->data_enable_gpio, - GPIOF_OUT_INIT_LOW, "lcd l4f00242t03 data enable"); - if (ret) { + priv->enable = devm_gpiod_get(&spi->dev, "enable", GPIOD_OUT_LOW); + if (IS_ERR(priv->enable)) { dev_err(&spi->dev, "Unable to get the lcd l4f00242t03 data en gpio.\n"); - return ret; + return PTR_ERR(priv->enable); } + gpiod_set_consumer_name(priv->enable, "lcd l4f00242t03 data enable"); priv->io_reg = devm_regulator_get(&spi->dev, "vdd"); if (IS_ERR(priv->io_reg)) { diff --git a/include/linux/spi/l4f00242t03.h b/include/linux/spi/l4f00242t03.h deleted file mode 100644 index 831a5de7a0e2..000000000000 --- a/include/linux/spi/l4f00242t03.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * l4f00242t03.h -- Platform glue for Epson L4F00242T03 LCD - * - * Copyright (c) 2009 Alberto Panizzo - * Based on Marek Vasut work in lms283gf05.h -*/ - -#ifndef _INCLUDE_LINUX_SPI_L4F00242T03_H_ -#define _INCLUDE_LINUX_SPI_L4F00242T03_H_ - -struct l4f00242t03_pdata { - unsigned int reset_gpio; - unsigned int data_enable_gpio; -}; - -#endif /* _INCLUDE_LINUX_SPI_L4F00242T03_H_ */ -- cgit v1.2.3 From 36b5215436ad115551e12478ceb3be4a2279e415 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 28 Apr 2020 17:23:27 -0700 Subject: gpio: Document proper return value for gpio drivers The legacy defines GPIOF_DIR_XXX are only for consumers. Document the proper ones. Also: don't use "_XXX" since that's harder to find with "git grep". Just list both of the values. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200428172322.1.I396f351e364f3c09df7c7606e79abefb8682c092@changeid Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index b8fc92c177eb..7b5f5681b7e4 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -267,9 +267,9 @@ struct gpio_irq_chip { * @free: optional hook for chip-specific deactivation, such as * disabling module power and clock; may sleep * @get_direction: returns direction for signal "offset", 0=out, 1=in, - * (same as GPIOF_DIR_XXX), or negative error. - * It is recommended to always implement this function, even on - * input-only or output-only gpio chips. + * (same as GPIO_LINE_DIRECTION_OUT / GPIO_LINE_DIRECTION_IN), + * or negative error. It is recommended to always implement this + * function, even on input-only or output-only gpio chips. * @direction_input: configures signal "offset" as input, or returns error * This can be omitted on input-only or output-only gpio chips. * @direction_output: configures signal "offset" as output, or returns error -- cgit v1.2.3 From 8d0910121b070b4edf4239d91cf9a4523b33ca0c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 28 Apr 2020 17:23:28 -0700 Subject: gpio: Make "offset" and "unsigned int", not just "unsigned" When I copied the function prototypes from the GPIO header file into my own driver, checkpatch yelled at me saying that I shouldn't use use "unsigned" but instead should say "unsigned int". Let's make the header file use "unsigned int" so others who copy like I did won't get yelled at. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200428172322.2.Iacb3c8152c3cf9015a91308678155a578b0cc050@changeid Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 7b5f5681b7e4..8c41ae41b6bb 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -349,30 +349,30 @@ struct gpio_chip { struct module *owner; int (*request)(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); void (*free)(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); int (*get_direction)(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); int (*direction_input)(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); int (*direction_output)(struct gpio_chip *gc, - unsigned offset, int value); + unsigned int offset, int value); int (*get)(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); int (*get_multiple)(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits); void (*set)(struct gpio_chip *gc, - unsigned offset, int value); + unsigned int offset, int value); void (*set_multiple)(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits); int (*set_config)(struct gpio_chip *gc, - unsigned offset, + unsigned int offset, unsigned long config); int (*to_irq)(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); void (*dbg_show)(struct seq_file *s, struct gpio_chip *gc); @@ -459,7 +459,7 @@ struct gpio_chip { }; extern const char *gpiochip_is_requested(struct gpio_chip *gc, - unsigned offset); + unsigned int offset); /* add/remove chips */ extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, @@ -657,9 +657,9 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gc, } #endif /* CONFIG_LOCKDEP */ -int gpiochip_generic_request(struct gpio_chip *gc, unsigned offset); -void gpiochip_generic_free(struct gpio_chip *gc, unsigned offset); -int gpiochip_generic_config(struct gpio_chip *gc, unsigned offset, +int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset); +void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset); +int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset, unsigned long config); /** -- cgit v1.2.3 From 0376e9efe18388bd486a65edbc16d34b84bddc8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2020 13:27:55 +0200 Subject: block: replace BIO_QUEUE_ENTERED with BIO_CGROUP_ACCT BIO_QUEUE_ENTERED is only used for cgroup accounting now, so rename the flag and move setting it into the cgroup code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 10 ---------- include/linux/blk-cgroup.h | 10 ++++++---- include/linux/blk_types.h | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index c49eb3bdd0be..a04e991b5ded 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -336,16 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, /* there isn't chance to merge the splitted bio */ split->bi_opf |= REQ_NOMERGE; - /* - * Since we're recursing into make_request here, ensure - * that we mark this bio as already having entered the queue. - * If not, and the queue is going away, we can get stuck - * forever on waiting for the queue reference to drop. But - * that will never happen, as we're already holding a - * reference to it. - */ - bio_set_flag(*bio, BIO_QUEUE_ENTERED); - bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 35f8ffe92b70..4deb8bb7b6af 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -607,12 +607,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, u64_stats_update_begin(&bis->sync); /* - * If the bio is flagged with BIO_QUEUE_ENTERED it means this - * is a split bio and we would have already accounted for the - * size of the bio. + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a + * split bio and we would have already accounted for the size of + * the bio. */ - if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } bis->cur.ios[rwd]++; u64_stats_update_end(&bis->sync); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 31eb92876be7..90895d594e64 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,7 +220,7 @@ enum { * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ - BIO_QUEUE_ENTERED, /* can use blk_queue_enter_live() */ + BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ BIO_FLAG_LAST }; -- cgit v1.2.3 From 3c2214b6027ff37945799de717c417212e1a8c54 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 21 Apr 2020 12:34:55 -0400 Subject: padata: add separate cpuhp node for CPUHP_PADATA_DEAD Removing the pcrypt module triggers this: general protection fault, probably for non-canonical address 0xdead000000000122 CPU: 5 PID: 264 Comm: modprobe Not tainted 5.6.0+ #2 Hardware name: QEMU Standard PC RIP: 0010:__cpuhp_state_remove_instance+0xcc/0x120 Call Trace: padata_sysfs_release+0x74/0xce kobject_put+0x81/0xd0 padata_free+0x12/0x20 pcrypt_exit+0x43/0x8ee [pcrypt] padata instances wrongly use the same hlist node for the online and dead states, so __padata_free()'s second cpuhp remove call chokes on the node that the first poisoned. cpuhp multi-instance callbacks only walk forward in cpuhp_step->list and the same node is linked in both the online and dead lists, so the list corruption that results from padata_alloc() adding the node to a second list without removing it from the first doesn't cause problems as long as no instances are freed. Avoid the issue by giving each state its own node. Fixes: 894c9ef9780c ("padata: validate cpumask without removed CPU during offline") Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Herbert Xu --- include/linux/padata.h | 6 ++++-- kernel/padata.c | 14 ++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index a0d8b41850b2..693cae9bfe66 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -139,7 +139,8 @@ struct padata_shell { /** * struct padata_instance - The overall control structure. * - * @node: Used by CPU hotplug. + * @cpu_online_node: Linkage for CPU online callback. + * @cpu_dead_node: Linkage for CPU offline callback. * @parallel_wq: The workqueue used for parallel work. * @serial_wq: The workqueue used for serial work. * @pslist: List of padata_shell objects attached to this instance. @@ -150,7 +151,8 @@ struct padata_shell { * @flags: padata flags. */ struct padata_instance { - struct hlist_node node; + struct hlist_node cpu_online_node; + struct hlist_node cpu_dead_node; struct workqueue_struct *parallel_wq; struct workqueue_struct *serial_wq; struct list_head pslist; diff --git a/kernel/padata.c b/kernel/padata.c index a6afa12fb75e..aae789896616 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -703,7 +703,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +718,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,8 +734,9 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); @@ -939,9 +940,10 @@ static struct padata_instance *padata_alloc(const char *name, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); -- cgit v1.2.3 From 97f9ac3db6612f14ac0c509e1a63ce14fd4cc0eb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 21 Apr 2020 12:44:49 -0500 Subject: crypto: ccp - Add support for SEV-ES to the PSP driver To provide support for SEV-ES, the hypervisor must provide an area of memory to the PSP. Once this Trusted Memory Region (TMR) is provided to the PSP, the contents of this area of memory are no longer available to the x86. Update the PSP driver to allocate a 1MB region for the TMR that is 1MB aligned and then provide it to the PSP through the SEV INIT command. Signed-off-by: Tom Lendacky Reviewed-by: Brijesh Singh Reviewed-by: Joerg Roedel Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 2 ++ include/uapi/linux/psp-sev.h | 2 ++ 3 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 896f190b9a50..439cd737076e 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -44,6 +45,14 @@ MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during static bool psp_dead; static int psp_timeout; +/* Trusted Memory Region (TMR): + * The TMR is a 1MB area that must be 1MB aligned. Use the page allocator + * to allocate the memory, which will return aligned memory for the specified + * allocation order. + */ +#define SEV_ES_TMR_SIZE (1024 * 1024) +static void *sev_es_tmr; + static inline bool sev_version_greater_or_equal(u8 maj, u8 min) { struct sev_device *sev = psp_master->sev_data; @@ -214,6 +223,20 @@ static int __sev_platform_init_locked(int *error) if (sev->state == SEV_STATE_INIT) return 0; + if (sev_es_tmr) { + u64 tmr_pa; + + /* + * Do not include the encryption mask on the physical + * address of the TMR (firmware should clear it anyway). + */ + tmr_pa = __pa(sev_es_tmr); + + sev->init_cmd_buf.flags |= SEV_INIT_FLAGS_SEV_ES; + sev->init_cmd_buf.tmr_address = tmr_pa; + sev->init_cmd_buf.tmr_len = SEV_ES_TMR_SIZE; + } + rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error); if (rc) return rc; @@ -1012,6 +1035,7 @@ EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user); void sev_pci_init(void) { struct sev_device *sev = psp_master->sev_data; + struct page *tmr_page; int error, rc; if (!sev) @@ -1041,6 +1065,16 @@ void sev_pci_init(void) sev_update_firmware(sev->dev) == 0) sev_get_api_version(); + /* Obtain the TMR memory area for SEV-ES use */ + tmr_page = alloc_pages(GFP_KERNEL, get_order(SEV_ES_TMR_SIZE)); + if (tmr_page) { + sev_es_tmr = page_address(tmr_page); + } else { + sev_es_tmr = NULL; + dev_warn(sev->dev, + "SEV: TMR allocation failed, SEV-ES support unavailable\n"); + } + /* Initialize the platform */ rc = sev_platform_init(&error); if (rc && (error == SEV_RET_SECURE_DATA_INVALID)) { @@ -1075,4 +1109,13 @@ void sev_pci_exit(void) return; sev_platform_shutdown(NULL); + + if (sev_es_tmr) { + /* The TMR area was encrypted, flush it from the cache */ + wbinvd_on_all_cpus(); + + free_pages((unsigned long)sev_es_tmr, + get_order(SEV_ES_TMR_SIZE)); + sev_es_tmr = NULL; + } } diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 5167bf2bfc75..7fbc8679145c 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -100,6 +100,8 @@ struct sev_data_init { u32 tmr_len; /* In */ } __packed; +#define SEV_INIT_FLAGS_SEV_ES 0x01 + /** * struct sev_data_pek_csr - PEK_CSR command parameters * diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 0549a5c622bf..91b4c63d5cbf 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -83,6 +83,8 @@ struct sev_user_data_status { __u32 guest_count; /* Out */ } __packed; +#define SEV_STATUS_FLAGS_CONFIG_ES 0x0100 + /** * struct sev_user_data_pek_csr - PEK_CSR command parameters * -- cgit v1.2.3 From 0da0e31600e8a42c6f1dfaa7a06211c8bb243ea7 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 3 Apr 2020 19:44:52 +0200 Subject: of: reserved-memory: Support lookup of regions by name Add support for looking up memory regions by name. This looks up the given name in the newly introduced memory-region-names property and returns the memory region at the corresponding index in the memory- region(s) property. Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- drivers/of/of_reserved_mem.c | 19 +++++++++++++++++++ include/linux/of_reserved_mem.h | 11 +++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 1a84bc0d5fa8..ed2ff6f01d32 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -357,6 +357,25 @@ int of_reserved_mem_device_init_by_idx(struct device *dev, } EXPORT_SYMBOL_GPL(of_reserved_mem_device_init_by_idx); +/** + * of_reserved_mem_device_init_by_name() - assign named reserved memory region + * to given device + * @dev: pointer to the device to configure + * @np: pointer to the device node with 'memory-region' property + * @name: name of the selected memory region + * + * Returns: 0 on success or a negative error-code on failure. + */ +int of_reserved_mem_device_init_by_name(struct device *dev, + struct device_node *np, + const char *name) +{ + int idx = of_property_match_string(np, "memory-region-names", name); + + return of_reserved_mem_device_init_by_idx(dev, np, idx); +} +EXPORT_SYMBOL_GPL(of_reserved_mem_device_init_by_name); + /** * of_reserved_mem_device_release() - release reserved memory device structures * @dev: Pointer to the device to deconfigure diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 60f541912ccf..a1b427ac291b 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -33,6 +33,9 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem); int of_reserved_mem_device_init_by_idx(struct device *dev, struct device_node *np, int idx); +int of_reserved_mem_device_init_by_name(struct device *dev, + struct device_node *np, + const char *name); void of_reserved_mem_device_release(struct device *dev); void fdt_init_reserved_mem(void); @@ -45,6 +48,14 @@ static inline int of_reserved_mem_device_init_by_idx(struct device *dev, { return -ENOSYS; } + +static inline int of_reserved_mem_device_init_by_name(struct device *dev, + struct device_node *np, + const char *name) +{ + return -ENOSYS; +} + static inline void of_reserved_mem_device_release(struct device *pdev) { } static inline void fdt_init_reserved_mem(void) { } -- cgit v1.2.3 From 8aa8eb2a8f5b3305a95f39957dd2b715fa668e21 Mon Sep 17 00:00:00 2001 From: Alexandre Chartre Date: Tue, 14 Apr 2020 12:36:12 +0200 Subject: objtool: Add support for intra-function calls Change objtool to support intra-function calls. On x86, an intra-function call is represented in objtool as a push onto the stack (of the return address), and a jump to the destination address. That way the stack information is correctly updated and the call flow is still accurate. Signed-off-by: Alexandre Chartre Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200414103618.12657-4-alexandre.chartre@oracle.com --- include/linux/frame.h | 11 ++++ tools/objtool/Documentation/stack-validation.txt | 8 +++ tools/objtool/arch/x86/decode.c | 8 +++ tools/objtool/check.c | 79 ++++++++++++++++++++++-- 4 files changed, 102 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frame.h b/include/linux/frame.h index 02d3ca2d9598..303cda600e56 100644 --- a/include/linux/frame.h +++ b/include/linux/frame.h @@ -15,9 +15,20 @@ static void __used __section(.discard.func_stack_frame_non_standard) \ *__func_stack_frame_non_standard_##func = func +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL \ + 999: \ + .pushsection .discard.intra_function_calls; \ + .long 999b; \ + .popsection; + #else /* !CONFIG_STACK_VALIDATION */ #define STACK_FRAME_NON_STANDARD(func) +#define ANNOTATE_INTRA_FUNCTION_CALL #endif /* CONFIG_STACK_VALIDATION */ diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt index 0189039489e9..0542e46c7552 100644 --- a/tools/objtool/Documentation/stack-validation.txt +++ b/tools/objtool/Documentation/stack-validation.txt @@ -323,6 +323,14 @@ they mean, and suggestions for how to fix them. The easiest way to enforce this is to ensure alternatives do not contain any ORC entries, which in turn implies the above constraint. +11. file.o: warning: unannotated intra-function call + + This warning means that a direct call is done to a destination which + is not at the beginning of a function. If this is a legit call, you + can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL + directive right before the call. + + If the error doesn't seem to make sense, it could be a bug in objtool. Feel free to ask the objtool maintainer for help. diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index d7b5d1096913..4b504fc90bbb 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -496,6 +496,14 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, case 0xe8: *type = INSN_CALL; + /* + * For the impact on the stack, a CALL behaves like + * a PUSH of an immediate value (the return address). + */ + ADD_OP(op) { + op->src.type = OP_SRC_CONST; + op->dest.type = OP_DEST_PUSH; + } break; case 0xfc: diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d822858764fe..32dea5f3feed 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -674,6 +674,16 @@ static int add_jump_destinations(struct objtool_file *file) return 0; } +static void remove_insn_ops(struct instruction *insn) +{ + struct stack_op *op, *tmp; + + list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) { + list_del(&op->list); + free(op); + } +} + /* * Find the destination instructions for all calls. */ @@ -699,10 +709,7 @@ static int add_call_destinations(struct objtool_file *file) continue; if (!insn->call_dest) { - WARN_FUNC("unsupported intra-function call", - insn->sec, insn->offset); - if (retpoline) - WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE."); + WARN_FUNC("unannotated intra-function call", insn->sec, insn->offset); return -1; } @@ -725,6 +732,15 @@ static int add_call_destinations(struct objtool_file *file) } } else insn->call_dest = rela->sym; + + /* + * Whatever stack impact regular CALLs have, should be undone + * by the RETURN of the called function. + * + * Annotated intra-function calls retain the stack_ops but + * are converted to JUMP, see read_intra_function_calls(). + */ + remove_insn_ops(insn); } return 0; @@ -1404,6 +1420,57 @@ static int read_instr_hints(struct objtool_file *file) return 0; } +static int read_intra_function_calls(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + struct rela *rela; + + sec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + unsigned long dest_off; + + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in %s", + sec->name); + return -1; + } + + insn = find_insn(file, rela->sym->sec, rela->addend); + if (!insn) { + WARN("bad .discard.intra_function_call entry"); + return -1; + } + + if (insn->type != INSN_CALL) { + WARN_FUNC("intra_function_call not a direct call", + insn->sec, insn->offset); + return -1; + } + + /* + * Treat intra-function CALLs as JMPs, but with a stack_op. + * See add_call_destinations(), which strips stack_ops from + * normal CALLs. + */ + insn->type = INSN_JUMP_UNCONDITIONAL; + + dest_off = insn->offset + insn->len + insn->immediate; + insn->jump_dest = find_insn(file, insn->sec, dest_off); + if (!insn->jump_dest) { + WARN_FUNC("can't find call dest at %s+0x%lx", + insn->sec, insn->offset, + insn->sec->name, dest_off); + return -1; + } + } + + return 0; +} + static void mark_rodata(struct objtool_file *file) { struct section *sec; @@ -1459,6 +1526,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + ret = read_intra_function_calls(file); + if (ret) + return ret; + ret = add_call_destinations(file); if (ret) return ret; -- cgit v1.2.3 From 36c5bdc4387056af3840adb4478c752faeb9d15e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Apr 2020 22:05:07 +0100 Subject: sched/topology: Kill SD_LOAD_BALANCE That flag is set unconditionally in sd_init(), and no one checks for it anymore. Remove it. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200415210512.805-5-valentin.schneider@arm.com --- include/linux/sched/topology.h | 29 ++++++++++++++--------------- kernel/sched/topology.c | 3 +-- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 95253ad792b0..fb11091129b3 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -11,21 +11,20 @@ */ #ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ -#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ -#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ -#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ -#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ -#define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */ +#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */ +#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */ +#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */ +#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */ +#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */ +#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */ +#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */ +#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x2000 /* cross-node balancing */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a9dc34a0ebc1..1d7b446fac7d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1341,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE -- cgit v1.2.3 From bf2c59fce4074e55d622089b34be3a6bc95484fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 17:40:33 -0400 Subject: sched/core: Fix illegal RCU from offline CPUs In the CPU-offline process, it calls mmdrop() after idle entry and the subsequent call to cpuhp_report_idle_dead(). Once execution passes the call to rcu_report_dead(), RCU is ignoring the CPU, which results in lockdep complaining when mmdrop() uses RCU from either memcg or debugobjects below. Fix it by cleaning up the active_mm state from BP instead. Every arch which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit() from AP. The only exception is parisc because it switches them to &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()), but the patch will still work there because it calls mmgrab(&init_mm) in smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu(). WARNING: suspicious RCU usage ----------------------------- kernel/workqueue.c:710 RCU or wq_pool_mutex should be held! other info that might help us debug this: RCU used illegally from offline CPU! Call Trace: dump_stack+0xf4/0x164 (unreliable) lockdep_rcu_suspicious+0x140/0x164 get_work_pool+0x110/0x150 __queue_work+0x1bc/0xca0 queue_work_on+0x114/0x120 css_release+0x9c/0xc0 percpu_ref_put_many+0x204/0x230 free_pcp_prepare+0x264/0x570 free_unref_page+0x38/0xf0 __mmdrop+0x21c/0x2c0 idle_task_exit+0x170/0x1b0 pnv_smp_cpu_kill_self+0x38/0x2e0 cpu_die+0x48/0x64 arch_cpu_idle_dead+0x30/0x50 do_idle+0x2f4/0x470 cpu_startup_entry+0x38/0x40 start_secondary+0x7a8/0xa80 start_secondary_resume+0x10/0x14 Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman (powerpc) Link: https://lkml.kernel.org/r/20200401214033.8448-1-cai@lca.pw --- arch/powerpc/platforms/powernv/smp.c | 1 - include/linux/sched/mm.h | 2 ++ kernel/cpu.c | 18 +++++++++++++++++- kernel/sched/core.c | 5 +++-- 4 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 13e251699346..b2ba3e95bda7 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void) /* Standard hot unplug procedure */ idle_task_exit(); - current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); DBG("CPU%d offline\n", cpu); generic_set_cpu_dead(cpu); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c49257a3b510..a132d875d351 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +void mmdrop(struct mm_struct *mm); + /* * This has to be called after a get_task_mm()/mmget_not_zero() * followed by taking the mmap_sem for writing before modifying the diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..244d30544377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include #include #include #include @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e6ba9e301f0..f6ae2629f4e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6197,13 +6197,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* -- cgit v1.2.3 From 12ac6782a40ad7636b6ef45680741825b64ab221 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Apr 2020 21:07:39 -0700 Subject: sched/swait: Reword some of the main description With both the increased use of swait and kvm no longer using it, we can reword some of the comments. While removing Linus' valid rant, I've also cared to explicitly mention that swait is very different than regular wait. In addition it is mentioned against using swait in favor of the regular flavor. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200422040739.18601-6-dave@stgolabs.net --- include/linux/swait.h | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swait.h b/include/linux/swait.h index 73e06e9986d4..6a8c22b8c2a5 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -9,23 +9,10 @@ #include /* - * BROKEN wait-queues. - * - * These "simple" wait-queues are broken garbage, and should never be - * used. The comments below claim that they are "similar" to regular - * wait-queues, but the semantics are actually completely different, and - * every single user we have ever had has been buggy (or pointless). - * - * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what - * "wake_up()" does, and has led to problems. In other cases, it has - * been fine, because there's only ever one waiter (kvm), but in that - * case gthe whole "simple" wait-queue is just pointless to begin with, - * since there is no "queue". Use "wake_up_process()" with a direct - * pointer instead. - * - * While these are very similar to regular wait queues (wait.h) the most - * important difference is that the simple waitqueue allows for deterministic - * behaviour -- IOW it has strictly bounded IRQ and lock hold times. + * Simple waitqueues are semantically very different to regular wait queues + * (wait.h). The most important difference is that the simple waitqueue allows + * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold + * times. * * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher @@ -39,7 +26,7 @@ * sleeper state. * * - the !exclusive mode; because that leads to O(n) wakeups, everything is - * exclusive. + * exclusive. As such swake_up_one will only ever awake _one_ waiter. * * - custom wake callback functions; because you cannot give any guarantees * about random code. This also allows swait to be used in RT, such that -- cgit v1.2.3 From 6e3a401fc8af01828bcdc92d713195d318b36e7e Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:14 +0300 Subject: inet_diag: add cgroup id attribute This patch adds cgroup v2 ID to common inet diag message attributes. Cgroup v2 ID is kernfs ID (ino or ino+gen). This attribute allows filter inet diag output by cgroup ID obtained by name_to_handle_at() syscall. When net_cls or net_prio cgroup is activated this ID is equal to 1 (root cgroup ID) for newly created sockets. Some notes about this ID: 1) gets initialized in socket() syscall 2) incoming socket gets ID from listening socket (not during accept() syscall) 3) not changed when process get moved to another cgroup 4) can point to deleted cgroup (refcounting) v2: - use CONFIG_SOCK_CGROUP_DATA instead if CONFIG_CGROUPS v3: - fix attr size by using nla_total_size_64bit() (Eric Dumazet) - more detailed commit message (Konstantin Khlebnikov) Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Acked-By: Tejun Heo Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 6 +++++- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index ce9ed1c0602f..0ef2d800fda7 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -71,7 +71,11 @@ static inline size_t inet_diag_msg_attrs_size(void) + nla_total_size(1) /* INET_DIAG_SKV6ONLY */ #endif + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4); /* INET_DIAG_CLASS_ID */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ +#ifdef CONFIG_SOCK_CGROUP_DATA + + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ +#endif + ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 57cc429a9177..c9b1e551792c 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -157,6 +157,7 @@ enum { INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, + INET_DIAG_CGROUP_ID, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5d50aad3cdbf..9c4c315cbc10 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -162,6 +162,13 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; } +#ifdef CONFIG_SOCK_CGROUP_DATA + if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), + INET_DIAG_PAD)) + goto errout; +#endif + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); -- cgit v1.2.3 From ea5bacaa2cec6967ed337f4d0ad6034123ca737b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:03 +0200 Subject: docs: networking: convert netdev-features.txt to ReST Not much to be done here: - add SPDX header; - adjust titles and chapters, adding proper markups; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/checksum-offloads.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/netdev-features.rst | 184 +++++++++++++++++++++++++ Documentation/networking/netdev-features.txt | 181 ------------------------ include/linux/netdev_features.h | 2 +- 5 files changed, 187 insertions(+), 183 deletions(-) create mode 100644 Documentation/networking/netdev-features.rst delete mode 100644 Documentation/networking/netdev-features.txt (limited to 'include/linux') diff --git a/Documentation/networking/checksum-offloads.rst b/Documentation/networking/checksum-offloads.rst index 905c8a84b103..69b23cf6879e 100644 --- a/Documentation/networking/checksum-offloads.rst +++ b/Documentation/networking/checksum-offloads.rst @@ -59,7 +59,7 @@ recomputed for each resulting segment. See the skbuff.h comment (section 'E') for more details. A driver declares its offload capabilities in netdev->hw_features; see -Documentation/networking/netdev-features.txt for more. Note that a device +Documentation/networking/netdev-features.rst for more. Note that a device which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and csum_offset given in the SKB; if it tries to deduce these itself in hardware (as some NICs do) the driver should check that the values in the SKB match diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e58f872d401d..4c6aa3db97d4 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -81,6 +81,7 @@ Contents: mpls-sysctl multiqueue netconsole + netdev-features .. only:: subproject and html diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst new file mode 100644 index 000000000000..a2d7d7160e39 --- /dev/null +++ b/Documentation/networking/netdev-features.rst @@ -0,0 +1,184 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================================== +Netdev features mess and how to get out from it alive +===================================================== + +Author: + Michał Mirosław + + + +Part I: Feature sets +==================== + +Long gone are the days when a network card would just take and give packets +verbatim. Today's devices add multiple features and bugs (read: offloads) +that relieve an OS of various tasks like generating and checking checksums, +splitting packets, classifying them. Those capabilities and their state +are commonly referred to as netdev features in Linux kernel world. + +There are currently three sets of features relevant to the driver, and +one used internally by network core: + + 1. netdev->hw_features set contains features whose state may possibly + be changed (enabled or disabled) for a particular device by user's + request. This set should be initialized in ndo_init callback and not + changed later. + + 2. netdev->features set contains features which are currently enabled + for a device. This should be changed only by network core or in + error paths of ndo_set_features callback. + + 3. netdev->vlan_features set contains features whose state is inherited + by child VLAN devices (limits netdev->features set). This is currently + used for all VLAN devices whether tags are stripped or inserted in + hardware or software. + + 4. netdev->wanted_features set contains feature set requested by user. + This set is filtered by ndo_fix_features callback whenever it or + some device-specific conditions change. This set is internal to + networking core and should not be referenced in drivers. + + + +Part II: Controlling enabled features +===================================== + +When current feature set (netdev->features) is to be changed, new set +is calculated and filtered by calling ndo_fix_features callback +and netdev_fix_features(). If the resulting set differs from current +set, it is passed to ndo_set_features callback and (if the callback +returns success) replaces value stored in netdev->features. +NETDEV_FEAT_CHANGE notification is issued after that whenever current +set might have changed. + +The following events trigger recalculation: + 1. device's registration, after ndo_init returned success + 2. user requested changes in features state + 3. netdev_update_features() is called + +ndo_*_features callbacks are called with rtnl_lock held. Missing callbacks +are treated as always returning success. + +A driver that wants to trigger recalculation must do so by calling +netdev_update_features() while holding rtnl_lock. This should not be done +from ndo_*_features callbacks. netdev->features should not be modified by +driver except by means of ndo_fix_features callback. + + + +Part III: Implementation hints +============================== + + * ndo_fix_features: + +All dependencies between features should be resolved here. The resulting +set can be reduced further by networking core imposed limitations (as coded +in netdev_fix_features()). For this reason it is safer to disable a feature +when its dependencies are not met instead of forcing the dependency on. + +This callback should not modify hardware nor driver state (should be +stateless). It can be called multiple times between successive +ndo_set_features calls. + +Callback must not alter features contained in NETIF_F_SOFT_FEATURES or +NETIF_F_NEVER_CHANGE sets. The exception is NETIF_F_VLAN_CHALLENGED but +care must be taken as the change won't affect already configured VLANs. + + * ndo_set_features: + +Hardware should be reconfigured to match passed feature set. The set +should not be altered unless some error condition happens that can't +be reliably detected in ndo_fix_features. In this case, the callback +should update netdev->features to match resulting hardware state. +Errors returned are not (and cannot be) propagated anywhere except dmesg. +(Note: successful return is zero, >0 means silent error.) + + + +Part IV: Features +================= + +For current list of features, see include/linux/netdev_features.h. +This section describes semantics of some of them. + + * Transmit checksumming + +For complete description, see comments near the top of include/linux/skbuff.h. + +Note: NETIF_F_HW_CSUM is a superset of NETIF_F_IP_CSUM + NETIF_F_IPV6_CSUM. +It means that device can fill TCP/UDP-like checksum anywhere in the packets +whatever headers there might be. + + * Transmit TCP segmentation offload + +NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit +set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6). + + * Transmit UDP segmentation offload + +NETIF_F_GSO_UDP_L4 accepts a single UDP header with a payload that exceeds +gso_size. On segmentation, it segments the payload on gso_size boundaries and +replicates the network and UDP headers (fixing up the last one if less than +gso_size). + + * Transmit DMA from high memory + +On platforms where this is relevant, NETIF_F_HIGHDMA signals that +ndo_start_xmit can handle skbs with frags in high memory. + + * Transmit scatter-gather + +Those features say that ndo_start_xmit can handle fragmented skbs: +NETIF_F_SG --- paged skbs (skb_shinfo()->frags), NETIF_F_FRAGLIST --- +chained skbs (skb->next/prev list). + + * Software features + +Features contained in NETIF_F_SOFT_FEATURES are features of networking +stack. Driver should not change behaviour based on them. + + * LLTX driver (deprecated for hardware drivers) + +NETIF_F_LLTX is meant to be used by drivers that don't need locking at all, +e.g. software tunnels. + +This is also used in a few legacy drivers that implement their +own locking, don't use it for new (hardware) drivers. + + * netns-local device + +NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between +network namespaces (e.g. loopback). + +Don't use it in drivers. + + * VLAN challenged + +NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN +headers. Some drivers set this because the cards can't handle the bigger MTU. +[FIXME: Those cases could be fixed in VLAN code by allowing only reduced-MTU +VLANs. This may be not useful, though.] + +* rx-fcs + +This requests that the NIC append the Ethernet Frame Checksum (FCS) +to the end of the skb data. This allows sniffers and other tools to +read the CRC recorded by the NIC on receipt of the packet. + +* rx-all + +This requests that the NIC receive all possible frames, including errored +frames (such as bad FCS, etc). This can be helpful when sniffing a link with +bad packets on it. Some NICs may receive more packets if also put into normal +PROMISC mode. + +* rx-gro-hw + +This requests that the NIC enables Hardware GRO (generic receive offload). +Hardware GRO is basically the exact reverse of TSO, and is generally +stricter than Hardware LRO. A packet stream merged by Hardware GRO must +be re-segmentable by GSO or TSO back to the exact original packet stream. +Hardware GRO is dependent on RXCSUM since every packet successfully merged +by hardware must also have the checksum verified by hardware. diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt deleted file mode 100644 index 58dd1c1e3c65..000000000000 --- a/Documentation/networking/netdev-features.txt +++ /dev/null @@ -1,181 +0,0 @@ -Netdev features mess and how to get out from it alive -===================================================== - -Author: - Michał Mirosław - - - - Part I: Feature sets -====================== - -Long gone are the days when a network card would just take and give packets -verbatim. Today's devices add multiple features and bugs (read: offloads) -that relieve an OS of various tasks like generating and checking checksums, -splitting packets, classifying them. Those capabilities and their state -are commonly referred to as netdev features in Linux kernel world. - -There are currently three sets of features relevant to the driver, and -one used internally by network core: - - 1. netdev->hw_features set contains features whose state may possibly - be changed (enabled or disabled) for a particular device by user's - request. This set should be initialized in ndo_init callback and not - changed later. - - 2. netdev->features set contains features which are currently enabled - for a device. This should be changed only by network core or in - error paths of ndo_set_features callback. - - 3. netdev->vlan_features set contains features whose state is inherited - by child VLAN devices (limits netdev->features set). This is currently - used for all VLAN devices whether tags are stripped or inserted in - hardware or software. - - 4. netdev->wanted_features set contains feature set requested by user. - This set is filtered by ndo_fix_features callback whenever it or - some device-specific conditions change. This set is internal to - networking core and should not be referenced in drivers. - - - - Part II: Controlling enabled features -======================================= - -When current feature set (netdev->features) is to be changed, new set -is calculated and filtered by calling ndo_fix_features callback -and netdev_fix_features(). If the resulting set differs from current -set, it is passed to ndo_set_features callback and (if the callback -returns success) replaces value stored in netdev->features. -NETDEV_FEAT_CHANGE notification is issued after that whenever current -set might have changed. - -The following events trigger recalculation: - 1. device's registration, after ndo_init returned success - 2. user requested changes in features state - 3. netdev_update_features() is called - -ndo_*_features callbacks are called with rtnl_lock held. Missing callbacks -are treated as always returning success. - -A driver that wants to trigger recalculation must do so by calling -netdev_update_features() while holding rtnl_lock. This should not be done -from ndo_*_features callbacks. netdev->features should not be modified by -driver except by means of ndo_fix_features callback. - - - - Part III: Implementation hints -================================ - - * ndo_fix_features: - -All dependencies between features should be resolved here. The resulting -set can be reduced further by networking core imposed limitations (as coded -in netdev_fix_features()). For this reason it is safer to disable a feature -when its dependencies are not met instead of forcing the dependency on. - -This callback should not modify hardware nor driver state (should be -stateless). It can be called multiple times between successive -ndo_set_features calls. - -Callback must not alter features contained in NETIF_F_SOFT_FEATURES or -NETIF_F_NEVER_CHANGE sets. The exception is NETIF_F_VLAN_CHALLENGED but -care must be taken as the change won't affect already configured VLANs. - - * ndo_set_features: - -Hardware should be reconfigured to match passed feature set. The set -should not be altered unless some error condition happens that can't -be reliably detected in ndo_fix_features. In this case, the callback -should update netdev->features to match resulting hardware state. -Errors returned are not (and cannot be) propagated anywhere except dmesg. -(Note: successful return is zero, >0 means silent error.) - - - - Part IV: Features -=================== - -For current list of features, see include/linux/netdev_features.h. -This section describes semantics of some of them. - - * Transmit checksumming - -For complete description, see comments near the top of include/linux/skbuff.h. - -Note: NETIF_F_HW_CSUM is a superset of NETIF_F_IP_CSUM + NETIF_F_IPV6_CSUM. -It means that device can fill TCP/UDP-like checksum anywhere in the packets -whatever headers there might be. - - * Transmit TCP segmentation offload - -NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit -set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6). - - * Transmit UDP segmentation offload - -NETIF_F_GSO_UDP_L4 accepts a single UDP header with a payload that exceeds -gso_size. On segmentation, it segments the payload on gso_size boundaries and -replicates the network and UDP headers (fixing up the last one if less than -gso_size). - - * Transmit DMA from high memory - -On platforms where this is relevant, NETIF_F_HIGHDMA signals that -ndo_start_xmit can handle skbs with frags in high memory. - - * Transmit scatter-gather - -Those features say that ndo_start_xmit can handle fragmented skbs: -NETIF_F_SG --- paged skbs (skb_shinfo()->frags), NETIF_F_FRAGLIST --- -chained skbs (skb->next/prev list). - - * Software features - -Features contained in NETIF_F_SOFT_FEATURES are features of networking -stack. Driver should not change behaviour based on them. - - * LLTX driver (deprecated for hardware drivers) - -NETIF_F_LLTX is meant to be used by drivers that don't need locking at all, -e.g. software tunnels. - -This is also used in a few legacy drivers that implement their -own locking, don't use it for new (hardware) drivers. - - * netns-local device - -NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between -network namespaces (e.g. loopback). - -Don't use it in drivers. - - * VLAN challenged - -NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN -headers. Some drivers set this because the cards can't handle the bigger MTU. -[FIXME: Those cases could be fixed in VLAN code by allowing only reduced-MTU -VLANs. This may be not useful, though.] - -* rx-fcs - -This requests that the NIC append the Ethernet Frame Checksum (FCS) -to the end of the skb data. This allows sniffers and other tools to -read the CRC recorded by the NIC on receipt of the packet. - -* rx-all - -This requests that the NIC receive all possible frames, including errored -frames (such as bad FCS, etc). This can be helpful when sniffing a link with -bad packets on it. Some NICs may receive more packets if also put into normal -PROMISC mode. - -* rx-gro-hw - -This requests that the NIC enables Hardware GRO (generic receive offload). -Hardware GRO is basically the exact reverse of TSO, and is generally -stricter than Hardware LRO. A packet stream merged by Hardware GRO must -be re-segmentable by GSO or TSO back to the exact original packet stream. -Hardware GRO is dependent on RXCSUM since every packet successfully merged -by hardware must also have the checksum verified by hardware. diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 9d53c5ad272c..2cc3cf80b49a 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -89,7 +89,7 @@ enum { * Add your fresh new feature above and remember to update * netdev_features_strings[] in net/core/ethtool.c and maybe * some feature mask #defines below. Please also describe it - * in Documentation/networking/netdev-features.txt. + * in Documentation/networking/netdev-features.rst. */ /**/NETDEV_FEATURE_COUNT -- cgit v1.2.3 From 2b195850128f5bafde177b12489d9fa27962cc1e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:41 -0700 Subject: tcp: add tp->dup_ack_counter In commit 86de5921a3d5 ("tcp: defer SACK compression after DupThresh") I added a TCP_FASTRETRANS_THRESH bias to tp->compressed_ack in order to enable sack compression only after 3 dupacks. Since we plan to relax this rule for flows that involve stacks not requiring this old rule, this patch adds a distinct tp->dup_ack_counter. This means the TCP_FASTRETRANS_THRESH value is now used in a single location that a future patch can adjust: if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { tp->dup_ack_counter++; goto send_now; } This patch also introduces tcp_sack_compress_send_ack() helper to ease following patch comprehension. This patch refines LINUX_MIB_TCPACKCOMPRESSED to not count the acks that we had to send if the timer expires or tcp_sack_compress_send_ack() is sending an ack. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++--------- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 8 +++++++- 4 files changed, 38 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 421c99c12291..2c6f87e9f0cf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,7 @@ struct tcp_sock { } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; + u8 dup_ack_counter; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4ced9273e8..da777df0a0ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4327,6 +4327,27 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } +static void tcp_sack_compress_send_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->compressed_ack) + return; + + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - 1); + + tp->compressed_ack = 0; + tcp_send_ack(sk); +} + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4355,8 +4376,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - tcp_send_ack(sk); + tcp_sack_compress_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -5275,15 +5295,13 @@ send_now: if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = 0; + tp->dup_ack_counter = 0; } - - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { + tp->dup_ack_counter++; goto send_now; - + } + tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ba4482130f08..c414aeb1efa9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { + if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = TCP_FASTRETRANS_THRESH; + tp->compressed_ack); + tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c3f26dcd6704..ada046f425d2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + if (tp->compressed_ack) { + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + tp->compressed_ack--; tcp_send_ack(sk); + } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) -- cgit v1.2.3 From 54c52e10dc9b939084a7e6e3d32ce8fd8dee7898 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Apr 2020 12:27:55 -0400 Subject: blk-iocost: switch to fixed non-auto-decaying use_delay The use_delay mechanism was introduced by blk-iolatency to hold memory allocators accountable for the reclaim and other shared IOs they cause. The duration of the delay is dynamically balanced between iolatency increasing the value on each target miss and it auto-decaying as time passes and threads get delayed on it. While this works well for iolatency, iocost's control model isn't compatible with it. There is no repeated "violation" events which can be balanced against auto-decaying. iocost instead knows how much a given cgroup is over budget and wants to prevent that cgroup from issuing IOs while over budget. Until now, iocost has been adding the cost of force-issued IOs. However, this doesn't reflect the amount which is already over budget and is simply not enough to counter the auto-decaying allowing anon-memory leaking low priority cgroup to go over its alloted share of IOs. As auto-decaying doesn't make much sense for iocost, this patch introduces a different mode of operation for use_delay - when blkcg_set_delay() are used insted of blkcg_add/use_delay(), the delay duration is not auto-decayed until it is explicitly cleared with blkcg_clear_delay(). iocost is updated to keep the delay duration synchronized to the budget overage amount. With this change, iocost can effectively police cgroups which generate significant amount of force-issued IOs. Signed-off-by: Tejun Heo Cc: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 ++++++ block/blk-iocost.c | 23 +++++++++-------------- include/linux/blk-cgroup.h | 43 +++++++++++++++++++++++++++++++++---------- 3 files changed, 48 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c5dc833212e1..0a63c6cbbcb1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1530,6 +1530,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); + /* negative use_delay means no scaling, see blkcg_set_delay() */ + if (atomic_read(&blkg->use_delay) < 0) + return; + /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain @@ -1717,6 +1721,8 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index db35ee682294..a8e99ef76a08 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1209,14 +1209,14 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); u64 vmargin = ioc->margin_us * now->vrate; u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 expires, oexpires; + u64 delta_ns, expires, oexpires; u32 hw_inuse; /* debt-adjust vtime */ @@ -1233,15 +1233,10 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) return false; /* use delay */ - if (cost) { - u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC, - now->vrate); - blkcg_add_delay(blkg, now->now_ns, cost_ns); - } - blkcg_use_delay(blkg); - - expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; + delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + blkcg_set_delay(blkg, delta_ns); + expires = now->now_ns + delta_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); @@ -1260,7 +1255,7 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) struct ioc_now now; ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); return HRTIMER_NORESTART; } @@ -1378,7 +1373,7 @@ static void ioc_timer_fn(struct timer_list *timer) atomic64_read(&iocg->abs_vdebt)) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; @@ -1737,7 +1732,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { atomic64_add(abs_cost, &iocg->abs_vdebt); - if (iocg_kick_delay(iocg, &now, cost)) + if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); return; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4deb8bb7b6af..a57ebe2f00ab 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -631,6 +631,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, static inline void blkcg_use_delay(struct blkcg_gq *blkg) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); } @@ -639,6 +641,8 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); + if (WARN_ON_ONCE(old < 0)) + return 0; if (old == 0) return 0; @@ -663,20 +667,39 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) return 1; } +/** + * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount + * @blkg: target blkg + * @delay: delay duration in nsecs + * + * When enabled with this function, the delay is not decayed and must be + * explicitly cleared with blkcg_clear_delay(). Must not be mixed with + * blkcg_[un]use_delay() and blkcg_add_delay() usages. + */ +static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person setting the congestion count for this blkg. */ + if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + + atomic64_set(&blkg->delay_nsec, delay); +} + +/** + * blkcg_clear_delay - Disable allocator delay mechanism + * @blkg: target blkg + * + * Disable use_delay mechanism. See blkcg_set_delay(). + */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); - if (!old) - return; + /* We only want 1 person clearing the congestion count for this blkg. */ - while (old) { - int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); - if (cur == old) { - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); - break; - } - old = cur; - } + if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); -- cgit v1.2.3 From c100beb9ccfb98e2474586a4006483cbf770c823 Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Mon, 27 Apr 2020 18:25:13 -0500 Subject: PCI/AER: Use only _OSC to determine AER ownership Per the PCI Firmware spec, r3.2, sec 4.5.1, the OS can request control of AER via bit 3 of the _OSC Control Field. In the returned value of the Control Field: The firmware sets [bit 3] to 1 to grant control over PCI Express Advanced Error Reporting. ... after control is transferred to the operating system, firmware must not modify the Advanced Error Reporting Capability. If control of this feature was requested and denied or was not requested, firmware returns this bit set to 0. Previously the pci_root driver looked at the HEST FIRMWARE_FIRST bit to determine whether to request ownership of the AER Capability. This was based on ACPI spec v6.3, sec 18.3.2.4, and similar sections, which say things like: Bit [0] - FIRMWARE_FIRST: If set, indicates that system firmware will handle errors from this source first. Bit [1] - GLOBAL: If set, indicates that the settings contained in this structure apply globally to all PCI Express Devices. These ACPI references don't say anything about ownership of the AER Capability. Remove use of the FIRMWARE_FIRST bit and rely only on the _OSC bit to determine whether we have control of the AER Capability. Link: https://lore.kernel.org/r/20181115231605.24352-1-mr.nuke.me@gmail.com/ v1 Link: https://lore.kernel.org/r/20190326172343.28946-1-mr.nuke.me@gmail.com/ v2 Link: https://lore.kernel.org/r/67af2931705bed9a588b5a39d369cb70b9942190.1587925636.git.sathyanarayanan.kuppuswamy@linux.intel.com [bhelgaas: commit log, note: Alex posted this identical patch 18 months ago, and I failed to apply it then, so I made him the author, added links to his postings, and added his Signed-off-by] Signed-off-by: Alexandru Gagniuc Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas Reviewed-by: Jon Derrick --- drivers/acpi/pci_root.c | 9 ++------- drivers/pci/pcie/aer.c | 26 +------------------------- include/linux/pci-acpi.h | 6 ------ 3 files changed, 3 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index ac8ad6cb82aa..9e235c1a75ff 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -483,13 +483,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC)) control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL; - if (pci_aer_available()) { - if (aer_acpi_firmware_first()) - dev_info(&device->dev, - "PCIe AER handled by firmware\n"); - else - control |= OSC_PCI_EXPRESS_AER_CONTROL; - } + if (pci_aer_available()) + control |= OSC_PCI_EXPRESS_AER_CONTROL; /* * Per the Downstream Port Containment Related Enhancements ECN to diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index f4274d301235..efc26773cc6d 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -318,30 +318,6 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) aer_set_firmware_first(dev); return dev->__aer_firmware_first; } - -static bool aer_firmware_first; - -/** - * aer_acpi_firmware_first - Check if APEI should control AER. - */ -bool aer_acpi_firmware_first(void) -{ - static bool parsed = false; - struct aer_hest_parse_info info = { - .pci_dev = NULL, /* Check all PCIe devices */ - .firmware_first = 0, - }; - - if (pcie_ports_native) - return false; - - if (!parsed) { - apei_hest_parse(aer_hest_parse, &info); - aer_firmware_first = info.firmware_first; - parsed = true; - } - return aer_firmware_first; -} #endif #define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \ @@ -1523,7 +1499,7 @@ static struct pcie_port_service_driver aerdriver = { */ int __init pcie_aer_init(void) { - if (!pci_aer_available() || aer_acpi_firmware_first()) + if (!pci_aer_available()) return -ENXIO; return pcie_port_service_register(&aerdriver); } diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 2d155bfb8fbf..11c98875538a 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -125,10 +125,4 @@ static inline void acpi_pci_add_bus(struct pci_bus *bus) { } static inline void acpi_pci_remove_bus(struct pci_bus *bus) { } #endif /* CONFIG_ACPI */ -#ifdef CONFIG_ACPI_APEI -extern bool aer_acpi_firmware_first(void); -#else -static inline bool aer_acpi_firmware_first(void) { return false; } -#endif - #endif /* _PCI_ACPI_H_ */ -- cgit v1.2.3 From 999a22890cb183b918e4372395d24426a755cef2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 3 Apr 2020 07:20:50 +0000 Subject: uaccess: Add user_read_access_begin/end and user_write_access_begin/end Some architectures like powerpc64 have the capability to separate read access and write access protection. For get_user() and copy_from_user(), powerpc64 only open read access. For put_user() and copy_to_user(), powerpc64 only open write access. But when using unsafe_get_user() or unsafe_put_user(), user_access_begin open both read and write. Other architectures like powerpc book3s 32 bits only allow write access protection. And on this architecture protection is an heavy operation as it requires locking/unlocking per segment of 256Mbytes. On those architecture it is therefore desirable to do the unlocking only for write access. (Note that book3s/32 ranges from very old powermac from the 90's with powerpc 601 processor, till modern ADSL boxes with PowerQuicc II processors for instance so it is still worth considering.) In order to avoid any risk based of hacking some variable parameters passed to user_access_begin/end that would allow hacking and leaving user access open or opening too much, it is preferable to use dedicated static functions that can't be overridden. Add a user_read_access_begin and user_read_access_end to only open read access. Add a user_write_access_begin and user_write_access_end to only open write access. By default, when undefined, those new access helpers default on the existing user_access_begin and user_access_end. Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/36e43241c7f043a24b5069e78c6a7edd11043be5.1585898438.git.christophe.leroy@c-s.fr --- include/linux/uaccess.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 67f016010aad..9861c89f93be 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -378,6 +378,14 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif +#ifndef user_write_access_begin +#define user_write_access_begin user_access_begin +#define user_write_access_end user_access_end +#endif +#ifndef user_read_access_begin +#define user_read_access_begin user_access_begin +#define user_read_access_end user_access_end +#endif #ifdef CONFIG_HARDENED_USERCOPY void usercopy_warn(const char *name, const char *detail, bool to_user, -- cgit v1.2.3 From f256356f65e6449a9fcf6089ea25882c91768665 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 27 Apr 2020 11:39:03 +0800 Subject: ptp_qoriq: output PPS signal on FIPER2 in default Output PPS signal on FIPER2 (Fixed Period Interval Pulse) in default which is more desired by user. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index 75884563059f..884b8f8ca06d 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -135,7 +135,7 @@ struct ptp_qoriq_registers { #define DEFAULT_CKSEL 1 #define DEFAULT_TMR_PRSC 2 #define DEFAULT_FIPER1_PERIOD 1000000000 -#define DEFAULT_FIPER2_PERIOD 100000 +#define DEFAULT_FIPER2_PERIOD 1000000000 struct ptp_qoriq { void __iomem *base; -- cgit v1.2.3 From e4e51da66dc812176cca16b0f8a5b87b173deb5d Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 29 Apr 2020 01:06:59 +0200 Subject: net: phy: bcm54140: add second PHY ID This PHY has two PHY IDs depending on its mode. Adjust the mask so that it includes both IDs. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 11 +++++++++-- include/linux/brcmphy.h | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index d0498ed47878..400d7c3c405a 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -115,6 +115,13 @@ #define BCM54140_HWMON_IN_ALARM_BIT(ch) ((ch) ? BCM54140_RDB_MON_ISR_3V3 \ : BCM54140_RDB_MON_ISR_1V0) +/* This PHY has two different PHY IDs depening on its MODE_SEL pin. This + * pin choses between 4x SGMII and QSGMII mode: + * AE02_5009 4x SGMII + * AE02_5019 QSGMII + */ +#define BCM54140_PHY_ID_MASK 0xffffffe8 + #define BCM54140_PHY_ID_REV(phy_id) ((phy_id) & 0x7) #define BCM54140_REV_B0 1 @@ -857,7 +864,7 @@ static int bcm54140_set_tunable(struct phy_device *phydev, static struct phy_driver bcm54140_drivers[] = { { .phy_id = PHY_ID_BCM54140, - .phy_id_mask = 0xfffffff8, + .phy_id_mask = BCM54140_PHY_ID_MASK, .name = "Broadcom BCM54140", .features = PHY_GBIT_FEATURES, .config_init = bcm54140_config_init, @@ -875,7 +882,7 @@ static struct phy_driver bcm54140_drivers[] = { module_phy_driver(bcm54140_drivers); static struct mdio_device_id __maybe_unused bcm54140_tbl[] = { - { PHY_ID_BCM54140, 0xfffffff8 }, + { PHY_ID_BCM54140, BCM54140_PHY_ID_MASK }, { } }; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 8be150e69c7c..58d0150acc3e 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -25,7 +25,7 @@ #define PHY_ID_BCM5461 0x002060c0 #define PHY_ID_BCM54612E 0x03625e60 #define PHY_ID_BCM54616S 0x03625d10 -#define PHY_ID_BCM54140 0xae025019 +#define PHY_ID_BCM54140 0xae025009 #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM89610 0x03625cd0 -- cgit v1.2.3 From 9ba2353b2cc58ba13f7a7369208107f133f6a27b Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Fri, 3 Apr 2020 22:20:32 +0200 Subject: power: supply: core: allow to constify property lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since tables pointed to by power_supply_desc->properties and ->usb_types are not expected to change after registration, mark the pointers accordingly Signed-off-by: Michał Mirosław Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index dcd5a71e6c67..6a34df65d4d1 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -223,9 +223,9 @@ struct power_supply_config { struct power_supply_desc { const char *name; enum power_supply_type type; - enum power_supply_usb_type *usb_types; + const enum power_supply_usb_type *usb_types; size_t num_usb_types; - enum power_supply_property *properties; + const enum power_supply_property *properties; size_t num_properties; /* -- cgit v1.2.3 From 0b104773b4f72ccd8af98a2f1efe69b174c344d3 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Apr 2020 17:49:21 -0600 Subject: PCI: Constify struct pci_ecam_ops struct pci_ecam_ops is typically DT match table data which is defined to be const. It's also best practice for ops structs to be const. Ideally, we'd make struct pci_ops const as well, but that becomes pretty invasive, so for now we just cast it where needed. Link: https://lore.kernel.org/r/20200409234923.21598-2-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Acked-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Jonathan Chocron Cc: Zhou Wang Cc: Robert Richter Cc: Toan Le Cc: Marc Gonzalez Cc: Mans Rullgard Cc: linux-acpi@vger.kernel.org --- arch/arm64/kernel/pci.c | 4 ++-- drivers/acpi/pci_mcfg.c | 8 ++++---- drivers/pci/controller/dwc/pcie-al.c | 2 +- drivers/pci/controller/dwc/pcie-hisi.c | 8 ++++---- drivers/pci/controller/pci-host-common.c | 6 +++--- drivers/pci/controller/pci-host-generic.c | 4 ++-- drivers/pci/controller/pci-thunder-ecam.c | 2 +- drivers/pci/controller/pci-thunder-pem.c | 4 ++-- drivers/pci/controller/pci-xgene.c | 4 ++-- drivers/pci/controller/pcie-tango.c | 2 +- drivers/pci/ecam.c | 6 +++--- include/linux/pci-acpi.h | 2 +- include/linux/pci-ecam.h | 22 +++++++++++----------- 13 files changed, 37 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 570988c7a7ff..1006ed2d7c60 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -117,7 +117,7 @@ pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) struct device *dev = &root->device->dev; struct resource *bus_res = &root->secondary; u16 seg = root->segment; - struct pci_ecam_ops *ecam_ops; + const struct pci_ecam_ops *ecam_ops; struct resource cfgres; struct acpi_device *adev; struct pci_config_window *cfg; @@ -185,7 +185,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) root_ops->release_info = pci_acpi_generic_release_info; root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = &ri->cfg->ops->pci_ops; + root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); if (!bus) return NULL; diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c index 6b347d9920cc..54b36b7ad47d 100644 --- a/drivers/acpi/pci_mcfg.c +++ b/drivers/acpi/pci_mcfg.c @@ -29,7 +29,7 @@ struct mcfg_fixup { u32 oem_revision; u16 segment; struct resource bus_range; - struct pci_ecam_ops *ops; + const struct pci_ecam_ops *ops; struct resource cfgres; }; @@ -165,7 +165,7 @@ static int pci_mcfg_quirk_matches(struct mcfg_fixup *f, u16 segment, static void pci_mcfg_apply_quirks(struct acpi_pci_root *root, struct resource *cfgres, - struct pci_ecam_ops **ecam_ops) + const struct pci_ecam_ops **ecam_ops) { #ifdef CONFIG_PCI_QUIRKS u16 segment = root->segment; @@ -191,9 +191,9 @@ static void pci_mcfg_apply_quirks(struct acpi_pci_root *root, static LIST_HEAD(pci_mcfg_list); int pci_mcfg_lookup(struct acpi_pci_root *root, struct resource *cfgres, - struct pci_ecam_ops **ecam_ops) + const struct pci_ecam_ops **ecam_ops) { - struct pci_ecam_ops *ops = &pci_generic_ecam_ops; + const struct pci_ecam_ops *ops = &pci_generic_ecam_ops; struct resource *bus_res = &root->secondary; u16 seg = root->segment; struct mcfg_entry *e; diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c index 1eeda2f6371f..270868f3859a 100644 --- a/drivers/pci/controller/dwc/pcie-al.c +++ b/drivers/pci/controller/dwc/pcie-al.c @@ -80,7 +80,7 @@ static int al_pcie_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops al_pcie_ops = { +const struct pci_ecam_ops al_pcie_ops = { .bus_shift = 20, .init = al_pcie_init, .pci_ops = { diff --git a/drivers/pci/controller/dwc/pcie-hisi.c b/drivers/pci/controller/dwc/pcie-hisi.c index 6d9e1b2b8f7b..90017045334d 100644 --- a/drivers/pci/controller/dwc/pcie-hisi.c +++ b/drivers/pci/controller/dwc/pcie-hisi.c @@ -104,7 +104,7 @@ static int hisi_pcie_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops hisi_pcie_ops = { +const struct pci_ecam_ops hisi_pcie_ops = { .bus_shift = 20, .init = hisi_pcie_init, .pci_ops = { @@ -362,7 +362,7 @@ static int hisi_pcie_platform_init(struct pci_config_window *cfg) return 0; } -struct pci_ecam_ops hisi_pcie_platform_ops = { +const struct pci_ecam_ops hisi_pcie_platform_ops = { .bus_shift = 20, .init = hisi_pcie_platform_init, .pci_ops = { @@ -375,11 +375,11 @@ struct pci_ecam_ops hisi_pcie_platform_ops = { static const struct of_device_id hisi_pcie_almost_ecam_of_match[] = { { .compatible = "hisilicon,hip06-pcie-ecam", - .data = (void *) &hisi_pcie_platform_ops, + .data = &hisi_pcie_platform_ops, }, { .compatible = "hisilicon,hip07-pcie-ecam", - .data = (void *) &hisi_pcie_platform_ops, + .data = &hisi_pcie_platform_ops, }, {}, }; diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index 250a3fc80ec6..f6d5dc068488 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -19,7 +19,7 @@ static void gen_pci_unmap_cfg(void *ptr) } static struct pci_config_window *gen_pci_init(struct device *dev, - struct list_head *resources, struct pci_ecam_ops *ops) + struct list_head *resources, const struct pci_ecam_ops *ops) { int err; struct resource cfgres; @@ -55,7 +55,7 @@ err_out: } int pci_host_common_probe(struct platform_device *pdev, - struct pci_ecam_ops *ops) + const struct pci_ecam_ops *ops) { struct device *dev = &pdev->dev; struct pci_host_bridge *bridge; @@ -82,7 +82,7 @@ int pci_host_common_probe(struct platform_device *pdev, bridge->dev.parent = dev; bridge->sysdata = cfg; bridge->busnr = cfg->busr.start; - bridge->ops = &ops->pci_ops; + bridge->ops = (struct pci_ops *)&ops->pci_ops; bridge->map_irq = of_irq_parse_and_map_pci; bridge->swizzle_irq = pci_common_swizzle; diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c index 75a2fb930d4b..7e9a7c0833b1 100644 --- a/drivers/pci/controller/pci-host-generic.c +++ b/drivers/pci/controller/pci-host-generic.c @@ -15,7 +15,7 @@ #include #include -static struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = { +static const struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = { .bus_shift = 16, .pci_ops = { .map_bus = pci_ecam_map_bus, @@ -49,7 +49,7 @@ static void __iomem *pci_dw_ecam_map_bus(struct pci_bus *bus, return pci_ecam_map_bus(bus, devfn, where); } -static struct pci_ecam_ops pci_dw_ecam_bus_ops = { +static const struct pci_ecam_ops pci_dw_ecam_bus_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_dw_ecam_map_bus, diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index 32d1d7b81ef4..c3fdd3e6b21c 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -345,7 +345,7 @@ static int thunder_ecam_config_write(struct pci_bus *bus, unsigned int devfn, return pci_generic_config_write(bus, devfn, where, size, val); } -struct pci_ecam_ops pci_thunder_ecam_ops = { +const struct pci_ecam_ops pci_thunder_ecam_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, diff --git a/drivers/pci/controller/pci-thunder-pem.c b/drivers/pci/controller/pci-thunder-pem.c index 9491e266b1ea..2e792707ceab 100644 --- a/drivers/pci/controller/pci-thunder-pem.c +++ b/drivers/pci/controller/pci-thunder-pem.c @@ -403,7 +403,7 @@ static int thunder_pem_acpi_init(struct pci_config_window *cfg) return thunder_pem_init(dev, cfg, res_pem); } -struct pci_ecam_ops thunder_pem_ecam_ops = { +const struct pci_ecam_ops thunder_pem_ecam_ops = { .bus_shift = 24, .init = thunder_pem_acpi_init, .pci_ops = { @@ -440,7 +440,7 @@ static int thunder_pem_platform_init(struct pci_config_window *cfg) return thunder_pem_init(dev, cfg, res_pem); } -static struct pci_ecam_ops pci_thunder_pem_ops = { +static const struct pci_ecam_ops pci_thunder_pem_ops = { .bus_shift = 24, .init = thunder_pem_platform_init, .pci_ops = { diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c index de195fd430dc..d1efa8ffbae1 100644 --- a/drivers/pci/controller/pci-xgene.c +++ b/drivers/pci/controller/pci-xgene.c @@ -256,7 +256,7 @@ static int xgene_v1_pcie_ecam_init(struct pci_config_window *cfg) return xgene_pcie_ecam_init(cfg, XGENE_PCIE_IP_VER_1); } -struct pci_ecam_ops xgene_v1_pcie_ecam_ops = { +const struct pci_ecam_ops xgene_v1_pcie_ecam_ops = { .bus_shift = 16, .init = xgene_v1_pcie_ecam_init, .pci_ops = { @@ -271,7 +271,7 @@ static int xgene_v2_pcie_ecam_init(struct pci_config_window *cfg) return xgene_pcie_ecam_init(cfg, XGENE_PCIE_IP_VER_2); } -struct pci_ecam_ops xgene_v2_pcie_ecam_ops = { +const struct pci_ecam_ops xgene_v2_pcie_ecam_ops = { .bus_shift = 16, .init = xgene_v2_pcie_ecam_init, .pci_ops = { diff --git a/drivers/pci/controller/pcie-tango.c b/drivers/pci/controller/pcie-tango.c index 21a208da3f59..3b2b10906fdd 100644 --- a/drivers/pci/controller/pcie-tango.c +++ b/drivers/pci/controller/pcie-tango.c @@ -207,7 +207,7 @@ static int smp8759_config_write(struct pci_bus *bus, unsigned int devfn, return ret; } -static struct pci_ecam_ops smp8759_ecam_ops = { +static const struct pci_ecam_ops smp8759_ecam_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 1a81af0ba961..1b05ff627859 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -26,7 +26,7 @@ static const bool per_bus_mapping = !IS_ENABLED(CONFIG_64BIT); */ struct pci_config_window *pci_ecam_create(struct device *dev, struct resource *cfgres, struct resource *busr, - struct pci_ecam_ops *ops) + const struct pci_ecam_ops *ops) { struct pci_config_window *cfg; unsigned int bus_range, bus_range_max, bsz; @@ -145,7 +145,7 @@ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, } /* ECAM ops */ -struct pci_ecam_ops pci_generic_ecam_ops = { +const struct pci_ecam_ops pci_generic_ecam_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, @@ -156,7 +156,7 @@ struct pci_ecam_ops pci_generic_ecam_ops = { #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) /* ECAM ops for 32-bit access only (non-compliant) */ -struct pci_ecam_ops pci_32b_ops = { +const struct pci_ecam_ops pci_32b_ops = { .bus_shift = 20, .pci_ops = { .map_bus = pci_ecam_map_bus, diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 2d155bfb8fbf..81f5535ca1b5 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -27,7 +27,7 @@ extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle); struct pci_ecam_ops; extern int pci_mcfg_lookup(struct acpi_pci_root *root, struct resource *cfgres, - struct pci_ecam_ops **ecam_ops); + const struct pci_ecam_ops **ecam_ops); static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index a73164c85e78..6c21dd0901ab 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -29,7 +29,7 @@ struct pci_config_window { struct resource res; struct resource busr; void *priv; - struct pci_ecam_ops *ops; + const struct pci_ecam_ops *ops; union { void __iomem *win; /* 64-bit single mapping */ void __iomem **winp; /* 32-bit per-bus mapping */ @@ -40,29 +40,29 @@ struct pci_config_window { /* create and free pci_config_window */ struct pci_config_window *pci_ecam_create(struct device *dev, struct resource *cfgres, struct resource *busr, - struct pci_ecam_ops *ops); + const struct pci_ecam_ops *ops); void pci_ecam_free(struct pci_config_window *cfg); /* map_bus when ->sysdata is an instance of pci_config_window */ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, int where); /* default ECAM ops */ -extern struct pci_ecam_ops pci_generic_ecam_ops; +extern const struct pci_ecam_ops pci_generic_ecam_ops; #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) -extern struct pci_ecam_ops pci_32b_ops; /* 32-bit accesses only */ -extern struct pci_ecam_ops hisi_pcie_ops; /* HiSilicon */ -extern struct pci_ecam_ops thunder_pem_ecam_ops; /* Cavium ThunderX 1.x & 2.x */ -extern struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ -extern struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ -extern struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ -extern struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ +extern const struct pci_ecam_ops pci_32b_ops; /* 32-bit accesses only */ +extern const struct pci_ecam_ops hisi_pcie_ops; /* HiSilicon */ +extern const struct pci_ecam_ops thunder_pem_ecam_ops; /* Cavium ThunderX 1.x & 2.x */ +extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ +extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ +extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ +extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ #endif #ifdef CONFIG_PCI_HOST_COMMON /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev, - struct pci_ecam_ops *ops); + const struct pci_ecam_ops *ops); int pci_host_common_remove(struct platform_device *pdev); #endif #endif -- cgit v1.2.3 From 0c59c06a7c90390c3985c9acd58a73320781c15e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Apr 2020 17:49:22 -0600 Subject: PCI: host-generic: Support building as modules Enable building host-generic and its host-common dependency as a module. Link: https://lore.kernel.org/r/20200409234923.21598-3-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Acked-by: Will Deacon Acked-by: Bjorn Helgaas Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: Will Deacon Cc: linux-pci@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org --- drivers/pci/controller/Kconfig | 4 ++-- drivers/pci/controller/pci-host-common.c | 5 +++++ drivers/pci/controller/pci-host-generic.c | 7 +++++-- drivers/pci/ecam.c | 4 ++++ drivers/pci/setup-bus.c | 1 + include/linux/pci-ecam.h | 2 +- 6 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 91bfdb784829..416a53414728 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -62,11 +62,11 @@ config PCIE_RCAR Say Y here if you want PCIe controller support on R-Car SoCs. config PCI_HOST_COMMON - bool + tristate select PCI_ECAM config PCI_HOST_GENERIC - bool "Generic PCI host controller" + tristate "Generic PCI host controller" depends on OF select PCI_HOST_COMMON select IRQ_DOMAIN diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index f6d5dc068488..6d15bc12e726 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -95,6 +96,7 @@ int pci_host_common_probe(struct platform_device *pdev, platform_set_drvdata(pdev, bridge->bus); return 0; } +EXPORT_SYMBOL_GPL(pci_host_common_probe); int pci_host_common_remove(struct platform_device *pdev) { @@ -107,3 +109,6 @@ int pci_host_common_remove(struct platform_device *pdev) return 0; } +EXPORT_SYMBOL_GPL(pci_host_common_remove); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c index 7e9a7c0833b1..fd8cff61de14 100644 --- a/drivers/pci/controller/pci-host-generic.c +++ b/drivers/pci/controller/pci-host-generic.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,7 @@ static const struct of_device_id gen_pci_of_match[] = { { }, }; +MODULE_DEVICE_TABLE(of, gen_pci_of_match); static int gen_pci_probe(struct platform_device *pdev) { @@ -92,9 +94,10 @@ static struct platform_driver gen_pci_driver = { .driver = { .name = "pci-host-generic", .of_match_table = gen_pci_of_match, - .suppress_bind_attrs = true, }, .probe = gen_pci_probe, .remove = pci_host_common_remove, }; -builtin_platform_driver(gen_pci_driver); +module_platform_driver(gen_pci_driver); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 1b05ff627859..8f065a42fc1a 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -101,6 +101,7 @@ err_exit: pci_ecam_free(cfg); return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(pci_ecam_create); void pci_ecam_free(struct pci_config_window *cfg) { @@ -121,6 +122,7 @@ void pci_ecam_free(struct pci_config_window *cfg) release_resource(&cfg->res); kfree(cfg); } +EXPORT_SYMBOL_GPL(pci_ecam_free); /* * Function to implement the pci_ops ->map_bus method @@ -143,6 +145,7 @@ void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, base = cfg->win + (busn << cfg->ops->bus_shift); return base + (devfn << devfn_shift) + where; } +EXPORT_SYMBOL_GPL(pci_ecam_map_bus); /* ECAM ops */ const struct pci_ecam_ops pci_generic_ecam_ops = { @@ -153,6 +156,7 @@ const struct pci_ecam_ops pci_generic_ecam_ops = { .write = pci_generic_config_write, } }; +EXPORT_SYMBOL_GPL(pci_generic_ecam_ops); #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) /* ECAM ops for 32-bit access only (non-compliant) */ diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index bbcef1a053ab..5b35f7fc2ace 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -26,6 +26,7 @@ #include "pci.h" unsigned int pci_flags; +EXPORT_SYMBOL_GPL(pci_flags); struct pci_dev_resource { struct list_head list; diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 6c21dd0901ab..fd0edb8b8a00 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -59,7 +59,7 @@ extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ #endif -#ifdef CONFIG_PCI_HOST_COMMON +#if IS_ENABLED(CONFIG_PCI_HOST_COMMON) /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev, const struct pci_ecam_ops *ops); -- cgit v1.2.3 From d46edd671a147032e22cfeb271a5734703093649 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:04 -0700 Subject: bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats. Typical userspace tools use kernel.bpf_stats_enabled as follows: 1. Enable kernel.bpf_stats_enabled; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Disable kernel.bpf_stats_enabled. The problem with this approach is that only one userspace tool can toggle this sysctl. If multiple tools toggle the sysctl at the same time, the measurement may be inaccurate. To fix this problem while keep backward compatibility, introduce a new bpf command BPF_ENABLE_STATS. On success, this command enables stats and returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently, only one type, BPF_STATS_RUN_TIME, is supported. We can extend the command to support other types of stats in the future. With BPF_ENABLE_STATS, user space tool would have the following flow: 1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Close the fd. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-2-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 ++++++++ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 36 +++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 11 ++++++++ 5 files changed, 115 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c07b1d2f3824..1262ec460ab3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -987,6 +987,7 @@ _out: \ #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); +extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c75b2dd2459c..4f34eecec9ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) return fd; } +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3996,6 +4050,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &link_idr, &link_idr_lock); break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e961286d0e14..7adfe5dbce9d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} +#endif + /* * /proc/sys support */ @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From cff9f12b18915d957a2130885a00f8ab15cff7e4 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:31 +0300 Subject: net/core: Introduce netdev_get_xmit_slave Add new ndo to get the xmit slave of master device. The reference counters are not incremented so the caller must be careful with locks. User can ask to get the xmit slave assume all the slaves can transmit by set all_slaves arg to true. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 12 ++++++++++++ net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..26bc0f11b7ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1146,6 +1146,12 @@ struct netdev_net_notifier { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, + * struct sk_buff *skb, + * bool all_slaves); + * Get the xmit slave of master device. If all_slaves is true, function + * assume all the slaves can transmit. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1389,6 +1395,9 @@ struct net_device_ops { struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2731,6 +2740,9 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..e6c10980abfd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7785,6 +7785,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3 From c6bc6041b10f70b617f2d13894311fe62027d292 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:41 +0300 Subject: net/mlx5: Add support to get lag physical port Add function to get the device physical port of the lag slave. Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 24 ++++++++++++++++++++++++ include/linux/mlx5/driver.h | 2 ++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index b17b80bcd045..874c70e8cc54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -687,6 +687,30 @@ unlock: } EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); +u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, + struct net_device *slave) +{ + struct mlx5_lag *ldev; + u8 port = 0; + + spin_lock(&lag_lock); + ldev = mlx5_lag_dev_get(dev); + if (!(ldev && __mlx5_lag_is_roce(ldev))) + goto unlock; + + if (ldev->pf[MLX5_LAG_P1].netdev == slave) + port = MLX5_LAG_P1; + else + port = MLX5_LAG_P2; + + port = ldev->v2p_map[port]; + +unlock: + spin_unlock(&lag_lock); + return port; +} +EXPORT_SYMBOL(mlx5_lag_get_slave_port); + bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv) { struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d82dbbab8179..267dfcc5493e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1074,6 +1074,8 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); +u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, + struct net_device *slave); int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, u64 *values, int num_counters, -- cgit v1.2.3 From cfc1a89e449c02207952c72a4c0394691fdedf43 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:46 +0300 Subject: RDMA/mlx5: Set lag tx affinity according to slave The patch sets the lag tx affinity of the data QPs and the GSI QPs according to the LAG xmit slave. For GSI QPs, in case the link layer is Ethenet (RoCE) we create two GSI QPs, one for each physical port. When the driver selects the GSI QP, it will consider the port affinity result. For connected QPs, the driver sets the affinity of the xmit slave. The above, ensures that RC QP and it's corresponding GSI QP will transmit from the same physical port. Link: https://lore.kernel.org/r/20200430192146.12863-17-maorg@mellanox.com Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ah.c | 9 +++++-- drivers/infiniband/hw/mlx5/gsi.c | 33 ++++++++++++++++++----- drivers/infiniband/hw/mlx5/main.c | 2 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 52 ++++++++++++++++++++++++------------ include/linux/mlx5/mlx5_ifc.h | 4 ++- include/linux/mlx5/qp.h | 2 ++ 7 files changed, 76 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 9b59348d51b5..cc858f658567 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -33,8 +33,9 @@ #include "mlx5_ib.h" static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, - struct rdma_ah_attr *ah_attr) + struct rdma_ah_init_attr *init_attr) { + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; enum ib_gid_type gid_type; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { @@ -51,6 +52,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (init_attr->xmit_slave) + ah->xmit_port = + mlx5_lag_get_slave_port(dev->mdev, + init_attr->xmit_slave); gid_type = ah_attr->grh.sgid_attr->gid_type; memcpy(ah->av.rmac, ah_attr->roce.dmac, @@ -98,7 +103,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, return err; } - create_ib_ah(dev, ah, ah_attr); + create_ib_ah(dev, ah, init_attr); return 0; } diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 1afbf03d1a98..40d418153891 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -119,10 +119,17 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct mlx5_ib_gsi_qp *gsi; struct ib_qp_init_attr hw_init_attr = *init_attr; const u8 port_num = init_attr->port_num; - const int num_pkeys = pd->device->attrs.max_pkeys; - const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; + int num_qps = 0; int ret; + if (mlx5_ib_deth_sqpn_cap(dev)) { + if (MLX5_CAP_GEN(dev->mdev, + port_type) == MLX5_CAP_PORT_TYPE_IB) + num_qps = pd->device->attrs.max_pkeys; + else if (dev->lag_active) + num_qps = MLX5_MAX_PORTS; + } + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); if (!gsi) return ERR_PTR(-ENOMEM); @@ -261,7 +268,7 @@ static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) } static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, - u16 qp_index) + u16 pkey_index) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct ib_qp_attr attr; @@ -270,7 +277,7 @@ static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; attr.qp_state = IB_QPS_INIT; - attr.pkey_index = qp_index; + attr.pkey_index = pkey_index; attr.qkey = IB_QP1_QKEY; attr.port_num = gsi->port_num; ret = ib_modify_qp(qp, &attr, mask); @@ -304,12 +311,17 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) { struct ib_device *device = gsi->rx_qp->device; struct mlx5_ib_dev *dev = to_mdev(device); + int pkey_index = qp_index; + struct mlx5_ib_qp *mqp; struct ib_qp *qp; unsigned long flags; u16 pkey; int ret; - ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + pkey_index = 0; + + ret = ib_query_pkey(device, gsi->port_num, pkey_index, &pkey); if (ret) { mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", gsi->port_num, qp_index); @@ -338,7 +350,10 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) return; } - ret = modify_to_rts(gsi, qp, qp_index); + mqp = to_mqp(qp); + if (dev->lag_active) + mqp->gsi_lag_port = qp_index + 1; + ret = modify_to_rts(gsi, qp, pkey_index); if (ret) goto err_destroy_qp; @@ -457,11 +472,15 @@ static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) { struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_ah *ah = to_mah(wr->ah); int qp_index = wr->pkey_index; - if (!mlx5_ib_deth_sqpn_cap(dev)) + if (!gsi->num_qps) return gsi->rx_qp; + if (dev->lag_active && ah->xmit_port) + qp_index = ah->xmit_port - 1; + if (qp_index >= gsi->num_qps) return NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 80ae8f04bfd5..e7fb290c9d8d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include "mlx5_ib.h" @@ -6567,6 +6568,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev); dev->ib_dev.dev.parent = mdev->device; + dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7dffc87601eb..f250753319d0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -461,6 +461,7 @@ struct mlx5_ib_qp { * but not take effective */ u32 counter_pending; + u16 gsi_lag_port; }; struct mlx5_ib_cq_buf { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 14bfdfc8ab96..810bbd52daec 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3218,10 +3218,12 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_PRI_PORT, + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_PRI_PORT, + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_PRI_PORT, @@ -3229,17 +3231,20 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_PRI_PORT, + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PKEY_INDEX, + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PKEY_INDEX, + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | @@ -3248,7 +3253,8 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PKEY_INDEX, + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, }, }, [MLX5_QP_STATE_RTR] = { @@ -3601,11 +3607,8 @@ static unsigned int get_tx_affinity_rr(struct mlx5_ib_dev *dev, static bool qp_supports_affinity(struct ib_qp *qp) { - struct mlx5_ib_qp *mqp = to_mqp(qp); - if ((qp->qp_type == IB_QPT_RC) || - (qp->qp_type == IB_QPT_UD && - !(mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)) || + (qp->qp_type == IB_QPT_UD) || (qp->qp_type == IB_QPT_UC) || (qp->qp_type == IB_QPT_RAW_PACKET) || (qp->qp_type == IB_QPT_XRC_INI) || @@ -3614,7 +3617,9 @@ static bool qp_supports_affinity(struct ib_qp *qp) return false; } -static unsigned int get_tx_affinity(struct ib_qp *qp, u8 init, +static unsigned int get_tx_affinity(struct ib_qp *qp, + const struct ib_qp_attr *attr, + int attr_mask, u8 init, struct ib_udata *udata) { struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( @@ -3624,10 +3629,18 @@ static unsigned int get_tx_affinity(struct ib_qp *qp, u8 init, struct mlx5_ib_qp_base *qp_base; unsigned int tx_affinity; - if (!(dev->lag_active && init && qp_supports_affinity(qp))) + if (!(dev->lag_active && qp_supports_affinity(qp))) return 0; - tx_affinity = get_tx_affinity_rr(dev, udata); + if (mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) + tx_affinity = mqp->gsi_lag_port; + else if (init) + tx_affinity = get_tx_affinity_rr(dev, udata); + else if ((attr_mask & IB_QP_AV) && attr->xmit_slave) + tx_affinity = + mlx5_lag_get_slave_port(dev->mdev, attr->xmit_slave); + else + return 0; qp_base = &mqp->trans_qp.base; if (ucontext) @@ -3712,7 +3725,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_qp_context *context; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; - enum mlx5_qp_optpar optpar; + enum mlx5_qp_optpar optpar = 0; u32 set_id = 0; int mlx5_st; int err; @@ -3746,10 +3759,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - tx_affinity = get_tx_affinity(ibqp, + tx_affinity = get_tx_affinity(ibqp, attr, attr_mask, cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT, udata); - context->flags |= cpu_to_be32(tx_affinity << 24); + if (tx_affinity) { + context->flags |= cpu_to_be32(tx_affinity << 24); + if (new_state == IB_QPS_RTR && + MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) + optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; + } if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; @@ -3886,7 +3904,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } op = optab[mlx5_cur][mlx5_new]; - optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar |= ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fb243848132d..c1ba89198335 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1321,7 +1321,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 stat_rate_support[0x10]; u8 reserved_at_1f0[0x1]; u8 pci_sync_for_fw_update_event[0x1]; - u8 reserved_at_1f2[0xa]; + u8 reserved_at_1f2[0x6]; + u8 init2_lag_tx_port_affinity[0x1]; + u8 reserved_at_1fa[0x3]; u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f23eb18526fe..b9facdb9b9bd 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -66,6 +66,7 @@ enum mlx5_qp_optpar { MLX5_QP_OPTPAR_RETRY_COUNT = 1 << 12, MLX5_QP_OPTPAR_RNR_RETRY = 1 << 13, MLX5_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, + MLX5_QP_OPTPAR_LAG_TX_AFF = 1 << 15, MLX5_QP_OPTPAR_PRI_PORT = 1 << 16, MLX5_QP_OPTPAR_SRQN = 1 << 18, MLX5_QP_OPTPAR_CQN_RCV = 1 << 19, @@ -321,6 +322,7 @@ struct mlx5_av { struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; + u8 xmit_port; }; static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) -- cgit v1.2.3 From 184ecc9eb260d5a3bcdddc5bebd18f285ac004e9 Mon Sep 17 00:00:00 2001 From: Vincent Cheng Date: Fri, 1 May 2020 23:35:36 -0400 Subject: ptp: Add adjphase function to support phase offset control. Adds adjust phase function to take advantage of a PHC clock's hardware filtering capability that uses phase offset control word instead of frequency offset control word. Signed-off-by: Vincent Cheng Reviewed-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 3 +++ include/linux/ptp_clock_kernel.h | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index acabbe72e55e..fc984a8828fb 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -146,6 +146,9 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx) else err = ops->adjfreq(ops, ppb); ptp->dialed_frequency = tx->freq; + } else if (tx->modes & ADJ_OFFSET) { + if (ops->adjphase) + err = ops->adjphase(ops, tx->offset); } else if (tx->modes == 0) { tx->freq = ptp->dialed_frequency; err = 0; diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 121a7eda4593..31144d954d89 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -36,7 +36,7 @@ struct ptp_system_timestamp { }; /** - * struct ptp_clock_info - decribes a PTP hardware clock + * struct ptp_clock_info - describes a PTP hardware clock * * @owner: The clock driver should set to THIS_MODULE. * @name: A short "friendly name" to identify the clock and to @@ -65,6 +65,9 @@ struct ptp_system_timestamp { * parameter delta: Desired frequency offset from nominal frequency * in parts per billion * + * @adjphase: Adjusts the phase offset of the hardware clock. + * parameter delta: Desired change in nanoseconds. + * * @adjtime: Shifts the time of the hardware clock. * parameter delta: Desired change in nanoseconds. * @@ -128,6 +131,7 @@ struct ptp_clock_info { struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); + int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); int (*gettimex64)(struct ptp_clock_info *ptp, struct timespec64 *ts, -- cgit v1.2.3 From efbe3c2493d2f7a1e1a753780fe727b34709ebd2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:33 -0700 Subject: fs: Remove unneeded IS_DAX() check in io_is_direct() Remove the check because DAX now has it's own read/write methods and file systems which support DAX check IS_DAX() prior to IOCB_DIRECT on their own. Therefore, it does not matter if the file state is DAX when the iocb flags are created. Also remove io_is_direct() as it is just a simple flag check. Reviewed-by: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Ira Weiny Signed-off-by: Darrick J. Wong --- drivers/block/loop.c | 6 +++--- include/linux/fs.h | 7 +------ 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..14372df0f354 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -634,8 +634,8 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) static inline void loop_update_dio(struct loop_device *lo) { - __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | - lo->use_dio); + __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | + lo->use_dio); } static void loop_reread_partitions(struct loop_device *lo, @@ -1028,7 +1028,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); - if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) { + if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) { /* In case of direct I/O, match underlying block size */ unsigned short bsize = bdev_logical_block_size( inode->i_sb->s_bdev); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..a87cc5845a02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3394,11 +3394,6 @@ extern void setattr_copy(struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); -static inline bool io_is_direct(struct file *filp) -{ - return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); -} - static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); @@ -3423,7 +3418,7 @@ static inline int iocb_flags(struct file *file) int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; - if (io_is_direct(file)) + if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) res |= IOCB_DSYNC; -- cgit v1.2.3 From a711d91cd97e6c9a554ccd1652527a7f36661857 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:57:00 +0200 Subject: block: add a cdrom_device_info pointer to struct gendisk Add a pointer to the CDROM information structure to struct gendisk. This will allow various removable media file systems to call directly into the CDROM layer instead of abusing ioctls with kernel pointers. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/paride/pcd.c | 2 +- drivers/cdrom/cdrom.c | 5 ++++- drivers/cdrom/gdrom.c | 2 +- drivers/ide/ide-cd.c | 3 +-- drivers/scsi/sr.c | 3 +-- include/linux/cdrom.h | 2 +- include/linux/genhd.h | 9 +++++++++ 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index cda5cf917e9a..5124eca90e83 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -1032,7 +1032,7 @@ static int __init pcd_init(void) for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { if (cd->present) { - register_cdrom(&cd->info); + register_cdrom(cd->disk, &cd->info); cd->disk->private_data = cd; add_disk(cd->disk); } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..a1d2112fd283 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -586,7 +586,7 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) return 0; } -int register_cdrom(struct cdrom_device_info *cdi) +int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) { static char banner_printed; const struct cdrom_device_ops *cdo = cdi->ops; @@ -601,6 +601,9 @@ int register_cdrom(struct cdrom_device_info *cdi) cdrom_sysctl_register(); } + cdi->disk = disk; + disk->cdi = cdi; + ENSURE(cdo, drive_status, CDC_DRIVE_STATUS); if (cdo->check_events == NULL && cdo->media_changed == NULL) WARN_ON_ONCE(cdo->capability & (CDC_MEDIA_CHANGED | CDC_SELECT_DISC)); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index c51292c2a131..09b0cd292720 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -770,7 +770,7 @@ static int probe_gdrom(struct platform_device *devptr) goto probe_fail_no_disk; } probe_gdrom_setupdisk(); - if (register_cdrom(gd.cd_info)) { + if (register_cdrom(gd.disk, gd.cd_info)) { err = -ENODEV; goto probe_fail_cdrom_register; } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index dcf8b51b47fd..40e124eb918a 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1305,8 +1305,7 @@ static int ide_cdrom_register(ide_drive_t *drive, int nslots) if (drive->atapi_flags & IDE_AFLAG_NO_SPEED_SELECT) devinfo->mask |= CDC_SELECT_SPEED; - devinfo->disk = info->disk; - return register_cdrom(devinfo); + return register_cdrom(info->disk, devinfo); } static int ide_cdrom_probe_capabilities(ide_drive_t *drive) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index d2fe3fa470f9..f9b589d60a46 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -794,9 +794,8 @@ static int sr_probe(struct device *dev) set_capacity(disk, cd->capacity); disk->private_data = &cd->driver; disk->queue = sdev->request_queue; - cd->cdi.disk = disk; - if (register_cdrom(&cd->cdi)) + if (register_cdrom(disk, &cd->cdi)) goto fail_put; /* diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 528271c60018..4f74ce050253 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -104,7 +104,7 @@ extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); extern int cdrom_media_changed(struct cdrom_device_info *); -extern int register_cdrom(struct cdrom_device_info *cdi); +extern int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi); extern void unregister_cdrom(struct cdrom_device_info *cdi); typedef struct { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 058d895544c7..f9c226f9546a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -217,11 +217,20 @@ struct gendisk { #ifdef CONFIG_BLK_DEV_INTEGRITY struct kobject integrity_kobj; #endif /* CONFIG_BLK_DEV_INTEGRITY */ +#if IS_ENABLED(CONFIG_CDROM) + struct cdrom_device_info *cdi; +#endif int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; }; +#if IS_REACHABLE(CONFIG_CDROM) +#define disk_to_cdi(disk) ((disk)->cdi) +#else +#define disk_to_cdi(disk) NULL +#endif + static inline struct gendisk *part_to_disk(struct hd_struct *part) { if (likely(part)) { -- cgit v1.2.3 From 4c3cfcce45152309c2031b3b87bcf10e4f3899e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:57:02 +0200 Subject: cdrom: factor out a cdrom_read_tocentry helper Factor out a version of the CDROMREADTOCENTRY ioctl handler that can be called directly from kernel space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 39 ++++++++++++++++++++++----------------- include/linux/cdrom.h | 3 +++ 2 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index a1d2112fd283..c91d1e138214 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2666,32 +2666,37 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi, return 0; } +int cdrom_read_tocentry(struct cdrom_device_info *cdi, + struct cdrom_tocentry *entry) +{ + u8 requested_format = entry->cdte_format; + int ret; + + if (requested_format != CDROM_MSF && requested_format != CDROM_LBA) + return -EINVAL; + + /* make interface to low-level uniform */ + entry->cdte_format = CDROM_MSF; + ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, entry); + if (!ret) + sanitize_format(&entry->cdte_addr, &entry->cdte_format, + requested_format); + return ret; +} +EXPORT_SYMBOL_GPL(cdrom_read_tocentry); + static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi, void __user *argp) { struct cdrom_tocentry entry; - u8 requested_format; int ret; - /* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */ - if (copy_from_user(&entry, argp, sizeof(entry))) return -EFAULT; - - requested_format = entry.cdte_format; - if (requested_format != CDROM_MSF && requested_format != CDROM_LBA) - return -EINVAL; - /* make interface to low-level uniform */ - entry.cdte_format = CDROM_MSF; - ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry); - if (ret) - return ret; - sanitize_format(&entry.cdte_addr, &entry.cdte_format, requested_format); - - if (copy_to_user(argp, &entry, sizeof(entry))) + ret = cdrom_read_tocentry(cdi, &entry); + if (!ret && copy_to_user(argp, &entry, sizeof(entry))) return -EFAULT; - /* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */ - return 0; + return ret; } static int cdrom_ioctl_play_msf(struct cdrom_device_info *cdi, diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 4f74ce050253..008c4d79fa33 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -94,6 +94,9 @@ struct cdrom_device_ops { struct packet_command *); }; +int cdrom_read_tocentry(struct cdrom_device_info *cdi, + struct cdrom_tocentry *entry); + /* the general block_device operations structure: */ extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode); -- cgit v1.2.3 From eaf8e3e4b54a8b778257fc3921ad8f793c941882 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:57:03 +0200 Subject: cdrom: factor out a cdrom_multisession helper Factor out a version of the CDROMMULTISESSION ioctl handler that can be called directly from kernel space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 41 +++++++++++++++++++++++++---------------- include/linux/cdrom.h | 2 ++ 2 files changed, 27 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index c91d1e138214..06896c07b133 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2295,37 +2295,46 @@ retry: return cdrom_read_cdda_old(cdi, ubuf, lba, nframes); } -static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi, - void __user *argp) +int cdrom_multisession(struct cdrom_device_info *cdi, + struct cdrom_multisession *info) { - struct cdrom_multisession ms_info; u8 requested_format; int ret; - cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); - if (!(cdi->ops->capability & CDC_MULTI_SESSION)) return -ENOSYS; - if (copy_from_user(&ms_info, argp, sizeof(ms_info))) - return -EFAULT; - - requested_format = ms_info.addr_format; + requested_format = info->addr_format; if (requested_format != CDROM_MSF && requested_format != CDROM_LBA) return -EINVAL; - ms_info.addr_format = CDROM_LBA; + info->addr_format = CDROM_LBA; - ret = cdi->ops->get_last_session(cdi, &ms_info); - if (ret) - return ret; + ret = cdi->ops->get_last_session(cdi, info); + if (!ret) + sanitize_format(&info->addr, &info->addr_format, + requested_format); + return ret; +} +EXPORT_SYMBOL_GPL(cdrom_multisession); - sanitize_format(&ms_info.addr, &ms_info.addr_format, requested_format); +static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi, + void __user *argp) +{ + struct cdrom_multisession info; + int ret; + + cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); - if (copy_to_user(argp, &ms_info, sizeof(ms_info))) + if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + ret = cdrom_multisession(cdi, &info); + if (ret) + return ret; + if (copy_to_user(argp, &info, sizeof(info))) return -EFAULT; cd_dbg(CD_DO_IOCTL, "CDROMMULTISESSION successful\n"); - return 0; + return ret; } static int cdrom_ioctl_eject(struct cdrom_device_info *cdi) diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 008c4d79fa33..8543fa59da72 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -94,6 +94,8 @@ struct cdrom_device_ops { struct packet_command *); }; +int cdrom_multisession(struct cdrom_device_info *cdi, + struct cdrom_multisession *info); int cdrom_read_tocentry(struct cdrom_device_info *cdi, struct cdrom_tocentry *entry); -- cgit v1.2.3 From b86cd700edd3bfe27f631649727b7796067bb3fd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 4 May 2020 19:27:00 +0200 Subject: net: add helper eth_hw_addr_crc Several drivers use the same code as basis for filter hashes. Therefore let's factor it out to a helper. This way drivers don't have to access struct netdev_hw_addr internals. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 8801f1f986e5..2e5debc0373c 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -265,6 +266,17 @@ static inline void eth_hw_addr_random(struct net_device *dev) eth_random_addr(dev->dev_addr); } +/** + * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr + * @ha: pointer to hardware address + * + * Calculate CRC from a hardware address as basis for filter hashes. + */ +static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) +{ + return ether_crc(ETH_ALEN, ha->addr); +} + /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination -- cgit v1.2.3 From 1a33e10e4a95cb109ff1145098175df3113313ef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 2 May 2020 22:22:19 -0700 Subject: net: partially revert dynamic lockdep key changes This patch reverts the folowing commits: commit 064ff66e2bef84f1153087612032b5b9eab005bd "bonding: add missing netdev_update_lockdep_key()" commit 53d374979ef147ab51f5d632dfe20b14aebeccd0 "net: avoid updating qdisc_xmit_lock_key in netdev_update_lockdep_key()" commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 "net: fix kernel-doc warning in " commit ab92d68fc22f9afab480153bd82a20f6e2533769 "net: core: add generic lockdep keys" but keeps the addr_list_lock_key because we still lock addr_list_lock nestedly on stack devices, unlikely xmit_lock this is safe because we don't take addr_list_lock on any fast path. Reported-and-tested-by: syzbot+aaa6fa4949cc5d9b7b25@syzkaller.appspotmail.com Cc: Dmitry Vyukov Cc: Taehee Yoo Signed-off-by: Cong Wang Acked-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 16 ++++ drivers/net/hamradio/bpqether.c | 20 +++++ drivers/net/hyperv/netvsc_drv.c | 2 + drivers/net/ipvlan/ipvlan_main.c | 2 + drivers/net/macsec.c | 2 + drivers/net/macvlan.c | 2 + drivers/net/ppp/ppp_generic.c | 2 + drivers/net/team/team.c | 1 + drivers/net/vrf.c | 1 + drivers/net/wireless/intersil/hostap/hostap_hw.c | 22 ++++++ include/linux/netdevice.h | 27 +++++-- net/8021q/vlan_dev.c | 21 ++++++ net/batman-adv/soft-interface.c | 30 ++++++++ net/bluetooth/6lowpan.c | 8 ++ net/core/dev.c | 90 ++++++++++++++++++----- net/dsa/slave.c | 12 +++ net/ieee802154/6lowpan/core.c | 8 ++ net/l2tp/l2tp_eth.c | 1 + net/netrom/af_netrom.c | 21 ++++++ net/rose/af_rose.c | 21 ++++++ net/sched/sch_generic.c | 17 +++-- 22 files changed, 294 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2e70e43c5df5..d01871321d22 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4898,6 +4898,7 @@ static int bond_init(struct net_device *bond_dev) spin_lock_init(&bond->stats_lock); lockdep_register_key(&bond->stats_lock_key); lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key); + netdev_lockdep_set_classes(bond_dev); list_add_tail(&bond->bond_list, &bn->dev_list); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 79d72c88bbef..b3cabc274121 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -299,6 +299,20 @@ static void nfp_repr_clean(struct nfp_repr *repr) nfp_port_free(repr->port); } +static struct lock_class_key nfp_repr_netdev_xmit_lock_key; + +static void nfp_repr_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nfp_repr_netdev_xmit_lock_key); +} + +static void nfp_repr_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nfp_repr_set_lockdep_class_one, NULL); +} + int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 cmsg_port_id, struct nfp_port *port, struct net_device *pf_netdev) @@ -308,6 +322,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 repr_cap = nn->tlv_caps.repr_cap; int err; + nfp_repr_set_lockdep_class(netdev); + repr->port = port; repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL); if (!repr->dst) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index fbea6f232819..206688154fdf 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -107,6 +107,25 @@ struct bpqdev { static LIST_HEAD(bpq_devices); +/* + * bpqether network devices are paired with ethernet devices below them, so + * form a special "super class" of normal ethernet devices; split their locks + * off into a separate class since they always nest. + */ +static struct lock_class_key bpq_netdev_xmit_lock_key; + +static void bpq_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &bpq_netdev_xmit_lock_key); +} + +static void bpq_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL); +} + /* ------------------------------------------------------------------------ */ @@ -477,6 +496,7 @@ static int bpq_new_device(struct net_device *edev) err = register_netdevice(ndev); if (err) goto error; + bpq_set_lockdep_class(ndev); /* List protected by RTNL */ list_add_rcu(&bpq->bpq_list, &bpq_devices); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d8e86bdbfba1..c0b647a4c893 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2456,6 +2456,8 @@ static int netvsc_probe(struct hv_device *dev, NETIF_F_HW_VLAN_CTAG_RX; net->vlan_features = net->features; + netdev_lockdep_set_classes(net); + /* MTU range: 68 - 1500 or 65521 */ net->min_mtu = NETVSC_MTU_MIN; if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f195f278a83a..15e87c097b0b 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -131,6 +131,8 @@ static int ipvlan_init(struct net_device *dev) dev->gso_max_segs = phy_dev->gso_max_segs; dev->hard_header_len = phy_dev->hard_header_len; + netdev_lockdep_set_classes(dev); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 758baf7cb8a1..ea3f25cc79ef 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4047,6 +4047,8 @@ static int macsec_newlink(struct net *net, struct net_device *dev, if (err < 0) return err; + netdev_lockdep_set_classes(dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d45600e0a38c..34eb073cdd74 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -890,6 +890,8 @@ static int macvlan_init(struct net_device *dev) dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; + netdev_lockdep_set_classes(dev); + vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 22cc2cb9d878..7d005896a0f9 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1410,6 +1410,8 @@ static int ppp_dev_init(struct net_device *dev) { struct ppp *ppp; + netdev_lockdep_set_classes(dev); + ppp = netdev_priv(dev); /* Let the netdevice take a reference on the ppp file. This ensures * that ppp_destroy_interface() won't run before the device gets diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 04845a4017f9..8c1e02752ff6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1647,6 +1647,7 @@ static int team_init(struct net_device *dev) lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); + netdev_lockdep_set_classes(dev); return 0; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 56f8aab46f89..43928a1c2f2a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -867,6 +867,7 @@ static int vrf_dev_init(struct net_device *dev) /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; + netdev_lockdep_set_classes(dev); return 0; out_rth: diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index 58212c532c90..aadf3dec5bf3 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -3041,6 +3041,27 @@ static void prism2_clear_set_tim_queue(local_info_t *local) } } + +/* + * HostAP uses two layers of net devices, where the inner + * layer gets called all the time from the outer layer. + * This is a natural nesting, which needs a split lock type. + */ +static struct lock_class_key hostap_netdev_xmit_lock_key; + +static void prism2_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &hostap_netdev_xmit_lock_key); +} + +static void prism2_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL); +} + static struct net_device * prism2_init_local_data(struct prism2_helper_functions *funcs, int card_idx, struct device *sdev) @@ -3199,6 +3220,7 @@ while (0) if (ret >= 0) ret = register_netdevice(dev); + prism2_set_lockdep_class(dev); rtnl_unlock(); if (ret < 0) { printk(KERN_WARNING "%s: register netdevice failed!\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a8d40f1ffe2..7725efd6e48a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1805,13 +1805,11 @@ enum netdev_priv_flags { * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. - * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock - * spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount - * @qdisc_xmit_lock_key: lockdep class annotating - * netdev_queue->_xmit_lock spinlock + * * @addr_list_lock_key: lockdep class annotating * net_device->addr_list_lock spinlock + * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock + * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2112,10 +2110,9 @@ struct net_device { #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; - struct lock_class_key qdisc_tx_busylock_key; - struct lock_class_key qdisc_running_key; - struct lock_class_key qdisc_xmit_lock_key; struct lock_class_key addr_list_lock_key; + struct lock_class_key *qdisc_tx_busylock; + struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; @@ -2200,6 +2197,20 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_running_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + (dev)->qdisc_running_key = &qdisc_running_key; \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 990b9fde28c6..319220b2341d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -489,6 +489,25 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; + +static void vlan_dev_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *unused) +{ + lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); +} + +static void vlan_dev_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -579,6 +598,8 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); + vlan_dev_set_lockdep_class(dev); + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5f05a728f347..822af540b854 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -739,6 +739,34 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } +/* batman-adv network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key batadv_netdev_xmit_lock_key; + +/** + * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue + * @dev: device which owns the tx queue + * @txq: tx queue to modify + * @_unused: always NULL + */ +static void batadv_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); +} + +/** + * batadv_set_lockdep_class() - Set txq and addr_list lockdep class + * @dev: network device to modify + */ +static void batadv_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); +} + /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -752,6 +780,8 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; + batadv_set_lockdep_class(dev); + bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4febc82a7c76..bb55d92691b0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,7 +571,15 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } +static int bt_dev_init(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + + return 0; +} + static const struct net_device_ops netdev_ops = { + .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; diff --git a/net/core/dev.c b/net/core/dev.c index afff16849c26..f8d83922a6af 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +#endif + /******************************************************************************* * * Protocol management and registration routines @@ -9208,7 +9276,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -9255,22 +9323,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -static void netdev_register_lockdep_key(struct net_device *dev) -{ - lockdep_register_key(&dev->qdisc_tx_busylock_key); - lockdep_register_key(&dev->qdisc_running_key); - lockdep_register_key(&dev->qdisc_xmit_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); -} - -static void netdev_unregister_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->qdisc_tx_busylock_key); - lockdep_unregister_key(&dev->qdisc_running_key); - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); - lockdep_unregister_key(&dev->addr_list_lock_key); -} - void netdev_update_lockdep_key(struct net_device *dev) { lockdep_unregister_key(&dev->addr_list_lock_key); @@ -9837,7 +9889,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - netdev_register_lockdep_key(dev); + lockdep_register_key(&dev->addr_list_lock_key); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; @@ -9926,7 +9978,7 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - netdev_unregister_lockdep_key(dev); + lockdep_unregister_key(&dev->addr_list_lock_key); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ba8bf90dc0cc..fa2634043751 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1671,6 +1671,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1754,6 +1763,9 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); + SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index c0b107cdd715..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,6 +58,13 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; +static int lowpan_dev_init(struct net_device *ldev) +{ + netdev_lockdep_set_classes(ldev); + + return 0; +} + static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -89,6 +96,7 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d3b520b9b2c9..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,6 +56,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7b1a74f74aad..eccc7d366e17 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,6 +63,26 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; +/* + * NETROM network devices are virtual network devices encapsulating NETROM + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key nr_netdev_xmit_lock_key; + +static void nr_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); +} + +static void nr_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); +} + /* * Socket removal during an interrupt is now safe. */ @@ -1394,6 +1414,7 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } + nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 1e8eeb044b07..e7a872207b46 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -64,6 +64,26 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; +/* + * ROSE network devices are virtual network devices encapsulating ROSE + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key rose_netdev_xmit_lock_key; + +static void rose_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); +} + +static void rose_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); +} + /* * Convert a ROSE address into text. */ @@ -1511,6 +1531,7 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } + rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ad24fa1a51e6..ebc55d884247 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); - if (sch != &noop_qdisc) { - lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->running, &dev->qdisc_running_key); - } - return sch; errout1: kfree(p); -- cgit v1.2.3 From d26c0cc53950464a24adfa76867f1d71f0cbbea6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Apr 2020 23:30:47 +0200 Subject: bpf: Avoid gcc-10 stringop-overflow warning in struct bpf_prog gcc-10 warns about accesses to zero-length arrays: kernel/bpf/core.c: In function 'bpf_patch_insn_single': cc1: warning: writing 8 bytes into a region of size 0 [-Wstringop-overflow=] In file included from kernel/bpf/core.c:21: include/linux/filter.h:550:20: note: at offset 0 to object 'insnsi' with size 0 declared here 550 | struct bpf_insn insnsi[0]; | ^~~~~~ In this case, we really want to have two flexible-array members, but that is not possible. Removing the union to make insnsi a flexible-array member while leaving insns as a zero-length array fixes the warning, as nothing writes to the other one in that way. This trick only works on linux-3.18 or higher, as older versions had additional members in the union. Fixes: 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430213101.135134-6-arnd@arndb.de --- include/linux/filter.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index af37318bb1c5..73d06a39e2d6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -545,10 +545,8 @@ struct bpf_prog { unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); /* Instructions for interpreter */ - union { - struct sock_filter insns[0]; - struct bpf_insn insnsi[0]; - }; + struct sock_filter insns[0]; + struct bpf_insn insnsi[]; }; struct sk_filter { -- cgit v1.2.3 From d8e81bc3e87c9e7994f5bf24e215a607899ca470 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 15 Apr 2020 16:45:16 +0200 Subject: docs: dt: convert usage-model.txt to ReST - Add a SPDX header; - Adjust document title; - Use footnoote markups; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to devicetree/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: Lee Jones Signed-off-by: Rob Herring --- Documentation/devicetree/index.rst | 1 + Documentation/devicetree/of_unittest.txt | 2 +- Documentation/devicetree/usage-model.rst | 420 +++++++++++++++++++++++++++++++ Documentation/devicetree/usage-model.txt | 415 ------------------------------ include/linux/mfd/core.h | 2 +- 5 files changed, 423 insertions(+), 417 deletions(-) create mode 100644 Documentation/devicetree/usage-model.rst delete mode 100644 Documentation/devicetree/usage-model.txt (limited to 'include/linux') diff --git a/Documentation/devicetree/index.rst b/Documentation/devicetree/index.rst index a11efe26f205..7a6aad7d384a 100644 --- a/Documentation/devicetree/index.rst +++ b/Documentation/devicetree/index.rst @@ -7,4 +7,5 @@ Open Firmware and Device Tree .. toctree:: :maxdepth: 1 + usage-model writing-schema diff --git a/Documentation/devicetree/of_unittest.txt b/Documentation/devicetree/of_unittest.txt index 3e4e7d48ae93..9fdd2de9b770 100644 --- a/Documentation/devicetree/of_unittest.txt +++ b/Documentation/devicetree/of_unittest.txt @@ -11,7 +11,7 @@ architecture. It is recommended to read the following documents before moving ahead. -[1] Documentation/devicetree/usage-model.txt +[1] Documentation/devicetree/usage-model.rst [2] http://www.devicetree.org/Device_Tree_Usage OF Selftest has been designed to test the interface (include/linux/of.h) diff --git a/Documentation/devicetree/usage-model.rst b/Documentation/devicetree/usage-model.rst new file mode 100644 index 000000000000..326d7af10c5b --- /dev/null +++ b/Documentation/devicetree/usage-model.rst @@ -0,0 +1,420 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +Linux and the Device Tree +========================= + +The Linux usage model for device tree data + +:Author: Grant Likely + +This article describes how Linux uses the device tree. An overview of +the device tree data format can be found on the device tree usage page +at devicetree.org\ [1]_. + +.. [1] http://devicetree.org/Device_Tree_Usage + +The "Open Firmware Device Tree", or simply Device Tree (DT), is a data +structure and language for describing hardware. More specifically, it +is a description of hardware that is readable by an operating system +so that the operating system doesn't need to hard code details of the +machine. + +Structurally, the DT is a tree, or acyclic graph with named nodes, and +nodes may have an arbitrary number of named properties encapsulating +arbitrary data. A mechanism also exists to create arbitrary +links from one node to another outside of the natural tree structure. + +Conceptually, a common set of usage conventions, called 'bindings', +is defined for how data should appear in the tree to describe typical +hardware characteristics including data busses, interrupt lines, GPIO +connections, and peripheral devices. + +As much as possible, hardware is described using existing bindings to +maximize use of existing support code, but since property and node +names are simply text strings, it is easy to extend existing bindings +or create new ones by defining new nodes and properties. Be wary, +however, of creating a new binding without first doing some homework +about what already exists. There are currently two different, +incompatible, bindings for i2c busses that came about because the new +binding was created without first investigating how i2c devices were +already being enumerated in existing systems. + +1. History +---------- +The DT was originally created by Open Firmware as part of the +communication method for passing data from Open Firmware to a client +program (like to an operating system). An operating system used the +Device Tree to discover the topology of the hardware at runtime, and +thereby support a majority of available hardware without hard coded +information (assuming drivers were available for all devices). + +Since Open Firmware is commonly used on PowerPC and SPARC platforms, +the Linux support for those architectures has for a long time used the +Device Tree. + +In 2005, when PowerPC Linux began a major cleanup and to merge 32-bit +and 64-bit support, the decision was made to require DT support on all +powerpc platforms, regardless of whether or not they used Open +Firmware. To do this, a DT representation called the Flattened Device +Tree (FDT) was created which could be passed to the kernel as a binary +blob without requiring a real Open Firmware implementation. U-Boot, +kexec, and other bootloaders were modified to support both passing a +Device Tree Binary (dtb) and to modify a dtb at boot time. DT was +also added to the PowerPC boot wrapper (``arch/powerpc/boot/*``) so that +a dtb could be wrapped up with the kernel image to support booting +existing non-DT aware firmware. + +Some time later, FDT infrastructure was generalized to be usable by +all architectures. At the time of this writing, 6 mainlined +architectures (arm, microblaze, mips, powerpc, sparc, and x86) and 1 +out of mainline (nios) have some level of DT support. + +2. Data Model +------------- +If you haven't already read the Device Tree Usage\ [1]_ page, +then go read it now. It's okay, I'll wait.... + +2.1 High Level View +------------------- +The most important thing to understand is that the DT is simply a data +structure that describes the hardware. There is nothing magical about +it, and it doesn't magically make all hardware configuration problems +go away. What it does do is provide a language for decoupling the +hardware configuration from the board and device driver support in the +Linux kernel (or any other operating system for that matter). Using +it allows board and device support to become data driven; to make +setup decisions based on data passed into the kernel instead of on +per-machine hard coded selections. + +Ideally, data driven platform setup should result in less code +duplication and make it easier to support a wide range of hardware +with a single kernel image. + +Linux uses DT data for three major purposes: + +1) platform identification, +2) runtime configuration, and +3) device population. + +2.2 Platform Identification +--------------------------- +First and foremost, the kernel will use data in the DT to identify the +specific machine. In a perfect world, the specific platform shouldn't +matter to the kernel because all platform details would be described +perfectly by the device tree in a consistent and reliable manner. +Hardware is not perfect though, and so the kernel must identify the +machine during early boot so that it has the opportunity to run +machine-specific fixups. + +In the majority of cases, the machine identity is irrelevant, and the +kernel will instead select setup code based on the machine's core +CPU or SoC. On ARM for example, setup_arch() in +arch/arm/kernel/setup.c will call setup_machine_fdt() in +arch/arm/kernel/devtree.c which searches through the machine_desc +table and selects the machine_desc which best matches the device tree +data. It determines the best match by looking at the 'compatible' +property in the root device tree node, and comparing it with the +dt_compat list in struct machine_desc (which is defined in +arch/arm/include/asm/mach/arch.h if you're curious). + +The 'compatible' property contains a sorted list of strings starting +with the exact name of the machine, followed by an optional list of +boards it is compatible with sorted from most compatible to least. For +example, the root compatible properties for the TI BeagleBoard and its +successor, the BeagleBoard xM board might look like, respectively:: + + compatible = "ti,omap3-beagleboard", "ti,omap3450", "ti,omap3"; + compatible = "ti,omap3-beagleboard-xm", "ti,omap3450", "ti,omap3"; + +Where "ti,omap3-beagleboard-xm" specifies the exact model, it also +claims that it compatible with the OMAP 3450 SoC, and the omap3 family +of SoCs in general. You'll notice that the list is sorted from most +specific (exact board) to least specific (SoC family). + +Astute readers might point out that the Beagle xM could also claim +compatibility with the original Beagle board. However, one should be +cautioned about doing so at the board level since there is typically a +high level of change from one board to another, even within the same +product line, and it is hard to nail down exactly what is meant when one +board claims to be compatible with another. For the top level, it is +better to err on the side of caution and not claim one board is +compatible with another. The notable exception would be when one +board is a carrier for another, such as a CPU module attached to a +carrier board. + +One more note on compatible values. Any string used in a compatible +property must be documented as to what it indicates. Add +documentation for compatible strings in Documentation/devicetree/bindings. + +Again on ARM, for each machine_desc, the kernel looks to see if +any of the dt_compat list entries appear in the compatible property. +If one does, then that machine_desc is a candidate for driving the +machine. After searching the entire table of machine_descs, +setup_machine_fdt() returns the 'most compatible' machine_desc based +on which entry in the compatible property each machine_desc matches +against. If no matching machine_desc is found, then it returns NULL. + +The reasoning behind this scheme is the observation that in the majority +of cases, a single machine_desc can support a large number of boards +if they all use the same SoC, or same family of SoCs. However, +invariably there will be some exceptions where a specific board will +require special setup code that is not useful in the generic case. +Special cases could be handled by explicitly checking for the +troublesome board(s) in generic setup code, but doing so very quickly +becomes ugly and/or unmaintainable if it is more than just a couple of +cases. + +Instead, the compatible list allows a generic machine_desc to provide +support for a wide common set of boards by specifying "less +compatible" values in the dt_compat list. In the example above, +generic board support can claim compatibility with "ti,omap3" or +"ti,omap3450". If a bug was discovered on the original beagleboard +that required special workaround code during early boot, then a new +machine_desc could be added which implements the workarounds and only +matches on "ti,omap3-beagleboard". + +PowerPC uses a slightly different scheme where it calls the .probe() +hook from each machine_desc, and the first one returning TRUE is used. +However, this approach does not take into account the priority of the +compatible list, and probably should be avoided for new architecture +support. + +2.3 Runtime configuration +------------------------- +In most cases, a DT will be the sole method of communicating data from +firmware to the kernel, so also gets used to pass in runtime and +configuration data like the kernel parameters string and the location +of an initrd image. + +Most of this data is contained in the /chosen node, and when booting +Linux it will look something like this:: + + chosen { + bootargs = "console=ttyS0,115200 loglevel=8"; + initrd-start = <0xc8000000>; + initrd-end = <0xc8200000>; + }; + +The bootargs property contains the kernel arguments, and the initrd-* +properties define the address and size of an initrd blob. Note that +initrd-end is the first address after the initrd image, so this doesn't +match the usual semantic of struct resource. The chosen node may also +optionally contain an arbitrary number of additional properties for +platform-specific configuration data. + +During early boot, the architecture setup code calls of_scan_flat_dt() +several times with different helper callbacks to parse device tree +data before paging is setup. The of_scan_flat_dt() code scans through +the device tree and uses the helpers to extract information required +during early boot. Typically the early_init_dt_scan_chosen() helper +is used to parse the chosen node including kernel parameters, +early_init_dt_scan_root() to initialize the DT address space model, +and early_init_dt_scan_memory() to determine the size and +location of usable RAM. + +On ARM, the function setup_machine_fdt() is responsible for early +scanning of the device tree after selecting the correct machine_desc +that supports the board. + +2.4 Device population +--------------------- +After the board has been identified, and after the early configuration data +has been parsed, then kernel initialization can proceed in the normal +way. At some point in this process, unflatten_device_tree() is called +to convert the data into a more efficient runtime representation. +This is also when machine-specific setup hooks will get called, like +the machine_desc .init_early(), .init_irq() and .init_machine() hooks +on ARM. The remainder of this section uses examples from the ARM +implementation, but all architectures will do pretty much the same +thing when using a DT. + +As can be guessed by the names, .init_early() is used for any machine- +specific setup that needs to be executed early in the boot process, +and .init_irq() is used to set up interrupt handling. Using a DT +doesn't materially change the behaviour of either of these functions. +If a DT is provided, then both .init_early() and .init_irq() are able +to call any of the DT query functions (of_* in include/linux/of*.h) to +get additional data about the platform. + +The most interesting hook in the DT context is .init_machine() which +is primarily responsible for populating the Linux device model with +data about the platform. Historically this has been implemented on +embedded platforms by defining a set of static clock structures, +platform_devices, and other data in the board support .c file, and +registering it en-masse in .init_machine(). When DT is used, then +instead of hard coding static devices for each platform, the list of +devices can be obtained by parsing the DT, and allocating device +structures dynamically. + +The simplest case is when .init_machine() is only responsible for +registering a block of platform_devices. A platform_device is a concept +used by Linux for memory or I/O mapped devices which cannot be detected +by hardware, and for 'composite' or 'virtual' devices (more on those +later). While there is no 'platform device' terminology for the DT, +platform devices roughly correspond to device nodes at the root of the +tree and children of simple memory mapped bus nodes. + +About now is a good time to lay out an example. Here is part of the +device tree for the NVIDIA Tegra board:: + + /{ + compatible = "nvidia,harmony", "nvidia,tegra20"; + #address-cells = <1>; + #size-cells = <1>; + interrupt-parent = <&intc>; + + chosen { }; + aliases { }; + + memory { + device_type = "memory"; + reg = <0x00000000 0x40000000>; + }; + + soc { + compatible = "nvidia,tegra20-soc", "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + intc: interrupt-controller@50041000 { + compatible = "nvidia,tegra20-gic"; + interrupt-controller; + #interrupt-cells = <1>; + reg = <0x50041000 0x1000>, < 0x50040100 0x0100 >; + }; + + serial@70006300 { + compatible = "nvidia,tegra20-uart"; + reg = <0x70006300 0x100>; + interrupts = <122>; + }; + + i2s1: i2s@70002800 { + compatible = "nvidia,tegra20-i2s"; + reg = <0x70002800 0x100>; + interrupts = <77>; + codec = <&wm8903>; + }; + + i2c@7000c000 { + compatible = "nvidia,tegra20-i2c"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x7000c000 0x100>; + interrupts = <70>; + + wm8903: codec@1a { + compatible = "wlf,wm8903"; + reg = <0x1a>; + interrupts = <347>; + }; + }; + }; + + sound { + compatible = "nvidia,harmony-sound"; + i2s-controller = <&i2s1>; + i2s-codec = <&wm8903>; + }; + }; + +At .init_machine() time, Tegra board support code will need to look at +this DT and decide which nodes to create platform_devices for. +However, looking at the tree, it is not immediately obvious what kind +of device each node represents, or even if a node represents a device +at all. The /chosen, /aliases, and /memory nodes are informational +nodes that don't describe devices (although arguably memory could be +considered a device). The children of the /soc node are memory mapped +devices, but the codec@1a is an i2c device, and the sound node +represents not a device, but rather how other devices are connected +together to create the audio subsystem. I know what each device is +because I'm familiar with the board design, but how does the kernel +know what to do with each node? + +The trick is that the kernel starts at the root of the tree and looks +for nodes that have a 'compatible' property. First, it is generally +assumed that any node with a 'compatible' property represents a device +of some kind, and second, it can be assumed that any node at the root +of the tree is either directly attached to the processor bus, or is a +miscellaneous system device that cannot be described any other way. +For each of these nodes, Linux allocates and registers a +platform_device, which in turn may get bound to a platform_driver. + +Why is using a platform_device for these nodes a safe assumption? +Well, for the way that Linux models devices, just about all bus_types +assume that its devices are children of a bus controller. For +example, each i2c_client is a child of an i2c_master. Each spi_device +is a child of an SPI bus. Similarly for USB, PCI, MDIO, etc. The +same hierarchy is also found in the DT, where I2C device nodes only +ever appear as children of an I2C bus node. Ditto for SPI, MDIO, USB, +etc. The only devices which do not require a specific type of parent +device are platform_devices (and amba_devices, but more on that +later), which will happily live at the base of the Linux /sys/devices +tree. Therefore, if a DT node is at the root of the tree, then it +really probably is best registered as a platform_device. + +Linux board support code calls of_platform_populate(NULL, NULL, NULL, NULL) +to kick off discovery of devices at the root of the tree. The +parameters are all NULL because when starting from the root of the +tree, there is no need to provide a starting node (the first NULL), a +parent struct device (the last NULL), and we're not using a match +table (yet). For a board that only needs to register devices, +.init_machine() can be completely empty except for the +of_platform_populate() call. + +In the Tegra example, this accounts for the /soc and /sound nodes, but +what about the children of the SoC node? Shouldn't they be registered +as platform devices too? For Linux DT support, the generic behaviour +is for child devices to be registered by the parent's device driver at +driver .probe() time. So, an i2c bus device driver will register a +i2c_client for each child node, an SPI bus driver will register +its spi_device children, and similarly for other bus_types. +According to that model, a driver could be written that binds to the +SoC node and simply registers platform_devices for each of its +children. The board support code would allocate and register an SoC +device, a (theoretical) SoC device driver could bind to the SoC device, +and register platform_devices for /soc/interrupt-controller, /soc/serial, +/soc/i2s, and /soc/i2c in its .probe() hook. Easy, right? + +Actually, it turns out that registering children of some +platform_devices as more platform_devices is a common pattern, and the +device tree support code reflects that and makes the above example +simpler. The second argument to of_platform_populate() is an +of_device_id table, and any node that matches an entry in that table +will also get its child nodes registered. In the Tegra case, the code +can look something like this:: + + static void __init harmony_init_machine(void) + { + /* ... */ + of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); + } + +"simple-bus" is defined in the Devicetree Specification as a property +meaning a simple memory mapped bus, so the of_platform_populate() code +could be written to just assume simple-bus compatible nodes will +always be traversed. However, we pass it in as an argument so that +board support code can always override the default behaviour. + +[Need to add discussion of adding i2c/spi/etc child devices] + +Appendix A: AMBA devices +------------------------ + +ARM Primecells are a certain kind of device attached to the ARM AMBA +bus which include some support for hardware detection and power +management. In Linux, struct amba_device and the amba_bus_type is +used to represent Primecell devices. However, the fiddly bit is that +not all devices on an AMBA bus are Primecells, and for Linux it is +typical for both amba_device and platform_device instances to be +siblings of the same bus segment. + +When using the DT, this creates problems for of_platform_populate() +because it must decide whether to register each node as either a +platform_device or an amba_device. This unfortunately complicates the +device creation model a little bit, but the solution turns out not to +be too invasive. If a node is compatible with "arm,amba-primecell", then +of_platform_populate() will register it as an amba_device instead of a +platform_device. diff --git a/Documentation/devicetree/usage-model.txt b/Documentation/devicetree/usage-model.txt deleted file mode 100644 index 33a8aaac02a8..000000000000 --- a/Documentation/devicetree/usage-model.txt +++ /dev/null @@ -1,415 +0,0 @@ -Linux and the Device Tree -------------------------- -The Linux usage model for device tree data - -Author: Grant Likely - -This article describes how Linux uses the device tree. An overview of -the device tree data format can be found on the device tree usage page -at devicetree.org[1]. - -[1] http://devicetree.org/Device_Tree_Usage - -The "Open Firmware Device Tree", or simply Device Tree (DT), is a data -structure and language for describing hardware. More specifically, it -is a description of hardware that is readable by an operating system -so that the operating system doesn't need to hard code details of the -machine. - -Structurally, the DT is a tree, or acyclic graph with named nodes, and -nodes may have an arbitrary number of named properties encapsulating -arbitrary data. A mechanism also exists to create arbitrary -links from one node to another outside of the natural tree structure. - -Conceptually, a common set of usage conventions, called 'bindings', -is defined for how data should appear in the tree to describe typical -hardware characteristics including data busses, interrupt lines, GPIO -connections, and peripheral devices. - -As much as possible, hardware is described using existing bindings to -maximize use of existing support code, but since property and node -names are simply text strings, it is easy to extend existing bindings -or create new ones by defining new nodes and properties. Be wary, -however, of creating a new binding without first doing some homework -about what already exists. There are currently two different, -incompatible, bindings for i2c busses that came about because the new -binding was created without first investigating how i2c devices were -already being enumerated in existing systems. - -1. History ----------- -The DT was originally created by Open Firmware as part of the -communication method for passing data from Open Firmware to a client -program (like to an operating system). An operating system used the -Device Tree to discover the topology of the hardware at runtime, and -thereby support a majority of available hardware without hard coded -information (assuming drivers were available for all devices). - -Since Open Firmware is commonly used on PowerPC and SPARC platforms, -the Linux support for those architectures has for a long time used the -Device Tree. - -In 2005, when PowerPC Linux began a major cleanup and to merge 32-bit -and 64-bit support, the decision was made to require DT support on all -powerpc platforms, regardless of whether or not they used Open -Firmware. To do this, a DT representation called the Flattened Device -Tree (FDT) was created which could be passed to the kernel as a binary -blob without requiring a real Open Firmware implementation. U-Boot, -kexec, and other bootloaders were modified to support both passing a -Device Tree Binary (dtb) and to modify a dtb at boot time. DT was -also added to the PowerPC boot wrapper (arch/powerpc/boot/*) so that -a dtb could be wrapped up with the kernel image to support booting -existing non-DT aware firmware. - -Some time later, FDT infrastructure was generalized to be usable by -all architectures. At the time of this writing, 6 mainlined -architectures (arm, microblaze, mips, powerpc, sparc, and x86) and 1 -out of mainline (nios) have some level of DT support. - -2. Data Model -------------- -If you haven't already read the Device Tree Usage[1] page, -then go read it now. It's okay, I'll wait.... - -2.1 High Level View -------------------- -The most important thing to understand is that the DT is simply a data -structure that describes the hardware. There is nothing magical about -it, and it doesn't magically make all hardware configuration problems -go away. What it does do is provide a language for decoupling the -hardware configuration from the board and device driver support in the -Linux kernel (or any other operating system for that matter). Using -it allows board and device support to become data driven; to make -setup decisions based on data passed into the kernel instead of on -per-machine hard coded selections. - -Ideally, data driven platform setup should result in less code -duplication and make it easier to support a wide range of hardware -with a single kernel image. - -Linux uses DT data for three major purposes: -1) platform identification, -2) runtime configuration, and -3) device population. - -2.2 Platform Identification ---------------------------- -First and foremost, the kernel will use data in the DT to identify the -specific machine. In a perfect world, the specific platform shouldn't -matter to the kernel because all platform details would be described -perfectly by the device tree in a consistent and reliable manner. -Hardware is not perfect though, and so the kernel must identify the -machine during early boot so that it has the opportunity to run -machine-specific fixups. - -In the majority of cases, the machine identity is irrelevant, and the -kernel will instead select setup code based on the machine's core -CPU or SoC. On ARM for example, setup_arch() in -arch/arm/kernel/setup.c will call setup_machine_fdt() in -arch/arm/kernel/devtree.c which searches through the machine_desc -table and selects the machine_desc which best matches the device tree -data. It determines the best match by looking at the 'compatible' -property in the root device tree node, and comparing it with the -dt_compat list in struct machine_desc (which is defined in -arch/arm/include/asm/mach/arch.h if you're curious). - -The 'compatible' property contains a sorted list of strings starting -with the exact name of the machine, followed by an optional list of -boards it is compatible with sorted from most compatible to least. For -example, the root compatible properties for the TI BeagleBoard and its -successor, the BeagleBoard xM board might look like, respectively: - - compatible = "ti,omap3-beagleboard", "ti,omap3450", "ti,omap3"; - compatible = "ti,omap3-beagleboard-xm", "ti,omap3450", "ti,omap3"; - -Where "ti,omap3-beagleboard-xm" specifies the exact model, it also -claims that it compatible with the OMAP 3450 SoC, and the omap3 family -of SoCs in general. You'll notice that the list is sorted from most -specific (exact board) to least specific (SoC family). - -Astute readers might point out that the Beagle xM could also claim -compatibility with the original Beagle board. However, one should be -cautioned about doing so at the board level since there is typically a -high level of change from one board to another, even within the same -product line, and it is hard to nail down exactly what is meant when one -board claims to be compatible with another. For the top level, it is -better to err on the side of caution and not claim one board is -compatible with another. The notable exception would be when one -board is a carrier for another, such as a CPU module attached to a -carrier board. - -One more note on compatible values. Any string used in a compatible -property must be documented as to what it indicates. Add -documentation for compatible strings in Documentation/devicetree/bindings. - -Again on ARM, for each machine_desc, the kernel looks to see if -any of the dt_compat list entries appear in the compatible property. -If one does, then that machine_desc is a candidate for driving the -machine. After searching the entire table of machine_descs, -setup_machine_fdt() returns the 'most compatible' machine_desc based -on which entry in the compatible property each machine_desc matches -against. If no matching machine_desc is found, then it returns NULL. - -The reasoning behind this scheme is the observation that in the majority -of cases, a single machine_desc can support a large number of boards -if they all use the same SoC, or same family of SoCs. However, -invariably there will be some exceptions where a specific board will -require special setup code that is not useful in the generic case. -Special cases could be handled by explicitly checking for the -troublesome board(s) in generic setup code, but doing so very quickly -becomes ugly and/or unmaintainable if it is more than just a couple of -cases. - -Instead, the compatible list allows a generic machine_desc to provide -support for a wide common set of boards by specifying "less -compatible" values in the dt_compat list. In the example above, -generic board support can claim compatibility with "ti,omap3" or -"ti,omap3450". If a bug was discovered on the original beagleboard -that required special workaround code during early boot, then a new -machine_desc could be added which implements the workarounds and only -matches on "ti,omap3-beagleboard". - -PowerPC uses a slightly different scheme where it calls the .probe() -hook from each machine_desc, and the first one returning TRUE is used. -However, this approach does not take into account the priority of the -compatible list, and probably should be avoided for new architecture -support. - -2.3 Runtime configuration -------------------------- -In most cases, a DT will be the sole method of communicating data from -firmware to the kernel, so also gets used to pass in runtime and -configuration data like the kernel parameters string and the location -of an initrd image. - -Most of this data is contained in the /chosen node, and when booting -Linux it will look something like this: - - chosen { - bootargs = "console=ttyS0,115200 loglevel=8"; - initrd-start = <0xc8000000>; - initrd-end = <0xc8200000>; - }; - -The bootargs property contains the kernel arguments, and the initrd-* -properties define the address and size of an initrd blob. Note that -initrd-end is the first address after the initrd image, so this doesn't -match the usual semantic of struct resource. The chosen node may also -optionally contain an arbitrary number of additional properties for -platform-specific configuration data. - -During early boot, the architecture setup code calls of_scan_flat_dt() -several times with different helper callbacks to parse device tree -data before paging is setup. The of_scan_flat_dt() code scans through -the device tree and uses the helpers to extract information required -during early boot. Typically the early_init_dt_scan_chosen() helper -is used to parse the chosen node including kernel parameters, -early_init_dt_scan_root() to initialize the DT address space model, -and early_init_dt_scan_memory() to determine the size and -location of usable RAM. - -On ARM, the function setup_machine_fdt() is responsible for early -scanning of the device tree after selecting the correct machine_desc -that supports the board. - -2.4 Device population ---------------------- -After the board has been identified, and after the early configuration data -has been parsed, then kernel initialization can proceed in the normal -way. At some point in this process, unflatten_device_tree() is called -to convert the data into a more efficient runtime representation. -This is also when machine-specific setup hooks will get called, like -the machine_desc .init_early(), .init_irq() and .init_machine() hooks -on ARM. The remainder of this section uses examples from the ARM -implementation, but all architectures will do pretty much the same -thing when using a DT. - -As can be guessed by the names, .init_early() is used for any machine- -specific setup that needs to be executed early in the boot process, -and .init_irq() is used to set up interrupt handling. Using a DT -doesn't materially change the behaviour of either of these functions. -If a DT is provided, then both .init_early() and .init_irq() are able -to call any of the DT query functions (of_* in include/linux/of*.h) to -get additional data about the platform. - -The most interesting hook in the DT context is .init_machine() which -is primarily responsible for populating the Linux device model with -data about the platform. Historically this has been implemented on -embedded platforms by defining a set of static clock structures, -platform_devices, and other data in the board support .c file, and -registering it en-masse in .init_machine(). When DT is used, then -instead of hard coding static devices for each platform, the list of -devices can be obtained by parsing the DT, and allocating device -structures dynamically. - -The simplest case is when .init_machine() is only responsible for -registering a block of platform_devices. A platform_device is a concept -used by Linux for memory or I/O mapped devices which cannot be detected -by hardware, and for 'composite' or 'virtual' devices (more on those -later). While there is no 'platform device' terminology for the DT, -platform devices roughly correspond to device nodes at the root of the -tree and children of simple memory mapped bus nodes. - -About now is a good time to lay out an example. Here is part of the -device tree for the NVIDIA Tegra board. - -/{ - compatible = "nvidia,harmony", "nvidia,tegra20"; - #address-cells = <1>; - #size-cells = <1>; - interrupt-parent = <&intc>; - - chosen { }; - aliases { }; - - memory { - device_type = "memory"; - reg = <0x00000000 0x40000000>; - }; - - soc { - compatible = "nvidia,tegra20-soc", "simple-bus"; - #address-cells = <1>; - #size-cells = <1>; - ranges; - - intc: interrupt-controller@50041000 { - compatible = "nvidia,tegra20-gic"; - interrupt-controller; - #interrupt-cells = <1>; - reg = <0x50041000 0x1000>, < 0x50040100 0x0100 >; - }; - - serial@70006300 { - compatible = "nvidia,tegra20-uart"; - reg = <0x70006300 0x100>; - interrupts = <122>; - }; - - i2s1: i2s@70002800 { - compatible = "nvidia,tegra20-i2s"; - reg = <0x70002800 0x100>; - interrupts = <77>; - codec = <&wm8903>; - }; - - i2c@7000c000 { - compatible = "nvidia,tegra20-i2c"; - #address-cells = <1>; - #size-cells = <0>; - reg = <0x7000c000 0x100>; - interrupts = <70>; - - wm8903: codec@1a { - compatible = "wlf,wm8903"; - reg = <0x1a>; - interrupts = <347>; - }; - }; - }; - - sound { - compatible = "nvidia,harmony-sound"; - i2s-controller = <&i2s1>; - i2s-codec = <&wm8903>; - }; -}; - -At .init_machine() time, Tegra board support code will need to look at -this DT and decide which nodes to create platform_devices for. -However, looking at the tree, it is not immediately obvious what kind -of device each node represents, or even if a node represents a device -at all. The /chosen, /aliases, and /memory nodes are informational -nodes that don't describe devices (although arguably memory could be -considered a device). The children of the /soc node are memory mapped -devices, but the codec@1a is an i2c device, and the sound node -represents not a device, but rather how other devices are connected -together to create the audio subsystem. I know what each device is -because I'm familiar with the board design, but how does the kernel -know what to do with each node? - -The trick is that the kernel starts at the root of the tree and looks -for nodes that have a 'compatible' property. First, it is generally -assumed that any node with a 'compatible' property represents a device -of some kind, and second, it can be assumed that any node at the root -of the tree is either directly attached to the processor bus, or is a -miscellaneous system device that cannot be described any other way. -For each of these nodes, Linux allocates and registers a -platform_device, which in turn may get bound to a platform_driver. - -Why is using a platform_device for these nodes a safe assumption? -Well, for the way that Linux models devices, just about all bus_types -assume that its devices are children of a bus controller. For -example, each i2c_client is a child of an i2c_master. Each spi_device -is a child of an SPI bus. Similarly for USB, PCI, MDIO, etc. The -same hierarchy is also found in the DT, where I2C device nodes only -ever appear as children of an I2C bus node. Ditto for SPI, MDIO, USB, -etc. The only devices which do not require a specific type of parent -device are platform_devices (and amba_devices, but more on that -later), which will happily live at the base of the Linux /sys/devices -tree. Therefore, if a DT node is at the root of the tree, then it -really probably is best registered as a platform_device. - -Linux board support code calls of_platform_populate(NULL, NULL, NULL, NULL) -to kick off discovery of devices at the root of the tree. The -parameters are all NULL because when starting from the root of the -tree, there is no need to provide a starting node (the first NULL), a -parent struct device (the last NULL), and we're not using a match -table (yet). For a board that only needs to register devices, -.init_machine() can be completely empty except for the -of_platform_populate() call. - -In the Tegra example, this accounts for the /soc and /sound nodes, but -what about the children of the SoC node? Shouldn't they be registered -as platform devices too? For Linux DT support, the generic behaviour -is for child devices to be registered by the parent's device driver at -driver .probe() time. So, an i2c bus device driver will register a -i2c_client for each child node, an SPI bus driver will register -its spi_device children, and similarly for other bus_types. -According to that model, a driver could be written that binds to the -SoC node and simply registers platform_devices for each of its -children. The board support code would allocate and register an SoC -device, a (theoretical) SoC device driver could bind to the SoC device, -and register platform_devices for /soc/interrupt-controller, /soc/serial, -/soc/i2s, and /soc/i2c in its .probe() hook. Easy, right? - -Actually, it turns out that registering children of some -platform_devices as more platform_devices is a common pattern, and the -device tree support code reflects that and makes the above example -simpler. The second argument to of_platform_populate() is an -of_device_id table, and any node that matches an entry in that table -will also get its child nodes registered. In the Tegra case, the code -can look something like this: - -static void __init harmony_init_machine(void) -{ - /* ... */ - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - -"simple-bus" is defined in the Devicetree Specification as a property -meaning a simple memory mapped bus, so the of_platform_populate() code -could be written to just assume simple-bus compatible nodes will -always be traversed. However, we pass it in as an argument so that -board support code can always override the default behaviour. - -[Need to add discussion of adding i2c/spi/etc child devices] - -Appendix A: AMBA devices ------------------------- - -ARM Primecells are a certain kind of device attached to the ARM AMBA -bus which include some support for hardware detection and power -management. In Linux, struct amba_device and the amba_bus_type is -used to represent Primecell devices. However, the fiddly bit is that -not all devices on an AMBA bus are Primecells, and for Linux it is -typical for both amba_device and platform_device instances to be -siblings of the same bus segment. - -When using the DT, this creates problems for of_platform_populate() -because it must decide whether to register each node as either a -platform_device or an amba_device. This unfortunately complicates the -device creation model a little bit, but the solution turns out not to -be too invasive. If a node is compatible with "arm,amba-primecell", then -of_platform_populate() will register it as an amba_device instead of a -platform_device. diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index d01d1299e49d..21718c8b2b48 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -74,7 +74,7 @@ struct mfd_cell { /* * Device Tree compatible string - * See: Documentation/devicetree/usage-model.txt Chapter 2.2 for details + * See: Documentation/devicetree/usage-model.rst Chapter 2.2 for details */ const char *of_compatible; -- cgit v1.2.3 From 17309a6a43561bd7f4d4b51d7987225eb2b13d05 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Sun, 15 Mar 2020 12:16:28 -0700 Subject: usb: gadget: add "usb_validate_langid" function The USB LANGID validation code in "check_user_usb_string" function is moved to "usb_validate_langid" function which can be used by other usb gadget drivers. Signed-off-by: Tao Ren Signed-off-by: Felipe Balbi --- drivers/usb/gadget/configfs.c | 14 +------------- drivers/usb/gadget/usbstring.c | 24 ++++++++++++++++++++++++ include/linux/usb/gadget.h | 3 +++ 3 files changed, 28 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 32b637e3e1fa..822ef0470c5f 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -13,8 +13,6 @@ int check_user_usb_string(const char *name, struct usb_gadget_strings *stringtab_dev) { - unsigned primary_lang; - unsigned sub_lang; u16 num; int ret; @@ -22,17 +20,7 @@ int check_user_usb_string(const char *name, if (ret) return ret; - primary_lang = num & 0x3ff; - sub_lang = num >> 10; - - /* simple sanity check for valid langid */ - switch (primary_lang) { - case 0: - case 0x62 ... 0xfe: - case 0x100 ... 0x3ff: - return -EINVAL; - } - if (!sub_lang) + if (!usb_validate_langid(num)) return -EINVAL; stringtab_dev->language = num; diff --git a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c index 7c24d1ce1088..58a4d3325090 100644 --- a/drivers/usb/gadget/usbstring.c +++ b/drivers/usb/gadget/usbstring.c @@ -65,3 +65,27 @@ usb_gadget_get_string (const struct usb_gadget_strings *table, int id, u8 *buf) return buf [0]; } EXPORT_SYMBOL_GPL(usb_gadget_get_string); + +/** + * usb_validate_langid - validate usb language identifiers + * @lang: usb language identifier + * + * Returns true for valid language identifier, otherwise false. + */ +bool usb_validate_langid(u16 langid) +{ + u16 primary_lang = langid & 0x3ff; /* bit [9:0] */ + u16 sub_lang = langid >> 10; /* bit [15:10] */ + + switch (primary_lang) { + case 0: + case 0x62 ... 0xfe: + case 0x100 ... 0x3ff: + return false; + } + if (!sub_lang) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(usb_validate_langid); diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 9411c08a5c7e..e959c09a97c9 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -773,6 +773,9 @@ struct usb_gadget_string_container { /* put descriptor for string with that id into buf (buflen >= 256) */ int usb_gadget_get_string(const struct usb_gadget_strings *table, int id, u8 *buf); +/* check if the given language identifier is valid */ +bool usb_validate_langid(u16 langid); + /*-------------------------------------------------------------------------*/ /* utility to simplify managing config descriptors */ -- cgit v1.2.3 From 4cbf38511a007867def958872203ae8adb8e2351 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 29 Apr 2020 15:36:40 +0200 Subject: iommu: Add def_domain_type() callback in iommu_ops Some devices are reqired to use a specific type (identity or dma) of default domain when they are used with a vendor iommu. When the system level default domain type is different from it, the vendor iommu driver has to request a new default domain with iommu_request_dma_domain_for_dev() and iommu_request_dm_for_dev() in the add_dev() callback. Unfortunately, these two helpers only work when the group hasn't been assigned to any other devices, hence, some vendor iommu driver has to use a private domain if it fails to request a new default one. This adds def_domain_type() callback in the iommu_ops, so that any special requirement of default domain for a device could be aware by the iommu generic layer. Signed-off-by: Sai Praneeth Prakhya Signed-off-by: Lu Baolu [ jroedel@suse.de: Added iommu_get_def_domain_type() function and use it to allocate the default domain ] Co-developed-by: Joerg Roedel Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-3-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 20 +++++++++++++++++--- include/linux/iommu.h | 6 ++++++ 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index bfe011760ed1..5877abd9b693 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1361,21 +1361,35 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); +static int iommu_get_def_domain_type(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + unsigned int type = 0; + + if (ops->def_domain_type) + type = ops->def_domain_type(dev); + + return (type == 0) ? iommu_def_domain_type : type; +} + static int iommu_alloc_default_domain(struct device *dev, struct iommu_group *group) { struct iommu_domain *dom; + unsigned int type; if (group->default_domain) return 0; - dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type); - if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) { + type = iommu_get_def_domain_type(dev); + + dom = __iommu_domain_alloc(dev->bus, type); + if (!dom && type != IOMMU_DOMAIN_DMA) { dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA); if (dom) { dev_warn(dev, "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA", - iommu_def_domain_type); + type); } } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ef8b0bda695..1f027b07e499 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -248,6 +248,10 @@ struct iommu_iotlb_gather { * @cache_invalidate: invalidate translation caches * @sva_bind_gpasid: bind guest pasid and mm * @sva_unbind_gpasid: unbind guest pasid and mm + * @def_domain_type: device default domain type, return value: + * - IOMMU_DOMAIN_IDENTITY: must use an identity domain + * - IOMMU_DOMAIN_DMA: must use a dma domain + * - 0: use the default setting * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -318,6 +322,8 @@ struct iommu_ops { int (*sva_unbind_gpasid)(struct device *dev, int pasid); + int (*def_domain_type)(struct device *dev); + unsigned long pgsize_bitmap; struct module *owner; }; -- cgit v1.2.3 From a6a4c7e2c5b8b981d1c546a393ff21f2112468c3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:45 +0200 Subject: iommu: Add probe_device() and release_device() call-backs Add call-backs to 'struct iommu_ops' as an alternative to the add_device() and remove_device() call-backs, which will be removed when all drivers are converted. The new call-backs will not setup IOMMU groups and domains anymore, so also add a probe_finalize() call-back where the IOMMU driver can do per-device setup work which require the device to be set up with a group and a domain. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-8-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++----- include/linux/iommu.h | 9 ++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5877abd9b693..6cfe7799dc8c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -174,6 +174,36 @@ static void dev_iommu_free(struct device *dev) dev->iommu = NULL; } +static int __iommu_probe_device(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + struct iommu_device *iommu_dev; + struct iommu_group *group; + int ret; + + iommu_dev = ops->probe_device(dev); + if (IS_ERR(iommu_dev)) + return PTR_ERR(iommu_dev); + + dev->iommu->iommu_dev = iommu_dev; + + group = iommu_group_get_for_dev(dev); + if (!IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_release; + } + iommu_group_put(group); + + iommu_device_link(iommu_dev, dev); + + return 0; + +out_release: + ops->release_device(dev); + + return ret; +} + int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -191,10 +221,17 @@ int iommu_probe_device(struct device *dev) goto err_free_dev_param; } - ret = ops->add_device(dev); + if (ops->probe_device) + ret = __iommu_probe_device(dev); + else + ret = ops->add_device(dev); + if (ret) goto err_module_put; + if (ops->probe_finalize) + ops->probe_finalize(dev); + return 0; err_module_put: @@ -204,17 +241,31 @@ err_free_dev_param: return ret; } +static void __iommu_release_device(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + iommu_device_unlink(dev->iommu->iommu_dev, dev); + + iommu_group_remove_device(dev); + + ops->release_device(dev); +} + void iommu_release_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; - if (dev->iommu_group) + if (!dev->iommu) + return; + + if (ops->release_device) + __iommu_release_device(dev); + else if (dev->iommu_group) ops->remove_device(dev); - if (dev->iommu) { - module_put(ops->owner); - dev_iommu_free(dev); - } + module_put(ops->owner); + dev_iommu_free(dev); } static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1f027b07e499..30170d191e5e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -225,6 +225,10 @@ struct iommu_iotlb_gather { * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping + * @probe_device: Add device to iommu driver handling + * @release_device: Remove device from iommu driver handling + * @probe_finalize: Do final setup work after the device is added to an IOMMU + * group and attached to the groups domain * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes @@ -275,6 +279,9 @@ struct iommu_ops { phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); int (*add_device)(struct device *dev); void (*remove_device)(struct device *dev); + struct iommu_device *(*probe_device)(struct device *dev); + void (*release_device)(struct device *dev); + void (*probe_finalize)(struct device *dev); struct iommu_group *(*device_group)(struct device *dev); int (*domain_get_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); @@ -375,6 +382,7 @@ struct iommu_fault_param { * * @fault_param: IOMMU detected device fault reporting data * @fwspec: IOMMU fwspec data + * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. @@ -384,6 +392,7 @@ struct dev_iommu { struct mutex lock; struct iommu_fault_param *fault_param; struct iommu_fwspec *fwspec; + struct iommu_device *iommu_dev; void *priv; }; -- cgit v1.2.3 From 5012c3968537e2ffecbdb2eba3479bf9fb9e5597 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:51 +0200 Subject: iommu: Export bus_iommu_probe() and make is safe for re-probing Add a check to the bus_iommu_probe() call-path to make sure it ignores devices which have already been successfully probed. Then export the bus_iommu_probe() function so it can be used by IOMMU drivers. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-14-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 10 +++++++++- include/linux/iommu.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 834a45da0ed0..397fd4fd0c32 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1610,11 +1610,19 @@ static int probe_iommu_group(struct device *dev, void *data) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct list_head *group_list = data; + struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* Device is probed already if in a group */ + group = iommu_group_get(dev); + if (group) { + iommu_group_put(group); + return 0; + } + if (!try_module_get(ops->owner)) { ret = -EINVAL; goto err_free_dev_iommu; @@ -1783,7 +1791,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group) iommu_do_create_direct_mappings); } -static int bus_iommu_probe(struct bus_type *bus) +int bus_iommu_probe(struct bus_type *bus) { const struct iommu_ops *ops = bus->iommu_ops; int ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 30170d191e5e..fea1622408ad 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -445,6 +445,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) #define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER 6 /* Post Driver unbind */ extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); +extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); -- cgit v1.2.3 From 3eeeb45c6d0444b368cdeba9bdafa8bbcf5370d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:10 +0200 Subject: iommu: Remove add_device()/remove_device() code-paths All drivers are converted to use the probe/release_device() call-backs, so the add_device/remove_device() pointers are unused and the code using them can be removed. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-33-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 158 ++++++++++++-------------------------------------- include/linux/iommu.h | 4 -- 2 files changed, 38 insertions(+), 124 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 397fd4fd0c32..7f99e5ae432c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -220,12 +220,20 @@ out_release: return ret; } -static int __iommu_probe_device_helper(struct device *dev) +int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_group *group; int ret; + if (!dev_iommu_get(dev)) + return -ENOMEM; + + if (!try_module_get(ops->owner)) { + ret = -EINVAL; + goto err_out; + } + ret = __iommu_probe_device(dev, NULL); if (ret) goto err_out; @@ -259,75 +267,23 @@ static int __iommu_probe_device_helper(struct device *dev) err_release: iommu_release_device(dev); + err_out: return ret; } -int iommu_probe_device(struct device *dev) +void iommu_release_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; - struct iommu_group *group; - int ret; - - WARN_ON(dev->iommu_group); - - if (!ops) - return -EINVAL; - - if (!dev_iommu_get(dev)) - return -ENOMEM; - - if (!try_module_get(ops->owner)) { - ret = -EINVAL; - goto err_free_dev_param; - } - - if (ops->probe_device) - return __iommu_probe_device_helper(dev); - - ret = ops->add_device(dev); - if (ret) - goto err_module_put; - group = iommu_group_get(dev); - iommu_create_device_direct_mappings(group, dev); - iommu_group_put(group); - - if (ops->probe_finalize) - ops->probe_finalize(dev); - - return 0; - -err_module_put: - module_put(ops->owner); -err_free_dev_param: - dev_iommu_free(dev); - return ret; -} - -static void __iommu_release_device(struct device *dev) -{ - const struct iommu_ops *ops = dev->bus->iommu_ops; + if (!dev->iommu) + return; iommu_device_unlink(dev->iommu->iommu_dev, dev); - iommu_group_remove_device(dev); ops->release_device(dev); -} - -void iommu_release_device(struct device *dev) -{ - const struct iommu_ops *ops = dev->bus->iommu_ops; - - if (!dev->iommu) - return; - - if (ops->release_device) - __iommu_release_device(dev); - else if (dev->iommu_group) - ops->remove_device(dev); module_put(ops->owner); dev_iommu_free(dev); @@ -1560,23 +1516,6 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev) if (ret) goto out_put_group; - /* - * Try to allocate a default domain - needs support from the - * IOMMU driver. There are still some drivers which don't support - * default domains, so the return value is not yet checked. Only - * allocate the domain here when the driver still has the - * add_device/remove_device call-backs implemented. - */ - if (!ops->probe_device) { - iommu_alloc_default_domain(dev); - - if (group->default_domain) - ret = __iommu_attach_device(group->default_domain, dev); - - if (ret) - goto out_put_group; - } - return group; out_put_group: @@ -1591,21 +1530,6 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) return group->default_domain; } -static int add_iommu_group(struct device *dev, void *data) -{ - int ret = iommu_probe_device(dev); - - /* - * We ignore -ENODEV errors for now, as they just mean that the - * device is not translated by an IOMMU. We still care about - * other errors and fail to initialize when they happen. - */ - if (ret == -ENODEV) - ret = 0; - - return ret; -} - static int probe_iommu_group(struct device *dev, void *data) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -1793,47 +1717,41 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group) int bus_iommu_probe(struct bus_type *bus) { - const struct iommu_ops *ops = bus->iommu_ops; + struct iommu_group *group, *next; + LIST_HEAD(group_list); int ret; - if (ops->probe_device) { - struct iommu_group *group, *next; - LIST_HEAD(group_list); - - /* - * This code-path does not allocate the default domain when - * creating the iommu group, so do it after the groups are - * created. - */ - ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); - if (ret) - return ret; + /* + * This code-path does not allocate the default domain when + * creating the iommu group, so do it after the groups are + * created. + */ + ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); + if (ret) + return ret; - list_for_each_entry_safe(group, next, &group_list, entry) { - /* Remove item from the list */ - list_del_init(&group->entry); + list_for_each_entry_safe(group, next, &group_list, entry) { + /* Remove item from the list */ + list_del_init(&group->entry); - mutex_lock(&group->mutex); + mutex_lock(&group->mutex); - /* Try to allocate default domain */ - probe_alloc_default_domain(bus, group); + /* Try to allocate default domain */ + probe_alloc_default_domain(bus, group); - if (!group->default_domain) { - mutex_unlock(&group->mutex); - continue; - } + if (!group->default_domain) { + mutex_unlock(&group->mutex); + continue; + } - iommu_group_create_direct_mappings(group); + iommu_group_create_direct_mappings(group); - ret = __iommu_group_dma_attach(group); + ret = __iommu_group_dma_attach(group); - mutex_unlock(&group->mutex); + mutex_unlock(&group->mutex); - if (ret) - break; - } - } else { - ret = bus_for_each_dev(bus, NULL, NULL, add_iommu_group); + if (ret) + break; } return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fea1622408ad..dd076366383f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,8 +223,6 @@ struct iommu_iotlb_gather { * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue * @iova_to_phys: translate iova to physical address - * @add_device: add device to iommu grouping - * @remove_device: remove device from iommu grouping * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -277,8 +275,6 @@ struct iommu_ops { void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); - int (*add_device)(struct device *dev); - void (*remove_device)(struct device *dev); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); void (*probe_finalize)(struct device *dev); -- cgit v1.2.3 From 1b032ec1ecbce6047af7d11c9db432e237cb17d8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:12 +0200 Subject: iommu: Unexport iommu_group_get_for_dev() The function is now only used in IOMMU core code and shouldn't be used outside of it anyway, so remove the export for it. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-35-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 4 ++-- include/linux/iommu.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 48a95f7d7999..a9e5618cde80 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -91,6 +91,7 @@ static void __iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group); static int iommu_create_device_direct_mappings(struct iommu_group *group, struct device *dev); +static struct iommu_group *iommu_group_get_for_dev(struct device *dev); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -1500,7 +1501,7 @@ static int iommu_alloc_default_domain(struct device *dev) * to the returned IOMMU group, which will already include the provided * device. The reference should be released with iommu_group_put(). */ -struct iommu_group *iommu_group_get_for_dev(struct device *dev) +static struct iommu_group *iommu_group_get_for_dev(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_group *group; @@ -1531,7 +1532,6 @@ out_put_group: return ERR_PTR(ret); } -EXPORT_SYMBOL(iommu_group_get_for_dev); struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index dd076366383f..7cfd2dddb49d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -527,7 +527,6 @@ extern int iommu_page_response(struct device *dev, struct iommu_page_response *msg); extern int iommu_group_id(struct iommu_group *group); -extern struct iommu_group *iommu_group_get_for_dev(struct device *dev); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, -- cgit v1.2.3 From 79622f372b8679916bf4e38aabc6df2659d1e109 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Apr 2020 19:49:12 +0100 Subject: i2c: pxa: move private definitions to i2c-pxa.c Move driver-private definitions out of the i2c-pxa.h platform data header file into the driver itself. Nothing outside of the driver makes use of these constants. Signed-off-by: Russell King Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-pxa.c | 43 +++++++++++++++++++++++++++++++ include/linux/platform_data/i2c-pxa.h | 48 ----------------------------------- 2 files changed, 43 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 3286b5f8bf17..9c811a09eac9 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -85,6 +85,49 @@ #define IWCR_HS_CNT2_SHIFT 10 #define IWCR_HS_CNT2_MASK (0x1F << IWCR_HS_CNT2_SHIFT) +/* need a longer timeout if we're dealing with the fact we may well be + * looking at a multi-master environment + */ +#define DEF_TIMEOUT 32 + +#define BUS_ERROR (-EREMOTEIO) +#define XFER_NAKED (-ECONNREFUSED) +#define I2C_RETRY (-2000) /* an error has occurred retry transmit */ + +/* ICR initialize bit values + * + * 15 FM 0 (100 kHz operation) + * 14 UR 0 (No unit reset) + * 13 SADIE 0 (Disables the unit from interrupting on slave addresses + * matching its slave address) + * 12 ALDIE 0 (Disables the unit from interrupt when it loses arbitration + * in master mode) + * 11 SSDIE 0 (Disables interrupts from a slave stop detected, in slave mode) + * 10 BEIE 1 (Enable interrupts from detected bus errors, no ACK sent) + * 9 IRFIE 1 (Enable interrupts from full buffer received) + * 8 ITEIE 1 (Enables the I2C unit to interrupt when transmit buffer empty) + * 7 GCD 1 (Disables i2c unit response to general call messages as a slave) + * 6 IUE 0 (Disable unit until we change settings) + * 5 SCLE 1 (Enables the i2c clock output for master mode (drives SCL) + * 4 MA 0 (Only send stop with the ICR stop bit) + * 3 TB 0 (We are not transmitting a byte initially) + * 2 ACKNAK 0 (Send an ACK after the unit receives a byte) + * 1 STOP 0 (Do not send a STOP) + * 0 START 0 (Do not send a START) + */ +#define I2C_ICR_INIT (ICR_BEIE | ICR_IRFIE | ICR_ITEIE | ICR_GCD | ICR_SCLE) + +/* I2C status register init values + * + * 10 BED 1 (Clear bus error detected) + * 9 SAD 1 (Clear slave address detected) + * 7 IRF 1 (Clear IDBR Receive Full) + * 6 ITE 1 (Clear IDBR Transmit Empty) + * 5 ALD 1 (Clear Arbitration Loss Detected) + * 4 SSD 1 (Clear Slave Stop Detected) + */ +#define I2C_ISR_INIT 0x7FF /* status register init */ + struct pxa_reg_layout { u32 ibmr; u32 idbr; diff --git a/include/linux/platform_data/i2c-pxa.h b/include/linux/platform_data/i2c-pxa.h index 6a9b28399b39..24953981bd9f 100644 --- a/include/linux/platform_data/i2c-pxa.h +++ b/include/linux/platform_data/i2c-pxa.h @@ -7,54 +7,6 @@ #ifndef _I2C_PXA_H_ #define _I2C_PXA_H_ -#if 0 -#define DEF_TIMEOUT 3 -#else -/* need a longer timeout if we're dealing with the fact we may well be - * looking at a multi-master environment -*/ -#define DEF_TIMEOUT 32 -#endif - -#define BUS_ERROR (-EREMOTEIO) -#define XFER_NAKED (-ECONNREFUSED) -#define I2C_RETRY (-2000) /* an error has occurred retry transmit */ - -/* ICR initialize bit values -* -* 15. FM 0 (100 Khz operation) -* 14. UR 0 (No unit reset) -* 13. SADIE 0 (Disables the unit from interrupting on slave addresses -* matching its slave address) -* 12. ALDIE 0 (Disables the unit from interrupt when it loses arbitration -* in master mode) -* 11. SSDIE 0 (Disables interrupts from a slave stop detected, in slave mode) -* 10. BEIE 1 (Enable interrupts from detected bus errors, no ACK sent) -* 9. IRFIE 1 (Enable interrupts from full buffer received) -* 8. ITEIE 1 (Enables the I2C unit to interrupt when transmit buffer empty) -* 7. GCD 1 (Disables i2c unit response to general call messages as a slave) -* 6. IUE 0 (Disable unit until we change settings) -* 5. SCLE 1 (Enables the i2c clock output for master mode (drives SCL) -* 4. MA 0 (Only send stop with the ICR stop bit) -* 3. TB 0 (We are not transmitting a byte initially) -* 2. ACKNAK 0 (Send an ACK after the unit receives a byte) -* 1. STOP 0 (Do not send a STOP) -* 0. START 0 (Do not send a START) -* -*/ -#define I2C_ICR_INIT (ICR_BEIE | ICR_IRFIE | ICR_ITEIE | ICR_GCD | ICR_SCLE) - -/* I2C status register init values - * - * 10. BED 1 (Clear bus error detected) - * 9. SAD 1 (Clear slave address detected) - * 7. IRF 1 (Clear IDBR Receive Full) - * 6. ITE 1 (Clear IDBR Transmit Empty) - * 5. ALD 1 (Clear Arbitration Loss Detected) - * 4. SSD 1 (Clear Slave Stop Detected) - */ -#define I2C_ISR_INIT 0x7FF /* status register init */ - struct i2c_pxa_platform_data { unsigned int class; unsigned int use_pio :1; -- cgit v1.2.3 From 3d9231e69831551da3a78a618b31702ea4dedd5e Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Sun, 3 May 2020 17:34:24 +0530 Subject: tty: serial: qcom_geni_serial: Use OPP API to set clk/perf state geni serial needs to express a perforamnce state requirement on CX powerdomain depending on the frequency of the clock rates. Use OPP table from DT to register with OPP framework and use dev_pm_opp_set_rate() to set the clk/perf state. Signed-off-by: Rajendra Nayak Reviewed-by: Matthias Kaehlcke Cc: Greg Kroah-Hartman Cc: Akash Asthana Cc: linux-serial@vger.kernel.org Link: https://lore.kernel.org/r/1588507469-31889-2-git-send-email-rnayak@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 34 +++++++++++++++++++++++++++++----- include/linux/qcom-geni-se.h | 4 ++++ 2 files changed, 33 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index 6119090ce045..dd3d1ba38bd9 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -961,7 +962,7 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport, goto out_restart_rx; uport->uartclk = clk_rate; - clk_set_rate(port->se.clk, clk_rate); + dev_pm_opp_set_rate(uport->dev, clk_rate); ser_clk_cfg = SER_CLK_EN; ser_clk_cfg |= clk_div << CLK_DIV_SHFT; @@ -1198,8 +1199,11 @@ static void qcom_geni_serial_pm(struct uart_port *uport, if (new_state == UART_PM_STATE_ON && old_state == UART_PM_STATE_OFF) geni_se_resources_on(&port->se); else if (new_state == UART_PM_STATE_OFF && - old_state == UART_PM_STATE_ON) + old_state == UART_PM_STATE_ON) { + /* Drop the performance state vote */ + dev_pm_opp_set_rate(uport->dev, 0); geni_se_resources_off(&port->se); + } } static const struct uart_ops qcom_geni_console_pops = { @@ -1318,13 +1322,25 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) if (of_property_read_bool(pdev->dev.of_node, "cts-rts-swap")) port->cts_rts_swap = true; + port->se.opp_table = dev_pm_opp_set_clkname(&pdev->dev, "se"); + if (IS_ERR(port->se.opp_table)) + return PTR_ERR(port->se.opp_table); + /* OPP table is optional */ + ret = dev_pm_opp_of_add_table(&pdev->dev); + if (!ret) { + port->se.has_opp_table = true; + } else if (ret != -ENODEV) { + dev_err(&pdev->dev, "invalid OPP table in device tree\n"); + return ret; + } + uport->private_data = drv; platform_set_drvdata(pdev, port); port->handle_rx = console ? handle_rx_console : handle_rx_uart; ret = uart_add_one_port(drv, uport); if (ret) - return ret; + goto err; irq_set_status_flags(uport->irq, IRQ_NOAUTOEN); ret = devm_request_irq(uport->dev, uport->irq, qcom_geni_serial_isr, @@ -1332,7 +1348,7 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) if (ret) { dev_err(uport->dev, "Failed to get IRQ ret %d\n", ret); uart_remove_one_port(drv, uport); - return ret; + goto err; } /* @@ -1349,11 +1365,16 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) if (ret) { device_init_wakeup(&pdev->dev, false); uart_remove_one_port(drv, uport); - return ret; + goto err; } } return 0; +err: + if (port->se.has_opp_table) + dev_pm_opp_of_remove_table(&pdev->dev); + dev_pm_opp_put_clkname(port->se.opp_table); + return ret; } static int qcom_geni_serial_remove(struct platform_device *pdev) @@ -1361,6 +1382,9 @@ static int qcom_geni_serial_remove(struct platform_device *pdev) struct qcom_geni_serial_port *port = platform_get_drvdata(pdev); struct uart_driver *drv = port->uport.private_data; + if (port->se.has_opp_table) + dev_pm_opp_of_remove_table(&pdev->dev); + dev_pm_opp_put_clkname(port->se.opp_table); dev_pm_clear_wake_irq(&pdev->dev); device_init_wakeup(&pdev->dev, false); uart_remove_one_port(drv, &port->uport); diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index dd464943f717..6b78094a4e23 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -33,6 +33,8 @@ struct clk; * @clk: Handle to the core serial engine clock * @num_clk_levels: Number of valid clock levels in clk_perf_tbl * @clk_perf_tbl: Table of clock frequency input to serial engine clock + * @opp_table: Pointer to the OPP table + * @has_opp_table: Specifies if the SE has an OPP table */ struct geni_se { void __iomem *base; @@ -41,6 +43,8 @@ struct geni_se { struct clk *clk; unsigned int num_clk_levels; unsigned long *clk_perf_tbl; + struct opp_table *opp_table; + bool has_opp_table; }; /* Common SE registers */ -- cgit v1.2.3 From efc930fa1d844869eb65586f8cdd6a7837db3607 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:55 +0200 Subject: docs: filesystems: caching/netfs-api.txt: convert it to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/caching/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/cfe4cb1bf8e1f0093d44c30801ec42e74721e543.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/fscache.rst | 2 +- Documentation/filesystems/caching/index.rst | 1 + Documentation/filesystems/caching/netfs-api.rst | 896 +++++++++++++++++++++++ Documentation/filesystems/caching/netfs-api.txt | 910 ------------------------ fs/fscache/cookie.c | 2 +- include/linux/fscache.h | 42 +- 6 files changed, 920 insertions(+), 933 deletions(-) create mode 100644 Documentation/filesystems/caching/netfs-api.rst delete mode 100644 Documentation/filesystems/caching/netfs-api.txt (limited to 'include/linux') diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst index 950602147aa5..dd1297d884d0 100644 --- a/Documentation/filesystems/caching/fscache.rst +++ b/Documentation/filesystems/caching/fscache.rst @@ -183,7 +183,7 @@ reside in one cache. The netfs API to FS-Cache can be found in: - Documentation/filesystems/caching/netfs-api.txt + Documentation/filesystems/caching/netfs-api.rst The cache backend API to FS-Cache can be found in: diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index f488747630aa..d0651db450fb 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -8,3 +8,4 @@ Filesystem Caching fscache object + netfs-api diff --git a/Documentation/filesystems/caching/netfs-api.rst b/Documentation/filesystems/caching/netfs-api.rst new file mode 100644 index 000000000000..d9f14b8610ba --- /dev/null +++ b/Documentation/filesystems/caching/netfs-api.rst @@ -0,0 +1,896 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +FS-Cache Network Filesystem API +=============================== + +There's an API by which a network filesystem can make use of the FS-Cache +facilities. This is based around a number of principles: + + (1) Caches can store a number of different object types. There are two main + object types: indices and files. The first is a special type used by + FS-Cache to make finding objects faster and to make retiring of groups of + objects easier. + + (2) Every index, file or other object is represented by a cookie. This cookie + may or may not have anything associated with it, but the netfs doesn't + need to care. + + (3) Barring the top-level index (one entry per cached netfs), the index + hierarchy for each netfs is structured according the whim of the netfs. + +This API is declared in . + +.. This document contains the following sections: + + (1) Network filesystem definition + (2) Index definition + (3) Object definition + (4) Network filesystem (un)registration + (5) Cache tag lookup + (6) Index registration + (7) Data file registration + (8) Miscellaneous object registration + (9) Setting the data file size + (10) Page alloc/read/write + (11) Page uncaching + (12) Index and data file consistency + (13) Cookie enablement + (14) Miscellaneous cookie operations + (15) Cookie unregistration + (16) Index invalidation + (17) Data file invalidation + (18) FS-Cache specific page flags. + + +Network Filesystem Definition +============================= + +FS-Cache needs a description of the network filesystem. This is specified +using a record of the following structure:: + + struct fscache_netfs { + uint32_t version; + const char *name; + struct fscache_cookie *primary_index; + ... + }; + +This first two fields should be filled in before registration, and the third +will be filled in by the registration function; any other fields should just be +ignored and are for internal use only. + +The fields are: + + (1) The name of the netfs (used as the key in the toplevel index). + + (2) The version of the netfs (if the name matches but the version doesn't, the + entire in-cache hierarchy for this netfs will be scrapped and begun + afresh). + + (3) The cookie representing the primary index will be allocated according to + another parameter passed into the registration function. + +For example, kAFS (linux/fs/afs/) uses the following definitions to describe +itself:: + + struct fscache_netfs afs_cache_netfs = { + .version = 0, + .name = "afs", + }; + + +Index Definition +================ + +Indices are used for two purposes: + + (1) To aid the finding of a file based on a series of keys (such as AFS's + "cell", "volume ID", "vnode ID"). + + (2) To make it easier to discard a subset of all the files cached based around + a particular key - for instance to mirror the removal of an AFS volume. + +However, since it's unlikely that any two netfs's are going to want to define +their index hierarchies in quite the same way, FS-Cache tries to impose as few +restraints as possible on how an index is structured and where it is placed in +the tree. The netfs can even mix indices and data files at the same level, but +it's not recommended. + +Each index entry consists of a key of indeterminate length plus some auxiliary +data, also of indeterminate length. + +There are some limits on indices: + + (1) Any index containing non-index objects should be restricted to a single + cache. Any such objects created within an index will be created in the + first cache only. The cache in which an index is created can be + controlled by cache tags (see below). + + (2) The entry data must be atomically journallable, so it is limited to about + 400 bytes at present. At least 400 bytes will be available. + + (3) The depth of the index tree should be judged with care as the search + function is recursive. Too many layers will run the kernel out of stack. + + +Object Definition +================= + +To define an object, a structure of the following type should be filled out:: + + struct fscache_cookie_def + { + uint8_t name[16]; + uint8_t type; + + struct fscache_cache_tag *(*select_cache)( + const void *parent_netfs_data, + const void *cookie_netfs_data); + + enum fscache_checkaux (*check_aux)(void *cookie_netfs_data, + const void *data, + uint16_t datalen, + loff_t object_size); + + void (*get_context)(void *cookie_netfs_data, void *context); + + void (*put_context)(void *cookie_netfs_data, void *context); + + void (*mark_pages_cached)(void *cookie_netfs_data, + struct address_space *mapping, + struct pagevec *cached_pvec); + }; + +This has the following fields: + + (1) The type of the object [mandatory]. + + This is one of the following values: + + FSCACHE_COOKIE_TYPE_INDEX + This defines an index, which is a special FS-Cache type. + + FSCACHE_COOKIE_TYPE_DATAFILE + This defines an ordinary data file. + + Any other value between 2 and 255 + This defines an extraordinary object such as an XATTR. + + (2) The name of the object type (NUL terminated unless all 16 chars are used) + [optional]. + + (3) A function to select the cache in which to store an index [optional]. + + This function is invoked when an index needs to be instantiated in a cache + during the instantiation of a non-index object. Only the immediate index + parent for the non-index object will be queried. Any indices above that + in the hierarchy may be stored in multiple caches. This function does not + need to be supplied for any non-index object or any index that will only + have index children. + + If this function is not supplied or if it returns NULL then the first + cache in the parent's list will be chosen, or failing that, the first + cache in the master list. + + (4) A function to check the auxiliary data [optional]. + + This function will be called to check that a match found in the cache for + this object is valid. For instance with AFS it could check the auxiliary + data against the data version number returned by the server to determine + whether the index entry in a cache is still valid. + + If this function is absent, it will be assumed that matching objects in a + cache are always valid. + + The function is also passed the cache's idea of the object size and may + use this to manage coherency also. + + If present, the function should return one of the following values: + + FSCACHE_CHECKAUX_OKAY + - the entry is okay as is + + FSCACHE_CHECKAUX_NEEDS_UPDATE + - the entry requires update + + FSCACHE_CHECKAUX_OBSOLETE + - the entry should be deleted + + This function can also be used to extract data from the auxiliary data in + the cache and copy it into the netfs's structures. + + (5) A pair of functions to manage contexts for the completion callback + [optional]. + + The cache read/write functions are passed a context which is then passed + to the I/O completion callback function. To ensure this context remains + valid until after the I/O completion is called, two functions may be + provided: one to get an extra reference on the context, and one to drop a + reference to it. + + If the context is not used or is a type of object that won't go out of + scope, then these functions are not required. These functions are not + required for indices as indices may not contain data. These functions may + be called in interrupt context and so may not sleep. + + (6) A function to mark a page as retaining cache metadata [optional]. + + This is called by the cache to indicate that it is retaining in-memory + information for this page and that the netfs should uncache the page when + it has finished. This does not indicate whether there's data on the disk + or not. Note that several pages at once may be presented for marking. + + The PG_fscache bit is set on the pages before this function would be + called, so the function need not be provided if this is sufficient. + + This function is not required for indices as they're not permitted data. + + (7) A function to unmark all the pages retaining cache metadata [mandatory]. + + This is called by FS-Cache to indicate that a backing store is being + unbound from a cookie and that all the marks on the pages should be + cleared to prevent confusion. Note that the cache will have torn down all + its tracking information so that the pages don't need to be explicitly + uncached. + + This function is not required for indices as they're not permitted data. + + +Network Filesystem (Un)registration +=================================== + +The first step is to declare the network filesystem to the cache. This also +involves specifying the layout of the primary index (for AFS, this would be the +"cell" level). + +The registration function is:: + + int fscache_register_netfs(struct fscache_netfs *netfs); + +It just takes a pointer to the netfs definition. It returns 0 or an error as +appropriate. + +For kAFS, registration is done as follows:: + + ret = fscache_register_netfs(&afs_cache_netfs); + +The last step is, of course, unregistration:: + + void fscache_unregister_netfs(struct fscache_netfs *netfs); + + +Cache Tag Lookup +================ + +FS-Cache permits the use of more than one cache. To permit particular index +subtrees to be bound to particular caches, the second step is to look up cache +representation tags. This step is optional; it can be left entirely up to +FS-Cache as to which cache should be used. The problem with doing that is that +FS-Cache will always pick the first cache that was registered. + +To get the representation for a named tag:: + + struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name); + +This takes a text string as the name and returns a representation of a tag. It +will never return an error. It may return a dummy tag, however, if it runs out +of memory; this will inhibit caching with this tag. + +Any representation so obtained must be released by passing it to this function:: + + void fscache_release_cache_tag(struct fscache_cache_tag *tag); + +The tag will be retrieved by FS-Cache when it calls the object definition +operation select_cache(). + + +Index Registration +================== + +The third step is to inform FS-Cache about part of an index hierarchy that can +be used to locate files. This is done by requesting a cookie for each index in +the path to the file:: + + struct fscache_cookie * + fscache_acquire_cookie(struct fscache_cookie *parent, + const struct fscache_object_def *def, + const void *index_key, + size_t index_key_len, + const void *aux_data, + size_t aux_data_len, + void *netfs_data, + loff_t object_size, + bool enable); + +This function creates an index entry in the index represented by parent, +filling in the index entry by calling the operations pointed to by def. + +A unique key that represents the object within the parent must be pointed to by +index_key and is of length index_key_len. + +An optional blob of auxiliary data that is to be stored within the cache can be +pointed to with aux_data and should be of length aux_data_len. This would +typically be used for storing coherency data. + +The netfs may pass an arbitrary value in netfs_data and this will be presented +to it in the event of any calling back. This may also be used in tracing or +logging of messages. + +The cache tracks the size of the data attached to an object and this set to be +object_size. For indices, this should be 0. This value will be passed to the +->check_aux() callback. + +Note that this function never returns an error - all errors are handled +internally. It may, however, return NULL to indicate no cookie. It is quite +acceptable to pass this token back to this function as the parent to another +acquisition (or even to the relinquish cookie, read page and write page +functions - see below). + +Note also that no indices are actually created in a cache until a non-index +object needs to be created somewhere down the hierarchy. Furthermore, an index +may be created in several different caches independently at different times. +This is all handled transparently, and the netfs doesn't see any of it. + +A cookie will be created in the disabled state if enabled is false. A cookie +must be enabled to do anything with it. A disabled cookie can be enabled by +calling fscache_enable_cookie() (see below). + +For example, with AFS, a cell would be added to the primary index. This index +entry would have a dependent inode containing volume mappings within this cell:: + + cell->cache = + fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell->name, strlen(cell->name), + NULL, 0, + cell, 0, true); + +And then a particular volume could be added to that index by ID, creating +another index for vnodes (AFS inode equivalents):: + + volume->cache = + fscache_acquire_cookie(volume->cell->cache, + &afs_volume_cache_index_def, + &volume->vid, sizeof(volume->vid), + NULL, 0, + volume, 0, true); + + +Data File Registration +====================== + +The fourth step is to request a data file be created in the cache. This is +identical to index cookie acquisition. The only difference is that the type in +the object definition should be something other than index type:: + + vnode->cache = + fscache_acquire_cookie(volume->cache, + &afs_vnode_cache_object_def, + &key, sizeof(key), + &aux, sizeof(aux), + vnode, vnode->status.size, true); + + +Miscellaneous Object Registration +================================= + +An optional step is to request an object of miscellaneous type be created in +the cache. This is almost identical to index cookie acquisition. The only +difference is that the type in the object definition should be something other +than index type. While the parent object could be an index, it's more likely +it would be some other type of object such as a data file:: + + xattr->cache = + fscache_acquire_cookie(vnode->cache, + &afs_xattr_cache_object_def, + &xattr->name, strlen(xattr->name), + NULL, 0, + xattr, strlen(xattr->val), true); + +Miscellaneous objects might be used to store extended attributes or directory +entries for example. + + +Setting the Data File Size +========================== + +The fifth step is to set the physical attributes of the file, such as its size. +This doesn't automatically reserve any space in the cache, but permits the +cache to adjust its metadata for data tracking appropriately:: + + int fscache_attr_changed(struct fscache_cookie *cookie); + +The cache will return -ENOBUFS if there is no backing cache or if there is no +space to allocate any extra metadata required in the cache. + +Note that attempts to read or write data pages in the cache over this size may +be rebuffed with -ENOBUFS. + +This operation schedules an attribute adjustment to happen asynchronously at +some point in the future, and as such, it may happen after the function returns +to the caller. The attribute adjustment excludes read and write operations. + + +Page alloc/read/write +===================== + +And the sixth step is to store and retrieve pages in the cache. There are +three functions that are used to do this. + +Note: + + (1) A page should not be re-read or re-allocated without uncaching it first. + + (2) A read or allocated page must be uncached when the netfs page is released + from the pagecache. + + (3) A page should only be written to the cache if previous read or allocated. + +This permits the cache to maintain its page tracking in proper order. + + +PAGE READ +--------- + +Firstly, the netfs should ask FS-Cache to examine the caches and read the +contents cached for a particular page of a particular file if present, or else +allocate space to store the contents if not:: + + typedef + void (*fscache_rw_complete_t)(struct page *page, + void *context, + int error); + + int fscache_read_or_alloc_page(struct fscache_cookie *cookie, + struct page *page, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp); + +The cookie argument must specify a cookie for an object that isn't an index, +the page specified will have the data loaded into it (and is also used to +specify the page number), and the gfp argument is used to control how any +memory allocations made are satisfied. + +If the cookie indicates the inode is not cached: + + (1) The function will return -ENOBUFS. + +Else if there's a copy of the page resident in the cache: + + (1) The mark_pages_cached() cookie operation will be called on that page. + + (2) The function will submit a request to read the data from the cache's + backing device directly into the page specified. + + (3) The function will return 0. + + (4) When the read is complete, end_io_func() will be invoked with: + + * The netfs data supplied when the cookie was created. + + * The page descriptor. + + * The context argument passed to the above function. This will be + maintained with the get_context/put_context functions mentioned above. + + * An argument that's 0 on success or negative for an error code. + + If an error occurs, it should be assumed that the page contains no usable + data. fscache_readpages_cancel() may need to be called. + + end_io_func() will be called in process context if the read is results in + an error, but it might be called in interrupt context if the read is + successful. + +Otherwise, if there's not a copy available in cache, but the cache may be able +to store the page: + + (1) The mark_pages_cached() cookie operation will be called on that page. + + (2) A block may be reserved in the cache and attached to the object at the + appropriate place. + + (3) The function will return -ENODATA. + +This function may also return -ENOMEM or -EINTR, in which case it won't have +read any data from the cache. + + +Page Allocate +------------- + +Alternatively, if there's not expected to be any data in the cache for a page +because the file has been extended, a block can simply be allocated instead:: + + int fscache_alloc_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp); + +This is similar to the fscache_read_or_alloc_page() function, except that it +never reads from the cache. It will return 0 if a block has been allocated, +rather than -ENODATA as the other would. One or the other must be performed +before writing to the cache. + +The mark_pages_cached() cookie operation will be called on the page if +successful. + + +Page Write +---------- + +Secondly, if the netfs changes the contents of the page (either due to an +initial download or if a user performs a write), then the page should be +written back to the cache:: + + int fscache_write_page(struct fscache_cookie *cookie, + struct page *page, + loff_t object_size, + gfp_t gfp); + +The cookie argument must specify a data file cookie, the page specified should +contain the data to be written (and is also used to specify the page number), +object_size is the revised size of the object and the gfp argument is used to +control how any memory allocations made are satisfied. + +The page must have first been read or allocated successfully and must not have +been uncached before writing is performed. + +If the cookie indicates the inode is not cached then: + + (1) The function will return -ENOBUFS. + +Else if space can be allocated in the cache to hold this page: + + (1) PG_fscache_write will be set on the page. + + (2) The function will submit a request to write the data to cache's backing + device directly from the page specified. + + (3) The function will return 0. + + (4) When the write is complete PG_fscache_write is cleared on the page and + anyone waiting for that bit will be woken up. + +Else if there's no space available in the cache, -ENOBUFS will be returned. It +is also possible for the PG_fscache_write bit to be cleared when no write took +place if unforeseen circumstances arose (such as a disk error). + +Writing takes place asynchronously. + + +Multiple Page Read +------------------ + +A facility is provided to read several pages at once, as requested by the +readpages() address space operation:: + + int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, + struct address_space *mapping, + struct list_head *pages, + int *nr_pages, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp); + +This works in a similar way to fscache_read_or_alloc_page(), except: + + (1) Any page it can retrieve data for is removed from pages and nr_pages and + dispatched for reading to the disk. Reads of adjacent pages on disk may + be merged for greater efficiency. + + (2) The mark_pages_cached() cookie operation will be called on several pages + at once if they're being read or allocated. + + (3) If there was an general error, then that error will be returned. + + Else if some pages couldn't be allocated or read, then -ENOBUFS will be + returned. + + Else if some pages couldn't be read but were allocated, then -ENODATA will + be returned. + + Otherwise, if all pages had reads dispatched, then 0 will be returned, the + list will be empty and ``*nr_pages`` will be 0. + + (4) end_io_func will be called once for each page being read as the reads + complete. It will be called in process context if error != 0, but it may + be called in interrupt context if there is no error. + +Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude +some of the pages being read and some being allocated. Those pages will have +been marked appropriately and will need uncaching. + + +Cancellation of Unread Pages +---------------------------- + +If one or more pages are passed to fscache_read_or_alloc_pages() but not then +read from the cache and also not read from the underlying filesystem then +those pages will need to have any marks and reservations removed. This can be +done by calling:: + + void fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages); + +prior to returning to the caller. The cookie argument should be as passed to +fscache_read_or_alloc_pages(). Every page in the pages list will be examined +and any that have PG_fscache set will be uncached. + + +Page Uncaching +============== + +To uncache a page, this function should be called:: + + void fscache_uncache_page(struct fscache_cookie *cookie, + struct page *page); + +This function permits the cache to release any in-memory representation it +might be holding for this netfs page. This function must be called once for +each page on which the read or write page functions above have been called to +make sure the cache's in-memory tracking information gets torn down. + +Note that pages can't be explicitly deleted from the a data file. The whole +data file must be retired (see the relinquish cookie function below). + +Furthermore, note that this does not cancel the asynchronous read or write +operation started by the read/alloc and write functions, so the page +invalidation functions must use:: + + bool fscache_check_page_write(struct fscache_cookie *cookie, + struct page *page); + +to see if a page is being written to the cache, and:: + + void fscache_wait_on_page_write(struct fscache_cookie *cookie, + struct page *page); + +to wait for it to finish if it is. + + +When releasepage() is being implemented, a special FS-Cache function exists to +manage the heuristics of coping with vmscan trying to eject pages, which may +conflict with the cache trying to write pages to the cache (which may itself +need to allocate memory):: + + bool fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp); + +This takes the netfs cookie, and the page and gfp arguments as supplied to +releasepage(). It will return false if the page cannot be released yet for +some reason and if it returns true, the page has been uncached and can now be +released. + +To make a page available for release, this function may wait for an outstanding +storage request to complete, or it may attempt to cancel the storage request - +in which case the page will not be stored in the cache this time. + + +Bulk Image Page Uncache +----------------------- + +A convenience routine is provided to perform an uncache on all the pages +attached to an inode. This assumes that the pages on the inode correspond on a +1:1 basis with the pages in the cache:: + + void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode); + +This takes the netfs cookie that the pages were cached with and the inode that +the pages are attached to. This function will wait for pages to finish being +written to the cache and for the cache to finish with the page generally. No +error is returned. + + +Index and Data File consistency +=============================== + +To find out whether auxiliary data for an object is up to data within the +cache, the following function can be called:: + + int fscache_check_consistency(struct fscache_cookie *cookie, + const void *aux_data); + +This will call back to the netfs to check whether the auxiliary data associated +with a cookie is correct; if aux_data is non-NULL, it will update the auxiliary +data buffer first. It returns 0 if it is and -ESTALE if it isn't; it may also +return -ENOMEM and -ERESTARTSYS. + +To request an update of the index data for an index or other object, the +following function should be called:: + + void fscache_update_cookie(struct fscache_cookie *cookie, + const void *aux_data); + +This function will update the cookie's auxiliary data buffer from aux_data if +that is non-NULL and then schedule this to be stored on disk. The update +method in the parent index definition will be called to transfer the data. + +Note that partial updates may happen automatically at other times, such as when +data blocks are added to a data file object. + + +Cookie Enablement +================= + +Cookies exist in one of two states: enabled and disabled. If a cookie is +disabled, it ignores all attempts to acquire child cookies; check, update or +invalidate its state; allocate, read or write backing pages - though it is +still possible to uncache pages and relinquish the cookie. + +The initial enablement state is set by fscache_acquire_cookie(), but the cookie +can be enabled or disabled later. To disable a cookie, call:: + + void fscache_disable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool invalidate); + +If the cookie is not already disabled, this locks the cookie against other +enable and disable ops, marks the cookie as being disabled, discards or +invalidates any backing objects and waits for cessation of activity on any +associated object before unlocking the cookie. + +All possible failures are handled internally. The caller should consider +calling fscache_uncache_all_inode_pages() afterwards to make sure all page +markings are cleared up. + +Cookies can be enabled or reenabled with:: + + void fscache_enable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + loff_t object_size, + bool (*can_enable)(void *data), + void *data) + +If the cookie is not already enabled, this locks the cookie against other +enable and disable ops, invokes can_enable() and, if the cookie is not an index +cookie, will begin the procedure of acquiring backing objects. + +The optional can_enable() function is passed the data argument and returns a +ruling as to whether or not enablement should actually be permitted to begin. + +All possible failures are handled internally. The cookie will only be marked +as enabled if provisional backing objects are allocated. + +The object's data size is updated from object_size and is passed to the +->check_aux() function. + +In both cases, the cookie's auxiliary data buffer is updated from aux_data if +that is non-NULL inside the enablement lock before proceeding. + + +Miscellaneous Cookie operations +=============================== + +There are a number of operations that can be used to control cookies: + + * Cookie pinning:: + + int fscache_pin_cookie(struct fscache_cookie *cookie); + void fscache_unpin_cookie(struct fscache_cookie *cookie); + + These operations permit data cookies to be pinned into the cache and to + have the pinning removed. They are not permitted on index cookies. + + The pinning function will return 0 if successful, -ENOBUFS in the cookie + isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning, + -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or + -EIO if there's any other problem. + + * Data space reservation:: + + int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size); + + This permits a netfs to request cache space be reserved to store up to the + given amount of a file. It is permitted to ask for more than the current + size of the file to allow for future file expansion. + + If size is given as zero then the reservation will be cancelled. + + The function will return 0 if successful, -ENOBUFS in the cookie isn't + backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations, + -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or + -EIO if there's any other problem. + + Note that this doesn't pin an object in a cache; it can still be culled to + make space if it's not in use. + + +Cookie Unregistration +===================== + +To get rid of a cookie, this function should be called:: + + void fscache_relinquish_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool retire); + +If retire is non-zero, then the object will be marked for recycling, and all +copies of it will be removed from all active caches in which it is present. +Not only that but all child objects will also be retired. + +If retire is zero, then the object may be available again when next the +acquisition function is called. Retirement here will overrule the pinning on a +cookie. + +The cookie's auxiliary data will be updated from aux_data if that is non-NULL +so that the cache can lazily update it on disk. + +One very important note - relinquish must NOT be called for a cookie unless all +the cookies for "child" indices, objects and pages have been relinquished +first. + + +Index Invalidation +================== + +There is no direct way to invalidate an index subtree. To do this, the caller +should relinquish and retire the cookie they have, and then acquire a new one. + + +Data File Invalidation +====================== + +Sometimes it will be necessary to invalidate an object that contains data. +Typically this will be necessary when the server tells the netfs of a foreign +change - at which point the netfs has to throw away all the state it had for an +inode and reload from the server. + +To indicate that a cache object should be invalidated, the following function +can be called:: + + void fscache_invalidate(struct fscache_cookie *cookie); + +This can be called with spinlocks held as it defers the work to a thread pool. +All extant storage, retrieval and attribute change ops at this point are +cancelled and discarded. Some future operations will be rejected until the +cache has had a chance to insert a barrier in the operations queue. After +that, operations will be queued again behind the invalidation operation. + +The invalidation operation will perform an attribute change operation and an +auxiliary data update operation as it is very likely these will have changed. + +Using the following function, the netfs can wait for the invalidation operation +to have reached a point at which it can start submitting ordinary operations +once again:: + + void fscache_wait_on_invalidate(struct fscache_cookie *cookie); + + +FS-cache Specific Page Flag +=========================== + +FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is +given the alternative name PG_fscache. + +PG_fscache is used to indicate that the page is known by the cache, and that +the cache must be informed if the page is going to go away. It's an indication +to the netfs that the cache has an interest in this page, where an interest may +be a pointer to it, resources allocated or reserved for it, or I/O in progress +upon it. + +The netfs can use this information in methods such as releasepage() to +determine whether it needs to uncache a page or update it. + +Furthermore, if this bit is set, releasepage() and invalidatepage() operations +will be called on a page to get rid of it, even if PG_private is not set. This +allows caching to attempted on a page before read_cache_pages() to be called +after fscache_read_or_alloc_pages() as the former will try and release pages it +was given under certain circumstances. + +This bit does not overlap with such as PG_private. This means that FS-Cache +can be used with a filesystem that uses the block buffering code. + +There are a number of operations defined on this flag:: + + int PageFsCache(struct page *page); + void SetPageFsCache(struct page *page) + void ClearPageFsCache(struct page *page) + int TestSetPageFsCache(struct page *page) + int TestClearPageFsCache(struct page *page) + +These functions are bit test, bit set, bit clear, bit test and set and bit +test and clear operations on PG_fscache. diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt deleted file mode 100644 index ba968e8f5704..000000000000 --- a/Documentation/filesystems/caching/netfs-api.txt +++ /dev/null @@ -1,910 +0,0 @@ - =============================== - FS-CACHE NETWORK FILESYSTEM API - =============================== - -There's an API by which a network filesystem can make use of the FS-Cache -facilities. This is based around a number of principles: - - (1) Caches can store a number of different object types. There are two main - object types: indices and files. The first is a special type used by - FS-Cache to make finding objects faster and to make retiring of groups of - objects easier. - - (2) Every index, file or other object is represented by a cookie. This cookie - may or may not have anything associated with it, but the netfs doesn't - need to care. - - (3) Barring the top-level index (one entry per cached netfs), the index - hierarchy for each netfs is structured according the whim of the netfs. - -This API is declared in . - -This document contains the following sections: - - (1) Network filesystem definition - (2) Index definition - (3) Object definition - (4) Network filesystem (un)registration - (5) Cache tag lookup - (6) Index registration - (7) Data file registration - (8) Miscellaneous object registration - (9) Setting the data file size - (10) Page alloc/read/write - (11) Page uncaching - (12) Index and data file consistency - (13) Cookie enablement - (14) Miscellaneous cookie operations - (15) Cookie unregistration - (16) Index invalidation - (17) Data file invalidation - (18) FS-Cache specific page flags. - - -============================= -NETWORK FILESYSTEM DEFINITION -============================= - -FS-Cache needs a description of the network filesystem. This is specified -using a record of the following structure: - - struct fscache_netfs { - uint32_t version; - const char *name; - struct fscache_cookie *primary_index; - ... - }; - -This first two fields should be filled in before registration, and the third -will be filled in by the registration function; any other fields should just be -ignored and are for internal use only. - -The fields are: - - (1) The name of the netfs (used as the key in the toplevel index). - - (2) The version of the netfs (if the name matches but the version doesn't, the - entire in-cache hierarchy for this netfs will be scrapped and begun - afresh). - - (3) The cookie representing the primary index will be allocated according to - another parameter passed into the registration function. - -For example, kAFS (linux/fs/afs/) uses the following definitions to describe -itself: - - struct fscache_netfs afs_cache_netfs = { - .version = 0, - .name = "afs", - }; - - -================ -INDEX DEFINITION -================ - -Indices are used for two purposes: - - (1) To aid the finding of a file based on a series of keys (such as AFS's - "cell", "volume ID", "vnode ID"). - - (2) To make it easier to discard a subset of all the files cached based around - a particular key - for instance to mirror the removal of an AFS volume. - -However, since it's unlikely that any two netfs's are going to want to define -their index hierarchies in quite the same way, FS-Cache tries to impose as few -restraints as possible on how an index is structured and where it is placed in -the tree. The netfs can even mix indices and data files at the same level, but -it's not recommended. - -Each index entry consists of a key of indeterminate length plus some auxiliary -data, also of indeterminate length. - -There are some limits on indices: - - (1) Any index containing non-index objects should be restricted to a single - cache. Any such objects created within an index will be created in the - first cache only. The cache in which an index is created can be - controlled by cache tags (see below). - - (2) The entry data must be atomically journallable, so it is limited to about - 400 bytes at present. At least 400 bytes will be available. - - (3) The depth of the index tree should be judged with care as the search - function is recursive. Too many layers will run the kernel out of stack. - - -================= -OBJECT DEFINITION -================= - -To define an object, a structure of the following type should be filled out: - - struct fscache_cookie_def - { - uint8_t name[16]; - uint8_t type; - - struct fscache_cache_tag *(*select_cache)( - const void *parent_netfs_data, - const void *cookie_netfs_data); - - enum fscache_checkaux (*check_aux)(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size); - - void (*get_context)(void *cookie_netfs_data, void *context); - - void (*put_context)(void *cookie_netfs_data, void *context); - - void (*mark_pages_cached)(void *cookie_netfs_data, - struct address_space *mapping, - struct pagevec *cached_pvec); - }; - -This has the following fields: - - (1) The type of the object [mandatory]. - - This is one of the following values: - - (*) FSCACHE_COOKIE_TYPE_INDEX - - This defines an index, which is a special FS-Cache type. - - (*) FSCACHE_COOKIE_TYPE_DATAFILE - - This defines an ordinary data file. - - (*) Any other value between 2 and 255 - - This defines an extraordinary object such as an XATTR. - - (2) The name of the object type (NUL terminated unless all 16 chars are used) - [optional]. - - (3) A function to select the cache in which to store an index [optional]. - - This function is invoked when an index needs to be instantiated in a cache - during the instantiation of a non-index object. Only the immediate index - parent for the non-index object will be queried. Any indices above that - in the hierarchy may be stored in multiple caches. This function does not - need to be supplied for any non-index object or any index that will only - have index children. - - If this function is not supplied or if it returns NULL then the first - cache in the parent's list will be chosen, or failing that, the first - cache in the master list. - - (4) A function to check the auxiliary data [optional]. - - This function will be called to check that a match found in the cache for - this object is valid. For instance with AFS it could check the auxiliary - data against the data version number returned by the server to determine - whether the index entry in a cache is still valid. - - If this function is absent, it will be assumed that matching objects in a - cache are always valid. - - The function is also passed the cache's idea of the object size and may - use this to manage coherency also. - - If present, the function should return one of the following values: - - (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is - (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update - (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted - - This function can also be used to extract data from the auxiliary data in - the cache and copy it into the netfs's structures. - - (5) A pair of functions to manage contexts for the completion callback - [optional]. - - The cache read/write functions are passed a context which is then passed - to the I/O completion callback function. To ensure this context remains - valid until after the I/O completion is called, two functions may be - provided: one to get an extra reference on the context, and one to drop a - reference to it. - - If the context is not used or is a type of object that won't go out of - scope, then these functions are not required. These functions are not - required for indices as indices may not contain data. These functions may - be called in interrupt context and so may not sleep. - - (6) A function to mark a page as retaining cache metadata [optional]. - - This is called by the cache to indicate that it is retaining in-memory - information for this page and that the netfs should uncache the page when - it has finished. This does not indicate whether there's data on the disk - or not. Note that several pages at once may be presented for marking. - - The PG_fscache bit is set on the pages before this function would be - called, so the function need not be provided if this is sufficient. - - This function is not required for indices as they're not permitted data. - - (7) A function to unmark all the pages retaining cache metadata [mandatory]. - - This is called by FS-Cache to indicate that a backing store is being - unbound from a cookie and that all the marks on the pages should be - cleared to prevent confusion. Note that the cache will have torn down all - its tracking information so that the pages don't need to be explicitly - uncached. - - This function is not required for indices as they're not permitted data. - - -=================================== -NETWORK FILESYSTEM (UN)REGISTRATION -=================================== - -The first step is to declare the network filesystem to the cache. This also -involves specifying the layout of the primary index (for AFS, this would be the -"cell" level). - -The registration function is: - - int fscache_register_netfs(struct fscache_netfs *netfs); - -It just takes a pointer to the netfs definition. It returns 0 or an error as -appropriate. - -For kAFS, registration is done as follows: - - ret = fscache_register_netfs(&afs_cache_netfs); - -The last step is, of course, unregistration: - - void fscache_unregister_netfs(struct fscache_netfs *netfs); - - -================ -CACHE TAG LOOKUP -================ - -FS-Cache permits the use of more than one cache. To permit particular index -subtrees to be bound to particular caches, the second step is to look up cache -representation tags. This step is optional; it can be left entirely up to -FS-Cache as to which cache should be used. The problem with doing that is that -FS-Cache will always pick the first cache that was registered. - -To get the representation for a named tag: - - struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name); - -This takes a text string as the name and returns a representation of a tag. It -will never return an error. It may return a dummy tag, however, if it runs out -of memory; this will inhibit caching with this tag. - -Any representation so obtained must be released by passing it to this function: - - void fscache_release_cache_tag(struct fscache_cache_tag *tag); - -The tag will be retrieved by FS-Cache when it calls the object definition -operation select_cache(). - - -================== -INDEX REGISTRATION -================== - -The third step is to inform FS-Cache about part of an index hierarchy that can -be used to locate files. This is done by requesting a cookie for each index in -the path to the file: - - struct fscache_cookie * - fscache_acquire_cookie(struct fscache_cookie *parent, - const struct fscache_object_def *def, - const void *index_key, - size_t index_key_len, - const void *aux_data, - size_t aux_data_len, - void *netfs_data, - loff_t object_size, - bool enable); - -This function creates an index entry in the index represented by parent, -filling in the index entry by calling the operations pointed to by def. - -A unique key that represents the object within the parent must be pointed to by -index_key and is of length index_key_len. - -An optional blob of auxiliary data that is to be stored within the cache can be -pointed to with aux_data and should be of length aux_data_len. This would -typically be used for storing coherency data. - -The netfs may pass an arbitrary value in netfs_data and this will be presented -to it in the event of any calling back. This may also be used in tracing or -logging of messages. - -The cache tracks the size of the data attached to an object and this set to be -object_size. For indices, this should be 0. This value will be passed to the -->check_aux() callback. - -Note that this function never returns an error - all errors are handled -internally. It may, however, return NULL to indicate no cookie. It is quite -acceptable to pass this token back to this function as the parent to another -acquisition (or even to the relinquish cookie, read page and write page -functions - see below). - -Note also that no indices are actually created in a cache until a non-index -object needs to be created somewhere down the hierarchy. Furthermore, an index -may be created in several different caches independently at different times. -This is all handled transparently, and the netfs doesn't see any of it. - -A cookie will be created in the disabled state if enabled is false. A cookie -must be enabled to do anything with it. A disabled cookie can be enabled by -calling fscache_enable_cookie() (see below). - -For example, with AFS, a cell would be added to the primary index. This index -entry would have a dependent inode containing volume mappings within this cell: - - cell->cache = - fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell->name, strlen(cell->name), - NULL, 0, - cell, 0, true); - -And then a particular volume could be added to that index by ID, creating -another index for vnodes (AFS inode equivalents): - - volume->cache = - fscache_acquire_cookie(volume->cell->cache, - &afs_volume_cache_index_def, - &volume->vid, sizeof(volume->vid), - NULL, 0, - volume, 0, true); - - -====================== -DATA FILE REGISTRATION -====================== - -The fourth step is to request a data file be created in the cache. This is -identical to index cookie acquisition. The only difference is that the type in -the object definition should be something other than index type. - - vnode->cache = - fscache_acquire_cookie(volume->cache, - &afs_vnode_cache_object_def, - &key, sizeof(key), - &aux, sizeof(aux), - vnode, vnode->status.size, true); - - -================================= -MISCELLANEOUS OBJECT REGISTRATION -================================= - -An optional step is to request an object of miscellaneous type be created in -the cache. This is almost identical to index cookie acquisition. The only -difference is that the type in the object definition should be something other -than index type. While the parent object could be an index, it's more likely -it would be some other type of object such as a data file. - - xattr->cache = - fscache_acquire_cookie(vnode->cache, - &afs_xattr_cache_object_def, - &xattr->name, strlen(xattr->name), - NULL, 0, - xattr, strlen(xattr->val), true); - -Miscellaneous objects might be used to store extended attributes or directory -entries for example. - - -========================== -SETTING THE DATA FILE SIZE -========================== - -The fifth step is to set the physical attributes of the file, such as its size. -This doesn't automatically reserve any space in the cache, but permits the -cache to adjust its metadata for data tracking appropriately: - - int fscache_attr_changed(struct fscache_cookie *cookie); - -The cache will return -ENOBUFS if there is no backing cache or if there is no -space to allocate any extra metadata required in the cache. - -Note that attempts to read or write data pages in the cache over this size may -be rebuffed with -ENOBUFS. - -This operation schedules an attribute adjustment to happen asynchronously at -some point in the future, and as such, it may happen after the function returns -to the caller. The attribute adjustment excludes read and write operations. - - -===================== -PAGE ALLOC/READ/WRITE -===================== - -And the sixth step is to store and retrieve pages in the cache. There are -three functions that are used to do this. - -Note: - - (1) A page should not be re-read or re-allocated without uncaching it first. - - (2) A read or allocated page must be uncached when the netfs page is released - from the pagecache. - - (3) A page should only be written to the cache if previous read or allocated. - -This permits the cache to maintain its page tracking in proper order. - - -PAGE READ ---------- - -Firstly, the netfs should ask FS-Cache to examine the caches and read the -contents cached for a particular page of a particular file if present, or else -allocate space to store the contents if not: - - typedef - void (*fscache_rw_complete_t)(struct page *page, - void *context, - int error); - - int fscache_read_or_alloc_page(struct fscache_cookie *cookie, - struct page *page, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp); - -The cookie argument must specify a cookie for an object that isn't an index, -the page specified will have the data loaded into it (and is also used to -specify the page number), and the gfp argument is used to control how any -memory allocations made are satisfied. - -If the cookie indicates the inode is not cached: - - (1) The function will return -ENOBUFS. - -Else if there's a copy of the page resident in the cache: - - (1) The mark_pages_cached() cookie operation will be called on that page. - - (2) The function will submit a request to read the data from the cache's - backing device directly into the page specified. - - (3) The function will return 0. - - (4) When the read is complete, end_io_func() will be invoked with: - - (*) The netfs data supplied when the cookie was created. - - (*) The page descriptor. - - (*) The context argument passed to the above function. This will be - maintained with the get_context/put_context functions mentioned above. - - (*) An argument that's 0 on success or negative for an error code. - - If an error occurs, it should be assumed that the page contains no usable - data. fscache_readpages_cancel() may need to be called. - - end_io_func() will be called in process context if the read is results in - an error, but it might be called in interrupt context if the read is - successful. - -Otherwise, if there's not a copy available in cache, but the cache may be able -to store the page: - - (1) The mark_pages_cached() cookie operation will be called on that page. - - (2) A block may be reserved in the cache and attached to the object at the - appropriate place. - - (3) The function will return -ENODATA. - -This function may also return -ENOMEM or -EINTR, in which case it won't have -read any data from the cache. - - -PAGE ALLOCATE -------------- - -Alternatively, if there's not expected to be any data in the cache for a page -because the file has been extended, a block can simply be allocated instead: - - int fscache_alloc_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp); - -This is similar to the fscache_read_or_alloc_page() function, except that it -never reads from the cache. It will return 0 if a block has been allocated, -rather than -ENODATA as the other would. One or the other must be performed -before writing to the cache. - -The mark_pages_cached() cookie operation will be called on the page if -successful. - - -PAGE WRITE ----------- - -Secondly, if the netfs changes the contents of the page (either due to an -initial download or if a user performs a write), then the page should be -written back to the cache: - - int fscache_write_page(struct fscache_cookie *cookie, - struct page *page, - loff_t object_size, - gfp_t gfp); - -The cookie argument must specify a data file cookie, the page specified should -contain the data to be written (and is also used to specify the page number), -object_size is the revised size of the object and the gfp argument is used to -control how any memory allocations made are satisfied. - -The page must have first been read or allocated successfully and must not have -been uncached before writing is performed. - -If the cookie indicates the inode is not cached then: - - (1) The function will return -ENOBUFS. - -Else if space can be allocated in the cache to hold this page: - - (1) PG_fscache_write will be set on the page. - - (2) The function will submit a request to write the data to cache's backing - device directly from the page specified. - - (3) The function will return 0. - - (4) When the write is complete PG_fscache_write is cleared on the page and - anyone waiting for that bit will be woken up. - -Else if there's no space available in the cache, -ENOBUFS will be returned. It -is also possible for the PG_fscache_write bit to be cleared when no write took -place if unforeseen circumstances arose (such as a disk error). - -Writing takes place asynchronously. - - -MULTIPLE PAGE READ ------------------- - -A facility is provided to read several pages at once, as requested by the -readpages() address space operation: - - int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, - struct address_space *mapping, - struct list_head *pages, - int *nr_pages, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp); - -This works in a similar way to fscache_read_or_alloc_page(), except: - - (1) Any page it can retrieve data for is removed from pages and nr_pages and - dispatched for reading to the disk. Reads of adjacent pages on disk may - be merged for greater efficiency. - - (2) The mark_pages_cached() cookie operation will be called on several pages - at once if they're being read or allocated. - - (3) If there was an general error, then that error will be returned. - - Else if some pages couldn't be allocated or read, then -ENOBUFS will be - returned. - - Else if some pages couldn't be read but were allocated, then -ENODATA will - be returned. - - Otherwise, if all pages had reads dispatched, then 0 will be returned, the - list will be empty and *nr_pages will be 0. - - (4) end_io_func will be called once for each page being read as the reads - complete. It will be called in process context if error != 0, but it may - be called in interrupt context if there is no error. - -Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude -some of the pages being read and some being allocated. Those pages will have -been marked appropriately and will need uncaching. - - -CANCELLATION OF UNREAD PAGES ----------------------------- - -If one or more pages are passed to fscache_read_or_alloc_pages() but not then -read from the cache and also not read from the underlying filesystem then -those pages will need to have any marks and reservations removed. This can be -done by calling: - - void fscache_readpages_cancel(struct fscache_cookie *cookie, - struct list_head *pages); - -prior to returning to the caller. The cookie argument should be as passed to -fscache_read_or_alloc_pages(). Every page in the pages list will be examined -and any that have PG_fscache set will be uncached. - - -============== -PAGE UNCACHING -============== - -To uncache a page, this function should be called: - - void fscache_uncache_page(struct fscache_cookie *cookie, - struct page *page); - -This function permits the cache to release any in-memory representation it -might be holding for this netfs page. This function must be called once for -each page on which the read or write page functions above have been called to -make sure the cache's in-memory tracking information gets torn down. - -Note that pages can't be explicitly deleted from the a data file. The whole -data file must be retired (see the relinquish cookie function below). - -Furthermore, note that this does not cancel the asynchronous read or write -operation started by the read/alloc and write functions, so the page -invalidation functions must use: - - bool fscache_check_page_write(struct fscache_cookie *cookie, - struct page *page); - -to see if a page is being written to the cache, and: - - void fscache_wait_on_page_write(struct fscache_cookie *cookie, - struct page *page); - -to wait for it to finish if it is. - - -When releasepage() is being implemented, a special FS-Cache function exists to -manage the heuristics of coping with vmscan trying to eject pages, which may -conflict with the cache trying to write pages to the cache (which may itself -need to allocate memory): - - bool fscache_maybe_release_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp); - -This takes the netfs cookie, and the page and gfp arguments as supplied to -releasepage(). It will return false if the page cannot be released yet for -some reason and if it returns true, the page has been uncached and can now be -released. - -To make a page available for release, this function may wait for an outstanding -storage request to complete, or it may attempt to cancel the storage request - -in which case the page will not be stored in the cache this time. - - -BULK INODE PAGE UNCACHE ------------------------ - -A convenience routine is provided to perform an uncache on all the pages -attached to an inode. This assumes that the pages on the inode correspond on a -1:1 basis with the pages in the cache. - - void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, - struct inode *inode); - -This takes the netfs cookie that the pages were cached with and the inode that -the pages are attached to. This function will wait for pages to finish being -written to the cache and for the cache to finish with the page generally. No -error is returned. - - -=============================== -INDEX AND DATA FILE CONSISTENCY -=============================== - -To find out whether auxiliary data for an object is up to data within the -cache, the following function can be called: - - int fscache_check_consistency(struct fscache_cookie *cookie, - const void *aux_data); - -This will call back to the netfs to check whether the auxiliary data associated -with a cookie is correct; if aux_data is non-NULL, it will update the auxiliary -data buffer first. It returns 0 if it is and -ESTALE if it isn't; it may also -return -ENOMEM and -ERESTARTSYS. - -To request an update of the index data for an index or other object, the -following function should be called: - - void fscache_update_cookie(struct fscache_cookie *cookie, - const void *aux_data); - -This function will update the cookie's auxiliary data buffer from aux_data if -that is non-NULL and then schedule this to be stored on disk. The update -method in the parent index definition will be called to transfer the data. - -Note that partial updates may happen automatically at other times, such as when -data blocks are added to a data file object. - - -================= -COOKIE ENABLEMENT -================= - -Cookies exist in one of two states: enabled and disabled. If a cookie is -disabled, it ignores all attempts to acquire child cookies; check, update or -invalidate its state; allocate, read or write backing pages - though it is -still possible to uncache pages and relinquish the cookie. - -The initial enablement state is set by fscache_acquire_cookie(), but the cookie -can be enabled or disabled later. To disable a cookie, call: - - void fscache_disable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool invalidate); - -If the cookie is not already disabled, this locks the cookie against other -enable and disable ops, marks the cookie as being disabled, discards or -invalidates any backing objects and waits for cessation of activity on any -associated object before unlocking the cookie. - -All possible failures are handled internally. The caller should consider -calling fscache_uncache_all_inode_pages() afterwards to make sure all page -markings are cleared up. - -Cookies can be enabled or reenabled with: - - void fscache_enable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - loff_t object_size, - bool (*can_enable)(void *data), - void *data) - -If the cookie is not already enabled, this locks the cookie against other -enable and disable ops, invokes can_enable() and, if the cookie is not an index -cookie, will begin the procedure of acquiring backing objects. - -The optional can_enable() function is passed the data argument and returns a -ruling as to whether or not enablement should actually be permitted to begin. - -All possible failures are handled internally. The cookie will only be marked -as enabled if provisional backing objects are allocated. - -The object's data size is updated from object_size and is passed to the -->check_aux() function. - -In both cases, the cookie's auxiliary data buffer is updated from aux_data if -that is non-NULL inside the enablement lock before proceeding. - - -=============================== -MISCELLANEOUS COOKIE OPERATIONS -=============================== - -There are a number of operations that can be used to control cookies: - - (*) Cookie pinning: - - int fscache_pin_cookie(struct fscache_cookie *cookie); - void fscache_unpin_cookie(struct fscache_cookie *cookie); - - These operations permit data cookies to be pinned into the cache and to - have the pinning removed. They are not permitted on index cookies. - - The pinning function will return 0 if successful, -ENOBUFS in the cookie - isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning, - -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or - -EIO if there's any other problem. - - (*) Data space reservation: - - int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size); - - This permits a netfs to request cache space be reserved to store up to the - given amount of a file. It is permitted to ask for more than the current - size of the file to allow for future file expansion. - - If size is given as zero then the reservation will be cancelled. - - The function will return 0 if successful, -ENOBUFS in the cookie isn't - backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations, - -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or - -EIO if there's any other problem. - - Note that this doesn't pin an object in a cache; it can still be culled to - make space if it's not in use. - - -===================== -COOKIE UNREGISTRATION -===================== - -To get rid of a cookie, this function should be called. - - void fscache_relinquish_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool retire); - -If retire is non-zero, then the object will be marked for recycling, and all -copies of it will be removed from all active caches in which it is present. -Not only that but all child objects will also be retired. - -If retire is zero, then the object may be available again when next the -acquisition function is called. Retirement here will overrule the pinning on a -cookie. - -The cookie's auxiliary data will be updated from aux_data if that is non-NULL -so that the cache can lazily update it on disk. - -One very important note - relinquish must NOT be called for a cookie unless all -the cookies for "child" indices, objects and pages have been relinquished -first. - - -================== -INDEX INVALIDATION -================== - -There is no direct way to invalidate an index subtree. To do this, the caller -should relinquish and retire the cookie they have, and then acquire a new one. - - -====================== -DATA FILE INVALIDATION -====================== - -Sometimes it will be necessary to invalidate an object that contains data. -Typically this will be necessary when the server tells the netfs of a foreign -change - at which point the netfs has to throw away all the state it had for an -inode and reload from the server. - -To indicate that a cache object should be invalidated, the following function -can be called: - - void fscache_invalidate(struct fscache_cookie *cookie); - -This can be called with spinlocks held as it defers the work to a thread pool. -All extant storage, retrieval and attribute change ops at this point are -cancelled and discarded. Some future operations will be rejected until the -cache has had a chance to insert a barrier in the operations queue. After -that, operations will be queued again behind the invalidation operation. - -The invalidation operation will perform an attribute change operation and an -auxiliary data update operation as it is very likely these will have changed. - -Using the following function, the netfs can wait for the invalidation operation -to have reached a point at which it can start submitting ordinary operations -once again: - - void fscache_wait_on_invalidate(struct fscache_cookie *cookie); - - -=========================== -FS-CACHE SPECIFIC PAGE FLAG -=========================== - -FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is -given the alternative name PG_fscache. - -PG_fscache is used to indicate that the page is known by the cache, and that -the cache must be informed if the page is going to go away. It's an indication -to the netfs that the cache has an interest in this page, where an interest may -be a pointer to it, resources allocated or reserved for it, or I/O in progress -upon it. - -The netfs can use this information in methods such as releasepage() to -determine whether it needs to uncache a page or update it. - -Furthermore, if this bit is set, releasepage() and invalidatepage() operations -will be called on a page to get rid of it, even if PG_private is not set. This -allows caching to attempted on a page before read_cache_pages() to be called -after fscache_read_or_alloc_pages() as the former will try and release pages it -was given under certain circumstances. - -This bit does not overlap with such as PG_private. This means that FS-Cache -can be used with a filesystem that uses the block buffering code. - -There are a number of operations defined on this flag: - - int PageFsCache(struct page *page); - void SetPageFsCache(struct page *page) - void ClearPageFsCache(struct page *page) - int TestSetPageFsCache(struct page *page) - int TestClearPageFsCache(struct page *page) - -These functions are bit test, bit set, bit clear, bit test and set and bit -test and clear operations on PG_fscache. diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0ce39658a620..751bc5b1cddf 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -4,7 +4,7 @@ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/netfs-api.txt for more information on + * See Documentation/filesystems/caching/netfs-api.rst for more information on * the netfs API. */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ad044c0cb1f3..a1c928fe98e7 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -6,7 +6,7 @@ * * NOTE!!! See: * - * Documentation/filesystems/caching/netfs-api.txt + * Documentation/filesystems/caching/netfs-api.rst * * for a description of the network filesystem interface declared here. */ @@ -233,7 +233,7 @@ extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_ * * Register a filesystem as desiring caching services if they're available. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -253,7 +253,7 @@ int fscache_register_netfs(struct fscache_netfs *netfs) * Indicate that a filesystem no longer desires caching services for the * moment. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -270,7 +270,7 @@ void fscache_unregister_netfs(struct fscache_netfs *netfs) * Acquire a specific cache referral tag that can be used to select a specific * cache in which to cache an index. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -288,7 +288,7 @@ struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name) * * Release a reference to a cache referral tag previously looked up. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -315,7 +315,7 @@ void fscache_release_cache_tag(struct fscache_cache_tag *tag) * that can be used to locate files. This is done by requesting a cookie for * each index in the path to the file. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -351,7 +351,7 @@ struct fscache_cookie *fscache_acquire_cookie( * provided to update the auxiliary data in the cache before the object is * disconnected. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -394,7 +394,7 @@ int fscache_check_consistency(struct fscache_cookie *cookie, * cookie. The auxiliary data on the cookie will be updated first if @aux_data * is set. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -410,7 +410,7 @@ void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) * * Permit data-storage cache objects to be pinned in the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -425,7 +425,7 @@ int fscache_pin_cookie(struct fscache_cookie *cookie) * * Permit data-storage cache objects to be unpinned from the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -441,7 +441,7 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie) * changed. This includes the data size. These attributes will be obtained * through the get_attr() cookie definition op. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -463,7 +463,7 @@ int fscache_attr_changed(struct fscache_cookie *cookie) * * This can be called with spinlocks held. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -479,7 +479,7 @@ void fscache_invalidate(struct fscache_cookie *cookie) * * Wait for the invalidation of an object to complete. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -498,7 +498,7 @@ void fscache_wait_on_invalidate(struct fscache_cookie *cookie) * cookie so that a write to that object within the space can always be * honoured. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -533,7 +533,7 @@ int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size) * Else, if the page is unbacked, -ENODATA is returned and a block may have * been allocated in the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -582,7 +582,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie, * regard to different pages, the return values are prioritised in that order. * Any pages submitted for reading are removed from the pages list. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -617,7 +617,7 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, * Else, a block will be allocated if one wasn't already, and 0 will be * returned * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -667,7 +667,7 @@ void fscache_readpages_cancel(struct fscache_cookie *cookie, * be cleared at the completion of the write to indicate the success or failure * of the operation. Note that the completion may happen before the return. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -693,7 +693,7 @@ int fscache_write_page(struct fscache_cookie *cookie, * Note that this cannot cancel any outstanding I/O operations between this * page and the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -711,7 +711,7 @@ void fscache_uncache_page(struct fscache_cookie *cookie, * * Ask the cache if a page is being written to the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -731,7 +731,7 @@ bool fscache_check_page_write(struct fscache_cookie *cookie, * Ask the cache to wake us up when a page is no longer being written to the * cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline -- cgit v1.2.3 From 0e822145b564204cd5e9dd67a7fd37d4a7b8253b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:58 +0200 Subject: docs: filesystems: caching/backend-api.txt: convert it to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add table markups; - Add it to filesystems/caching/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/5d0a61abaa87bfe913b9e2f321e74ef7af0f3dfc.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/backend-api.rst | 727 ++++++++++++++++++++++ Documentation/filesystems/caching/backend-api.txt | 726 --------------------- Documentation/filesystems/caching/fscache.rst | 2 +- Documentation/filesystems/caching/index.rst | 1 + fs/fscache/cache.c | 8 +- fs/fscache/object.c | 2 +- include/linux/fscache-cache.h | 4 +- 7 files changed, 736 insertions(+), 734 deletions(-) create mode 100644 Documentation/filesystems/caching/backend-api.rst delete mode 100644 Documentation/filesystems/caching/backend-api.txt (limited to 'include/linux') diff --git a/Documentation/filesystems/caching/backend-api.rst b/Documentation/filesystems/caching/backend-api.rst new file mode 100644 index 000000000000..19fbf6b9aa36 --- /dev/null +++ b/Documentation/filesystems/caching/backend-api.rst @@ -0,0 +1,727 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +FS-Cache Cache backend API +========================== + +The FS-Cache system provides an API by which actual caches can be supplied to +FS-Cache for it to then serve out to network filesystems and other interested +parties. + +This API is declared in . + + +Initialising and Registering a Cache +==================================== + +To start off, a cache definition must be initialised and registered for each +cache the backend wants to make available. For instance, CacheFS does this in +the fill_super() operation on mounting. + +The cache definition (struct fscache_cache) should be initialised by calling:: + + void fscache_init_cache(struct fscache_cache *cache, + struct fscache_cache_ops *ops, + const char *idfmt, + ...); + +Where: + + * "cache" is a pointer to the cache definition; + + * "ops" is a pointer to the table of operations that the backend supports on + this cache; and + + * "idfmt" is a format and printf-style arguments for constructing a label + for the cache. + + +The cache should then be registered with FS-Cache by passing a pointer to the +previously initialised cache definition to:: + + int fscache_add_cache(struct fscache_cache *cache, + struct fscache_object *fsdef, + const char *tagname); + +Two extra arguments should also be supplied: + + * "fsdef" which should point to the object representation for the FS-Cache + master index in this cache. Netfs primary index entries will be created + here. FS-Cache keeps the caller's reference to the index object if + successful and will release it upon withdrawal of the cache. + + * "tagname" which, if given, should be a text string naming this cache. If + this is NULL, the identifier will be used instead. For CacheFS, the + identifier is set to name the underlying block device and the tag can be + supplied by mount. + +This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag +is already in use. 0 will be returned on success. + + +Unregistering a Cache +===================== + +A cache can be withdrawn from the system by calling this function with a +pointer to the cache definition:: + + void fscache_withdraw_cache(struct fscache_cache *cache); + +In CacheFS's case, this is called by put_super(). + + +Security +======== + +The cache methods are executed one of two contexts: + + (1) that of the userspace process that issued the netfs operation that caused + the cache method to be invoked, or + + (2) that of one of the processes in the FS-Cache thread pool. + +In either case, this may not be an appropriate context in which to access the +cache. + +The calling process's fsuid, fsgid and SELinux security identities may need to +be masqueraded for the duration of the cache driver's access to the cache. +This is left to the cache to handle; FS-Cache makes no effort in this regard. + + +Control and Statistics Presentation +=================================== + +The cache may present data to the outside world through FS-Cache's interfaces +in sysfs and procfs - the former for control and the latter for statistics. + +A sysfs directory called /sys/fs/fscache// is created if CONFIG_SYSFS +is enabled. This is accessible through the kobject struct fscache_cache::kobj +and is for use by the cache as it sees fit. + + +Relevant Data Structures +======================== + + * Index/Data file FS-Cache representation cookie:: + + struct fscache_cookie { + struct fscache_object_def *def; + struct fscache_netfs *netfs; + void *netfs_data; + ... + }; + + The fields that might be of use to the backend describe the object + definition, the netfs definition and the netfs's data for this cookie. + The object definition contain functions supplied by the netfs for loading + and matching index entries; these are required to provide some of the + cache operations. + + + * In-cache object representation:: + + struct fscache_object { + int debug_id; + enum { + FSCACHE_OBJECT_RECYCLING, + ... + } state; + spinlock_t lock + struct fscache_cache *cache; + struct fscache_cookie *cookie; + ... + }; + + Structures of this type should be allocated by the cache backend and + passed to FS-Cache when requested by the appropriate cache operation. In + the case of CacheFS, they're embedded in CacheFS's internal object + structures. + + The debug_id is a simple integer that can be used in debugging messages + that refer to a particular object. In such a case it should be printed + using "OBJ%x" to be consistent with FS-Cache. + + Each object contains a pointer to the cookie that represents the object it + is backing. An object should retired when put_object() is called if it is + in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be + initialised by calling fscache_object_init(object). + + + * FS-Cache operation record:: + + struct fscache_operation { + atomic_t usage; + struct fscache_object *object; + unsigned long flags; + #define FSCACHE_OP_EXCLUSIVE + void (*processor)(struct fscache_operation *op); + void (*release)(struct fscache_operation *op); + ... + }; + + FS-Cache has a pool of threads that it uses to give CPU time to the + various asynchronous operations that need to be done as part of driving + the cache. These are represented by the above structure. The processor + method is called to give the op CPU time, and the release method to get + rid of it when its usage count reaches 0. + + An operation can be made exclusive upon an object by setting the + appropriate flag before enqueuing it with fscache_enqueue_operation(). If + an operation needs more processing time, it should be enqueued again. + + + * FS-Cache retrieval operation record:: + + struct fscache_retrieval { + struct fscache_operation op; + struct address_space *mapping; + struct list_head *to_do; + ... + }; + + A structure of this type is allocated by FS-Cache to record retrieval and + allocation requests made by the netfs. This struct is then passed to the + backend to do the operation. The backend may get extra refs to it by + calling fscache_get_retrieval() and refs may be discarded by calling + fscache_put_retrieval(). + + A retrieval operation can be used by the backend to do retrieval work. To + do this, the retrieval->op.processor method pointer should be set + appropriately by the backend and fscache_enqueue_retrieval() called to + submit it to the thread pool. CacheFiles, for example, uses this to queue + page examination when it detects PG_lock being cleared. + + The to_do field is an empty list available for the cache backend to use as + it sees fit. + + + * FS-Cache storage operation record:: + + struct fscache_storage { + struct fscache_operation op; + pgoff_t store_limit; + ... + }; + + A structure of this type is allocated by FS-Cache to record outstanding + writes to be made. FS-Cache itself enqueues this operation and invokes + the write_page() method on the object at appropriate times to effect + storage. + + +Cache Operations +================ + +The cache backend provides FS-Cache with a table of operations that can be +performed on the denizens of the cache. These are held in a structure of type: + + :: + + struct fscache_cache_ops + + * Name of cache provider [mandatory]:: + + const char *name + + This isn't strictly an operation, but should be pointed at a string naming + the backend. + + + * Allocate a new object [mandatory]:: + + struct fscache_object *(*alloc_object)(struct fscache_cache *cache, + struct fscache_cookie *cookie) + + This method is used to allocate a cache object representation to back a + cookie in a particular cache. fscache_object_init() should be called on + the object to initialise it prior to returning. + + This function may also be used to parse the index key to be used for + multiple lookup calls to turn it into a more convenient form. FS-Cache + will call the lookup_complete() method to allow the cache to release the + form once lookup is complete or aborted. + + + * Look up and create object [mandatory]:: + + void (*lookup_object)(struct fscache_object *object) + + This method is used to look up an object, given that the object is already + allocated and attached to the cookie. This should instantiate that object + in the cache if it can. + + The method should call fscache_object_lookup_negative() as soon as + possible if it determines the object doesn't exist in the cache. If the + object is found to exist and the netfs indicates that it is valid then + fscache_obtained_object() should be called once the object is in a + position to have data stored in it. Similarly, fscache_obtained_object() + should also be called once a non-present object has been created. + + If a lookup error occurs, fscache_object_lookup_error() should be called + to abort the lookup of that object. + + + * Release lookup data [mandatory]:: + + void (*lookup_complete)(struct fscache_object *object) + + This method is called to ask the cache to release any resources it was + using to perform a lookup. + + + * Increment object refcount [mandatory]:: + + struct fscache_object *(*grab_object)(struct fscache_object *object) + + This method is called to increment the reference count on an object. It + may fail (for instance if the cache is being withdrawn) by returning NULL. + It should return the object pointer if successful. + + + * Lock/Unlock object [mandatory]:: + + void (*lock_object)(struct fscache_object *object) + void (*unlock_object)(struct fscache_object *object) + + These methods are used to exclusively lock an object. It must be possible + to schedule with the lock held, so a spinlock isn't sufficient. + + + * Pin/Unpin object [optional]:: + + int (*pin_object)(struct fscache_object *object) + void (*unpin_object)(struct fscache_object *object) + + These methods are used to pin an object into the cache. Once pinned an + object cannot be reclaimed to make space. Return -ENOSPC if there's not + enough space in the cache to permit this. + + + * Check coherency state of an object [mandatory]:: + + int (*check_consistency)(struct fscache_object *object) + + This method is called to have the cache check the saved auxiliary data of + the object against the netfs's idea of the state. 0 should be returned + if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS + may also be returned. + + * Update object [mandatory]:: + + int (*update_object)(struct fscache_object *object) + + This is called to update the index entry for the specified object. The + new information should be in object->cookie->netfs_data. This can be + obtained by calling object->cookie->def->get_aux()/get_attr(). + + + * Invalidate data object [mandatory]:: + + int (*invalidate_object)(struct fscache_operation *op) + + This is called to invalidate a data object (as pointed to by op->object). + All the data stored for this object should be discarded and an + attr_changed operation should be performed. The caller will follow up + with an object update operation. + + fscache_op_complete() must be called on op before returning. + + + * Discard object [mandatory]:: + + void (*drop_object)(struct fscache_object *object) + + This method is called to indicate that an object has been unbound from its + cookie, and that the cache should release the object's resources and + retire it if it's in state FSCACHE_OBJECT_RECYCLING. + + This method should not attempt to release any references held by the + caller. The caller will invoke the put_object() method as appropriate. + + + * Release object reference [mandatory]:: + + void (*put_object)(struct fscache_object *object) + + This method is used to discard a reference to an object. The object may + be freed when all the references to it are released. + + + * Synchronise a cache [mandatory]:: + + void (*sync)(struct fscache_cache *cache) + + This is called to ask the backend to synchronise a cache with its backing + device. + + + * Dissociate a cache [mandatory]:: + + void (*dissociate_pages)(struct fscache_cache *cache) + + This is called to ask a cache to perform any page dissociations as part of + cache withdrawal. + + + * Notification that the attributes on a netfs file changed [mandatory]:: + + int (*attr_changed)(struct fscache_object *object); + + This is called to indicate to the cache that certain attributes on a netfs + file have changed (for example the maximum size a file may reach). The + cache can read these from the netfs by calling the cookie's get_attr() + method. + + The cache may use the file size information to reserve space on the cache. + It should also call fscache_set_store_limit() to indicate to FS-Cache the + highest byte it's willing to store for an object. + + This method may return -ve if an error occurred or the cache object cannot + be expanded. In such a case, the object will be withdrawn from service. + + This operation is run asynchronously from FS-Cache's thread pool, and + storage and retrieval operations from the netfs are excluded during the + execution of this operation. + + + * Reserve cache space for an object's data [optional]:: + + int (*reserve_space)(struct fscache_object *object, loff_t size); + + This is called to request that cache space be reserved to hold the data + for an object and the metadata used to track it. Zero size should be + taken as request to cancel a reservation. + + This should return 0 if successful, -ENOSPC if there isn't enough space + available, or -ENOMEM or -EIO on other errors. + + The reservation may exceed the current size of the object, thus permitting + future expansion. If the amount of space consumed by an object would + exceed the reservation, it's permitted to refuse requests to allocate + pages, but not required. An object may be pruned down to its reservation + size if larger than that already. + + + * Request page be read from cache [mandatory]:: + + int (*read_or_alloc_page)(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) + + This is called to attempt to read a netfs page from the cache, or to + reserve a backing block if not. FS-Cache will have done as much checking + as it can before calling, but most of the work belongs to the backend. + + If there's no page in the cache, then -ENODATA should be returned if the + backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it + didn't. + + If there is suitable data in the cache, then a read operation should be + queued and 0 returned. When the read finishes, fscache_end_io() should be + called. + + The fscache_mark_pages_cached() should be called for the page if any cache + metadata is retained. This will indicate to the netfs that the page needs + explicit uncaching. This operation takes a pagevec, thus allowing several + pages to be marked at once. + + The retrieval record pointed to by op should be retained for each page + queued and released when I/O on the page has been formally ended. + fscache_get/put_retrieval() are available for this purpose. + + The retrieval record may be used to get CPU time via the FS-Cache thread + pool. If this is desired, the op->op.processor should be set to point to + the appropriate processing routine, and fscache_enqueue_retrieval() should + be called at an appropriate point to request CPU time. For instance, the + retrieval routine could be enqueued upon the completion of a disk read. + The to_do field in the retrieval record is provided to aid in this. + + If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS + returned if possible or fscache_end_io() called with a suitable error + code. + + fscache_put_retrieval() should be called after a page or pages are dealt + with. This will complete the operation when all pages are dealt with. + + + * Request pages be read from cache [mandatory]:: + + int (*read_or_alloc_pages)(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) + + This is like the read_or_alloc_page() method, except it is handed a list + of pages instead of one page. Any pages on which a read operation is + started must be added to the page cache for the specified mapping and also + to the LRU. Such pages must also be removed from the pages list and + ``*nr_pages`` decremented per page. + + If there was an error such as -ENOMEM, then that should be returned; else + if one or more pages couldn't be read or allocated, then -ENOBUFS should + be returned; else if one or more pages couldn't be read, then -ENODATA + should be returned. If all the pages are dispatched then 0 should be + returned. + + + * Request page be allocated in the cache [mandatory]:: + + int (*allocate_page)(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) + + This is like the read_or_alloc_page() method, except that it shouldn't + read from the cache, even if there's data there that could be retrieved. + It should, however, set up any internal metadata required such that + the write_page() method can write to the cache. + + If there's no backing block available, then -ENOBUFS should be returned + (or -ENOMEM if there were other problems). If a block is successfully + allocated, then the netfs page should be marked and 0 returned. + + + * Request pages be allocated in the cache [mandatory]:: + + int (*allocate_pages)(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) + + This is an multiple page version of the allocate_page() method. pages and + nr_pages should be treated as for the read_or_alloc_pages() method. + + + * Request page be written to cache [mandatory]:: + + int (*write_page)(struct fscache_storage *op, + struct page *page); + + This is called to write from a page on which there was a previously + successful read_or_alloc_page() call or similar. FS-Cache filters out + pages that don't have mappings. + + This method is called asynchronously from the FS-Cache thread pool. It is + not required to actually store anything, provided -ENODATA is then + returned to the next read of this page. + + If an error occurred, then a negative error code should be returned, + otherwise zero should be returned. FS-Cache will take appropriate action + in response to an error, such as withdrawing this object. + + If this method returns success then FS-Cache will inform the netfs + appropriately. + + + * Discard retained per-page metadata [mandatory]:: + + void (*uncache_page)(struct fscache_object *object, struct page *page) + + This is called when a netfs page is being evicted from the pagecache. The + cache backend should tear down any internal representation or tracking it + maintains for this page. + + +FS-Cache Utilities +================== + +FS-Cache provides some utilities that a cache backend may make use of: + + * Note occurrence of an I/O error in a cache:: + + void fscache_io_error(struct fscache_cache *cache) + + This tells FS-Cache that an I/O error occurred in the cache. After this + has been called, only resource dissociation operations (object and page + release) will be passed from the netfs to the cache backend for the + specified cache. + + This does not actually withdraw the cache. That must be done separately. + + + * Invoke the retrieval I/O completion function:: + + void fscache_end_io(struct fscache_retrieval *op, struct page *page, + int error); + + This is called to note the end of an attempt to retrieve a page. The + error value should be 0 if successful and an error otherwise. + + + * Record that one or more pages being retrieved or allocated have been dealt + with:: + + void fscache_retrieval_complete(struct fscache_retrieval *op, + int n_pages); + + This is called to record the fact that one or more pages have been dealt + with and are no longer the concern of this operation. When the number of + pages remaining in the operation reaches 0, the operation will be + completed. + + + * Record operation completion:: + + void fscache_op_complete(struct fscache_operation *op); + + This is called to record the completion of an operation. This deducts + this operation from the parent object's run state, potentially permitting + one or more pending operations to start running. + + + * Set highest store limit:: + + void fscache_set_store_limit(struct fscache_object *object, + loff_t i_size); + + This sets the limit FS-Cache imposes on the highest byte it's willing to + try and store for a netfs. Any page over this limit is automatically + rejected by fscache_read_alloc_page() and co with -ENOBUFS. + + + * Mark pages as being cached:: + + void fscache_mark_pages_cached(struct fscache_retrieval *op, + struct pagevec *pagevec); + + This marks a set of pages as being cached. After this has been called, + the netfs must call fscache_uncache_page() to unmark the pages. + + + * Perform coherency check on an object:: + + enum fscache_checkaux fscache_check_aux(struct fscache_object *object, + const void *data, + uint16_t datalen); + + This asks the netfs to perform a coherency check on an object that has + just been looked up. The cookie attached to the object will determine the + netfs to use. data and datalen should specify where the auxiliary data + retrieved from the cache can be found. + + One of three values will be returned: + + FSCACHE_CHECKAUX_OKAY + The coherency data indicates the object is valid as is. + + FSCACHE_CHECKAUX_NEEDS_UPDATE + The coherency data needs updating, but otherwise the object is + valid. + + FSCACHE_CHECKAUX_OBSOLETE + The coherency data indicates that the object is obsolete and should + be discarded. + + + * Initialise a freshly allocated object:: + + void fscache_object_init(struct fscache_object *object); + + This initialises all the fields in an object representation. + + + * Indicate the destruction of an object:: + + void fscache_object_destroyed(struct fscache_cache *cache); + + This must be called to inform FS-Cache that an object that belonged to a + cache has been destroyed and deallocated. This will allow continuation + of the cache withdrawal process when it is stopped pending destruction of + all the objects. + + + * Indicate negative lookup on an object:: + + void fscache_object_lookup_negative(struct fscache_object *object); + + This is called to indicate to FS-Cache that a lookup process for an object + found a negative result. + + This changes the state of an object to permit reads pending on lookup + completion to go off and start fetching data from the netfs server as it's + known at this point that there can't be any data in the cache. + + This may be called multiple times on an object. Only the first call is + significant - all subsequent calls are ignored. + + + * Indicate an object has been obtained:: + + void fscache_obtained_object(struct fscache_object *object); + + This is called to indicate to FS-Cache that a lookup process for an object + produced a positive result, or that an object was created. This should + only be called once for any particular object. + + This changes the state of an object to indicate: + + (1) if no call to fscache_object_lookup_negative() has been made on + this object, that there may be data available, and that reads can + now go and look for it; and + + (2) that writes may now proceed against this object. + + + * Indicate that object lookup failed:: + + void fscache_object_lookup_error(struct fscache_object *object); + + This marks an object as having encountered a fatal error (usually EIO) + and causes it to move into a state whereby it will be withdrawn as soon + as possible. + + + * Indicate that a stale object was found and discarded:: + + void fscache_object_retrying_stale(struct fscache_object *object); + + This is called to indicate that the lookup procedure found an object in + the cache that the netfs decided was stale. The object has been + discarded from the cache and the lookup will be performed again. + + + * Indicate that the caching backend killed an object:: + + void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why); + + This is called to indicate that the cache backend preemptively killed an + object. The why parameter should be set to indicate the reason: + + FSCACHE_OBJECT_IS_STALE + - the object was stale and needs discarding. + + FSCACHE_OBJECT_NO_SPACE + - there was insufficient cache space + + FSCACHE_OBJECT_WAS_RETIRED + - the object was retired when relinquished. + + FSCACHE_OBJECT_WAS_CULLED + - the object was culled to make space. + + + * Get and release references on a retrieval record:: + + void fscache_get_retrieval(struct fscache_retrieval *op); + void fscache_put_retrieval(struct fscache_retrieval *op); + + These two functions are used to retain a retrieval record while doing + asynchronous data retrieval and block allocation. + + + * Enqueue a retrieval record for processing:: + + void fscache_enqueue_retrieval(struct fscache_retrieval *op); + + This enqueues a retrieval record for processing by the FS-Cache thread + pool. One of the threads in the pool will invoke the retrieval record's + op->op.processor callback function. This function may be called from + within the callback function. + + + * List of object state names:: + + const char *fscache_object_states[]; + + For debugging purposes, this may be used to turn the state that an object + is in into a text string for display purposes. diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt deleted file mode 100644 index c418280c915f..000000000000 --- a/Documentation/filesystems/caching/backend-api.txt +++ /dev/null @@ -1,726 +0,0 @@ - ========================== - FS-CACHE CACHE BACKEND API - ========================== - -The FS-Cache system provides an API by which actual caches can be supplied to -FS-Cache for it to then serve out to network filesystems and other interested -parties. - -This API is declared in . - - -==================================== -INITIALISING AND REGISTERING A CACHE -==================================== - -To start off, a cache definition must be initialised and registered for each -cache the backend wants to make available. For instance, CacheFS does this in -the fill_super() operation on mounting. - -The cache definition (struct fscache_cache) should be initialised by calling: - - void fscache_init_cache(struct fscache_cache *cache, - struct fscache_cache_ops *ops, - const char *idfmt, - ...); - -Where: - - (*) "cache" is a pointer to the cache definition; - - (*) "ops" is a pointer to the table of operations that the backend supports on - this cache; and - - (*) "idfmt" is a format and printf-style arguments for constructing a label - for the cache. - - -The cache should then be registered with FS-Cache by passing a pointer to the -previously initialised cache definition to: - - int fscache_add_cache(struct fscache_cache *cache, - struct fscache_object *fsdef, - const char *tagname); - -Two extra arguments should also be supplied: - - (*) "fsdef" which should point to the object representation for the FS-Cache - master index in this cache. Netfs primary index entries will be created - here. FS-Cache keeps the caller's reference to the index object if - successful and will release it upon withdrawal of the cache. - - (*) "tagname" which, if given, should be a text string naming this cache. If - this is NULL, the identifier will be used instead. For CacheFS, the - identifier is set to name the underlying block device and the tag can be - supplied by mount. - -This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag -is already in use. 0 will be returned on success. - - -===================== -UNREGISTERING A CACHE -===================== - -A cache can be withdrawn from the system by calling this function with a -pointer to the cache definition: - - void fscache_withdraw_cache(struct fscache_cache *cache); - -In CacheFS's case, this is called by put_super(). - - -======== -SECURITY -======== - -The cache methods are executed one of two contexts: - - (1) that of the userspace process that issued the netfs operation that caused - the cache method to be invoked, or - - (2) that of one of the processes in the FS-Cache thread pool. - -In either case, this may not be an appropriate context in which to access the -cache. - -The calling process's fsuid, fsgid and SELinux security identities may need to -be masqueraded for the duration of the cache driver's access to the cache. -This is left to the cache to handle; FS-Cache makes no effort in this regard. - - -=================================== -CONTROL AND STATISTICS PRESENTATION -=================================== - -The cache may present data to the outside world through FS-Cache's interfaces -in sysfs and procfs - the former for control and the latter for statistics. - -A sysfs directory called /sys/fs/fscache// is created if CONFIG_SYSFS -is enabled. This is accessible through the kobject struct fscache_cache::kobj -and is for use by the cache as it sees fit. - - -======================== -RELEVANT DATA STRUCTURES -======================== - - (*) Index/Data file FS-Cache representation cookie: - - struct fscache_cookie { - struct fscache_object_def *def; - struct fscache_netfs *netfs; - void *netfs_data; - ... - }; - - The fields that might be of use to the backend describe the object - definition, the netfs definition and the netfs's data for this cookie. - The object definition contain functions supplied by the netfs for loading - and matching index entries; these are required to provide some of the - cache operations. - - - (*) In-cache object representation: - - struct fscache_object { - int debug_id; - enum { - FSCACHE_OBJECT_RECYCLING, - ... - } state; - spinlock_t lock - struct fscache_cache *cache; - struct fscache_cookie *cookie; - ... - }; - - Structures of this type should be allocated by the cache backend and - passed to FS-Cache when requested by the appropriate cache operation. In - the case of CacheFS, they're embedded in CacheFS's internal object - structures. - - The debug_id is a simple integer that can be used in debugging messages - that refer to a particular object. In such a case it should be printed - using "OBJ%x" to be consistent with FS-Cache. - - Each object contains a pointer to the cookie that represents the object it - is backing. An object should retired when put_object() is called if it is - in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be - initialised by calling fscache_object_init(object). - - - (*) FS-Cache operation record: - - struct fscache_operation { - atomic_t usage; - struct fscache_object *object; - unsigned long flags; - #define FSCACHE_OP_EXCLUSIVE - void (*processor)(struct fscache_operation *op); - void (*release)(struct fscache_operation *op); - ... - }; - - FS-Cache has a pool of threads that it uses to give CPU time to the - various asynchronous operations that need to be done as part of driving - the cache. These are represented by the above structure. The processor - method is called to give the op CPU time, and the release method to get - rid of it when its usage count reaches 0. - - An operation can be made exclusive upon an object by setting the - appropriate flag before enqueuing it with fscache_enqueue_operation(). If - an operation needs more processing time, it should be enqueued again. - - - (*) FS-Cache retrieval operation record: - - struct fscache_retrieval { - struct fscache_operation op; - struct address_space *mapping; - struct list_head *to_do; - ... - }; - - A structure of this type is allocated by FS-Cache to record retrieval and - allocation requests made by the netfs. This struct is then passed to the - backend to do the operation. The backend may get extra refs to it by - calling fscache_get_retrieval() and refs may be discarded by calling - fscache_put_retrieval(). - - A retrieval operation can be used by the backend to do retrieval work. To - do this, the retrieval->op.processor method pointer should be set - appropriately by the backend and fscache_enqueue_retrieval() called to - submit it to the thread pool. CacheFiles, for example, uses this to queue - page examination when it detects PG_lock being cleared. - - The to_do field is an empty list available for the cache backend to use as - it sees fit. - - - (*) FS-Cache storage operation record: - - struct fscache_storage { - struct fscache_operation op; - pgoff_t store_limit; - ... - }; - - A structure of this type is allocated by FS-Cache to record outstanding - writes to be made. FS-Cache itself enqueues this operation and invokes - the write_page() method on the object at appropriate times to effect - storage. - - -================ -CACHE OPERATIONS -================ - -The cache backend provides FS-Cache with a table of operations that can be -performed on the denizens of the cache. These are held in a structure of type: - - struct fscache_cache_ops - - (*) Name of cache provider [mandatory]: - - const char *name - - This isn't strictly an operation, but should be pointed at a string naming - the backend. - - - (*) Allocate a new object [mandatory]: - - struct fscache_object *(*alloc_object)(struct fscache_cache *cache, - struct fscache_cookie *cookie) - - This method is used to allocate a cache object representation to back a - cookie in a particular cache. fscache_object_init() should be called on - the object to initialise it prior to returning. - - This function may also be used to parse the index key to be used for - multiple lookup calls to turn it into a more convenient form. FS-Cache - will call the lookup_complete() method to allow the cache to release the - form once lookup is complete or aborted. - - - (*) Look up and create object [mandatory]: - - void (*lookup_object)(struct fscache_object *object) - - This method is used to look up an object, given that the object is already - allocated and attached to the cookie. This should instantiate that object - in the cache if it can. - - The method should call fscache_object_lookup_negative() as soon as - possible if it determines the object doesn't exist in the cache. If the - object is found to exist and the netfs indicates that it is valid then - fscache_obtained_object() should be called once the object is in a - position to have data stored in it. Similarly, fscache_obtained_object() - should also be called once a non-present object has been created. - - If a lookup error occurs, fscache_object_lookup_error() should be called - to abort the lookup of that object. - - - (*) Release lookup data [mandatory]: - - void (*lookup_complete)(struct fscache_object *object) - - This method is called to ask the cache to release any resources it was - using to perform a lookup. - - - (*) Increment object refcount [mandatory]: - - struct fscache_object *(*grab_object)(struct fscache_object *object) - - This method is called to increment the reference count on an object. It - may fail (for instance if the cache is being withdrawn) by returning NULL. - It should return the object pointer if successful. - - - (*) Lock/Unlock object [mandatory]: - - void (*lock_object)(struct fscache_object *object) - void (*unlock_object)(struct fscache_object *object) - - These methods are used to exclusively lock an object. It must be possible - to schedule with the lock held, so a spinlock isn't sufficient. - - - (*) Pin/Unpin object [optional]: - - int (*pin_object)(struct fscache_object *object) - void (*unpin_object)(struct fscache_object *object) - - These methods are used to pin an object into the cache. Once pinned an - object cannot be reclaimed to make space. Return -ENOSPC if there's not - enough space in the cache to permit this. - - - (*) Check coherency state of an object [mandatory]: - - int (*check_consistency)(struct fscache_object *object) - - This method is called to have the cache check the saved auxiliary data of - the object against the netfs's idea of the state. 0 should be returned - if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS - may also be returned. - - (*) Update object [mandatory]: - - int (*update_object)(struct fscache_object *object) - - This is called to update the index entry for the specified object. The - new information should be in object->cookie->netfs_data. This can be - obtained by calling object->cookie->def->get_aux()/get_attr(). - - - (*) Invalidate data object [mandatory]: - - int (*invalidate_object)(struct fscache_operation *op) - - This is called to invalidate a data object (as pointed to by op->object). - All the data stored for this object should be discarded and an - attr_changed operation should be performed. The caller will follow up - with an object update operation. - - fscache_op_complete() must be called on op before returning. - - - (*) Discard object [mandatory]: - - void (*drop_object)(struct fscache_object *object) - - This method is called to indicate that an object has been unbound from its - cookie, and that the cache should release the object's resources and - retire it if it's in state FSCACHE_OBJECT_RECYCLING. - - This method should not attempt to release any references held by the - caller. The caller will invoke the put_object() method as appropriate. - - - (*) Release object reference [mandatory]: - - void (*put_object)(struct fscache_object *object) - - This method is used to discard a reference to an object. The object may - be freed when all the references to it are released. - - - (*) Synchronise a cache [mandatory]: - - void (*sync)(struct fscache_cache *cache) - - This is called to ask the backend to synchronise a cache with its backing - device. - - - (*) Dissociate a cache [mandatory]: - - void (*dissociate_pages)(struct fscache_cache *cache) - - This is called to ask a cache to perform any page dissociations as part of - cache withdrawal. - - - (*) Notification that the attributes on a netfs file changed [mandatory]: - - int (*attr_changed)(struct fscache_object *object); - - This is called to indicate to the cache that certain attributes on a netfs - file have changed (for example the maximum size a file may reach). The - cache can read these from the netfs by calling the cookie's get_attr() - method. - - The cache may use the file size information to reserve space on the cache. - It should also call fscache_set_store_limit() to indicate to FS-Cache the - highest byte it's willing to store for an object. - - This method may return -ve if an error occurred or the cache object cannot - be expanded. In such a case, the object will be withdrawn from service. - - This operation is run asynchronously from FS-Cache's thread pool, and - storage and retrieval operations from the netfs are excluded during the - execution of this operation. - - - (*) Reserve cache space for an object's data [optional]: - - int (*reserve_space)(struct fscache_object *object, loff_t size); - - This is called to request that cache space be reserved to hold the data - for an object and the metadata used to track it. Zero size should be - taken as request to cancel a reservation. - - This should return 0 if successful, -ENOSPC if there isn't enough space - available, or -ENOMEM or -EIO on other errors. - - The reservation may exceed the current size of the object, thus permitting - future expansion. If the amount of space consumed by an object would - exceed the reservation, it's permitted to refuse requests to allocate - pages, but not required. An object may be pruned down to its reservation - size if larger than that already. - - - (*) Request page be read from cache [mandatory]: - - int (*read_or_alloc_page)(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) - - This is called to attempt to read a netfs page from the cache, or to - reserve a backing block if not. FS-Cache will have done as much checking - as it can before calling, but most of the work belongs to the backend. - - If there's no page in the cache, then -ENODATA should be returned if the - backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it - didn't. - - If there is suitable data in the cache, then a read operation should be - queued and 0 returned. When the read finishes, fscache_end_io() should be - called. - - The fscache_mark_pages_cached() should be called for the page if any cache - metadata is retained. This will indicate to the netfs that the page needs - explicit uncaching. This operation takes a pagevec, thus allowing several - pages to be marked at once. - - The retrieval record pointed to by op should be retained for each page - queued and released when I/O on the page has been formally ended. - fscache_get/put_retrieval() are available for this purpose. - - The retrieval record may be used to get CPU time via the FS-Cache thread - pool. If this is desired, the op->op.processor should be set to point to - the appropriate processing routine, and fscache_enqueue_retrieval() should - be called at an appropriate point to request CPU time. For instance, the - retrieval routine could be enqueued upon the completion of a disk read. - The to_do field in the retrieval record is provided to aid in this. - - If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS - returned if possible or fscache_end_io() called with a suitable error - code. - - fscache_put_retrieval() should be called after a page or pages are dealt - with. This will complete the operation when all pages are dealt with. - - - (*) Request pages be read from cache [mandatory]: - - int (*read_or_alloc_pages)(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) - - This is like the read_or_alloc_page() method, except it is handed a list - of pages instead of one page. Any pages on which a read operation is - started must be added to the page cache for the specified mapping and also - to the LRU. Such pages must also be removed from the pages list and - *nr_pages decremented per page. - - If there was an error such as -ENOMEM, then that should be returned; else - if one or more pages couldn't be read or allocated, then -ENOBUFS should - be returned; else if one or more pages couldn't be read, then -ENODATA - should be returned. If all the pages are dispatched then 0 should be - returned. - - - (*) Request page be allocated in the cache [mandatory]: - - int (*allocate_page)(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) - - This is like the read_or_alloc_page() method, except that it shouldn't - read from the cache, even if there's data there that could be retrieved. - It should, however, set up any internal metadata required such that - the write_page() method can write to the cache. - - If there's no backing block available, then -ENOBUFS should be returned - (or -ENOMEM if there were other problems). If a block is successfully - allocated, then the netfs page should be marked and 0 returned. - - - (*) Request pages be allocated in the cache [mandatory]: - - int (*allocate_pages)(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) - - This is an multiple page version of the allocate_page() method. pages and - nr_pages should be treated as for the read_or_alloc_pages() method. - - - (*) Request page be written to cache [mandatory]: - - int (*write_page)(struct fscache_storage *op, - struct page *page); - - This is called to write from a page on which there was a previously - successful read_or_alloc_page() call or similar. FS-Cache filters out - pages that don't have mappings. - - This method is called asynchronously from the FS-Cache thread pool. It is - not required to actually store anything, provided -ENODATA is then - returned to the next read of this page. - - If an error occurred, then a negative error code should be returned, - otherwise zero should be returned. FS-Cache will take appropriate action - in response to an error, such as withdrawing this object. - - If this method returns success then FS-Cache will inform the netfs - appropriately. - - - (*) Discard retained per-page metadata [mandatory]: - - void (*uncache_page)(struct fscache_object *object, struct page *page) - - This is called when a netfs page is being evicted from the pagecache. The - cache backend should tear down any internal representation or tracking it - maintains for this page. - - -================== -FS-CACHE UTILITIES -================== - -FS-Cache provides some utilities that a cache backend may make use of: - - (*) Note occurrence of an I/O error in a cache: - - void fscache_io_error(struct fscache_cache *cache) - - This tells FS-Cache that an I/O error occurred in the cache. After this - has been called, only resource dissociation operations (object and page - release) will be passed from the netfs to the cache backend for the - specified cache. - - This does not actually withdraw the cache. That must be done separately. - - - (*) Invoke the retrieval I/O completion function: - - void fscache_end_io(struct fscache_retrieval *op, struct page *page, - int error); - - This is called to note the end of an attempt to retrieve a page. The - error value should be 0 if successful and an error otherwise. - - - (*) Record that one or more pages being retrieved or allocated have been dealt - with: - - void fscache_retrieval_complete(struct fscache_retrieval *op, - int n_pages); - - This is called to record the fact that one or more pages have been dealt - with and are no longer the concern of this operation. When the number of - pages remaining in the operation reaches 0, the operation will be - completed. - - - (*) Record operation completion: - - void fscache_op_complete(struct fscache_operation *op); - - This is called to record the completion of an operation. This deducts - this operation from the parent object's run state, potentially permitting - one or more pending operations to start running. - - - (*) Set highest store limit: - - void fscache_set_store_limit(struct fscache_object *object, - loff_t i_size); - - This sets the limit FS-Cache imposes on the highest byte it's willing to - try and store for a netfs. Any page over this limit is automatically - rejected by fscache_read_alloc_page() and co with -ENOBUFS. - - - (*) Mark pages as being cached: - - void fscache_mark_pages_cached(struct fscache_retrieval *op, - struct pagevec *pagevec); - - This marks a set of pages as being cached. After this has been called, - the netfs must call fscache_uncache_page() to unmark the pages. - - - (*) Perform coherency check on an object: - - enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, - uint16_t datalen); - - This asks the netfs to perform a coherency check on an object that has - just been looked up. The cookie attached to the object will determine the - netfs to use. data and datalen should specify where the auxiliary data - retrieved from the cache can be found. - - One of three values will be returned: - - (*) FSCACHE_CHECKAUX_OKAY - - The coherency data indicates the object is valid as is. - - (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - - The coherency data needs updating, but otherwise the object is - valid. - - (*) FSCACHE_CHECKAUX_OBSOLETE - - The coherency data indicates that the object is obsolete and should - be discarded. - - - (*) Initialise a freshly allocated object: - - void fscache_object_init(struct fscache_object *object); - - This initialises all the fields in an object representation. - - - (*) Indicate the destruction of an object: - - void fscache_object_destroyed(struct fscache_cache *cache); - - This must be called to inform FS-Cache that an object that belonged to a - cache has been destroyed and deallocated. This will allow continuation - of the cache withdrawal process when it is stopped pending destruction of - all the objects. - - - (*) Indicate negative lookup on an object: - - void fscache_object_lookup_negative(struct fscache_object *object); - - This is called to indicate to FS-Cache that a lookup process for an object - found a negative result. - - This changes the state of an object to permit reads pending on lookup - completion to go off and start fetching data from the netfs server as it's - known at this point that there can't be any data in the cache. - - This may be called multiple times on an object. Only the first call is - significant - all subsequent calls are ignored. - - - (*) Indicate an object has been obtained: - - void fscache_obtained_object(struct fscache_object *object); - - This is called to indicate to FS-Cache that a lookup process for an object - produced a positive result, or that an object was created. This should - only be called once for any particular object. - - This changes the state of an object to indicate: - - (1) if no call to fscache_object_lookup_negative() has been made on - this object, that there may be data available, and that reads can - now go and look for it; and - - (2) that writes may now proceed against this object. - - - (*) Indicate that object lookup failed: - - void fscache_object_lookup_error(struct fscache_object *object); - - This marks an object as having encountered a fatal error (usually EIO) - and causes it to move into a state whereby it will be withdrawn as soon - as possible. - - - (*) Indicate that a stale object was found and discarded: - - void fscache_object_retrying_stale(struct fscache_object *object); - - This is called to indicate that the lookup procedure found an object in - the cache that the netfs decided was stale. The object has been - discarded from the cache and the lookup will be performed again. - - - (*) Indicate that the caching backend killed an object: - - void fscache_object_mark_killed(struct fscache_object *object, - enum fscache_why_object_killed why); - - This is called to indicate that the cache backend preemptively killed an - object. The why parameter should be set to indicate the reason: - - FSCACHE_OBJECT_IS_STALE - the object was stale and needs discarding. - FSCACHE_OBJECT_NO_SPACE - there was insufficient cache space - FSCACHE_OBJECT_WAS_RETIRED - the object was retired when relinquished. - FSCACHE_OBJECT_WAS_CULLED - the object was culled to make space. - - - (*) Get and release references on a retrieval record: - - void fscache_get_retrieval(struct fscache_retrieval *op); - void fscache_put_retrieval(struct fscache_retrieval *op); - - These two functions are used to retain a retrieval record while doing - asynchronous data retrieval and block allocation. - - - (*) Enqueue a retrieval record for processing. - - void fscache_enqueue_retrieval(struct fscache_retrieval *op); - - This enqueues a retrieval record for processing by the FS-Cache thread - pool. One of the threads in the pool will invoke the retrieval record's - op->op.processor callback function. This function may be called from - within the callback function. - - - (*) List of object state names: - - const char *fscache_object_states[]; - - For debugging purposes, this may be used to turn the state that an object - is in into a text string for display purposes. diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst index dd1297d884d0..70de86922b6a 100644 --- a/Documentation/filesystems/caching/fscache.rst +++ b/Documentation/filesystems/caching/fscache.rst @@ -187,7 +187,7 @@ The netfs API to FS-Cache can be found in: The cache backend API to FS-Cache can be found in: - Documentation/filesystems/caching/backend-api.txt + Documentation/filesystems/caching/backend-api.rst A description of the internal representations and object state machine can be found in: diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index a2cf35f89e28..033da7ac7c6e 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -8,6 +8,7 @@ Filesystem Caching fscache object + backend-api cachefiles netfs-api operations diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f78793f3d21e..fcc136361415 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -172,7 +172,7 @@ no_preference: * * Initialise a record of a cache and fill in the name. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_init_cache(struct fscache_cache *cache, @@ -207,7 +207,7 @@ EXPORT_SYMBOL(fscache_init_cache); * * Add a cache to the system, making it available for netfs's to use. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ int fscache_add_cache(struct fscache_cache *cache, @@ -307,7 +307,7 @@ EXPORT_SYMBOL(fscache_add_cache); * Note that an I/O error occurred in a cache and that it should no longer be * used for anything. This also reports the error into the kernel log. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_io_error(struct fscache_cache *cache) @@ -355,7 +355,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, * Withdraw a cache from service, unbinding all its cache objects from the * netfs cookies they're currently representing. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_withdraw_cache(struct fscache_cache *cache) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index efaa003b8323..cb2146e02cd5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -295,7 +295,7 @@ static void fscache_object_work_func(struct work_struct *work) * * Initialise a cache object description to its basic values. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_object_init(struct fscache_object *object, diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index d5ba431b5d63..ce0b5fbf239d 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -6,7 +6,7 @@ * * NOTE!!! See: * - * Documentation/filesystems/caching/backend-api.txt + * Documentation/filesystems/caching/backend-api.rst * * for a description of the cache backend interface declared here. */ @@ -454,7 +454,7 @@ static inline void fscache_object_lookup_error(struct fscache_object *object) * Set the maximum size an object is permitted to reach, implying the highest * byte that may be written. Intended to be called by the attr_changed() op. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ static inline -- cgit v1.2.3 From 791a17ee19736fc6d4e35c4bf9f8efd1001be77c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:09 +0200 Subject: docs: filesystems: convert mount_api.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add table markups; - Add lists markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/32332c1659a28c22561cb5e64162c959856066b4.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/mount_api.rst | 825 ++++++++++++++++++++++++++++++++ Documentation/filesystems/mount_api.txt | 724 ---------------------------- include/linux/fs_context.h | 2 +- include/linux/lsm_hooks.h | 2 +- 5 files changed, 828 insertions(+), 726 deletions(-) create mode 100644 Documentation/filesystems/mount_api.rst delete mode 100644 Documentation/filesystems/mount_api.txt (limited to 'include/linux') diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index df1a474ad9b9..4532b4d2631f 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -30,6 +30,7 @@ algorithms work. files locks mandatory-locking + mount_api automount-support diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst new file mode 100644 index 000000000000..dea22d64f060 --- /dev/null +++ b/Documentation/filesystems/mount_api.rst @@ -0,0 +1,825 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +fILESYSTEM Mount API +==================== + +.. CONTENTS + + (1) Overview. + + (2) The filesystem context. + + (3) The filesystem context operations. + + (4) Filesystem context security. + + (5) VFS filesystem context API. + + (6) Superblock creation helpers. + + (7) Parameter description. + + (8) Parameter helper functions. + + +Overview +======== + +The creation of new mounts is now to be done in a multistep process: + + (1) Create a filesystem context. + + (2) Parse the parameters and attach them to the context. Parameters are + expected to be passed individually from userspace, though legacy binary + parameters can also be handled. + + (3) Validate and pre-process the context. + + (4) Get or create a superblock and mountable root. + + (5) Perform the mount. + + (6) Return an error message attached to the context. + + (7) Destroy the context. + +To support this, the file_system_type struct gains two new fields:: + + int (*init_fs_context)(struct fs_context *fc); + const struct fs_parameter_description *parameters; + +The first is invoked to set up the filesystem-specific parts of a filesystem +context, including the additional space, and the second points to the +parameter description for validation at registration time and querying by a +future system call. + +Note that security initialisation is done *after* the filesystem is called so +that the namespaces may be adjusted first. + + +The Filesystem context +====================== + +The creation and reconfiguration of a superblock is governed by a filesystem +context. This is represented by the fs_context structure:: + + struct fs_context { + const struct fs_context_operations *ops; + struct file_system_type *fs_type; + void *fs_private; + struct dentry *root; + struct user_namespace *user_ns; + struct net *net_ns; + const struct cred *cred; + char *source; + char *subtype; + void *security; + void *s_fs_info; + unsigned int sb_flags; + unsigned int sb_flags_mask; + unsigned int s_iflags; + unsigned int lsm_flags; + enum fs_context_purpose purpose:8; + ... + }; + +The fs_context fields are as follows: + + * :: + + const struct fs_context_operations *ops + + These are operations that can be done on a filesystem context (see + below). This must be set by the ->init_fs_context() file_system_type + operation. + + * :: + + struct file_system_type *fs_type + + A pointer to the file_system_type of the filesystem that is being + constructed or reconfigured. This retains a reference on the type owner. + + * :: + + void *fs_private + + A pointer to the file system's private data. This is where the filesystem + will need to store any options it parses. + + * :: + + struct dentry *root + + A pointer to the root of the mountable tree (and indirectly, the + superblock thereof). This is filled in by the ->get_tree() op. If this + is set, an active reference on root->d_sb must also be held. + + * :: + + struct user_namespace *user_ns + struct net *net_ns + + There are a subset of the namespaces in use by the invoking process. They + retain references on each namespace. The subscribed namespaces may be + replaced by the filesystem to reflect other sources, such as the parent + mount superblock on an automount. + + * :: + + const struct cred *cred + + The mounter's credentials. This retains a reference on the credentials. + + * :: + + char *source + + This specifies the source. It may be a block device (e.g. /dev/sda1) or + something more exotic, such as the "host:/path" that NFS desires. + + * :: + + char *subtype + + This is a string to be added to the type displayed in /proc/mounts to + qualify it (used by FUSE). This is available for the filesystem to set if + desired. + + * :: + + void *security + + A place for the LSMs to hang their security data for the superblock. The + relevant security operations are described below. + + * :: + + void *s_fs_info + + The proposed s_fs_info for a new superblock, set in the superblock by + sget_fc(). This can be used to distinguish superblocks. + + * :: + + unsigned int sb_flags + unsigned int sb_flags_mask + + Which bits SB_* flags are to be set/cleared in super_block::s_flags. + + * :: + + unsigned int s_iflags + + These will be bitwise-OR'd with s->s_iflags when a superblock is created. + + * :: + + enum fs_context_purpose + + This indicates the purpose for which the context is intended. The + available values are: + + ========================== ====================================== + FS_CONTEXT_FOR_MOUNT, New superblock for explicit mount + FS_CONTEXT_FOR_SUBMOUNT New automatic submount of extant mount + FS_CONTEXT_FOR_RECONFIGURE Change an existing mount + ========================== ====================================== + +The mount context is created by calling vfs_new_fs_context() or +vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the +structure is not refcounted. + +VFS, security and filesystem mount options are set individually with +vfs_parse_mount_option(). Options provided by the old mount(2) system call as +a page of data can be parsed with generic_parse_monolithic(). + +When mounting, the filesystem is allowed to take data from any of the pointers +and attach it to the superblock (or whatever), provided it clears the pointer +in the mount context. + +The filesystem is also allowed to allocate resources and pin them with the +mount context. For instance, NFS might pin the appropriate protocol version +module. + + +The Filesystem Context Operations +================================= + +The filesystem context points to a table of operations:: + + struct fs_context_operations { + void (*free)(struct fs_context *fc); + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); + int (*parse_monolithic)(struct fs_context *fc, void *data); + int (*get_tree)(struct fs_context *fc); + int (*reconfigure)(struct fs_context *fc); + }; + +These operations are invoked by the various stages of the mount procedure to +manage the filesystem context. They are as follows: + + * :: + + void (*free)(struct fs_context *fc); + + Called to clean up the filesystem-specific part of the filesystem context + when the context is destroyed. It should be aware that parts of the + context may have been removed and NULL'd out by ->get_tree(). + + * :: + + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + + Called when a filesystem context has been duplicated to duplicate the + filesystem-private data. An error may be returned to indicate failure to + do this. + + .. Warning:: + + Note that even if this fails, put_fs_context() will be called + immediately thereafter, so ->dup() *must* make the + filesystem-private data safe for ->free(). + + * :: + + int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); + + Called when a parameter is being added to the filesystem context. param + points to the key name and maybe a value object. VFS-specific options + will have been weeded out and fc->sb_flags updated in the context. + Security options will also have been weeded out and fc->security updated. + + The parameter can be parsed with fs_parse() and fs_lookup_param(). Note + that the source(s) are presented as parameters named "source". + + If successful, 0 should be returned or a negative error code otherwise. + + * :: + + int (*parse_monolithic)(struct fs_context *fc, void *data); + + Called when the mount(2) system call is invoked to pass the entire data + page in one go. If this is expected to be just a list of "key[=val]" + items separated by commas, then this may be set to NULL. + + The return value is as for ->parse_param(). + + If the filesystem (e.g. NFS) needs to examine the data first and then + finds it's the standard key-val list then it may pass it off to + generic_parse_monolithic(). + + * :: + + int (*get_tree)(struct fs_context *fc); + + Called to get or create the mountable root and superblock, using the + information stored in the filesystem context (reconfiguration goes via a + different vector). It may detach any resources it desires from the + filesystem context and transfer them to the superblock it creates. + + On success it should set fc->root to the mountable root and return 0. In + the case of an error, it should return a negative error code. + + The phase on a userspace-driven context will be set to only allow this to + be called once on any particular context. + + * :: + + int (*reconfigure)(struct fs_context *fc); + + Called to effect reconfiguration of a superblock using information stored + in the filesystem context. It may detach any resources it desires from + the filesystem context and transfer them to the superblock. The + superblock can be found from fc->root->d_sb. + + On success it should return 0. In the case of an error, it should return + a negative error code. + + .. Note:: reconfigure is intended as a replacement for remount_fs. + + +Filesystem context Security +=========================== + +The filesystem context contains a security pointer that the LSMs can use for +building up a security context for the superblock to be mounted. There are a +number of operations used by the new mount code for this purpose: + + * :: + + int security_fs_context_alloc(struct fs_context *fc, + struct dentry *reference); + + Called to initialise fc->security (which is preset to NULL) and allocate + any resources needed. It should return 0 on success or a negative error + code on failure. + + reference will be non-NULL if the context is being created for superblock + reconfiguration (FS_CONTEXT_FOR_RECONFIGURE) in which case it indicates + the root dentry of the superblock to be reconfigured. It will also be + non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case + it indicates the automount point. + + * :: + + int security_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc); + + Called to initialise fc->security (which is preset to NULL) and allocate + any resources needed. The original filesystem context is pointed to by + src_fc and may be used for reference. It should return 0 on success or a + negative error code on failure. + + * :: + + void security_fs_context_free(struct fs_context *fc); + + Called to clean up anything attached to fc->security. Note that the + contents may have been transferred to a superblock and the pointer cleared + during get_tree. + + * :: + + int security_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param); + + Called for each mount parameter, including the source. The arguments are + as for the ->parse_param() method. It should return 0 to indicate that + the parameter should be passed on to the filesystem, 1 to indicate that + the parameter should be discarded or an error to indicate that the + parameter should be rejected. + + The value pointed to by param may be modified (if a string) or stolen + (provided the value pointer is NULL'd out). If it is stolen, 1 must be + returned to prevent it being passed to the filesystem. + + * :: + + int security_fs_context_validate(struct fs_context *fc); + + Called after all the options have been parsed to validate the collection + as a whole and to do any necessary allocation so that + security_sb_get_tree() and security_sb_reconfigure() are less likely to + fail. It should return 0 or a negative error code. + + In the case of reconfiguration, the target superblock will be accessible + via fc->root. + + * :: + + int security_sb_get_tree(struct fs_context *fc); + + Called during the mount procedure to verify that the specified superblock + is allowed to be mounted and to transfer the security data there. It + should return 0 or a negative error code. + + * :: + + void security_sb_reconfigure(struct fs_context *fc); + + Called to apply any reconfiguration to an LSM's context. It must not + fail. Error checking and resource allocation must be done in advance by + the parameter parsing and validation hooks. + + * :: + + int security_sb_mountpoint(struct fs_context *fc, + struct path *mountpoint, + unsigned int mnt_flags); + + Called during the mount procedure to verify that the root dentry attached + to the context is permitted to be attached to the specified mountpoint. + It should return 0 on success or a negative error code on failure. + + +VFS Filesystem context API +========================== + +There are four operations for creating a filesystem context and one for +destroying a context: + + * :: + + struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags); + + Allocate a filesystem context for the purpose of setting up a new mount, + whether that be with a new superblock or sharing an existing one. This + sets the superblock flags, initialises the security and calls + fs_type->init_fs_context() to initialise the filesystem private data. + + fs_type specifies the filesystem type that will manage the context and + sb_flags presets the superblock flags stored therein. + + * :: + + struct fs_context *fs_context_for_reconfigure( + struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask); + + Allocate a filesystem context for the purpose of reconfiguring an + existing superblock. dentry provides a reference to the superblock to be + configured. sb_flags and sb_flags_mask indicate which superblock flags + need changing and to what. + + * :: + + struct fs_context *fs_context_for_submount( + struct file_system_type *fs_type, + struct dentry *reference); + + Allocate a filesystem context for the purpose of creating a new mount for + an automount point or other derived superblock. fs_type specifies the + filesystem type that will manage the context and the reference dentry + supplies the parameters. Namespaces are propagated from the reference + dentry's superblock also. + + Note that it's not a requirement that the reference dentry be of the same + filesystem type as fs_type. + + * :: + + struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); + + Duplicate a filesystem context, copying any options noted and duplicating + or additionally referencing any resources held therein. This is available + for use where a filesystem has to get a mount within a mount, such as NFS4 + does by internally mounting the root of the target server and then doing a + private pathwalk to the target directory. + + The purpose in the new context is inherited from the old one. + + * :: + + void put_fs_context(struct fs_context *fc); + + Destroy a filesystem context, releasing any resources it holds. This + calls the ->free() operation. This is intended to be called by anyone who + created a filesystem context. + + .. Warning:: + + filesystem contexts are not refcounted, so this causes unconditional + destruction. + +In all the above operations, apart from the put op, the return is a mount +context pointer or a negative error code. + +For the remaining operations, if an error occurs, a negative error code will be +returned. + + * :: + + int vfs_parse_fs_param(struct fs_context *fc, + struct fs_parameter *param); + + Supply a single mount parameter to the filesystem context. This include + the specification of the source/device which is specified as the "source" + parameter (which may be specified multiple times if the filesystem + supports that). + + param specifies the parameter key name and the value. The parameter is + first checked to see if it corresponds to a standard mount flag (in which + case it is used to set an SB_xxx flag and consumed) or a security option + (in which case the LSM consumes it) before it is passed on to the + filesystem. + + The parameter value is typed and can be one of: + + ==================== ============================= + fs_value_is_flag Parameter not given a value + fs_value_is_string Value is a string + fs_value_is_blob Value is a binary blob + fs_value_is_filename Value is a filename* + dirfd + fs_value_is_file Value is an open file (file*) + ==================== ============================= + + If there is a value, that value is stored in a union in the struct in one + of param->{string,blob,name,file}. Note that the function may steal and + clear the pointer, but then becomes responsible for disposing of the + object. + + * :: + + int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size); + + A wrapper around vfs_parse_fs_param() that copies the value string it is + passed. + + * :: + + int generic_parse_monolithic(struct fs_context *fc, void *data); + + Parse a sys_mount() data page, assuming the form to be a text list + consisting of key[=val] options separated by commas. Each item in the + list is passed to vfs_mount_option(). This is the default when the + ->parse_monolithic() method is NULL. + + * :: + + int vfs_get_tree(struct fs_context *fc); + + Get or create the mountable root and superblock, using the parameters in + the filesystem context to select/configure the superblock. This invokes + the ->get_tree() method. + + * :: + + struct vfsmount *vfs_create_mount(struct fs_context *fc); + + Create a mount given the parameters in the specified filesystem context. + Note that this does not attach the mount to anything. + + +Superblock Creation Helpers +=========================== + +A number of VFS helpers are available for use by filesystems for the creation +or looking up of superblocks. + + * :: + + struct super_block * + sget_fc(struct fs_context *fc, + int (*test)(struct super_block *sb, struct fs_context *fc), + int (*set)(struct super_block *sb, struct fs_context *fc)); + + This is the core routine. If test is non-NULL, it searches for an + existing superblock matching the criteria held in the fs_context, using + the test function to match them. If no match is found, a new superblock + is created and the set function is called to set it up. + + Prior to the set function being called, fc->s_fs_info will be transferred + to sb->s_fs_info - and fc->s_fs_info will be cleared if set returns + success (ie. 0). + +The following helpers all wrap sget_fc(): + + * :: + + int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) + + This creates/looks up a deviceless superblock. The keying indicates how + many superblocks of this type may exist and in what manner they may be + shared: + + (1) vfs_get_single_super + + Only one such superblock may exist in the system. Any further + attempt to get a new superblock gets this one (and any parameter + differences are ignored). + + (2) vfs_get_keyed_super + + Multiple superblocks of this type may exist and they're keyed on + their s_fs_info pointer (for example this may refer to a + namespace). + + (3) vfs_get_independent_super + + Multiple independent superblocks of this type may exist. This + function never matches an existing one and always creates a new + one. + + +===================== +PARAMETER DESCRIPTION +===================== + +Parameters are described using structures defined in linux/fs_parser.h. +There's a core description struct that links everything together:: + + struct fs_parameter_description { + const struct fs_parameter_spec *specs; + const struct fs_parameter_enum *enums; + }; + +For example:: + + enum { + Opt_autocell, + Opt_bar, + Opt_dyn, + Opt_foo, + Opt_source, + }; + + static const struct fs_parameter_description afs_fs_parameters = { + .specs = afs_param_specs, + .enums = afs_param_enums, + }; + +The members are as follows: + + (1) :: + + const struct fs_parameter_specification *specs; + + Table of parameter specifications, terminated with a null entry, where the + entries are of type:: + + struct fs_parameter_spec { + const char *name; + u8 opt; + enum fs_parameter_type type:8; + unsigned short flags; + }; + + The 'name' field is a string to match exactly to the parameter key (no + wildcards, patterns and no case-independence) and 'opt' is the value that + will be returned by the fs_parser() function in the case of a successful + match. + + The 'type' field indicates the desired value type and must be one of: + + ======================= ======================= ===================== + TYPE NAME EXPECTED VALUE RESULT IN + ======================= ======================= ===================== + fs_param_is_flag No value n/a + fs_param_is_bool Boolean value result->boolean + fs_param_is_u32 32-bit unsigned int result->uint_32 + fs_param_is_u32_octal 32-bit octal int result->uint_32 + fs_param_is_u32_hex 32-bit hex int result->uint_32 + fs_param_is_s32 32-bit signed int result->int_32 + fs_param_is_u64 64-bit unsigned int result->uint_64 + fs_param_is_enum Enum value name result->uint_32 + fs_param_is_string Arbitrary string param->string + fs_param_is_blob Binary blob param->blob + fs_param_is_blockdev Blockdev path * Needs lookup + fs_param_is_path Path * Needs lookup + fs_param_is_fd File descriptor result->int_32 + ======================= ======================= ===================== + + Note that if the value is of fs_param_is_bool type, fs_parse() will try + to match any string value against "0", "1", "no", "yes", "false", "true". + + Each parameter can also be qualified with 'flags': + + ======================= ================================================ + fs_param_v_optional The value is optional + fs_param_neg_with_no result->negated set if key is prefixed with "no" + fs_param_neg_with_empty result->negated set if value is "" + fs_param_deprecated The parameter is deprecated. + ======================= ================================================ + + These are wrapped with a number of convenience wrappers: + + ======================= =============================================== + MACRO SPECIFIES + ======================= =============================================== + fsparam_flag() fs_param_is_flag + fsparam_flag_no() fs_param_is_flag, fs_param_neg_with_no + fsparam_bool() fs_param_is_bool + fsparam_u32() fs_param_is_u32 + fsparam_u32oct() fs_param_is_u32_octal + fsparam_u32hex() fs_param_is_u32_hex + fsparam_s32() fs_param_is_s32 + fsparam_u64() fs_param_is_u64 + fsparam_enum() fs_param_is_enum + fsparam_string() fs_param_is_string + fsparam_blob() fs_param_is_blob + fsparam_bdev() fs_param_is_blockdev + fsparam_path() fs_param_is_path + fsparam_fd() fs_param_is_fd + ======================= =============================================== + + all of which take two arguments, name string and option number - for + example:: + + static const struct fs_parameter_spec afs_param_specs[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_string ("source", Opt_source), + fsparam_flag_no ("foo", Opt_foo), + {} + }; + + An addition macro, __fsparam() is provided that takes an additional pair + of arguments to specify the type and the flags for anything that doesn't + match one of the above macros. + + (2) :: + + const struct fs_parameter_enum *enums; + + Table of enum value names to integer mappings, terminated with a null + entry. This is of type:: + + struct fs_parameter_enum { + u8 opt; + char name[14]; + u8 value; + }; + + Where the array is an unsorted list of { parameter ID, name }-keyed + elements that indicate the value to map to, e.g.:: + + static const struct fs_parameter_enum afs_param_enums[] = { + { Opt_bar, "x", 1}, + { Opt_bar, "y", 23}, + { Opt_bar, "z", 42}, + }; + + If a parameter of type fs_param_is_enum is encountered, fs_parse() will + try to look the value up in the enum table and the result will be stored + in the parse result. + +The parser should be pointed to by the parser pointer in the file_system_type +struct as this will provide validation on registration (if +CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from +userspace using the fsinfo() syscall. + + +Parameter Helper Functions +========================== + +A number of helper functions are provided to help a filesystem or an LSM +process the parameters it is given. + + * :: + + int lookup_constant(const struct constant_table tbl[], + const char *name, int not_found); + + Look up a constant by name in a table of name -> integer mappings. The + table is an array of elements of the following type:: + + struct constant_table { + const char *name; + int value; + }; + + If a match is found, the corresponding value is returned. If a match + isn't found, the not_found value is returned instead. + + * :: + + bool validate_constant_table(const struct constant_table *tbl, + size_t tbl_size, + int low, int high, int special); + + Validate a constant table. Checks that all the elements are appropriately + ordered, that there are no duplicates and that the values are between low + and high inclusive, though provision is made for one allowable special + value outside of that range. If no special value is required, special + should just be set to lie inside the low-to-high range. + + If all is good, true is returned. If the table is invalid, errors are + logged to dmesg and false is returned. + + * :: + + bool fs_validate_description(const struct fs_parameter_description *desc); + + This performs some validation checks on a parameter description. It + returns true if the description is good and false if it is not. It will + log errors to dmesg if validation fails. + + * :: + + int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result); + + This is the main interpreter of parameters. It uses the parameter + description to look up a parameter by key name and to convert that to an + option number (which it returns). + + If successful, and if the parameter type indicates the result is a + boolean, integer or enum type, the value is converted by this function and + the result stored in result->{boolean,int_32,uint_32,uint_64}. + + If a match isn't initially made, the key is prefixed with "no" and no + value is present then an attempt will be made to look up the key with the + prefix removed. If this matches a parameter for which the type has flag + fs_param_neg_with_no set, then a match will be made and result->negated + will be set to true. + + If the parameter isn't matched, -ENOPARAM will be returned; if the + parameter is matched, but the value is erroneous, -EINVAL will be + returned; otherwise the parameter's option number will be returned. + + * :: + + int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *value, + bool want_bdev, + struct path *_path); + + This takes a parameter that carries a string or filename type and attempts + to do a path lookup on it. If the parameter expects a blockdev, a check + is made that the inode actually represents one. + + Returns 0 if successful and ``*_path`` will be set; returns a negative + error code if not. diff --git a/Documentation/filesystems/mount_api.txt b/Documentation/filesystems/mount_api.txt deleted file mode 100644 index 87c14bbb2b35..000000000000 --- a/Documentation/filesystems/mount_api.txt +++ /dev/null @@ -1,724 +0,0 @@ - ==================== - FILESYSTEM MOUNT API - ==================== - -CONTENTS - - (1) Overview. - - (2) The filesystem context. - - (3) The filesystem context operations. - - (4) Filesystem context security. - - (5) VFS filesystem context API. - - (6) Superblock creation helpers. - - (7) Parameter description. - - (8) Parameter helper functions. - - -======== -OVERVIEW -======== - -The creation of new mounts is now to be done in a multistep process: - - (1) Create a filesystem context. - - (2) Parse the parameters and attach them to the context. Parameters are - expected to be passed individually from userspace, though legacy binary - parameters can also be handled. - - (3) Validate and pre-process the context. - - (4) Get or create a superblock and mountable root. - - (5) Perform the mount. - - (6) Return an error message attached to the context. - - (7) Destroy the context. - -To support this, the file_system_type struct gains two new fields: - - int (*init_fs_context)(struct fs_context *fc); - const struct fs_parameter_description *parameters; - -The first is invoked to set up the filesystem-specific parts of a filesystem -context, including the additional space, and the second points to the -parameter description for validation at registration time and querying by a -future system call. - -Note that security initialisation is done *after* the filesystem is called so -that the namespaces may be adjusted first. - - -====================== -THE FILESYSTEM CONTEXT -====================== - -The creation and reconfiguration of a superblock is governed by a filesystem -context. This is represented by the fs_context structure: - - struct fs_context { - const struct fs_context_operations *ops; - struct file_system_type *fs_type; - void *fs_private; - struct dentry *root; - struct user_namespace *user_ns; - struct net *net_ns; - const struct cred *cred; - char *source; - char *subtype; - void *security; - void *s_fs_info; - unsigned int sb_flags; - unsigned int sb_flags_mask; - unsigned int s_iflags; - unsigned int lsm_flags; - enum fs_context_purpose purpose:8; - ... - }; - -The fs_context fields are as follows: - - (*) const struct fs_context_operations *ops - - These are operations that can be done on a filesystem context (see - below). This must be set by the ->init_fs_context() file_system_type - operation. - - (*) struct file_system_type *fs_type - - A pointer to the file_system_type of the filesystem that is being - constructed or reconfigured. This retains a reference on the type owner. - - (*) void *fs_private - - A pointer to the file system's private data. This is where the filesystem - will need to store any options it parses. - - (*) struct dentry *root - - A pointer to the root of the mountable tree (and indirectly, the - superblock thereof). This is filled in by the ->get_tree() op. If this - is set, an active reference on root->d_sb must also be held. - - (*) struct user_namespace *user_ns - (*) struct net *net_ns - - There are a subset of the namespaces in use by the invoking process. They - retain references on each namespace. The subscribed namespaces may be - replaced by the filesystem to reflect other sources, such as the parent - mount superblock on an automount. - - (*) const struct cred *cred - - The mounter's credentials. This retains a reference on the credentials. - - (*) char *source - - This specifies the source. It may be a block device (e.g. /dev/sda1) or - something more exotic, such as the "host:/path" that NFS desires. - - (*) char *subtype - - This is a string to be added to the type displayed in /proc/mounts to - qualify it (used by FUSE). This is available for the filesystem to set if - desired. - - (*) void *security - - A place for the LSMs to hang their security data for the superblock. The - relevant security operations are described below. - - (*) void *s_fs_info - - The proposed s_fs_info for a new superblock, set in the superblock by - sget_fc(). This can be used to distinguish superblocks. - - (*) unsigned int sb_flags - (*) unsigned int sb_flags_mask - - Which bits SB_* flags are to be set/cleared in super_block::s_flags. - - (*) unsigned int s_iflags - - These will be bitwise-OR'd with s->s_iflags when a superblock is created. - - (*) enum fs_context_purpose - - This indicates the purpose for which the context is intended. The - available values are: - - FS_CONTEXT_FOR_MOUNT, -- New superblock for explicit mount - FS_CONTEXT_FOR_SUBMOUNT -- New automatic submount of extant mount - FS_CONTEXT_FOR_RECONFIGURE -- Change an existing mount - -The mount context is created by calling vfs_new_fs_context() or -vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the -structure is not refcounted. - -VFS, security and filesystem mount options are set individually with -vfs_parse_mount_option(). Options provided by the old mount(2) system call as -a page of data can be parsed with generic_parse_monolithic(). - -When mounting, the filesystem is allowed to take data from any of the pointers -and attach it to the superblock (or whatever), provided it clears the pointer -in the mount context. - -The filesystem is also allowed to allocate resources and pin them with the -mount context. For instance, NFS might pin the appropriate protocol version -module. - - -================================= -THE FILESYSTEM CONTEXT OPERATIONS -================================= - -The filesystem context points to a table of operations: - - struct fs_context_operations { - void (*free)(struct fs_context *fc); - int (*dup)(struct fs_context *fc, struct fs_context *src_fc); - int (*parse_param)(struct fs_context *fc, - struct struct fs_parameter *param); - int (*parse_monolithic)(struct fs_context *fc, void *data); - int (*get_tree)(struct fs_context *fc); - int (*reconfigure)(struct fs_context *fc); - }; - -These operations are invoked by the various stages of the mount procedure to -manage the filesystem context. They are as follows: - - (*) void (*free)(struct fs_context *fc); - - Called to clean up the filesystem-specific part of the filesystem context - when the context is destroyed. It should be aware that parts of the - context may have been removed and NULL'd out by ->get_tree(). - - (*) int (*dup)(struct fs_context *fc, struct fs_context *src_fc); - - Called when a filesystem context has been duplicated to duplicate the - filesystem-private data. An error may be returned to indicate failure to - do this. - - [!] Note that even if this fails, put_fs_context() will be called - immediately thereafter, so ->dup() *must* make the - filesystem-private data safe for ->free(). - - (*) int (*parse_param)(struct fs_context *fc, - struct struct fs_parameter *param); - - Called when a parameter is being added to the filesystem context. param - points to the key name and maybe a value object. VFS-specific options - will have been weeded out and fc->sb_flags updated in the context. - Security options will also have been weeded out and fc->security updated. - - The parameter can be parsed with fs_parse() and fs_lookup_param(). Note - that the source(s) are presented as parameters named "source". - - If successful, 0 should be returned or a negative error code otherwise. - - (*) int (*parse_monolithic)(struct fs_context *fc, void *data); - - Called when the mount(2) system call is invoked to pass the entire data - page in one go. If this is expected to be just a list of "key[=val]" - items separated by commas, then this may be set to NULL. - - The return value is as for ->parse_param(). - - If the filesystem (e.g. NFS) needs to examine the data first and then - finds it's the standard key-val list then it may pass it off to - generic_parse_monolithic(). - - (*) int (*get_tree)(struct fs_context *fc); - - Called to get or create the mountable root and superblock, using the - information stored in the filesystem context (reconfiguration goes via a - different vector). It may detach any resources it desires from the - filesystem context and transfer them to the superblock it creates. - - On success it should set fc->root to the mountable root and return 0. In - the case of an error, it should return a negative error code. - - The phase on a userspace-driven context will be set to only allow this to - be called once on any particular context. - - (*) int (*reconfigure)(struct fs_context *fc); - - Called to effect reconfiguration of a superblock using information stored - in the filesystem context. It may detach any resources it desires from - the filesystem context and transfer them to the superblock. The - superblock can be found from fc->root->d_sb. - - On success it should return 0. In the case of an error, it should return - a negative error code. - - [NOTE] reconfigure is intended as a replacement for remount_fs. - - -=========================== -FILESYSTEM CONTEXT SECURITY -=========================== - -The filesystem context contains a security pointer that the LSMs can use for -building up a security context for the superblock to be mounted. There are a -number of operations used by the new mount code for this purpose: - - (*) int security_fs_context_alloc(struct fs_context *fc, - struct dentry *reference); - - Called to initialise fc->security (which is preset to NULL) and allocate - any resources needed. It should return 0 on success or a negative error - code on failure. - - reference will be non-NULL if the context is being created for superblock - reconfiguration (FS_CONTEXT_FOR_RECONFIGURE) in which case it indicates - the root dentry of the superblock to be reconfigured. It will also be - non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case - it indicates the automount point. - - (*) int security_fs_context_dup(struct fs_context *fc, - struct fs_context *src_fc); - - Called to initialise fc->security (which is preset to NULL) and allocate - any resources needed. The original filesystem context is pointed to by - src_fc and may be used for reference. It should return 0 on success or a - negative error code on failure. - - (*) void security_fs_context_free(struct fs_context *fc); - - Called to clean up anything attached to fc->security. Note that the - contents may have been transferred to a superblock and the pointer cleared - during get_tree. - - (*) int security_fs_context_parse_param(struct fs_context *fc, - struct fs_parameter *param); - - Called for each mount parameter, including the source. The arguments are - as for the ->parse_param() method. It should return 0 to indicate that - the parameter should be passed on to the filesystem, 1 to indicate that - the parameter should be discarded or an error to indicate that the - parameter should be rejected. - - The value pointed to by param may be modified (if a string) or stolen - (provided the value pointer is NULL'd out). If it is stolen, 1 must be - returned to prevent it being passed to the filesystem. - - (*) int security_fs_context_validate(struct fs_context *fc); - - Called after all the options have been parsed to validate the collection - as a whole and to do any necessary allocation so that - security_sb_get_tree() and security_sb_reconfigure() are less likely to - fail. It should return 0 or a negative error code. - - In the case of reconfiguration, the target superblock will be accessible - via fc->root. - - (*) int security_sb_get_tree(struct fs_context *fc); - - Called during the mount procedure to verify that the specified superblock - is allowed to be mounted and to transfer the security data there. It - should return 0 or a negative error code. - - (*) void security_sb_reconfigure(struct fs_context *fc); - - Called to apply any reconfiguration to an LSM's context. It must not - fail. Error checking and resource allocation must be done in advance by - the parameter parsing and validation hooks. - - (*) int security_sb_mountpoint(struct fs_context *fc, struct path *mountpoint, - unsigned int mnt_flags); - - Called during the mount procedure to verify that the root dentry attached - to the context is permitted to be attached to the specified mountpoint. - It should return 0 on success or a negative error code on failure. - - -========================== -VFS FILESYSTEM CONTEXT API -========================== - -There are four operations for creating a filesystem context and one for -destroying a context: - - (*) struct fs_context *fs_context_for_mount( - struct file_system_type *fs_type, - unsigned int sb_flags); - - Allocate a filesystem context for the purpose of setting up a new mount, - whether that be with a new superblock or sharing an existing one. This - sets the superblock flags, initialises the security and calls - fs_type->init_fs_context() to initialise the filesystem private data. - - fs_type specifies the filesystem type that will manage the context and - sb_flags presets the superblock flags stored therein. - - (*) struct fs_context *fs_context_for_reconfigure( - struct dentry *dentry, - unsigned int sb_flags, - unsigned int sb_flags_mask); - - Allocate a filesystem context for the purpose of reconfiguring an - existing superblock. dentry provides a reference to the superblock to be - configured. sb_flags and sb_flags_mask indicate which superblock flags - need changing and to what. - - (*) struct fs_context *fs_context_for_submount( - struct file_system_type *fs_type, - struct dentry *reference); - - Allocate a filesystem context for the purpose of creating a new mount for - an automount point or other derived superblock. fs_type specifies the - filesystem type that will manage the context and the reference dentry - supplies the parameters. Namespaces are propagated from the reference - dentry's superblock also. - - Note that it's not a requirement that the reference dentry be of the same - filesystem type as fs_type. - - (*) struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); - - Duplicate a filesystem context, copying any options noted and duplicating - or additionally referencing any resources held therein. This is available - for use where a filesystem has to get a mount within a mount, such as NFS4 - does by internally mounting the root of the target server and then doing a - private pathwalk to the target directory. - - The purpose in the new context is inherited from the old one. - - (*) void put_fs_context(struct fs_context *fc); - - Destroy a filesystem context, releasing any resources it holds. This - calls the ->free() operation. This is intended to be called by anyone who - created a filesystem context. - - [!] filesystem contexts are not refcounted, so this causes unconditional - destruction. - -In all the above operations, apart from the put op, the return is a mount -context pointer or a negative error code. - -For the remaining operations, if an error occurs, a negative error code will be -returned. - - (*) int vfs_parse_fs_param(struct fs_context *fc, - struct fs_parameter *param); - - Supply a single mount parameter to the filesystem context. This include - the specification of the source/device which is specified as the "source" - parameter (which may be specified multiple times if the filesystem - supports that). - - param specifies the parameter key name and the value. The parameter is - first checked to see if it corresponds to a standard mount flag (in which - case it is used to set an SB_xxx flag and consumed) or a security option - (in which case the LSM consumes it) before it is passed on to the - filesystem. - - The parameter value is typed and can be one of: - - fs_value_is_flag, Parameter not given a value. - fs_value_is_string, Value is a string - fs_value_is_blob, Value is a binary blob - fs_value_is_filename, Value is a filename* + dirfd - fs_value_is_file, Value is an open file (file*) - - If there is a value, that value is stored in a union in the struct in one - of param->{string,blob,name,file}. Note that the function may steal and - clear the pointer, but then becomes responsible for disposing of the - object. - - (*) int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); - - A wrapper around vfs_parse_fs_param() that copies the value string it is - passed. - - (*) int generic_parse_monolithic(struct fs_context *fc, void *data); - - Parse a sys_mount() data page, assuming the form to be a text list - consisting of key[=val] options separated by commas. Each item in the - list is passed to vfs_mount_option(). This is the default when the - ->parse_monolithic() method is NULL. - - (*) int vfs_get_tree(struct fs_context *fc); - - Get or create the mountable root and superblock, using the parameters in - the filesystem context to select/configure the superblock. This invokes - the ->get_tree() method. - - (*) struct vfsmount *vfs_create_mount(struct fs_context *fc); - - Create a mount given the parameters in the specified filesystem context. - Note that this does not attach the mount to anything. - - -=========================== -SUPERBLOCK CREATION HELPERS -=========================== - -A number of VFS helpers are available for use by filesystems for the creation -or looking up of superblocks. - - (*) struct super_block * - sget_fc(struct fs_context *fc, - int (*test)(struct super_block *sb, struct fs_context *fc), - int (*set)(struct super_block *sb, struct fs_context *fc)); - - This is the core routine. If test is non-NULL, it searches for an - existing superblock matching the criteria held in the fs_context, using - the test function to match them. If no match is found, a new superblock - is created and the set function is called to set it up. - - Prior to the set function being called, fc->s_fs_info will be transferred - to sb->s_fs_info - and fc->s_fs_info will be cleared if set returns - success (ie. 0). - -The following helpers all wrap sget_fc(): - - (*) int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) - - This creates/looks up a deviceless superblock. The keying indicates how - many superblocks of this type may exist and in what manner they may be - shared: - - (1) vfs_get_single_super - - Only one such superblock may exist in the system. Any further - attempt to get a new superblock gets this one (and any parameter - differences are ignored). - - (2) vfs_get_keyed_super - - Multiple superblocks of this type may exist and they're keyed on - their s_fs_info pointer (for example this may refer to a - namespace). - - (3) vfs_get_independent_super - - Multiple independent superblocks of this type may exist. This - function never matches an existing one and always creates a new - one. - - -===================== -PARAMETER DESCRIPTION -===================== - -Parameters are described using structures defined in linux/fs_parser.h. -There's a core description struct that links everything together: - - struct fs_parameter_description { - const struct fs_parameter_spec *specs; - const struct fs_parameter_enum *enums; - }; - -For example: - - enum { - Opt_autocell, - Opt_bar, - Opt_dyn, - Opt_foo, - Opt_source, - }; - - static const struct fs_parameter_description afs_fs_parameters = { - .specs = afs_param_specs, - .enums = afs_param_enums, - }; - -The members are as follows: - - (1) const struct fs_parameter_specification *specs; - - Table of parameter specifications, terminated with a null entry, where the - entries are of type: - - struct fs_parameter_spec { - const char *name; - u8 opt; - enum fs_parameter_type type:8; - unsigned short flags; - }; - - The 'name' field is a string to match exactly to the parameter key (no - wildcards, patterns and no case-independence) and 'opt' is the value that - will be returned by the fs_parser() function in the case of a successful - match. - - The 'type' field indicates the desired value type and must be one of: - - TYPE NAME EXPECTED VALUE RESULT IN - ======================= ======================= ===================== - fs_param_is_flag No value n/a - fs_param_is_bool Boolean value result->boolean - fs_param_is_u32 32-bit unsigned int result->uint_32 - fs_param_is_u32_octal 32-bit octal int result->uint_32 - fs_param_is_u32_hex 32-bit hex int result->uint_32 - fs_param_is_s32 32-bit signed int result->int_32 - fs_param_is_u64 64-bit unsigned int result->uint_64 - fs_param_is_enum Enum value name result->uint_32 - fs_param_is_string Arbitrary string param->string - fs_param_is_blob Binary blob param->blob - fs_param_is_blockdev Blockdev path * Needs lookup - fs_param_is_path Path * Needs lookup - fs_param_is_fd File descriptor result->int_32 - - Note that if the value is of fs_param_is_bool type, fs_parse() will try - to match any string value against "0", "1", "no", "yes", "false", "true". - - Each parameter can also be qualified with 'flags': - - fs_param_v_optional The value is optional - fs_param_neg_with_no result->negated set if key is prefixed with "no" - fs_param_neg_with_empty result->negated set if value is "" - fs_param_deprecated The parameter is deprecated. - - These are wrapped with a number of convenience wrappers: - - MACRO SPECIFIES - ======================= =============================================== - fsparam_flag() fs_param_is_flag - fsparam_flag_no() fs_param_is_flag, fs_param_neg_with_no - fsparam_bool() fs_param_is_bool - fsparam_u32() fs_param_is_u32 - fsparam_u32oct() fs_param_is_u32_octal - fsparam_u32hex() fs_param_is_u32_hex - fsparam_s32() fs_param_is_s32 - fsparam_u64() fs_param_is_u64 - fsparam_enum() fs_param_is_enum - fsparam_string() fs_param_is_string - fsparam_blob() fs_param_is_blob - fsparam_bdev() fs_param_is_blockdev - fsparam_path() fs_param_is_path - fsparam_fd() fs_param_is_fd - - all of which take two arguments, name string and option number - for - example: - - static const struct fs_parameter_spec afs_param_specs[] = { - fsparam_flag ("autocell", Opt_autocell), - fsparam_flag ("dyn", Opt_dyn), - fsparam_string ("source", Opt_source), - fsparam_flag_no ("foo", Opt_foo), - {} - }; - - An addition macro, __fsparam() is provided that takes an additional pair - of arguments to specify the type and the flags for anything that doesn't - match one of the above macros. - - (2) const struct fs_parameter_enum *enums; - - Table of enum value names to integer mappings, terminated with a null - entry. This is of type: - - struct fs_parameter_enum { - u8 opt; - char name[14]; - u8 value; - }; - - Where the array is an unsorted list of { parameter ID, name }-keyed - elements that indicate the value to map to, e.g.: - - static const struct fs_parameter_enum afs_param_enums[] = { - { Opt_bar, "x", 1}, - { Opt_bar, "y", 23}, - { Opt_bar, "z", 42}, - }; - - If a parameter of type fs_param_is_enum is encountered, fs_parse() will - try to look the value up in the enum table and the result will be stored - in the parse result. - -The parser should be pointed to by the parser pointer in the file_system_type -struct as this will provide validation on registration (if -CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from -userspace using the fsinfo() syscall. - - -========================== -PARAMETER HELPER FUNCTIONS -========================== - -A number of helper functions are provided to help a filesystem or an LSM -process the parameters it is given. - - (*) int lookup_constant(const struct constant_table tbl[], - const char *name, int not_found); - - Look up a constant by name in a table of name -> integer mappings. The - table is an array of elements of the following type: - - struct constant_table { - const char *name; - int value; - }; - - If a match is found, the corresponding value is returned. If a match - isn't found, the not_found value is returned instead. - - (*) bool validate_constant_table(const struct constant_table *tbl, - size_t tbl_size, - int low, int high, int special); - - Validate a constant table. Checks that all the elements are appropriately - ordered, that there are no duplicates and that the values are between low - and high inclusive, though provision is made for one allowable special - value outside of that range. If no special value is required, special - should just be set to lie inside the low-to-high range. - - If all is good, true is returned. If the table is invalid, errors are - logged to dmesg and false is returned. - - (*) bool fs_validate_description(const struct fs_parameter_description *desc); - - This performs some validation checks on a parameter description. It - returns true if the description is good and false if it is not. It will - log errors to dmesg if validation fails. - - (*) int fs_parse(struct fs_context *fc, - const struct fs_parameter_description *desc, - struct fs_parameter *param, - struct fs_parse_result *result); - - This is the main interpreter of parameters. It uses the parameter - description to look up a parameter by key name and to convert that to an - option number (which it returns). - - If successful, and if the parameter type indicates the result is a - boolean, integer or enum type, the value is converted by this function and - the result stored in result->{boolean,int_32,uint_32,uint_64}. - - If a match isn't initially made, the key is prefixed with "no" and no - value is present then an attempt will be made to look up the key with the - prefix removed. If this matches a parameter for which the type has flag - fs_param_neg_with_no set, then a match will be made and result->negated - will be set to true. - - If the parameter isn't matched, -ENOPARAM will be returned; if the - parameter is matched, but the value is erroneous, -EINVAL will be - returned; otherwise the parameter's option number will be returned. - - (*) int fs_lookup_param(struct fs_context *fc, - struct fs_parameter *value, - bool want_bdev, - struct path *_path); - - This takes a parameter that carries a string or filename type and attempts - to do a path lookup on it. If the parameter expects a blockdev, a check - is made that the inode actually represents one. - - Returns 0 if successful and *_path will be set; returns a negative error - code if not. diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index e6c3e4c61dad..5f24fcbfbfb4 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -85,7 +85,7 @@ struct p_log { * Superblock creation fills in ->root whereas reconfiguration begins with this * already set. * - * See Documentation/filesystems/mount_api.txt + * See Documentation/filesystems/mount_api.rst */ struct fs_context { const struct fs_context_operations *ops; diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 988ca0df7824..44d5422c18e4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -77,7 +77,7 @@ * state. This is called immediately after commit_creds(). * * Security hooks for mount using fs_context. - * [See also Documentation/filesystems/mount_api.txt] + * [See also Documentation/filesystems/mount_api.rst] * * @fs_context_dup: * Allocate and attach a security structure to sc->security. This pointer -- cgit v1.2.3 From 982649915d626bba8753c04e994e5a6650523c64 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:21 +0200 Subject: docs: filesystems: convert configfs.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Use copyright symbol; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Also, as this file is alone on its own dir, and it doesn't seem too likely that other documents will follow it, let's move it to the filesystems/ root documentation dir. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/c2424ec2ad4d735751434ff7f52144c44aa02d5a.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/configfs.rst | 535 ++++++++++++++++++++++++ Documentation/filesystems/configfs/configfs.txt | 508 ---------------------- Documentation/filesystems/index.rst | 1 + Documentation/iio/iio_configfs.rst | 2 +- Documentation/usb/gadget_configfs.rst | 4 +- fs/configfs/inode.c | 2 +- fs/configfs/item.c | 2 +- include/linux/configfs.h | 2 +- 8 files changed, 542 insertions(+), 514 deletions(-) create mode 100644 Documentation/filesystems/configfs.rst delete mode 100644 Documentation/filesystems/configfs/configfs.txt (limited to 'include/linux') diff --git a/Documentation/filesystems/configfs.rst b/Documentation/filesystems/configfs.rst new file mode 100644 index 000000000000..f8941954c667 --- /dev/null +++ b/Documentation/filesystems/configfs.rst @@ -0,0 +1,535 @@ +======================================================= +Configfs - Userspace-driven Kernel Object Configuration +======================================================= + +Joel Becker + +Updated: 31 March 2005 + +Copyright (c) 2005 Oracle Corporation, + Joel Becker + + +What is configfs? +================= + +configfs is a ram-based filesystem that provides the converse of +sysfs's functionality. Where sysfs is a filesystem-based view of +kernel objects, configfs is a filesystem-based manager of kernel +objects, or config_items. + +With sysfs, an object is created in kernel (for example, when a device +is discovered) and it is registered with sysfs. Its attributes then +appear in sysfs, allowing userspace to read the attributes via +readdir(3)/read(2). It may allow some attributes to be modified via +write(2). The important point is that the object is created and +destroyed in kernel, the kernel controls the lifecycle of the sysfs +representation, and sysfs is merely a window on all this. + +A configfs config_item is created via an explicit userspace operation: +mkdir(2). It is destroyed via rmdir(2). The attributes appear at +mkdir(2) time, and can be read or modified via read(2) and write(2). +As with sysfs, readdir(3) queries the list of items and/or attributes. +symlink(2) can be used to group items together. Unlike sysfs, the +lifetime of the representation is completely driven by userspace. The +kernel modules backing the items must respond to this. + +Both sysfs and configfs can and should exist together on the same +system. One is not a replacement for the other. + +Using configfs +============== + +configfs can be compiled as a module or into the kernel. You can access +it by doing:: + + mount -t configfs none /config + +The configfs tree will be empty unless client modules are also loaded. +These are modules that register their item types with configfs as +subsystems. Once a client subsystem is loaded, it will appear as a +subdirectory (or more than one) under /config. Like sysfs, the +configfs tree is always there, whether mounted on /config or not. + +An item is created via mkdir(2). The item's attributes will also +appear at this time. readdir(3) can determine what the attributes are, +read(2) can query their default values, and write(2) can store new +values. Don't mix more than one attribute in one attribute file. + +There are two types of configfs attributes: + +* Normal attributes, which similar to sysfs attributes, are small ASCII text + files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably + only one value per file should be used, and the same caveats from sysfs apply. + Configfs expects write(2) to store the entire buffer at once. When writing to + normal configfs attributes, userspace processes should first read the entire + file, modify the portions they wish to change, and then write the entire + buffer back. + +* Binary attributes, which are somewhat similar to sysfs binary attributes, + but with a few slight changes to semantics. The PAGE_SIZE limitation does not + apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. + The write(2) calls from user space are buffered, and the attributes' + write_bin_attribute method will be invoked on the final close, therefore it is + imperative for user-space to check the return code of close(2) in order to + verify that the operation finished successfully. + To avoid a malicious user OOMing the kernel, there's a per-binary attribute + maximum buffer value. + +When an item needs to be destroyed, remove it with rmdir(2). An +item cannot be destroyed if any other item has a link to it (via +symlink(2)). Links can be removed via unlink(2). + +Configuring FakeNBD: an Example +=============================== + +Imagine there's a Network Block Device (NBD) driver that allows you to +access remote block devices. Call it FakeNBD. FakeNBD uses configfs +for its configuration. Obviously, there will be a nice program that +sysadmins use to configure FakeNBD, but somehow that program has to tell +the driver about it. Here's where configfs comes in. + +When the FakeNBD driver is loaded, it registers itself with configfs. +readdir(3) sees this just fine:: + + # ls /config + fakenbd + +A fakenbd connection can be created with mkdir(2). The name is +arbitrary, but likely the tool will make some use of the name. Perhaps +it is a uuid or a disk name:: + + # mkdir /config/fakenbd/disk1 + # ls /config/fakenbd/disk1 + target device rw + +The target attribute contains the IP address of the server FakeNBD will +connect to. The device attribute is the device on the server. +Predictably, the rw attribute determines whether the connection is +read-only or read-write:: + + # echo 10.0.0.1 > /config/fakenbd/disk1/target + # echo /dev/sda1 > /config/fakenbd/disk1/device + # echo 1 > /config/fakenbd/disk1/rw + +That's it. That's all there is. Now the device is configured, via the +shell no less. + +Coding With configfs +==================== + +Every object in configfs is a config_item. A config_item reflects an +object in the subsystem. It has attributes that match values on that +object. configfs handles the filesystem representation of that object +and its attributes, allowing the subsystem to ignore all but the +basic show/store interaction. + +Items are created and destroyed inside a config_group. A group is a +collection of items that share the same attributes and operations. +Items are created by mkdir(2) and removed by rmdir(2), but configfs +handles that. The group has a set of operations to perform these tasks + +A subsystem is the top level of a client module. During initialization, +the client module registers the subsystem with configfs, the subsystem +appears as a directory at the top of the configfs filesystem. A +subsystem is also a config_group, and can do everything a config_group +can. + +struct config_item +================== + +:: + + struct config_item { + char *ci_name; + char ci_namebuf[UOBJ_NAME_LEN]; + struct kref ci_kref; + struct list_head ci_entry; + struct config_item *ci_parent; + struct config_group *ci_group; + struct config_item_type *ci_type; + struct dentry *ci_dentry; + }; + + void config_item_init(struct config_item *); + void config_item_init_type_name(struct config_item *, + const char *name, + struct config_item_type *type); + struct config_item *config_item_get(struct config_item *); + void config_item_put(struct config_item *); + +Generally, struct config_item is embedded in a container structure, a +structure that actually represents what the subsystem is doing. The +config_item portion of that structure is how the object interacts with +configfs. + +Whether statically defined in a source file or created by a parent +config_group, a config_item must have one of the _init() functions +called on it. This initializes the reference count and sets up the +appropriate fields. + +All users of a config_item should have a reference on it via +config_item_get(), and drop the reference when they are done via +config_item_put(). + +By itself, a config_item cannot do much more than appear in configfs. +Usually a subsystem wants the item to display and/or store attributes, +among other things. For that, it needs a type. + +struct config_item_type +======================= + +:: + + struct configfs_item_operations { + void (*release)(struct config_item *); + int (*allow_link)(struct config_item *src, + struct config_item *target); + void (*drop_link)(struct config_item *src, + struct config_item *target); + }; + + struct config_item_type { + struct module *ct_owner; + struct configfs_item_operations *ct_item_ops; + struct configfs_group_operations *ct_group_ops; + struct configfs_attribute **ct_attrs; + struct configfs_bin_attribute **ct_bin_attrs; + }; + +The most basic function of a config_item_type is to define what +operations can be performed on a config_item. All items that have been +allocated dynamically will need to provide the ct_item_ops->release() +method. This method is called when the config_item's reference count +reaches zero. + +struct configfs_attribute +========================= + +:: + + struct configfs_attribute { + char *ca_name; + struct module *ca_owner; + umode_t ca_mode; + ssize_t (*show)(struct config_item *, char *); + ssize_t (*store)(struct config_item *, const char *, size_t); + }; + +When a config_item wants an attribute to appear as a file in the item's +configfs directory, it must define a configfs_attribute describing it. +It then adds the attribute to the NULL-terminated array +config_item_type->ct_attrs. When the item appears in configfs, the +attribute file will appear with the configfs_attribute->ca_name +filename. configfs_attribute->ca_mode specifies the file permissions. + +If an attribute is readable and provides a ->show method, that method will +be called whenever userspace asks for a read(2) on the attribute. If an +attribute is writable and provides a ->store method, that method will be +be called whenever userspace asks for a write(2) on the attribute. + +struct configfs_bin_attribute +============================= + +:: + + struct configfs_bin_attribute { + struct configfs_attribute cb_attr; + void *cb_private; + size_t cb_max_size; + }; + +The binary attribute is used when the one needs to use binary blob to +appear as the contents of a file in the item's configfs directory. +To do so add the binary attribute to the NULL-terminated array +config_item_type->ct_bin_attrs, and the item appears in configfs, the +attribute file will appear with the configfs_bin_attribute->cb_attr.ca_name +filename. configfs_bin_attribute->cb_attr.ca_mode specifies the file +permissions. +The cb_private member is provided for use by the driver, while the +cb_max_size member specifies the maximum amount of vmalloc buffer +to be used. + +If binary attribute is readable and the config_item provides a +ct_item_ops->read_bin_attribute() method, that method will be called +whenever userspace asks for a read(2) on the attribute. The converse +will happen for write(2). The reads/writes are bufferred so only a +single read/write will occur; the attributes' need not concern itself +with it. + +struct config_group +=================== + +A config_item cannot live in a vacuum. The only way one can be created +is via mkdir(2) on a config_group. This will trigger creation of a +child item:: + + struct config_group { + struct config_item cg_item; + struct list_head cg_children; + struct configfs_subsystem *cg_subsys; + struct list_head default_groups; + struct list_head group_entry; + }; + + void config_group_init(struct config_group *group); + void config_group_init_type_name(struct config_group *group, + const char *name, + struct config_item_type *type); + + +The config_group structure contains a config_item. Properly configuring +that item means that a group can behave as an item in its own right. +However, it can do more: it can create child items or groups. This is +accomplished via the group operations specified on the group's +config_item_type:: + + struct configfs_group_operations { + struct config_item *(*make_item)(struct config_group *group, + const char *name); + struct config_group *(*make_group)(struct config_group *group, + const char *name); + int (*commit_item)(struct config_item *item); + void (*disconnect_notify)(struct config_group *group, + struct config_item *item); + void (*drop_item)(struct config_group *group, + struct config_item *item); + }; + +A group creates child items by providing the +ct_group_ops->make_item() method. If provided, this method is called from +mkdir(2) in the group's directory. The subsystem allocates a new +config_item (or more likely, its container structure), initializes it, +and returns it to configfs. Configfs will then populate the filesystem +tree to reflect the new item. + +If the subsystem wants the child to be a group itself, the subsystem +provides ct_group_ops->make_group(). Everything else behaves the same, +using the group _init() functions on the group. + +Finally, when userspace calls rmdir(2) on the item or group, +ct_group_ops->drop_item() is called. As a config_group is also a +config_item, it is not necessary for a separate drop_group() method. +The subsystem must config_item_put() the reference that was initialized +upon item allocation. If a subsystem has no work to do, it may omit +the ct_group_ops->drop_item() method, and configfs will call +config_item_put() on the item on behalf of the subsystem. + +Important: + drop_item() is void, and as such cannot fail. When rmdir(2) + is called, configfs WILL remove the item from the filesystem tree + (assuming that it has no children to keep it busy). The subsystem is + responsible for responding to this. If the subsystem has references to + the item in other threads, the memory is safe. It may take some time + for the item to actually disappear from the subsystem's usage. But it + is gone from configfs. + +When drop_item() is called, the item's linkage has already been torn +down. It no longer has a reference on its parent and has no place in +the item hierarchy. If a client needs to do some cleanup before this +teardown happens, the subsystem can implement the +ct_group_ops->disconnect_notify() method. The method is called after +configfs has removed the item from the filesystem view but before the +item is removed from its parent group. Like drop_item(), +disconnect_notify() is void and cannot fail. Client subsystems should +not drop any references here, as they still must do it in drop_item(). + +A config_group cannot be removed while it still has child items. This +is implemented in the configfs rmdir(2) code. ->drop_item() will not be +called, as the item has not been dropped. rmdir(2) will fail, as the +directory is not empty. + +struct configfs_subsystem +========================= + +A subsystem must register itself, usually at module_init time. This +tells configfs to make the subsystem appear in the file tree:: + + struct configfs_subsystem { + struct config_group su_group; + struct mutex su_mutex; + }; + + int configfs_register_subsystem(struct configfs_subsystem *subsys); + void configfs_unregister_subsystem(struct configfs_subsystem *subsys); + +A subsystem consists of a toplevel config_group and a mutex. +The group is where child config_items are created. For a subsystem, +this group is usually defined statically. Before calling +configfs_register_subsystem(), the subsystem must have initialized the +group via the usual group _init() functions, and it must also have +initialized the mutex. + +When the register call returns, the subsystem is live, and it +will be visible via configfs. At that point, mkdir(2) can be called and +the subsystem must be ready for it. + +An Example +========== + +The best example of these basic concepts is the simple_children +subsystem/group and the simple_child item in +samples/configfs/configfs_sample.c. It shows a trivial object displaying +and storing an attribute, and a simple group creating and destroying +these children. + +Hierarchy Navigation and the Subsystem Mutex +============================================ + +There is an extra bonus that configfs provides. The config_groups and +config_items are arranged in a hierarchy due to the fact that they +appear in a filesystem. A subsystem is NEVER to touch the filesystem +parts, but the subsystem might be interested in this hierarchy. For +this reason, the hierarchy is mirrored via the config_group->cg_children +and config_item->ci_parent structure members. + +A subsystem can navigate the cg_children list and the ci_parent pointer +to see the tree created by the subsystem. This can race with configfs' +management of the hierarchy, so configfs uses the subsystem mutex to +protect modifications. Whenever a subsystem wants to navigate the +hierarchy, it must do so under the protection of the subsystem +mutex. + +A subsystem will be prevented from acquiring the mutex while a newly +allocated item has not been linked into this hierarchy. Similarly, it +will not be able to acquire the mutex while a dropping item has not +yet been unlinked. This means that an item's ci_parent pointer will +never be NULL while the item is in configfs, and that an item will only +be in its parent's cg_children list for the same duration. This allows +a subsystem to trust ci_parent and cg_children while they hold the +mutex. + +Item Aggregation Via symlink(2) +=============================== + +configfs provides a simple group via the group->item parent/child +relationship. Often, however, a larger environment requires aggregation +outside of the parent/child connection. This is implemented via +symlink(2). + +A config_item may provide the ct_item_ops->allow_link() and +ct_item_ops->drop_link() methods. If the ->allow_link() method exists, +symlink(2) may be called with the config_item as the source of the link. +These links are only allowed between configfs config_items. Any +symlink(2) attempt outside the configfs filesystem will be denied. + +When symlink(2) is called, the source config_item's ->allow_link() +method is called with itself and a target item. If the source item +allows linking to target item, it returns 0. A source item may wish to +reject a link if it only wants links to a certain type of object (say, +in its own subsystem). + +When unlink(2) is called on the symbolic link, the source item is +notified via the ->drop_link() method. Like the ->drop_item() method, +this is a void function and cannot return failure. The subsystem is +responsible for responding to the change. + +A config_item cannot be removed while it links to any other item, nor +can it be removed while an item links to it. Dangling symlinks are not +allowed in configfs. + +Automatically Created Subgroups +=============================== + +A new config_group may want to have two types of child config_items. +While this could be codified by magic names in ->make_item(), it is much +more explicit to have a method whereby userspace sees this divergence. + +Rather than have a group where some items behave differently than +others, configfs provides a method whereby one or many subgroups are +automatically created inside the parent at its creation. Thus, +mkdir("parent") results in "parent", "parent/subgroup1", up through +"parent/subgroupN". Items of type 1 can now be created in +"parent/subgroup1", and items of type N can be created in +"parent/subgroupN". + +These automatic subgroups, or default groups, do not preclude other +children of the parent group. If ct_group_ops->make_group() exists, +other child groups can be created on the parent group directly. + +A configfs subsystem specifies default groups by adding them using the +configfs_add_default_group() function to the parent config_group +structure. Each added group is populated in the configfs tree at the same +time as the parent group. Similarly, they are removed at the same time +as the parent. No extra notification is provided. When a ->drop_item() +method call notifies the subsystem the parent group is going away, it +also means every default group child associated with that parent group. + +As a consequence of this, default groups cannot be removed directly via +rmdir(2). They also are not considered when rmdir(2) on the parent +group is checking for children. + +Dependent Subsystems +==================== + +Sometimes other drivers depend on particular configfs items. For +example, ocfs2 mounts depend on a heartbeat region item. If that +region item is removed with rmdir(2), the ocfs2 mount must BUG or go +readonly. Not happy. + +configfs provides two additional API calls: configfs_depend_item() and +configfs_undepend_item(). A client driver can call +configfs_depend_item() on an existing item to tell configfs that it is +depended on. configfs will then return -EBUSY from rmdir(2) for that +item. When the item is no longer depended on, the client driver calls +configfs_undepend_item() on it. + +These API cannot be called underneath any configfs callbacks, as +they will conflict. They can block and allocate. A client driver +probably shouldn't calling them of its own gumption. Rather it should +be providing an API that external subsystems call. + +How does this work? Imagine the ocfs2 mount process. When it mounts, +it asks for a heartbeat region item. This is done via a call into the +heartbeat code. Inside the heartbeat code, the region item is looked +up. Here, the heartbeat code calls configfs_depend_item(). If it +succeeds, then heartbeat knows the region is safe to give to ocfs2. +If it fails, it was being torn down anyway, and heartbeat can gracefully +pass up an error. + +Committable Items +================= + +Note: + Committable items are currently unimplemented. + +Some config_items cannot have a valid initial state. That is, no +default values can be specified for the item's attributes such that the +item can do its work. Userspace must configure one or more attributes, +after which the subsystem can start whatever entity this item +represents. + +Consider the FakeNBD device from above. Without a target address *and* +a target device, the subsystem has no idea what block device to import. +The simple example assumes that the subsystem merely waits until all the +appropriate attributes are configured, and then connects. This will, +indeed, work, but now every attribute store must check if the attributes +are initialized. Every attribute store must fire off the connection if +that condition is met. + +Far better would be an explicit action notifying the subsystem that the +config_item is ready to go. More importantly, an explicit action allows +the subsystem to provide feedback as to whether the attributes are +initialized in a way that makes sense. configfs provides this as +committable items. + +configfs still uses only normal filesystem operations. An item is +committed via rename(2). The item is moved from a directory where it +can be modified to a directory where it cannot. + +Any group that provides the ct_group_ops->commit_item() method has +committable items. When this group appears in configfs, mkdir(2) will +not work directly in the group. Instead, the group will have two +subdirectories: "live" and "pending". The "live" directory does not +support mkdir(2) or rmdir(2) either. It only allows rename(2). The +"pending" directory does allow mkdir(2) and rmdir(2). An item is +created in the "pending" directory. Its attributes can be modified at +will. Userspace commits the item by renaming it into the "live" +directory. At this point, the subsystem receives the ->commit_item() +callback. If all required attributes are filled to satisfaction, the +method returns zero and the item is moved to the "live" directory. + +As rmdir(2) does not work in the "live" directory, an item must be +shutdown, or "uncommitted". Again, this is done via rename(2), this +time from the "live" directory back to the "pending" one. The subsystem +is notified by the ct_group_ops->uncommit_object() method. diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt deleted file mode 100644 index 16e606c11f40..000000000000 --- a/Documentation/filesystems/configfs/configfs.txt +++ /dev/null @@ -1,508 +0,0 @@ - -configfs - Userspace-driven kernel object configuration. - -Joel Becker - -Updated: 31 March 2005 - -Copyright (c) 2005 Oracle Corporation, - Joel Becker - - -[What is configfs?] - -configfs is a ram-based filesystem that provides the converse of -sysfs's functionality. Where sysfs is a filesystem-based view of -kernel objects, configfs is a filesystem-based manager of kernel -objects, or config_items. - -With sysfs, an object is created in kernel (for example, when a device -is discovered) and it is registered with sysfs. Its attributes then -appear in sysfs, allowing userspace to read the attributes via -readdir(3)/read(2). It may allow some attributes to be modified via -write(2). The important point is that the object is created and -destroyed in kernel, the kernel controls the lifecycle of the sysfs -representation, and sysfs is merely a window on all this. - -A configfs config_item is created via an explicit userspace operation: -mkdir(2). It is destroyed via rmdir(2). The attributes appear at -mkdir(2) time, and can be read or modified via read(2) and write(2). -As with sysfs, readdir(3) queries the list of items and/or attributes. -symlink(2) can be used to group items together. Unlike sysfs, the -lifetime of the representation is completely driven by userspace. The -kernel modules backing the items must respond to this. - -Both sysfs and configfs can and should exist together on the same -system. One is not a replacement for the other. - -[Using configfs] - -configfs can be compiled as a module or into the kernel. You can access -it by doing - - mount -t configfs none /config - -The configfs tree will be empty unless client modules are also loaded. -These are modules that register their item types with configfs as -subsystems. Once a client subsystem is loaded, it will appear as a -subdirectory (or more than one) under /config. Like sysfs, the -configfs tree is always there, whether mounted on /config or not. - -An item is created via mkdir(2). The item's attributes will also -appear at this time. readdir(3) can determine what the attributes are, -read(2) can query their default values, and write(2) can store new -values. Don't mix more than one attribute in one attribute file. - -There are two types of configfs attributes: - -* Normal attributes, which similar to sysfs attributes, are small ASCII text -files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably -only one value per file should be used, and the same caveats from sysfs apply. -Configfs expects write(2) to store the entire buffer at once. When writing to -normal configfs attributes, userspace processes should first read the entire -file, modify the portions they wish to change, and then write the entire -buffer back. - -* Binary attributes, which are somewhat similar to sysfs binary attributes, -but with a few slight changes to semantics. The PAGE_SIZE limitation does not -apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. -The write(2) calls from user space are buffered, and the attributes' -write_bin_attribute method will be invoked on the final close, therefore it is -imperative for user-space to check the return code of close(2) in order to -verify that the operation finished successfully. -To avoid a malicious user OOMing the kernel, there's a per-binary attribute -maximum buffer value. - -When an item needs to be destroyed, remove it with rmdir(2). An -item cannot be destroyed if any other item has a link to it (via -symlink(2)). Links can be removed via unlink(2). - -[Configuring FakeNBD: an Example] - -Imagine there's a Network Block Device (NBD) driver that allows you to -access remote block devices. Call it FakeNBD. FakeNBD uses configfs -for its configuration. Obviously, there will be a nice program that -sysadmins use to configure FakeNBD, but somehow that program has to tell -the driver about it. Here's where configfs comes in. - -When the FakeNBD driver is loaded, it registers itself with configfs. -readdir(3) sees this just fine: - - # ls /config - fakenbd - -A fakenbd connection can be created with mkdir(2). The name is -arbitrary, but likely the tool will make some use of the name. Perhaps -it is a uuid or a disk name: - - # mkdir /config/fakenbd/disk1 - # ls /config/fakenbd/disk1 - target device rw - -The target attribute contains the IP address of the server FakeNBD will -connect to. The device attribute is the device on the server. -Predictably, the rw attribute determines whether the connection is -read-only or read-write. - - # echo 10.0.0.1 > /config/fakenbd/disk1/target - # echo /dev/sda1 > /config/fakenbd/disk1/device - # echo 1 > /config/fakenbd/disk1/rw - -That's it. That's all there is. Now the device is configured, via the -shell no less. - -[Coding With configfs] - -Every object in configfs is a config_item. A config_item reflects an -object in the subsystem. It has attributes that match values on that -object. configfs handles the filesystem representation of that object -and its attributes, allowing the subsystem to ignore all but the -basic show/store interaction. - -Items are created and destroyed inside a config_group. A group is a -collection of items that share the same attributes and operations. -Items are created by mkdir(2) and removed by rmdir(2), but configfs -handles that. The group has a set of operations to perform these tasks - -A subsystem is the top level of a client module. During initialization, -the client module registers the subsystem with configfs, the subsystem -appears as a directory at the top of the configfs filesystem. A -subsystem is also a config_group, and can do everything a config_group -can. - -[struct config_item] - - struct config_item { - char *ci_name; - char ci_namebuf[UOBJ_NAME_LEN]; - struct kref ci_kref; - struct list_head ci_entry; - struct config_item *ci_parent; - struct config_group *ci_group; - struct config_item_type *ci_type; - struct dentry *ci_dentry; - }; - - void config_item_init(struct config_item *); - void config_item_init_type_name(struct config_item *, - const char *name, - struct config_item_type *type); - struct config_item *config_item_get(struct config_item *); - void config_item_put(struct config_item *); - -Generally, struct config_item is embedded in a container structure, a -structure that actually represents what the subsystem is doing. The -config_item portion of that structure is how the object interacts with -configfs. - -Whether statically defined in a source file or created by a parent -config_group, a config_item must have one of the _init() functions -called on it. This initializes the reference count and sets up the -appropriate fields. - -All users of a config_item should have a reference on it via -config_item_get(), and drop the reference when they are done via -config_item_put(). - -By itself, a config_item cannot do much more than appear in configfs. -Usually a subsystem wants the item to display and/or store attributes, -among other things. For that, it needs a type. - -[struct config_item_type] - - struct configfs_item_operations { - void (*release)(struct config_item *); - int (*allow_link)(struct config_item *src, - struct config_item *target); - void (*drop_link)(struct config_item *src, - struct config_item *target); - }; - - struct config_item_type { - struct module *ct_owner; - struct configfs_item_operations *ct_item_ops; - struct configfs_group_operations *ct_group_ops; - struct configfs_attribute **ct_attrs; - struct configfs_bin_attribute **ct_bin_attrs; - }; - -The most basic function of a config_item_type is to define what -operations can be performed on a config_item. All items that have been -allocated dynamically will need to provide the ct_item_ops->release() -method. This method is called when the config_item's reference count -reaches zero. - -[struct configfs_attribute] - - struct configfs_attribute { - char *ca_name; - struct module *ca_owner; - umode_t ca_mode; - ssize_t (*show)(struct config_item *, char *); - ssize_t (*store)(struct config_item *, const char *, size_t); - }; - -When a config_item wants an attribute to appear as a file in the item's -configfs directory, it must define a configfs_attribute describing it. -It then adds the attribute to the NULL-terminated array -config_item_type->ct_attrs. When the item appears in configfs, the -attribute file will appear with the configfs_attribute->ca_name -filename. configfs_attribute->ca_mode specifies the file permissions. - -If an attribute is readable and provides a ->show method, that method will -be called whenever userspace asks for a read(2) on the attribute. If an -attribute is writable and provides a ->store method, that method will be -be called whenever userspace asks for a write(2) on the attribute. - -[struct configfs_bin_attribute] - - struct configfs_bin_attribute { - struct configfs_attribute cb_attr; - void *cb_private; - size_t cb_max_size; - }; - -The binary attribute is used when the one needs to use binary blob to -appear as the contents of a file in the item's configfs directory. -To do so add the binary attribute to the NULL-terminated array -config_item_type->ct_bin_attrs, and the item appears in configfs, the -attribute file will appear with the configfs_bin_attribute->cb_attr.ca_name -filename. configfs_bin_attribute->cb_attr.ca_mode specifies the file -permissions. -The cb_private member is provided for use by the driver, while the -cb_max_size member specifies the maximum amount of vmalloc buffer -to be used. - -If binary attribute is readable and the config_item provides a -ct_item_ops->read_bin_attribute() method, that method will be called -whenever userspace asks for a read(2) on the attribute. The converse -will happen for write(2). The reads/writes are bufferred so only a -single read/write will occur; the attributes' need not concern itself -with it. - -[struct config_group] - -A config_item cannot live in a vacuum. The only way one can be created -is via mkdir(2) on a config_group. This will trigger creation of a -child item. - - struct config_group { - struct config_item cg_item; - struct list_head cg_children; - struct configfs_subsystem *cg_subsys; - struct list_head default_groups; - struct list_head group_entry; - }; - - void config_group_init(struct config_group *group); - void config_group_init_type_name(struct config_group *group, - const char *name, - struct config_item_type *type); - - -The config_group structure contains a config_item. Properly configuring -that item means that a group can behave as an item in its own right. -However, it can do more: it can create child items or groups. This is -accomplished via the group operations specified on the group's -config_item_type. - - struct configfs_group_operations { - struct config_item *(*make_item)(struct config_group *group, - const char *name); - struct config_group *(*make_group)(struct config_group *group, - const char *name); - int (*commit_item)(struct config_item *item); - void (*disconnect_notify)(struct config_group *group, - struct config_item *item); - void (*drop_item)(struct config_group *group, - struct config_item *item); - }; - -A group creates child items by providing the -ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new -config_item (or more likely, its container structure), initializes it, -and returns it to configfs. Configfs will then populate the filesystem -tree to reflect the new item. - -If the subsystem wants the child to be a group itself, the subsystem -provides ct_group_ops->make_group(). Everything else behaves the same, -using the group _init() functions on the group. - -Finally, when userspace calls rmdir(2) on the item or group, -ct_group_ops->drop_item() is called. As a config_group is also a -config_item, it is not necessary for a separate drop_group() method. -The subsystem must config_item_put() the reference that was initialized -upon item allocation. If a subsystem has no work to do, it may omit -the ct_group_ops->drop_item() method, and configfs will call -config_item_put() on the item on behalf of the subsystem. - -IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2) -is called, configfs WILL remove the item from the filesystem tree -(assuming that it has no children to keep it busy). The subsystem is -responsible for responding to this. If the subsystem has references to -the item in other threads, the memory is safe. It may take some time -for the item to actually disappear from the subsystem's usage. But it -is gone from configfs. - -When drop_item() is called, the item's linkage has already been torn -down. It no longer has a reference on its parent and has no place in -the item hierarchy. If a client needs to do some cleanup before this -teardown happens, the subsystem can implement the -ct_group_ops->disconnect_notify() method. The method is called after -configfs has removed the item from the filesystem view but before the -item is removed from its parent group. Like drop_item(), -disconnect_notify() is void and cannot fail. Client subsystems should -not drop any references here, as they still must do it in drop_item(). - -A config_group cannot be removed while it still has child items. This -is implemented in the configfs rmdir(2) code. ->drop_item() will not be -called, as the item has not been dropped. rmdir(2) will fail, as the -directory is not empty. - -[struct configfs_subsystem] - -A subsystem must register itself, usually at module_init time. This -tells configfs to make the subsystem appear in the file tree. - - struct configfs_subsystem { - struct config_group su_group; - struct mutex su_mutex; - }; - - int configfs_register_subsystem(struct configfs_subsystem *subsys); - void configfs_unregister_subsystem(struct configfs_subsystem *subsys); - - A subsystem consists of a toplevel config_group and a mutex. -The group is where child config_items are created. For a subsystem, -this group is usually defined statically. Before calling -configfs_register_subsystem(), the subsystem must have initialized the -group via the usual group _init() functions, and it must also have -initialized the mutex. - When the register call returns, the subsystem is live, and it -will be visible via configfs. At that point, mkdir(2) can be called and -the subsystem must be ready for it. - -[An Example] - -The best example of these basic concepts is the simple_children -subsystem/group and the simple_child item in -samples/configfs/configfs_sample.c. It shows a trivial object displaying -and storing an attribute, and a simple group creating and destroying -these children. - -[Hierarchy Navigation and the Subsystem Mutex] - -There is an extra bonus that configfs provides. The config_groups and -config_items are arranged in a hierarchy due to the fact that they -appear in a filesystem. A subsystem is NEVER to touch the filesystem -parts, but the subsystem might be interested in this hierarchy. For -this reason, the hierarchy is mirrored via the config_group->cg_children -and config_item->ci_parent structure members. - -A subsystem can navigate the cg_children list and the ci_parent pointer -to see the tree created by the subsystem. This can race with configfs' -management of the hierarchy, so configfs uses the subsystem mutex to -protect modifications. Whenever a subsystem wants to navigate the -hierarchy, it must do so under the protection of the subsystem -mutex. - -A subsystem will be prevented from acquiring the mutex while a newly -allocated item has not been linked into this hierarchy. Similarly, it -will not be able to acquire the mutex while a dropping item has not -yet been unlinked. This means that an item's ci_parent pointer will -never be NULL while the item is in configfs, and that an item will only -be in its parent's cg_children list for the same duration. This allows -a subsystem to trust ci_parent and cg_children while they hold the -mutex. - -[Item Aggregation Via symlink(2)] - -configfs provides a simple group via the group->item parent/child -relationship. Often, however, a larger environment requires aggregation -outside of the parent/child connection. This is implemented via -symlink(2). - -A config_item may provide the ct_item_ops->allow_link() and -ct_item_ops->drop_link() methods. If the ->allow_link() method exists, -symlink(2) may be called with the config_item as the source of the link. -These links are only allowed between configfs config_items. Any -symlink(2) attempt outside the configfs filesystem will be denied. - -When symlink(2) is called, the source config_item's ->allow_link() -method is called with itself and a target item. If the source item -allows linking to target item, it returns 0. A source item may wish to -reject a link if it only wants links to a certain type of object (say, -in its own subsystem). - -When unlink(2) is called on the symbolic link, the source item is -notified via the ->drop_link() method. Like the ->drop_item() method, -this is a void function and cannot return failure. The subsystem is -responsible for responding to the change. - -A config_item cannot be removed while it links to any other item, nor -can it be removed while an item links to it. Dangling symlinks are not -allowed in configfs. - -[Automatically Created Subgroups] - -A new config_group may want to have two types of child config_items. -While this could be codified by magic names in ->make_item(), it is much -more explicit to have a method whereby userspace sees this divergence. - -Rather than have a group where some items behave differently than -others, configfs provides a method whereby one or many subgroups are -automatically created inside the parent at its creation. Thus, -mkdir("parent") results in "parent", "parent/subgroup1", up through -"parent/subgroupN". Items of type 1 can now be created in -"parent/subgroup1", and items of type N can be created in -"parent/subgroupN". - -These automatic subgroups, or default groups, do not preclude other -children of the parent group. If ct_group_ops->make_group() exists, -other child groups can be created on the parent group directly. - -A configfs subsystem specifies default groups by adding them using the -configfs_add_default_group() function to the parent config_group -structure. Each added group is populated in the configfs tree at the same -time as the parent group. Similarly, they are removed at the same time -as the parent. No extra notification is provided. When a ->drop_item() -method call notifies the subsystem the parent group is going away, it -also means every default group child associated with that parent group. - -As a consequence of this, default groups cannot be removed directly via -rmdir(2). They also are not considered when rmdir(2) on the parent -group is checking for children. - -[Dependent Subsystems] - -Sometimes other drivers depend on particular configfs items. For -example, ocfs2 mounts depend on a heartbeat region item. If that -region item is removed with rmdir(2), the ocfs2 mount must BUG or go -readonly. Not happy. - -configfs provides two additional API calls: configfs_depend_item() and -configfs_undepend_item(). A client driver can call -configfs_depend_item() on an existing item to tell configfs that it is -depended on. configfs will then return -EBUSY from rmdir(2) for that -item. When the item is no longer depended on, the client driver calls -configfs_undepend_item() on it. - -These API cannot be called underneath any configfs callbacks, as -they will conflict. They can block and allocate. A client driver -probably shouldn't calling them of its own gumption. Rather it should -be providing an API that external subsystems call. - -How does this work? Imagine the ocfs2 mount process. When it mounts, -it asks for a heartbeat region item. This is done via a call into the -heartbeat code. Inside the heartbeat code, the region item is looked -up. Here, the heartbeat code calls configfs_depend_item(). If it -succeeds, then heartbeat knows the region is safe to give to ocfs2. -If it fails, it was being torn down anyway, and heartbeat can gracefully -pass up an error. - -[Committable Items] - -NOTE: Committable items are currently unimplemented. - -Some config_items cannot have a valid initial state. That is, no -default values can be specified for the item's attributes such that the -item can do its work. Userspace must configure one or more attributes, -after which the subsystem can start whatever entity this item -represents. - -Consider the FakeNBD device from above. Without a target address *and* -a target device, the subsystem has no idea what block device to import. -The simple example assumes that the subsystem merely waits until all the -appropriate attributes are configured, and then connects. This will, -indeed, work, but now every attribute store must check if the attributes -are initialized. Every attribute store must fire off the connection if -that condition is met. - -Far better would be an explicit action notifying the subsystem that the -config_item is ready to go. More importantly, an explicit action allows -the subsystem to provide feedback as to whether the attributes are -initialized in a way that makes sense. configfs provides this as -committable items. - -configfs still uses only normal filesystem operations. An item is -committed via rename(2). The item is moved from a directory where it -can be modified to a directory where it cannot. - -Any group that provides the ct_group_ops->commit_item() method has -committable items. When this group appears in configfs, mkdir(2) will -not work directly in the group. Instead, the group will have two -subdirectories: "live" and "pending". The "live" directory does not -support mkdir(2) or rmdir(2) either. It only allows rename(2). The -"pending" directory does allow mkdir(2) and rmdir(2). An item is -created in the "pending" directory. Its attributes can be modified at -will. Userspace commits the item by renaming it into the "live" -directory. At this point, the subsystem receives the ->commit_item() -callback. If all required attributes are filled to satisfaction, the -method returns zero and the item is moved to the "live" directory. - -As rmdir(2) does not work in the "live" directory, an item must be -shutdown, or "uncommitted". Again, this is done via rename(2), this -time from the "live" directory back to the "pending" one. The subsystem -is notified by the ct_group_ops->uncommit_object() method. - - diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e242cb75f9f2..17795341e0a3 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -76,6 +76,7 @@ Documentation for filesystem implementations. cifs/cifsroot ceph coda + configfs cramfs debugfs dlmfs diff --git a/Documentation/iio/iio_configfs.rst b/Documentation/iio/iio_configfs.rst index ecbfdb3afef7..6e38cbbd2981 100644 --- a/Documentation/iio/iio_configfs.rst +++ b/Documentation/iio/iio_configfs.rst @@ -9,7 +9,7 @@ Configfs is a filesystem-based manager of kernel objects. IIO uses some objects that could be easily configured using configfs (e.g.: devices, triggers). -See Documentation/filesystems/configfs/configfs.txt for more information +See Documentation/filesystems/configfs.rst for more information about how configfs works. 2. Usage diff --git a/Documentation/usb/gadget_configfs.rst b/Documentation/usb/gadget_configfs.rst index 54fb08baae22..158e48dab586 100644 --- a/Documentation/usb/gadget_configfs.rst +++ b/Documentation/usb/gadget_configfs.rst @@ -24,7 +24,7 @@ Linux provides a number of functions for gadgets to use. Creating a gadget means deciding what configurations there will be and which functions each configuration will provide. -Configfs (please see `Documentation/filesystems/configfs/*`) lends itself nicely +Configfs (please see `Documentation/filesystems/configfs.rst`) lends itself nicely for the purpose of telling the kernel about the above mentioned decision. This document is about how to do it. @@ -354,7 +354,7 @@ the directories in general can be named at will. A group can have a number of its default sub-groups created automatically. For more information on configfs please see -`Documentation/filesystems/configfs/*`. +`Documentation/filesystems/configfs.rst`. The concepts described above translate to USB gadgets like this: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fd0b5dd68f9e..8bd6a883c94c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs/configfs.txt for more + * Please see Documentation/filesystems/configfs.rst for more * information. */ diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 6e0f1fcb8a5b..704a4356f137 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs/configfs.txt for + * Please see the file Documentation/filesystems/configfs.rst for * critical information about using the config_item interface. */ diff --git a/include/linux/configfs.h b/include/linux/configfs.h index fa9490a8874c..2e8c69b43c64 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -13,7 +13,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please read Documentation/filesystems/configfs/configfs.txt before using + * Please read Documentation/filesystems/configfs.rst before using * the configfs interface, ESPECIALLY the parts about reference counts and * item destructors. */ -- cgit v1.2.3 From 9b06fc39084e161da84a399b6b5dc524e673f51e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 29 Apr 2020 15:58:10 -0500 Subject: ARM: vexpress: Move vexpress_flags_set() into arch code vexpress_flags_set() is only used by the platform SMP related code and has nothing to do with the vexpress-sysreg MFD driver other than both access the same h/w block. It's also only needed for 32-bit systems and must be built-in for them. Let's move vexpress_flags_set() closer to where it is being used. This will allow for vexpress-sysreg to be built as a module. Cc: Lorenzo Pieralisi Cc: Linus Walleij Reviewed-by: Sudeep Holla Acked-by: Arnd Bergmann Acked-by: Liviu Dudau Acked-by: Lee Jones Signed-off-by: Rob Herring --- arch/arm/mach-vexpress/Kconfig | 1 - arch/arm/mach-vexpress/core.h | 1 + arch/arm/mach-vexpress/dcscb.c | 1 + arch/arm/mach-vexpress/v2m.c | 23 +++++++++++++++++++++++ drivers/mfd/vexpress-sysreg.c | 19 ------------------- include/linux/vexpress.h | 4 ---- 6 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig index 726a68085c3b..18951cd20d9d 100644 --- a/arch/arm/mach-vexpress/Kconfig +++ b/arch/arm/mach-vexpress/Kconfig @@ -22,7 +22,6 @@ menuconfig ARCH_VEXPRESS select REGULATOR_FIXED_VOLTAGE if REGULATOR select VEXPRESS_CONFIG select VEXPRESS_SYSCFG - select MFD_VEXPRESS_SYSREG help This option enables support for systems using Cortex processor based ARM core and logic (FPGA) tiles on the Versatile Express motherboard, diff --git a/arch/arm/mach-vexpress/core.h b/arch/arm/mach-vexpress/core.h index f4a7519084f1..bda78675c55d 100644 --- a/arch/arm/mach-vexpress/core.h +++ b/arch/arm/mach-vexpress/core.h @@ -1,3 +1,4 @@ bool vexpress_smp_init_ops(void); +void vexpress_flags_set(u32 data); extern const struct smp_operations vexpress_smp_dt_ops; diff --git a/arch/arm/mach-vexpress/dcscb.c b/arch/arm/mach-vexpress/dcscb.c index 46a903c88c6a..a0554d7d04f7 100644 --- a/arch/arm/mach-vexpress/dcscb.c +++ b/arch/arm/mach-vexpress/dcscb.c @@ -20,6 +20,7 @@ #include #include +#include "core.h" #define RST_HOLD0 0x0 #define RST_HOLD1 0x4 diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c index 95886b3bb9dd..ffe7c7a85ae9 100644 --- a/arch/arm/mach-vexpress/v2m.c +++ b/arch/arm/mach-vexpress/v2m.c @@ -1,8 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include "core.h" +#define SYS_FLAGSSET 0x030 +#define SYS_FLAGSCLR 0x034 + +void vexpress_flags_set(u32 data) +{ + static void __iomem *base; + + if (!base) { + struct device_node *node = of_find_compatible_node(NULL, NULL, + "arm,vexpress-sysreg"); + + base = of_iomap(node, 0); + } + + if (WARN_ON(!base)) + return; + + writel(~0, base + SYS_FLAGSCLR); + writel(data, base + SYS_FLAGSSET); +} + static const char * const v2m_dt_match[] __initconst = { "arm,vexpress", NULL, diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c index c68ff56dbdb1..0b9cc67706c7 100644 --- a/drivers/mfd/vexpress-sysreg.c +++ b/drivers/mfd/vexpress-sysreg.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -42,24 +41,6 @@ #define SYS_MISC_MASTERSITE (1 << 14) -void vexpress_flags_set(u32 data) -{ - static void __iomem *base; - - if (!base) { - struct device_node *node = of_find_compatible_node(NULL, NULL, - "arm,vexpress-sysreg"); - - base = of_iomap(node, 0); - } - - if (WARN_ON(!base)) - return; - - writel(~0, base + SYS_FLAGSCLR); - writel(data, base + SYS_FLAGSSET); -} - /* The sysreg block is just a random collection of various functions... */ static struct syscon_platform_data vexpress_sysreg_sys_id_pdata = { diff --git a/include/linux/vexpress.h b/include/linux/vexpress.h index 0e130b5077a5..2ec7992b054c 100644 --- a/include/linux/vexpress.h +++ b/include/linux/vexpress.h @@ -40,8 +40,4 @@ struct device *vexpress_config_bridge_register(struct device *parent, struct regmap *devm_regmap_init_vexpress_config(struct device *dev); -/* Platform control */ - -void vexpress_flags_set(u32 data); - #endif -- cgit v1.2.3 From c3b3f52476412a3899f2c65b220075aceb18dd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 12:12:53 +0200 Subject: signal: refactor copy_siginfo_to_user32 Factor out a copy_siginfo_to_external32 helper from copy_siginfo_to_user32 that fills out the compat_siginfo, but does so on a kernel space data structure. With that we can let architectures override copy_siginfo_to_user32 with their own implementations using copy_siginfo_to_external32. That allows moving the x32 SIGCHLD purely to x86 architecture code. As a nice side effect copy_siginfo_to_external32 also comes in handy for avoiding a set_fs() call in the coredump code later on. Contains improvements from Eric W. Biederman and Arnd Bergmann . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/compat.h | 8 +++- arch/x86/kernel/signal.c | 28 ++++++++++- include/linux/compat.h | 11 ++++- kernel/signal.c | 106 +++++++++++++++++++++--------------------- 5 files changed, 96 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9d8804144d0..81cf22398cd1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -350,7 +350,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); user_access_end(); - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 52e9f3480f69..d4edf281fff4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -214,7 +214,11 @@ static inline bool in_compat_syscall(void) #endif struct compat_siginfo; -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const kernel_siginfo_t *from, bool x32_ABI); + +#ifdef CONFIG_X86_X32_ABI +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#define copy_siginfo_to_user32 copy_siginfo_to_user32 +#endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..f3df262e370b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,6 +37,7 @@ #include #ifdef CONFIG_X86_64 +#include #include #include #endif /* CONFIG_X86_64 */ @@ -511,6 +512,31 @@ Efault: } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + static int x32_setup_rt_frame(struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) @@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } diff --git a/include/linux/compat.h b/include/linux/compat.h index 0480ba4db592..e90100c0de72 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -402,8 +402,15 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size); -int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo __user *from); -int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from); +int copy_siginfo_from_user32(kernel_siginfo_t *to, + const struct compat_siginfo __user *from); +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#ifndef copy_siginfo_to_user32 +#define copy_siginfo_to_user32 __copy_siginfo_to_user32 +#endif int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); diff --git a/kernel/signal.c b/kernel/signal.c index e58a6c619824..2bc9458d40bd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } -- cgit v1.2.3 From fa4751f454e6b51ef93babfd8b6c8b43a65c9db2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 May 2020 12:12:54 +0200 Subject: binfmt_elf: remove the set_fs in fill_siginfo_note The code in binfmt_elf.c is differnt from the rest of the code that processes siginfo, as it sends siginfo from a kernel buffer to a file rather than from kernel memory to userspace buffers. To remove it's use of set_fs the code needs some different siginfo helpers. Add the helper copy_siginfo_to_external to copy from the kernel's internal siginfo layout to a buffer in the siginfo layout that userspace expects. Modify fill_siginfo_note to use copy_siginfo_to_external instead of set_fs and copy_siginfo_to_user. Update compat_binfmt_elf.c to use the previously added copy_siginfo_to_external32 to handle the compat case. Signed-off-by: "Eric W. Biederman" Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/binfmt_elf.c | 5 +---- fs/compat_binfmt_elf.c | 2 +- include/linux/signal.h | 8 ++++++++ 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13f25e241ac4..a1f57e20c3cf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1556,10 +1556,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); - set_fs(old_fs); + copy_siginfo_to_external(csigdata, siginfo); fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index aaad4ca1217e..fa0e24e1b726 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -39,7 +39,7 @@ */ #define user_long_t compat_long_t #define user_siginfo_t compat_siginfo_t -#define copy_siginfo_to_user copy_siginfo_to_user32 +#define copy_siginfo_to_external copy_siginfo_to_external32 /* * The machine-dependent core note format types are defined in elfcore-compat.h, diff --git a/include/linux/signal.h b/include/linux/signal.h index 05bacd2ab135..6bb1a3f0258c 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -24,6 +24,14 @@ static inline void clear_siginfo(kernel_siginfo_t *info) #define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo)) +static inline void copy_siginfo_to_external(siginfo_t *to, + const kernel_siginfo_t *from) +{ + memcpy(to, from, sizeof(*from)); + memset(((char *)to) + sizeof(struct kernel_siginfo), 0, + SI_EXPANSION_SIZE); +} + int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from); int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from); -- cgit v1.2.3 From d755cdb1b9d7e1b645e176b97eb137194bbe8cf9 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Thu, 23 Jan 2020 14:00:26 +0800 Subject: usb: chipidea: introduce CI_HDRC_CONTROLLER_VBUS_EVENT glue layer use Some vendors glue layer need to handle some events for vbus, eg, some i.mx platforms (imx7d, imx8mm, imx8mn, etc) needs vbus event to handle charger detection, its charger detection is finished at glue layer code, but not at USB PHY driver. Signed-off-by: Peter Chen --- drivers/usb/chipidea/udc.c | 7 ++++++- include/linux/usb/chipidea.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c index 921bcf14dc06..da70fbe7ca4c 100644 --- a/drivers/usb/chipidea/udc.c +++ b/drivers/usb/chipidea/udc.c @@ -1561,6 +1561,7 @@ static int ci_udc_vbus_session(struct usb_gadget *_gadget, int is_active) { struct ci_hdrc *ci = container_of(_gadget, struct ci_hdrc, gadget); unsigned long flags; + int ret = 0; spin_lock_irqsave(&ci->lock, flags); ci->vbus_active = is_active; @@ -1570,10 +1571,14 @@ static int ci_udc_vbus_session(struct usb_gadget *_gadget, int is_active) usb_phy_set_charger_state(ci->usb_phy, is_active ? USB_CHARGER_PRESENT : USB_CHARGER_ABSENT); + if (ci->platdata->notify_event) + ret = ci->platdata->notify_event(ci, + CI_HDRC_CONTROLLER_VBUS_EVENT); + if (ci->driver) ci_hdrc_gadget_connect(_gadget, is_active); - return 0; + return ret; } static int ci_udc_wakeup(struct usb_gadget *_gadget) diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h index edd89b7c8f18..54167a2d28ea 100644 --- a/include/linux/usb/chipidea.h +++ b/include/linux/usb/chipidea.h @@ -67,6 +67,7 @@ struct ci_hdrc_platform_data { #define CI_HDRC_CONTROLLER_STOPPED_EVENT 1 #define CI_HDRC_IMX_HSIC_ACTIVE_EVENT 2 #define CI_HDRC_IMX_HSIC_SUSPEND_EVENT 3 +#define CI_HDRC_CONTROLLER_VBUS_EVENT 4 int (*notify_event) (struct ci_hdrc *ci, unsigned event); struct regulator *reg_vbus; struct usb_otg_caps ci_otg_caps; -- cgit v1.2.3 From 57a29df341466b5cca43ba3d2d7064426727d7c3 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Sat, 25 Apr 2020 02:49:14 +0800 Subject: iopoll: Introduce read_poll_timeout_atomic macro Like read_poll_timeout, an atomic variant for multiple parameter read function can be useful. Will be used by a later patch. Signed-off-by: Kai-Heng Feng Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424184918.30360-1-kai.heng.feng@canonical.com --- include/linux/iopoll.h | 62 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index cb20c733b15a..bc89ac625f26 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -57,6 +57,48 @@ (cond) ? 0 : -ETIMEDOUT; \ }) +/** + * read_poll_timeout_atomic - Periodically poll an address until a condition is + * met or a timeout occurs + * @op: accessor function (takes @addr as its only argument) + * @addr: Address to poll + * @val: Variable to read the value into + * @cond: Break condition (usually involving @val) + * @delay_us: Time to udelay between reads in us (0 tight-loops). Should + * be less than ~10us since udelay is used (see + * Documentation/timers/timers-howto.rst). + * @timeout_us: Timeout in us, 0 means never timeout + * @delay_before_read: if it is true, delay @delay_us before read. + * + * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. + * + * When available, you'll probably want to use one of the specialized + * macros defined below rather than this macro directly. + */ +#define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ + delay_before_read, args...) \ +({ \ + u64 __timeout_us = (timeout_us); \ + unsigned long __delay_us = (delay_us); \ + ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \ + if (delay_before_read && __delay_us) \ + udelay(__delay_us); \ + for (;;) { \ + (val) = op(args); \ + if (cond) \ + break; \ + if (__timeout_us && \ + ktime_compare(ktime_get(), __timeout) > 0) { \ + (val) = op(args); \ + break; \ + } \ + if (__delay_us) \ + udelay(__delay_us); \ + } \ + (cond) ? 0 : -ETIMEDOUT; \ +}) + /** * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs * @op: accessor function (takes @addr as its only argument) @@ -96,25 +138,7 @@ * macros defined below rather than this macro directly. */ #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \ -({ \ - u64 __timeout_us = (timeout_us); \ - unsigned long __delay_us = (delay_us); \ - ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \ - for (;;) { \ - (val) = op(addr); \ - if (cond) \ - break; \ - if (__timeout_us && \ - ktime_compare(ktime_get(), __timeout) > 0) { \ - (val) = op(addr); \ - break; \ - } \ - if (__delay_us) \ - udelay(__delay_us); \ - } \ - (cond) ? 0 : -ETIMEDOUT; \ -}) - + read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, false, addr) #define readb_poll_timeout(addr, val, cond, delay_us, timeout_us) \ readx_poll_timeout(readb, addr, val, cond, delay_us, timeout_us) -- cgit v1.2.3 From 0a64f38037cc230e2ad888a74263db6f6588cd90 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 6 May 2020 14:29:02 +0200 Subject: Revert "tty: serial: qcom_geni_serial: Use OPP API to set clk/perf state" This reverts commit 3d9231e69831551da3a78a618b31702ea4dedd5e Rajendra writes: Greg, there are other patches in the series which have a dependency on this patch [1] would it be possible for you to drop this patch and instead ack it so it can be taken via the msm tree? So dropping it from here. Reported-by: Rajendra Nayak Cc: Matthias Kaehlcke Cc: Akash Asthana Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 34 +++++----------------------------- include/linux/qcom-geni-se.h | 4 ---- 2 files changed, 5 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index dd3d1ba38bd9..6119090ce045 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -962,7 +961,7 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport, goto out_restart_rx; uport->uartclk = clk_rate; - dev_pm_opp_set_rate(uport->dev, clk_rate); + clk_set_rate(port->se.clk, clk_rate); ser_clk_cfg = SER_CLK_EN; ser_clk_cfg |= clk_div << CLK_DIV_SHFT; @@ -1199,11 +1198,8 @@ static void qcom_geni_serial_pm(struct uart_port *uport, if (new_state == UART_PM_STATE_ON && old_state == UART_PM_STATE_OFF) geni_se_resources_on(&port->se); else if (new_state == UART_PM_STATE_OFF && - old_state == UART_PM_STATE_ON) { - /* Drop the performance state vote */ - dev_pm_opp_set_rate(uport->dev, 0); + old_state == UART_PM_STATE_ON) geni_se_resources_off(&port->se); - } } static const struct uart_ops qcom_geni_console_pops = { @@ -1322,25 +1318,13 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) if (of_property_read_bool(pdev->dev.of_node, "cts-rts-swap")) port->cts_rts_swap = true; - port->se.opp_table = dev_pm_opp_set_clkname(&pdev->dev, "se"); - if (IS_ERR(port->se.opp_table)) - return PTR_ERR(port->se.opp_table); - /* OPP table is optional */ - ret = dev_pm_opp_of_add_table(&pdev->dev); - if (!ret) { - port->se.has_opp_table = true; - } else if (ret != -ENODEV) { - dev_err(&pdev->dev, "invalid OPP table in device tree\n"); - return ret; - } - uport->private_data = drv; platform_set_drvdata(pdev, port); port->handle_rx = console ? handle_rx_console : handle_rx_uart; ret = uart_add_one_port(drv, uport); if (ret) - goto err; + return ret; irq_set_status_flags(uport->irq, IRQ_NOAUTOEN); ret = devm_request_irq(uport->dev, uport->irq, qcom_geni_serial_isr, @@ -1348,7 +1332,7 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) if (ret) { dev_err(uport->dev, "Failed to get IRQ ret %d\n", ret); uart_remove_one_port(drv, uport); - goto err; + return ret; } /* @@ -1365,16 +1349,11 @@ static int qcom_geni_serial_probe(struct platform_device *pdev) if (ret) { device_init_wakeup(&pdev->dev, false); uart_remove_one_port(drv, uport); - goto err; + return ret; } } return 0; -err: - if (port->se.has_opp_table) - dev_pm_opp_of_remove_table(&pdev->dev); - dev_pm_opp_put_clkname(port->se.opp_table); - return ret; } static int qcom_geni_serial_remove(struct platform_device *pdev) @@ -1382,9 +1361,6 @@ static int qcom_geni_serial_remove(struct platform_device *pdev) struct qcom_geni_serial_port *port = platform_get_drvdata(pdev); struct uart_driver *drv = port->uport.private_data; - if (port->se.has_opp_table) - dev_pm_opp_of_remove_table(&pdev->dev); - dev_pm_opp_put_clkname(port->se.opp_table); dev_pm_clear_wake_irq(&pdev->dev); device_init_wakeup(&pdev->dev, false); uart_remove_one_port(drv, &port->uport); diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index 6b78094a4e23..dd464943f717 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -33,8 +33,6 @@ struct clk; * @clk: Handle to the core serial engine clock * @num_clk_levels: Number of valid clock levels in clk_perf_tbl * @clk_perf_tbl: Table of clock frequency input to serial engine clock - * @opp_table: Pointer to the OPP table - * @has_opp_table: Specifies if the SE has an OPP table */ struct geni_se { void __iomem *base; @@ -43,8 +41,6 @@ struct geni_se { struct clk *clk; unsigned int num_clk_levels; unsigned long *clk_perf_tbl; - struct opp_table *opp_table; - bool has_opp_table; }; /* Common SE registers */ -- cgit v1.2.3 From b720aaa347f227c416e8aed2f12ca62ea4f1cd4e Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 25 Mar 2020 01:43:34 +0300 Subject: firmware: tf: Different way of L2 cache enabling after LP2 suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS TF300T device may not work properly if firmware is asked to fully re-initialize L2 cache after resume from LP2 suspend. The downstream kernel of TF300T uses different opcode to enable cache after resuming from LP2, this opcode also works fine on Nexus 7 and Ouya devices. Supposedly, this may be needed by an older firmware versions. Reported-by: Michał Mirosław Tested-by: Michał Mirosław Tested-by: Jasper Korten Tested-by: David Heidelberg Tested-by: Peter Geis Signed-off-by: Dmitry Osipenko Signed-off-by: Thierry Reding --- drivers/firmware/trusted_foundations.c | 21 +++++++++++++++++++-- include/linux/firmware/trusted_foundations.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/trusted_foundations.c b/drivers/firmware/trusted_foundations.c index fc544e19b0a1..1389fa9418a7 100644 --- a/drivers/firmware/trusted_foundations.c +++ b/drivers/firmware/trusted_foundations.c @@ -19,6 +19,7 @@ #define TF_CACHE_ENABLE 1 #define TF_CACHE_DISABLE 2 +#define TF_CACHE_REENABLE 4 #define TF_SET_CPU_BOOT_ADDR_SMC 0xfffff200 @@ -29,6 +30,7 @@ #define TF_CPU_PM_S1 0xffffffe4 #define TF_CPU_PM_S1_NOFLUSH_L2 0xffffffe7 +static unsigned long tf_idle_mode = TF_PM_MODE_NONE; static unsigned long cpu_boot_addr; static void tf_generic_smc(u32 type, u32 arg1, u32 arg2) @@ -85,25 +87,40 @@ static int tf_prepare_idle(unsigned long mode) cpu_boot_addr); break; + case TF_PM_MODE_NONE: + break; + default: return -EINVAL; } + tf_idle_mode = mode; + return 0; } #ifdef CONFIG_CACHE_L2X0 static void tf_cache_write_sec(unsigned long val, unsigned int reg) { - u32 l2x0_way_mask = 0xff; + u32 enable_op, l2x0_way_mask = 0xff; switch (reg) { case L2X0_CTRL: if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_ASSOCIATIVITY_16) l2x0_way_mask = 0xffff; + switch (tf_idle_mode) { + case TF_PM_MODE_LP2: + enable_op = TF_CACHE_REENABLE; + break; + + default: + enable_op = TF_CACHE_ENABLE; + break; + } + if (val == L2X0_CTRL_EN) - tf_generic_smc(TF_CACHE_MAINT, TF_CACHE_ENABLE, + tf_generic_smc(TF_CACHE_MAINT, enable_op, l2x0_saved_regs.aux_ctrl); else tf_generic_smc(TF_CACHE_MAINT, TF_CACHE_DISABLE, diff --git a/include/linux/firmware/trusted_foundations.h b/include/linux/firmware/trusted_foundations.h index 2549a2db56aa..be5984bda592 100644 --- a/include/linux/firmware/trusted_foundations.h +++ b/include/linux/firmware/trusted_foundations.h @@ -32,6 +32,7 @@ #define TF_PM_MODE_LP1_NO_MC_CLK 2 #define TF_PM_MODE_LP2 3 #define TF_PM_MODE_LP2_NOFLUSH_L2 4 +#define TF_PM_MODE_NONE 5 struct trusted_foundations_platform_data { unsigned int version_major; -- cgit v1.2.3 From 19acd03d95dad1f50d06f28179a1866fca431fed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Apr 2020 17:47:29 +0200 Subject: kcsan: Add __kcsan_{enable,disable}_current() variants The __kcsan_{enable,disable}_current() variants only call into KCSAN if KCSAN is enabled for the current compilation unit. Note: This is typically not what we want, as we usually want to ensure that even calls into other functions still have KCSAN disabled. These variants may safely be used in header files that are shared between regular kernel code and code that does not link the KCSAN runtime. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 17 ++++++++++++++--- kernel/kcsan/core.c | 7 +++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index ef95ddc49182..7b0b9c44f5f3 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -49,6 +49,7 @@ void kcsan_disable_current(void); * Supports nesting. */ void kcsan_enable_current(void); +void kcsan_enable_current_nowarn(void); /* Safe in uaccess regions. */ /** * kcsan_nestable_atomic_begin - begin nestable atomic region @@ -149,6 +150,7 @@ static inline void __kcsan_check_access(const volatile void *ptr, size_t size, static inline void kcsan_disable_current(void) { } static inline void kcsan_enable_current(void) { } +static inline void kcsan_enable_current_nowarn(void) { } static inline void kcsan_nestable_atomic_begin(void) { } static inline void kcsan_nestable_atomic_end(void) { } static inline void kcsan_flat_atomic_begin(void) { } @@ -165,15 +167,24 @@ static inline void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) { } #endif /* CONFIG_KCSAN */ +#ifdef __SANITIZE_THREAD__ /* - * kcsan_*: Only calls into the runtime when the particular compilation unit has - * KCSAN instrumentation enabled. May be used in header files. + * Only calls into the runtime when the particular compilation unit has KCSAN + * instrumentation enabled. May be used in header files. */ -#ifdef __SANITIZE_THREAD__ #define kcsan_check_access __kcsan_check_access + +/* + * Only use these to disable KCSAN for accesses in the current compilation unit; + * calls into libraries may still perform KCSAN checks. + */ +#define __kcsan_disable_current kcsan_disable_current +#define __kcsan_enable_current kcsan_enable_current_nowarn #else static inline void kcsan_check_access(const volatile void *ptr, size_t size, int type) { } +static inline void __kcsan_enable_current(void) { } +static inline void __kcsan_disable_current(void) { } #endif /** diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index a572aae61b98..a73a66cf79df 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -625,6 +625,13 @@ void kcsan_enable_current(void) } EXPORT_SYMBOL(kcsan_enable_current); +void kcsan_enable_current_nowarn(void) +{ + if (get_ctx()->disable_count-- == 0) + kcsan_disable_current(); +} +EXPORT_SYMBOL(kcsan_enable_current_nowarn); + void kcsan_nestable_atomic_begin(void) { /* -- cgit v1.2.3 From 6349084746ff4f5f7ebc748e4b2a890f8c57b129 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 6 May 2020 16:53:13 +0200 Subject: net: phy: add concept of shared storage for PHYs There are packages which contain multiple PHY devices, eg. a quad PHY transceiver. Provide functions to allocate and free shared storage. Usually, a quad PHY contains global registers, which don't belong to any PHY. Provide convenience functions to access these registers. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 1 + drivers/net/phy/phy_device.c | 138 +++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 89 ++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 3e79b96fa344..255fdfcc13a6 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -614,6 +614,7 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) } mutex_init(&bus->mdio_lock); + mutex_init(&bus->shared_lock); /* de-assert bus level PHY GPIO reset */ gpiod = devm_gpiod_get_optional(&bus->dev, "reset", GPIOD_OUT_LOW); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7e1ddd5745d2..b1c5e4503bc4 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1461,6 +1461,144 @@ bool phy_driver_is_genphy_10g(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(phy_driver_is_genphy_10g); +/** + * phy_package_join - join a common PHY group + * @phydev: target phy_device struct + * @addr: cookie and PHY address for global register access + * @priv_size: if non-zero allocate this amount of bytes for private data + * + * This joins a PHY group and provides a shared storage for all phydevs in + * this group. This is intended to be used for packages which contain + * more than one PHY, for example a quad PHY transceiver. + * + * The addr parameter serves as a cookie which has to have the same value + * for all members of one group and as a PHY address to access generic + * registers of a PHY package. Usually, one of the PHY addresses of the + * different PHYs in the package provides access to these global registers. + * The address which is given here, will be used in the phy_package_read() + * and phy_package_write() convenience functions. If your PHY doesn't have + * global registers you can just pick any of the PHY addresses. + * + * This will set the shared pointer of the phydev to the shared storage. + * If this is the first call for a this cookie the shared storage will be + * allocated. If priv_size is non-zero, the given amount of bytes are + * allocated for the priv member. + * + * Returns < 1 on error, 0 on success. Esp. calling phy_package_join() + * with the same cookie but a different priv_size is an error. + */ +int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size) +{ + struct mii_bus *bus = phydev->mdio.bus; + struct phy_package_shared *shared; + int ret; + + if (addr < 0 || addr >= PHY_MAX_ADDR) + return -EINVAL; + + mutex_lock(&bus->shared_lock); + shared = bus->shared[addr]; + if (!shared) { + ret = -ENOMEM; + shared = kzalloc(sizeof(*shared), GFP_KERNEL); + if (!shared) + goto err_unlock; + if (priv_size) { + shared->priv = kzalloc(priv_size, GFP_KERNEL); + if (!shared->priv) + goto err_free; + shared->priv_size = priv_size; + } + shared->addr = addr; + refcount_set(&shared->refcnt, 1); + bus->shared[addr] = shared; + } else { + ret = -EINVAL; + if (priv_size && priv_size != shared->priv_size) + goto err_unlock; + refcount_inc(&shared->refcnt); + } + mutex_unlock(&bus->shared_lock); + + phydev->shared = shared; + + return 0; + +err_free: + kfree(shared); +err_unlock: + mutex_unlock(&bus->shared_lock); + return ret; +} +EXPORT_SYMBOL_GPL(phy_package_join); + +/** + * phy_package_leave - leave a common PHY group + * @phydev: target phy_device struct + * + * This leaves a PHY group created by phy_package_join(). If this phydev + * was the last user of the shared data between the group, this data is + * freed. Resets the phydev->shared pointer to NULL. + */ +void phy_package_leave(struct phy_device *phydev) +{ + struct phy_package_shared *shared = phydev->shared; + struct mii_bus *bus = phydev->mdio.bus; + + if (!shared) + return; + + if (refcount_dec_and_mutex_lock(&shared->refcnt, &bus->shared_lock)) { + bus->shared[shared->addr] = NULL; + mutex_unlock(&bus->shared_lock); + kfree(shared->priv); + kfree(shared); + } + + phydev->shared = NULL; +} +EXPORT_SYMBOL_GPL(phy_package_leave); + +static void devm_phy_package_leave(struct device *dev, void *res) +{ + phy_package_leave(*(struct phy_device **)res); +} + +/** + * devm_phy_package_join - resource managed phy_package_join() + * @dev: device that is registering this PHY package + * @phydev: target phy_device struct + * @addr: cookie and PHY address for global register access + * @priv_size: if non-zero allocate this amount of bytes for private data + * + * Managed phy_package_join(). Shared storage fetched by this function, + * phy_package_leave() is automatically called on driver detach. See + * phy_package_join() for more information. + */ +int devm_phy_package_join(struct device *dev, struct phy_device *phydev, + int addr, size_t priv_size) +{ + struct phy_device **ptr; + int ret; + + ptr = devres_alloc(devm_phy_package_leave, sizeof(*ptr), + GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + ret = phy_package_join(phydev, addr, priv_size); + + if (!ret) { + *ptr = phydev; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return ret; +} +EXPORT_SYMBOL_GPL(devm_phy_package_join); + /** * phy_detach - detach a PHY device from its network device * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index e2bfb9240587..1d36ac608159 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -227,6 +228,28 @@ struct mdio_bus_stats { struct u64_stats_sync syncp; }; +/* Represents a shared structure between different phydev's in the same + * package, for example a quad PHY. See phy_package_join() and + * phy_package_leave(). + */ +struct phy_package_shared { + int addr; + refcount_t refcnt; + unsigned long flags; + size_t priv_size; + + /* private data pointer */ + /* note that this pointer is shared between different phydevs and + * the user has to take care of appropriate locking. It is allocated + * and freed automatically by phy_package_join() and + * phy_package_leave(). + */ + void *priv; +}; + +/* used as bit number in atomic bitops */ +#define PHY_SHARED_F_INIT_DONE 0 + /* * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure @@ -278,6 +301,12 @@ struct mii_bus { int reset_delay_us; /* RESET GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; + + /* protect access to the shared element */ + struct mutex shared_lock; + + /* shared state across different PHYs */ + struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) @@ -478,6 +507,10 @@ struct phy_device { /* For use by PHYs to maintain extra state */ void *priv; + /* shared data pointer */ + /* For use by PHYs inside the same package that need a shared state. */ + struct phy_package_shared *shared; + /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; @@ -1354,6 +1387,10 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev, int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); +int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size); +void phy_package_leave(struct phy_device *phydev); +int devm_phy_package_join(struct device *dev, struct phy_device *phydev, + int addr, size_t priv_size); #if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); @@ -1406,6 +1443,58 @@ static inline int phy_ethtool_get_stats(struct phy_device *phydev, return 0; } +static inline int phy_package_read(struct phy_device *phydev, u32 regnum) +{ + struct phy_package_shared *shared = phydev->shared; + + if (!shared) + return -EIO; + + return mdiobus_read(phydev->mdio.bus, shared->addr, regnum); +} + +static inline int __phy_package_read(struct phy_device *phydev, u32 regnum) +{ + struct phy_package_shared *shared = phydev->shared; + + if (!shared) + return -EIO; + + return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum); +} + +static inline int phy_package_write(struct phy_device *phydev, + u32 regnum, u16 val) +{ + struct phy_package_shared *shared = phydev->shared; + + if (!shared) + return -EIO; + + return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); +} + +static inline int __phy_package_write(struct phy_device *phydev, + u32 regnum, u16 val) +{ + struct phy_package_shared *shared = phydev->shared; + + if (!shared) + return -EIO; + + return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); +} + +static inline bool phy_package_init_once(struct phy_device *phydev) +{ + struct phy_package_shared *shared = phydev->shared; + + if (!shared) + return false; + + return !test_and_set_bit(PHY_SHARED_F_INIT_DONE, &shared->flags); +} + extern struct bus_type mdio_bus_type; struct mdio_board_info { -- cgit v1.2.3 From c6af13d334759c33c14b6fad4c676c6d1dbf9564 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 1 May 2020 23:27:21 +0200 Subject: timer: add fsleep for flexible sleeping Sleeping for a certain amount of time requires use of different functions, depending on the time period. Documentation/timers/timers-howto.rst explains when to use which function, and also checkpatch checks for some potentially problematic cases. So let's create a helper that automatically chooses the appropriate sleep function -> fsleep(), for flexible sleeping If the delay is a constant, then the compiler should be able to ensure that the new helper doesn't create overhead. If the delay is not constant, then the new helper can save some code. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- Documentation/timers/timers-howto.rst | 3 +++ include/linux/delay.h | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/Documentation/timers/timers-howto.rst b/Documentation/timers/timers-howto.rst index 7e3167bec2b1..afb0a43b8cdf 100644 --- a/Documentation/timers/timers-howto.rst +++ b/Documentation/timers/timers-howto.rst @@ -110,3 +110,6 @@ NON-ATOMIC CONTEXT: short, the difference is whether the sleep can be ended early by a signal. In general, just use msleep unless you know you have a need for the interruptible variant. + + FLEXIBLE SLEEPING (any delay, uninterruptible) + * Use fsleep diff --git a/include/linux/delay.h b/include/linux/delay.h index 8e6828094c1e..5e016a4029d9 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -65,4 +65,15 @@ static inline void ssleep(unsigned int seconds) msleep(seconds * 1000); } +/* see Documentation/timers/timers-howto.rst for the thresholds */ +static inline void fsleep(unsigned long usecs) +{ + if (usecs <= 10) + udelay(usecs); + else if (usecs <= 20000) + usleep_range(usecs, 2 * usecs); + else + msleep(DIV_ROUND_UP(usecs, 1000)); +} + #endif /* defined(_LINUX_DELAY_H) */ -- cgit v1.2.3 From bdbdac7649fac05f88c9f7ab18121a17fb591687 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 5 May 2020 08:35:05 +0200 Subject: ethtool: provide UAPI for PHY master/slave configuration. This UAPI is needed for BroadR-Reach 100BASE-T1 devices. Due to lack of auto-negotiation support, we needed to be able to configure the MASTER-SLAVE role of the port manually or from an application in user space. The same UAPI can be used for 1000BASE-T or MultiGBASE-T devices to force MASTER or SLAVE role. See IEEE 802.3-2018: 22.2.4.3.7 MASTER-SLAVE control register (Register 9) 22.2.4.3.8 MASTER-SLAVE status register (Register 10) 40.5.2 MASTER-SLAVE configuration resolution 45.2.1.185.1 MASTER-SLAVE config value (1.2100.14) 45.2.7.10 MultiGBASE-T AN control 1 register (Register 7.32) The MASTER-SLAVE role affects the clock configuration: ------------------------------------------------------------------------------- When the PHY is configured as MASTER, the PMA Transmit function shall source TX_TCLK from a local clock source. When configured as SLAVE, the PMA Transmit function shall source TX_TCLK from the clock recovered from data stream provided by MASTER. iMX6Q KSZ9031 XXX ------\ /-----------\ /------------\ | | | | | MAC |<----RGMII----->| PHY Slave |<------>| PHY Master | |<--- 125 MHz ---+-<------/ | | \ | ------/ \-----------/ \------------/ ^ \-TX_TCLK ------------------------------------------------------------------------------- Since some clock or link related issues are only reproducible in a specific MASTER-SLAVE-role, MAC and PHY configuration, it is beneficial to provide generic (not 100BASE-T1 specific) interface to the user space for configuration flexibility and trouble shooting. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 35 ++++++----- drivers/net/phy/phy.c | 4 +- drivers/net/phy/phy_device.c | 94 ++++++++++++++++++++++++++++ include/linux/phy.h | 3 + include/uapi/linux/ethtool.h | 16 ++++- include/uapi/linux/ethtool_netlink.h | 2 + include/uapi/linux/mii.h | 2 + net/ethtool/ioctl.c | 6 ++ net/ethtool/linkmodes.c | 53 ++++++++++++++++ 9 files changed, 197 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 567326491f80..8f5cefc539cf 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -392,14 +392,16 @@ Request contents: Kernel response contents: - ==================================== ====== ========================== - ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header - ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status - ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes - ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes - ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) - ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode - ==================================== ====== ========================== + ========================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE`` u8 Master/slave port state + ========================================== ====== ========================== For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask represents supported modes. ``ETHTOOL_A_LINKMODES_PEER`` in the reply is a bit @@ -414,14 +416,15 @@ LINKMODES_SET Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_LINKMODES_HEADER`` nested request header - ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status - ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes - ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes - ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) - ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode - ==================================== ====== ========================== + ========================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested request header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ========================================== ====== ========================== ``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If autonegotiation is on (either set now or kept from before), advertised modes diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 72c69a9c8a98..8c22d02b4218 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -295,7 +295,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->advertising, autoneg == AUTONEG_ENABLE); phydev->duplex = duplex; - + phydev->master_slave_set = cmd->base.master_slave_cfg; phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; /* Restart the PHY */ @@ -314,6 +314,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.speed = phydev->speed; cmd->base.duplex = phydev->duplex; + cmd->base.master_slave_cfg = phydev->master_slave_get; + cmd->base.master_slave_state = phydev->master_slave_state; if (phydev->interface == PHY_INTERFACE_MODE_MOCA) cmd->base.port = PORT_BNC; else diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b1c5e4503bc4..83fc8e1b5793 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1913,6 +1913,90 @@ int genphy_setup_forced(struct phy_device *phydev) } EXPORT_SYMBOL(genphy_setup_forced); +static int genphy_setup_master_slave(struct phy_device *phydev) +{ + u16 ctl = 0; + + if (!phydev->is_gigabit_capable) + return 0; + + switch (phydev->master_slave_set) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + ctl |= CTL1000_PREFER_MASTER; + break; + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + break; + case MASTER_SLAVE_CFG_MASTER_FORCE: + ctl |= CTL1000_AS_MASTER; + /* fallthrough */ + case MASTER_SLAVE_CFG_SLAVE_FORCE: + ctl |= CTL1000_ENABLE_MASTER; + break; + case MASTER_SLAVE_CFG_UNKNOWN: + case MASTER_SLAVE_CFG_UNSUPPORTED: + return 0; + default: + phydev_warn(phydev, "Unsupported Master/Slave mode\n"); + return -EOPNOTSUPP; + } + + return phy_modify_changed(phydev, MII_CTRL1000, + (CTL1000_ENABLE_MASTER | CTL1000_AS_MASTER | + CTL1000_PREFER_MASTER), ctl); +} + +static int genphy_read_master_slave(struct phy_device *phydev) +{ + int cfg, state; + u16 val; + + if (!phydev->is_gigabit_capable) { + phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; + return 0; + } + + phydev->master_slave_get = MASTER_SLAVE_CFG_UNKNOWN; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN; + + val = phy_read(phydev, MII_CTRL1000); + if (val < 0) + return val; + + if (val & CTL1000_ENABLE_MASTER) { + if (val & CTL1000_AS_MASTER) + cfg = MASTER_SLAVE_CFG_MASTER_FORCE; + else + cfg = MASTER_SLAVE_CFG_SLAVE_FORCE; + } else { + if (val & CTL1000_PREFER_MASTER) + cfg = MASTER_SLAVE_CFG_MASTER_PREFERRED; + else + cfg = MASTER_SLAVE_CFG_SLAVE_PREFERRED; + } + + val = phy_read(phydev, MII_STAT1000); + if (val < 0) + return val; + + if (val & LPA_1000MSFAIL) { + state = MASTER_SLAVE_STATE_ERR; + } else if (phydev->link) { + /* this bits are valid only for active link */ + if (val & LPA_1000MSRES) + state = MASTER_SLAVE_STATE_MASTER; + else + state = MASTER_SLAVE_STATE_SLAVE; + } else { + state = MASTER_SLAVE_STATE_UNKNOWN; + } + + phydev->master_slave_get = cfg; + phydev->master_slave_state = state; + + return 0; +} + /** * genphy_restart_aneg - Enable and Restart Autonegotiation * @phydev: target phy_device struct @@ -1971,6 +2055,12 @@ int __genphy_config_aneg(struct phy_device *phydev, bool changed) if (genphy_config_eee_advert(phydev)) changed = true; + err = genphy_setup_master_slave(phydev); + if (err < 0) + return err; + else if (err) + changed = true; + if (AUTONEG_ENABLE != phydev->autoneg) return genphy_setup_forced(phydev); @@ -2205,6 +2295,10 @@ int genphy_read_status(struct phy_device *phydev) phydev->pause = 0; phydev->asym_pause = 0; + err = genphy_read_master_slave(phydev); + if (err < 0) + return err; + err = genphy_read_lpa(phydev); if (err < 0) return err; diff --git a/include/linux/phy.h b/include/linux/phy.h index 1d36ac608159..a2b91b5f9d0a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -477,6 +477,9 @@ struct phy_device { int duplex; int pause; int asym_pause; + u8 master_slave_get; + u8 master_slave_set; + u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 92f737f10117..f4662b3a9e1e 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1666,6 +1666,18 @@ static inline int ethtool_validate_duplex(__u8 duplex) return 0; } +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + /* Which connector port. */ #define PORT_TP 0x00 #define PORT_AUI 0x01 @@ -1904,7 +1916,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; __u8 transceiver; - __u8 reserved1[3]; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 reserved1[1]; __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..bf1d310e20bc 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -216,6 +216,8 @@ enum { ETHTOOL_A_LINKMODES_PEER, /* bitset */ ETHTOOL_A_LINKMODES_SPEED, /* u32 */ ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKMODES_CNT, diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index 90f9b4e1ba27..39f7c44baf53 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -151,11 +151,13 @@ /* 1000BASE-T Control register */ #define ADVERTISE_1000FULL 0x0200 /* Advertise 1000BASE-T full duplex */ #define ADVERTISE_1000HALF 0x0100 /* Advertise 1000BASE-T half duplex */ +#define CTL1000_PREFER_MASTER 0x0400 /* prefer to operate as master */ #define CTL1000_AS_MASTER 0x0800 #define CTL1000_ENABLE_MASTER 0x1000 /* 1000BASE-T Status register */ #define LPA_1000MSFAIL 0x8000 /* Master/Slave resolution failure */ +#define LPA_1000MSRES 0x4000 /* Master/Slave resolution status */ #define LPA_1000LOCALRXOK 0x2000 /* Link partner local receiver status */ #define LPA_1000REMRXOK 0x1000 /* Link partner remote receiver status */ #define LPA_1000FULL 0x0800 /* Link partner 1000BASE-T full duplex */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 226d5ecdd567..52102ab1709b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -552,6 +552,8 @@ static int ethtool_get_link_ksettings(struct net_device *dev, link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; + link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; + link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; return store_link_ksettings_for_user(useraddr, &link_ksettings); } @@ -589,6 +591,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; + if (link_ksettings.base.master_slave_cfg || + link_ksettings.base.master_slave_state) + return -EINVAL; + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 452608c6d856..fd4f3e58c6f6 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -27,6 +27,8 @@ linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, @@ -63,6 +65,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len, ret; @@ -86,6 +89,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, len += ret; } + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + return len; } @@ -122,6 +131,16 @@ static int linkmodes_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + lsettings->master_slave_cfg)) + return -EMSGSIZE; + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + lsettings->master_slave_state)) + return -EMSGSIZE; + return 0; } @@ -249,6 +268,8 @@ linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; /* Set advertised link modes to all supported modes matching requested speed @@ -287,14 +308,45 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static bool ethnl_validate_master_slave_cfg(u8 cfg) +{ + switch (cfg) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + case MASTER_SLAVE_CFG_MASTER_FORCE: + case MASTER_SLAVE_CFG_SLAVE_FORCE: + return true; + } + + return false; +} + static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod) { struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_duplex; + const struct nlattr *master_slave_cfg; int ret; + master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; + if (master_slave_cfg) { + u8 cfg = nla_get_u8(master_slave_cfg); + + if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave configuration not supported by device"); + return -EOPNOTSUPP; + } + + if (!ethnl_validate_master_slave_cfg(cfg)) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave value is invalid"); + return -EOPNOTSUPP; + } + } + *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; @@ -311,6 +363,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); + ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && (req_speed || req_duplex) && -- cgit v1.2.3 From b2f75a41eaa6bfc4aa6f6a1faefbf21c2c8d1588 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Apr 2020 17:49:23 -0600 Subject: PCI: host-generic: Eliminate pci_host_common_probe wrappers Most ECAM host drivers are just different pci_ecam_ops which can be DT match table data. That's already the case in some cases, but let's do that for all the ECAM drivers. Then we can use of_device_get_match_data() in pci_host_common_probe() and eliminate the probe wrapper functions and use pci_host_common_probe() directly for probe. Link: https://lore.kernel.org/r/20200409234923.21598-4-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Cc: Zhou Wang Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: Will Deacon Cc: Robert Richter Cc: Marc Gonzalez Cc: Mans Rullgard Cc: linux-pci@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org --- drivers/pci/controller/dwc/pcie-hisi.c | 11 +---------- drivers/pci/controller/pci-host-common.c | 9 +++++++-- drivers/pci/controller/pci-host-generic.c | 15 +-------------- drivers/pci/controller/pci-thunder-ecam.c | 12 +++++------- drivers/pci/controller/pci-thunder-pem.c | 12 +++++------- drivers/pci/controller/pcie-tango.c | 7 +++++-- include/linux/pci-ecam.h | 3 +-- 7 files changed, 25 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/dwc/pcie-hisi.c b/drivers/pci/controller/dwc/pcie-hisi.c index 90017045334d..0ba50fb473b1 100644 --- a/drivers/pci/controller/dwc/pcie-hisi.c +++ b/drivers/pci/controller/dwc/pcie-hisi.c @@ -332,15 +332,6 @@ static struct platform_driver hisi_pcie_driver = { }; builtin_platform_driver(hisi_pcie_driver); -static int hisi_pcie_almost_ecam_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - struct pci_ecam_ops *ops; - - ops = (struct pci_ecam_ops *)of_device_get_match_data(dev); - return pci_host_common_probe(pdev, ops); -} - static int hisi_pcie_platform_init(struct pci_config_window *cfg) { struct device *dev = cfg->parent; @@ -385,7 +376,7 @@ static const struct of_device_id hisi_pcie_almost_ecam_of_match[] = { }; static struct platform_driver hisi_pcie_almost_ecam_driver = { - .probe = hisi_pcie_almost_ecam_probe, + .probe = pci_host_common_probe, .driver = { .name = "hisi-pcie-almost-ecam", .of_match_table = hisi_pcie_almost_ecam_of_match, diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index 6d15bc12e726..953de57f6c57 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -55,15 +56,19 @@ err_out: return ERR_PTR(err); } -int pci_host_common_probe(struct platform_device *pdev, - const struct pci_ecam_ops *ops) +int pci_host_common_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct pci_host_bridge *bridge; struct pci_config_window *cfg; struct list_head resources; + const struct pci_ecam_ops *ops; int ret; + ops = of_device_get_match_data(&pdev->dev); + if (!ops) + return -ENODEV; + bridge = devm_pci_alloc_host_bridge(dev, 0); if (!bridge) return -ENOMEM; diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c index fd8cff61de14..b51977abfdf1 100644 --- a/drivers/pci/controller/pci-host-generic.c +++ b/drivers/pci/controller/pci-host-generic.c @@ -11,8 +11,6 @@ #include #include #include -#include -#include #include #include @@ -79,23 +77,12 @@ static const struct of_device_id gen_pci_of_match[] = { }; MODULE_DEVICE_TABLE(of, gen_pci_of_match); -static int gen_pci_probe(struct platform_device *pdev) -{ - const struct of_device_id *of_id; - struct pci_ecam_ops *ops; - - of_id = of_match_node(gen_pci_of_match, pdev->dev.of_node); - ops = (struct pci_ecam_ops *)of_id->data; - - return pci_host_common_probe(pdev, ops); -} - static struct platform_driver gen_pci_driver = { .driver = { .name = "pci-host-generic", .of_match_table = gen_pci_of_match, }, - .probe = gen_pci_probe, + .probe = pci_host_common_probe, .remove = pci_host_common_remove, }; module_platform_driver(gen_pci_driver); diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index c3fdd3e6b21c..7e8835fee5f7 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -357,22 +357,20 @@ const struct pci_ecam_ops pci_thunder_ecam_ops = { #ifdef CONFIG_PCI_HOST_THUNDER_ECAM static const struct of_device_id thunder_ecam_of_match[] = { - { .compatible = "cavium,pci-host-thunder-ecam" }, + { + .compatible = "cavium,pci-host-thunder-ecam", + .data = &pci_thunder_ecam_ops, + }, { }, }; -static int thunder_ecam_probe(struct platform_device *pdev) -{ - return pci_host_common_probe(pdev, &pci_thunder_ecam_ops); -} - static struct platform_driver thunder_ecam_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = thunder_ecam_of_match, .suppress_bind_attrs = true, }, - .probe = thunder_ecam_probe, + .probe = pci_host_common_probe, }; builtin_platform_driver(thunder_ecam_driver); diff --git a/drivers/pci/controller/pci-thunder-pem.c b/drivers/pci/controller/pci-thunder-pem.c index 2e792707ceab..3f847969143e 100644 --- a/drivers/pci/controller/pci-thunder-pem.c +++ b/drivers/pci/controller/pci-thunder-pem.c @@ -451,22 +451,20 @@ static const struct pci_ecam_ops pci_thunder_pem_ops = { }; static const struct of_device_id thunder_pem_of_match[] = { - { .compatible = "cavium,pci-host-thunder-pem" }, + { + .compatible = "cavium,pci-host-thunder-pem", + .data = &pci_thunder_pem_ops, + }, { }, }; -static int thunder_pem_probe(struct platform_device *pdev) -{ - return pci_host_common_probe(pdev, &pci_thunder_pem_ops); -} - static struct platform_driver thunder_pem_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = thunder_pem_of_match, .suppress_bind_attrs = true, }, - .probe = thunder_pem_probe, + .probe = pci_host_common_probe, }; builtin_platform_driver(thunder_pem_driver); diff --git a/drivers/pci/controller/pcie-tango.c b/drivers/pci/controller/pcie-tango.c index 3b2b10906fdd..c13367c30fc6 100644 --- a/drivers/pci/controller/pcie-tango.c +++ b/drivers/pci/controller/pcie-tango.c @@ -295,11 +295,14 @@ static int tango_pcie_probe(struct platform_device *pdev) spin_lock_init(&pcie->used_msi_lock); irq_set_chained_handler_and_data(virq, tango_msi_isr, pcie); - return pci_host_common_probe(pdev, &smp8759_ecam_ops); + return pci_host_common_probe(pdev); } static const struct of_device_id tango_pcie_ids[] = { - { .compatible = "sigma,smp8759-pcie" }, + { + .compatible = "sigma,smp8759-pcie", + .data = &smp8759_ecam_ops, + }, { }, }; diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index fd0edb8b8a00..1af5cb02ef7f 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -61,8 +61,7 @@ extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ #if IS_ENABLED(CONFIG_PCI_HOST_COMMON) /* for DT-based PCI controllers that support ECAM */ -int pci_host_common_probe(struct platform_device *pdev, - const struct pci_ecam_ops *ops); +int pci_host_common_probe(struct platform_device *pdev); int pci_host_common_remove(struct platform_device *pdev); #endif #endif -- cgit v1.2.3 From 565558558985b1d7cd43b21f18c1ad6b232788d0 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:03 +0100 Subject: cpu/hotplug: Remove disable_nonboot_cpus() The single user could have called freeze_secondary_cpus() directly. Since this function was a source of confusion, remove it as it's just a pointless wrapper. While at it, rename enable_nonboot_cpus() to thaw_secondary_cpus() to preserve the naming symmetry. Done automatically via: git grep -l enable_nonboot_cpus | xargs sed -i 's/enable_nonboot_cpus/thaw_secondary_cpus/g' Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-1-qais.yousef@arm.com --- Documentation/power/suspend-and-cpuhotplug.rst | 6 +++--- arch/x86/kernel/smpboot.c | 4 ++-- arch/x86/power/cpu.c | 2 +- include/linux/cpu.h | 12 +++--------- include/linux/smp.h | 4 ++-- kernel/cpu.c | 14 +++++++------- tools/power/pm-graph/config/custom-timeline-functions.cfg | 2 +- tools/power/pm-graph/sleepgraph.py | 2 +- 8 files changed, 20 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst index 572d968c5375..ebedb6c75db9 100644 --- a/Documentation/power/suspend-and-cpuhotplug.rst +++ b/Documentation/power/suspend-and-cpuhotplug.rst @@ -48,7 +48,7 @@ More details follow:: | | v - disable_nonboot_cpus() + freeze_secondary_cpus() /* start */ | v @@ -83,7 +83,7 @@ More details follow:: Release cpu_add_remove_lock | v - /* disable_nonboot_cpus() complete */ + /* freeze_secondary_cpus() complete */ | v Do suspend @@ -93,7 +93,7 @@ More details follow:: Resuming back is likewise, with the counterparts being (in the order of execution during resume): -* enable_nonboot_cpus() which involves:: +* thaw_secondary_cpus() which involves:: | Acquire cpu_add_remove_lock | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe3ab9632f3b..997b66c18154 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1376,12 +1376,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) speculative_store_bypass_ht_init(); } -void arch_enable_nonboot_cpus_begin(void) +void arch_thaw_secondary_cpus_begin(void) { set_mtrr_aps_delayed_init(); } -void arch_enable_nonboot_cpus_end(void) +void arch_thaw_secondary_cpus_end(void) { mtrr_aps_init(); } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index aaff9ed7ff45..fc3b757afb2c 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -307,7 +307,7 @@ int hibernate_resume_nonboot_cpu_disable(void) if (ret) return ret; smp_ops.play_dead = resume_play_dead; - ret = disable_nonboot_cpus(); + ret = freeze_secondary_cpus(0); smp_ops.play_dead = play_dead; return ret; } diff --git a/include/linux/cpu.h b/include/linux/cpu.h index beaed2dc269e..9d34dc3b859f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -150,12 +150,7 @@ static inline int freeze_secondary_cpus(int primary) return __freeze_secondary_cpus(primary, true); } -static inline int disable_nonboot_cpus(void) -{ - return __freeze_secondary_cpus(0, false); -} - -void enable_nonboot_cpus(void); +extern void thaw_secondary_cpus(void); static inline int suspend_disable_secondary_cpus(void) { @@ -168,12 +163,11 @@ static inline int suspend_disable_secondary_cpus(void) } static inline void suspend_enable_secondary_cpus(void) { - return enable_nonboot_cpus(); + return thaw_secondary_cpus(); } #else /* !CONFIG_PM_SLEEP_SMP */ -static inline int disable_nonboot_cpus(void) { return 0; } -static inline void enable_nonboot_cpus(void) {} +static inline void thaw_secondary_cpus(void) {} static inline int suspend_disable_secondary_cpus(void) { return 0; } static inline void suspend_enable_secondary_cpus(void) { } #endif /* !CONFIG_PM_SLEEP_SMP */ diff --git a/include/linux/smp.h b/include/linux/smp.h index cbc9162689d0..04019872c7bc 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -227,8 +227,8 @@ static inline int get_boot_cpu_id(void) */ extern void arch_disable_smp_support(void); -extern void arch_enable_nonboot_cpus_begin(void); -extern void arch_enable_nonboot_cpus_end(void); +extern void arch_thaw_secondary_cpus_begin(void); +extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a02d4434cf7..d766929e0b7e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1376,8 +1376,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1385,15 +1385,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1405,7 +1405,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1418,7 +1418,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: diff --git a/tools/power/pm-graph/config/custom-timeline-functions.cfg b/tools/power/pm-graph/config/custom-timeline-functions.cfg index 4f80ad7d7275..962e5768681c 100644 --- a/tools/power/pm-graph/config/custom-timeline-functions.cfg +++ b/tools/power/pm-graph/config/custom-timeline-functions.cfg @@ -125,7 +125,7 @@ acpi_suspend_begin: suspend_console: acpi_pm_prepare: syscore_suspend: -arch_enable_nonboot_cpus_end: +arch_thaw_secondary_cpus_end: syscore_resume: acpi_pm_finish: resume_console: diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index f7d1c1f62f86..530a9f681410 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -192,7 +192,7 @@ class SystemValues: 'suspend_console': {}, 'acpi_pm_prepare': {}, 'syscore_suspend': {}, - 'arch_enable_nonboot_cpus_end': {}, + 'arch_thaw_secondary_cpus_end': {}, 'syscore_resume': {}, 'acpi_pm_finish': {}, 'resume_console': {}, -- cgit v1.2.3 From fb7fb84a0c4e8021ddecb157802d58241a3f1a40 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:04 +0100 Subject: cpu/hotplug: Remove __freeze_secondary_cpus() The refactored function is no longer required as the codepaths that call freeze_secondary_cpus() are all suspend/resume related now. Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-2-qais.yousef@arm.com --- include/linux/cpu.h | 7 +------ kernel/cpu.c | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 9d34dc3b859f..52692587f7fe 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -144,12 +144,7 @@ static inline void get_online_cpus(void) { cpus_read_lock(); } static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP -int __freeze_secondary_cpus(int primary, bool suspend); -static inline int freeze_secondary_cpus(int primary) -{ - return __freeze_secondary_cpus(primary, true); -} - +extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); static inline int suspend_disable_secondary_cpus(void) diff --git a/kernel/cpu.c b/kernel/cpu.c index d766929e0b7e..9f892144db6b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1327,7 +1327,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1352,7 +1352,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; -- cgit v1.2.3 From 3fec4aecb311995189217e64d725cfe84a568de3 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 6 May 2020 17:42:23 +0100 Subject: kgdb: Fix spurious true from in_dbg_master() Currently there is a small window where a badly timed migration could cause in_dbg_master() to spuriously return true. Specifically if we migrate to a new core after reading the processor id and the previous core takes a breakpoint then we will evaluate true if we read kgdb_active before we get the IPI to bring us to halt. Fix this by checking irqs_disabled() first. Interrupts are always disabled when we are executing the kgdb trap so this is an acceptable prerequisite. This also allows us to replace raw_smp_processor_id() with smp_processor_id() since the short circuit logic will prevent warnings from PREEMPT_DEBUG. Fixes: dcc7871128e9 ("kgdb: core changes to support kdb") Suggested-by: Will Deacon Link: https://lore.kernel.org/r/20200506164223.2875760-1-daniel.thompson@linaro.org Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson --- include/linux/kgdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index b072aeb1fd78..4d6fe87fd38f 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -323,7 +323,7 @@ extern void gdbstub_exit(int status); extern int kgdb_single_step; extern atomic_t kgdb_active; #define in_dbg_master() \ - (raw_smp_processor_id() == atomic_read(&kgdb_active)) + (irqs_disabled() && (smp_processor_id() == atomic_read(&kgdb_active))) extern bool dbg_is_early; extern void __init dbg_late_init(void); extern void kgdb_panic(const char *msg); -- cgit v1.2.3 From 1137a96f9b5a8615296bd319151896c859ead292 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 7 May 2020 19:06:49 +0800 Subject: kgdb: Return true in kgdb_nmi_poll_knock() Fix the following coccicheck warning: include/linux/kgdb.h:301:54-55: WARNING: return of 0/1 in function 'kgdb_nmi_poll_knock' with return type bool Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20200507110649.37426-1-yanaijie@huawei.com Signed-off-by: Daniel Thompson --- include/linux/kgdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 4d6fe87fd38f..c2caee08e418 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -298,7 +298,7 @@ extern bool kgdb_nmi_poll_knock(void); #else static inline int kgdb_register_nmi_console(void) { return 0; } static inline int kgdb_unregister_nmi_console(void) { return 0; } -static inline bool kgdb_nmi_poll_knock(void) { return 1; } +static inline bool kgdb_nmi_poll_knock(void) { return true; } #endif extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); -- cgit v1.2.3 From 5d1f2e3c8090c0769ee4a1b920e82277613327fc Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Apr 2020 17:37:34 -0700 Subject: soc: qcom: glink_ssr: Internalize ssr_notifiers Rather than carrying a special purpose blocking notifier for glink_ssr in remoteproc's qcom_common.c, move it into glink_ssr so allow wider reuse of the common one. The rpmsg glink header file is used in preparation for the next patch. Acked-by: Chris Lew Acked-by: Rishabh Bhatnagar Link: https://lore.kernel.org/r/20200423003736.2027371-3-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_common.c | 8 ++++++++ drivers/soc/qcom/glink_ssr.c | 24 +++++++++++++++++++----- include/linux/rpmsg/qcom_glink.h | 6 ++++++ 3 files changed, 33 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/remoteproc/qcom_common.c b/drivers/remoteproc/qcom_common.c index ff26f2b68752..9028cea2d81e 100644 --- a/drivers/remoteproc/qcom_common.c +++ b/drivers/remoteproc/qcom_common.c @@ -42,6 +42,13 @@ static void glink_subdev_stop(struct rproc_subdev *subdev, bool crashed) glink->edge = NULL; } +static void glink_subdev_unprepare(struct rproc_subdev *subdev) +{ + struct qcom_rproc_glink *glink = to_glink_subdev(subdev); + + qcom_glink_ssr_notify(glink->ssr_name); +} + /** * qcom_add_glink_subdev() - try to add a GLINK subdevice to rproc * @rproc: rproc handle to parent the subdevice @@ -64,6 +71,7 @@ void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink, glink->dev = dev; glink->subdev.start = glink_subdev_start; glink->subdev.stop = glink_subdev_stop; + glink->subdev.unprepare = glink_subdev_unprepare; rproc_add_subdev(rproc, &glink->subdev); } diff --git a/drivers/soc/qcom/glink_ssr.c b/drivers/soc/qcom/glink_ssr.c index d7babe3d67bc..847d79c935f1 100644 --- a/drivers/soc/qcom/glink_ssr.c +++ b/drivers/soc/qcom/glink_ssr.c @@ -54,6 +54,19 @@ struct glink_ssr { struct completion completion; }; +/* Notifier list for all registered glink_ssr instances */ +static BLOCKING_NOTIFIER_HEAD(ssr_notifiers); + +/** + * qcom_glink_ssr_notify() - notify GLINK SSR about stopped remoteproc + * @ssr_name: name of the remoteproc that has been stopped + */ +void qcom_glink_ssr_notify(const char *ssr_name) +{ + blocking_notifier_call_chain(&ssr_notifiers, 0, (void *)ssr_name); +} +EXPORT_SYMBOL_GPL(qcom_glink_ssr_notify); + static int qcom_glink_ssr_callback(struct rpmsg_device *rpdev, void *data, int len, void *priv, u32 addr) { @@ -81,8 +94,9 @@ static int qcom_glink_ssr_callback(struct rpmsg_device *rpdev, return 0; } -static int qcom_glink_ssr_notify(struct notifier_block *nb, unsigned long event, - void *data) +static int qcom_glink_ssr_notifier_call(struct notifier_block *nb, + unsigned long event, + void *data) { struct glink_ssr *ssr = container_of(nb, struct glink_ssr, nb); struct do_cleanup_msg msg; @@ -121,18 +135,18 @@ static int qcom_glink_ssr_probe(struct rpmsg_device *rpdev) ssr->dev = &rpdev->dev; ssr->ept = rpdev->ept; - ssr->nb.notifier_call = qcom_glink_ssr_notify; + ssr->nb.notifier_call = qcom_glink_ssr_notifier_call; dev_set_drvdata(&rpdev->dev, ssr); - return qcom_register_ssr_notifier(&ssr->nb); + return blocking_notifier_chain_register(&ssr_notifiers, &ssr->nb); } static void qcom_glink_ssr_remove(struct rpmsg_device *rpdev) { struct glink_ssr *ssr = dev_get_drvdata(&rpdev->dev); - qcom_unregister_ssr_notifier(&ssr->nb); + blocking_notifier_chain_unregister(&ssr_notifiers, &ssr->nb); } static const struct rpmsg_device_id qcom_glink_ssr_match[] = { diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h index 96e26d94719f..09daa0acde2c 100644 --- a/include/linux/rpmsg/qcom_glink.h +++ b/include/linux/rpmsg/qcom_glink.h @@ -26,4 +26,10 @@ static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {} #endif +#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SSR) +void qcom_glink_ssr_notify(const char *ssr_name); +#else +static inline void qcom_glink_ssr_notify(const char *ssr_name) {} +#endif + #endif -- cgit v1.2.3 From 93bc3feee8bd5fbe29ad27779c5e7b369fd7c80b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Apr 2020 17:37:35 -0700 Subject: rpmsg: glink: Integrate glink_ssr in qcom_glink In all but the very special case of a system with _only_ glink_rpm, GLINK is dependent on glink_ssr, so move it to rpmsg and combine it with qcom_glink_native in the new qcom_glink kernel module. Acked-by: Chris Lew Acked-by: Rishabh Bhatnagar Link: https://lore.kernel.org/r/20200423003736.2027371-4-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/rpmsg/Kconfig | 6 +- drivers/rpmsg/Makefile | 3 +- drivers/rpmsg/qcom_glink_ssr.c | 166 ++++++++++++++++++++++++++++++++++++++ drivers/soc/qcom/Kconfig | 9 --- drivers/soc/qcom/Makefile | 1 - drivers/soc/qcom/glink_ssr.c | 170 --------------------------------------- include/linux/rpmsg/qcom_glink.h | 7 +- 7 files changed, 172 insertions(+), 190 deletions(-) create mode 100644 drivers/rpmsg/qcom_glink_ssr.c delete mode 100644 drivers/soc/qcom/glink_ssr.c (limited to 'include/linux') diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig index a9108ff563dc..f96716893c2a 100644 --- a/drivers/rpmsg/Kconfig +++ b/drivers/rpmsg/Kconfig @@ -24,13 +24,13 @@ config RPMSG_MTK_SCP remote processors in MediaTek platforms. This use IPI and IPC to communicate with remote processors. -config RPMSG_QCOM_GLINK_NATIVE +config RPMSG_QCOM_GLINK tristate select RPMSG config RPMSG_QCOM_GLINK_RPM tristate "Qualcomm RPM Glink driver" - select RPMSG_QCOM_GLINK_NATIVE + select RPMSG_QCOM_GLINK depends on HAS_IOMEM depends on MAILBOX help @@ -40,7 +40,7 @@ config RPMSG_QCOM_GLINK_RPM config RPMSG_QCOM_GLINK_SMEM tristate "Qualcomm SMEM Glink driver" - select RPMSG_QCOM_GLINK_NATIVE + select RPMSG_QCOM_GLINK depends on MAILBOX depends on QCOM_SMEM help diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile index ae92a7fb08f6..ffe932ef6050 100644 --- a/drivers/rpmsg/Makefile +++ b/drivers/rpmsg/Makefile @@ -2,8 +2,9 @@ obj-$(CONFIG_RPMSG) += rpmsg_core.o obj-$(CONFIG_RPMSG_CHAR) += rpmsg_char.o obj-$(CONFIG_RPMSG_MTK_SCP) += mtk_rpmsg.o +qcom_glink-objs := qcom_glink_native.o qcom_glink_ssr.o +obj-$(CONFIG_RPMSG_QCOM_GLINK) += qcom_glink.o obj-$(CONFIG_RPMSG_QCOM_GLINK_RPM) += qcom_glink_rpm.o -obj-$(CONFIG_RPMSG_QCOM_GLINK_NATIVE) += qcom_glink_native.o obj-$(CONFIG_RPMSG_QCOM_GLINK_SMEM) += qcom_glink_smem.o obj-$(CONFIG_RPMSG_QCOM_SMD) += qcom_smd.o obj-$(CONFIG_RPMSG_VIRTIO) += virtio_rpmsg_bus.o diff --git a/drivers/rpmsg/qcom_glink_ssr.c b/drivers/rpmsg/qcom_glink_ssr.c new file mode 100644 index 000000000000..dcd1ce616974 --- /dev/null +++ b/drivers/rpmsg/qcom_glink_ssr.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2014-2017, The Linux Foundation. All rights reserved. + * Copyright (c) 2017, Linaro Ltd. + */ + +#include +#include +#include +#include +#include + +/** + * struct do_cleanup_msg - The data structure for an SSR do_cleanup message + * version: The G-Link SSR protocol version + * command: The G-Link SSR command - do_cleanup + * seq_num: Sequence number + * name_len: Length of the name of the subsystem being restarted + * name: G-Link edge name of the subsystem being restarted + */ +struct do_cleanup_msg { + __le32 version; + __le32 command; + __le32 seq_num; + __le32 name_len; + char name[32]; +}; + +/** + * struct cleanup_done_msg - The data structure for an SSR cleanup_done message + * version: The G-Link SSR protocol version + * response: The G-Link SSR response to a do_cleanup command, cleanup_done + * seq_num: Sequence number + */ +struct cleanup_done_msg { + __le32 version; + __le32 response; + __le32 seq_num; +}; + +/** + * G-Link SSR protocol commands + */ +#define GLINK_SSR_DO_CLEANUP 0 +#define GLINK_SSR_CLEANUP_DONE 1 + +struct glink_ssr { + struct device *dev; + struct rpmsg_endpoint *ept; + + struct notifier_block nb; + + u32 seq_num; + struct completion completion; +}; + +/* Notifier list for all registered glink_ssr instances */ +static BLOCKING_NOTIFIER_HEAD(ssr_notifiers); + +/** + * qcom_glink_ssr_notify() - notify GLINK SSR about stopped remoteproc + * @ssr_name: name of the remoteproc that has been stopped + */ +void qcom_glink_ssr_notify(const char *ssr_name) +{ + blocking_notifier_call_chain(&ssr_notifiers, 0, (void *)ssr_name); +} +EXPORT_SYMBOL_GPL(qcom_glink_ssr_notify); + +static int qcom_glink_ssr_callback(struct rpmsg_device *rpdev, + void *data, int len, void *priv, u32 addr) +{ + struct cleanup_done_msg *msg = data; + struct glink_ssr *ssr = dev_get_drvdata(&rpdev->dev); + + if (len < sizeof(*msg)) { + dev_err(ssr->dev, "message too short\n"); + return -EINVAL; + } + + if (le32_to_cpu(msg->version) != 0) + return -EINVAL; + + if (le32_to_cpu(msg->response) != GLINK_SSR_CLEANUP_DONE) + return 0; + + if (le32_to_cpu(msg->seq_num) != ssr->seq_num) { + dev_err(ssr->dev, "invalid sequence number of response\n"); + return -EINVAL; + } + + complete(&ssr->completion); + + return 0; +} + +static int qcom_glink_ssr_notifier_call(struct notifier_block *nb, + unsigned long event, + void *data) +{ + struct glink_ssr *ssr = container_of(nb, struct glink_ssr, nb); + struct do_cleanup_msg msg; + char *ssr_name = data; + int ret; + + ssr->seq_num++; + reinit_completion(&ssr->completion); + + memset(&msg, 0, sizeof(msg)); + msg.command = cpu_to_le32(GLINK_SSR_DO_CLEANUP); + msg.seq_num = cpu_to_le32(ssr->seq_num); + msg.name_len = cpu_to_le32(strlen(ssr_name)); + strlcpy(msg.name, ssr_name, sizeof(msg.name)); + + ret = rpmsg_send(ssr->ept, &msg, sizeof(msg)); + if (ret < 0) + dev_err(ssr->dev, "failed to send cleanup message\n"); + + ret = wait_for_completion_timeout(&ssr->completion, HZ); + if (!ret) + dev_err(ssr->dev, "timeout waiting for cleanup done message\n"); + + return NOTIFY_DONE; +} + +static int qcom_glink_ssr_probe(struct rpmsg_device *rpdev) +{ + struct glink_ssr *ssr; + + ssr = devm_kzalloc(&rpdev->dev, sizeof(*ssr), GFP_KERNEL); + if (!ssr) + return -ENOMEM; + + init_completion(&ssr->completion); + + ssr->dev = &rpdev->dev; + ssr->ept = rpdev->ept; + ssr->nb.notifier_call = qcom_glink_ssr_notifier_call; + + dev_set_drvdata(&rpdev->dev, ssr); + + return blocking_notifier_chain_register(&ssr_notifiers, &ssr->nb); +} + +static void qcom_glink_ssr_remove(struct rpmsg_device *rpdev) +{ + struct glink_ssr *ssr = dev_get_drvdata(&rpdev->dev); + + blocking_notifier_chain_unregister(&ssr_notifiers, &ssr->nb); +} + +static const struct rpmsg_device_id qcom_glink_ssr_match[] = { + { "glink_ssr" }, + {} +}; + +static struct rpmsg_driver qcom_glink_ssr_driver = { + .probe = qcom_glink_ssr_probe, + .remove = qcom_glink_ssr_remove, + .callback = qcom_glink_ssr_callback, + .id_table = qcom_glink_ssr_match, + .drv = { + .name = "qcom_glink_ssr", + }, +}; +module_rpmsg_driver(qcom_glink_ssr_driver); diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index bf42a17a45de..811c91289cbf 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -35,15 +35,6 @@ config QCOM_GENI_SE driver is also used to manage the common aspects of multiple Serial Engines present in the QUP. -config QCOM_GLINK_SSR - tristate "Qualcomm Glink SSR driver" - depends on RPMSG - depends on QCOM_RPROC_COMMON - help - Say y here to enable GLINK SSR support. The GLINK SSR driver - implements the SSR protocol for notifying the remote processor about - neighboring subsystems going up or down. - config QCOM_GSBI tristate "QCOM General Serial Bus Interface" depends on ARCH_QCOM || COMPILE_TEST diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile index 5d6b83dc58e8..e9cacc9ad401 100644 --- a/drivers/soc/qcom/Makefile +++ b/drivers/soc/qcom/Makefile @@ -3,7 +3,6 @@ CFLAGS_rpmh-rsc.o := -I$(src) obj-$(CONFIG_QCOM_AOSS_QMP) += qcom_aoss.o obj-$(CONFIG_QCOM_GENI_SE) += qcom-geni-se.o obj-$(CONFIG_QCOM_COMMAND_DB) += cmd-db.o -obj-$(CONFIG_QCOM_GLINK_SSR) += glink_ssr.o obj-$(CONFIG_QCOM_GSBI) += qcom_gsbi.o obj-$(CONFIG_QCOM_MDT_LOADER) += mdt_loader.o obj-$(CONFIG_QCOM_OCMEM) += ocmem.o diff --git a/drivers/soc/qcom/glink_ssr.c b/drivers/soc/qcom/glink_ssr.c deleted file mode 100644 index 847d79c935f1..000000000000 --- a/drivers/soc/qcom/glink_ssr.c +++ /dev/null @@ -1,170 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (c) 2014-2017, The Linux Foundation. All rights reserved. - * Copyright (c) 2017, Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -/** - * struct do_cleanup_msg - The data structure for an SSR do_cleanup message - * version: The G-Link SSR protocol version - * command: The G-Link SSR command - do_cleanup - * seq_num: Sequence number - * name_len: Length of the name of the subsystem being restarted - * name: G-Link edge name of the subsystem being restarted - */ -struct do_cleanup_msg { - __le32 version; - __le32 command; - __le32 seq_num; - __le32 name_len; - char name[32]; -}; - -/** - * struct cleanup_done_msg - The data structure for an SSR cleanup_done message - * version: The G-Link SSR protocol version - * response: The G-Link SSR response to a do_cleanup command, cleanup_done - * seq_num: Sequence number - */ -struct cleanup_done_msg { - __le32 version; - __le32 response; - __le32 seq_num; -}; - -/** - * G-Link SSR protocol commands - */ -#define GLINK_SSR_DO_CLEANUP 0 -#define GLINK_SSR_CLEANUP_DONE 1 - -struct glink_ssr { - struct device *dev; - struct rpmsg_endpoint *ept; - - struct notifier_block nb; - - u32 seq_num; - struct completion completion; -}; - -/* Notifier list for all registered glink_ssr instances */ -static BLOCKING_NOTIFIER_HEAD(ssr_notifiers); - -/** - * qcom_glink_ssr_notify() - notify GLINK SSR about stopped remoteproc - * @ssr_name: name of the remoteproc that has been stopped - */ -void qcom_glink_ssr_notify(const char *ssr_name) -{ - blocking_notifier_call_chain(&ssr_notifiers, 0, (void *)ssr_name); -} -EXPORT_SYMBOL_GPL(qcom_glink_ssr_notify); - -static int qcom_glink_ssr_callback(struct rpmsg_device *rpdev, - void *data, int len, void *priv, u32 addr) -{ - struct cleanup_done_msg *msg = data; - struct glink_ssr *ssr = dev_get_drvdata(&rpdev->dev); - - if (len < sizeof(*msg)) { - dev_err(ssr->dev, "message too short\n"); - return -EINVAL; - } - - if (le32_to_cpu(msg->version) != 0) - return -EINVAL; - - if (le32_to_cpu(msg->response) != GLINK_SSR_CLEANUP_DONE) - return 0; - - if (le32_to_cpu(msg->seq_num) != ssr->seq_num) { - dev_err(ssr->dev, "invalid sequence number of response\n"); - return -EINVAL; - } - - complete(&ssr->completion); - - return 0; -} - -static int qcom_glink_ssr_notifier_call(struct notifier_block *nb, - unsigned long event, - void *data) -{ - struct glink_ssr *ssr = container_of(nb, struct glink_ssr, nb); - struct do_cleanup_msg msg; - char *ssr_name = data; - int ret; - - ssr->seq_num++; - reinit_completion(&ssr->completion); - - memset(&msg, 0, sizeof(msg)); - msg.command = cpu_to_le32(GLINK_SSR_DO_CLEANUP); - msg.seq_num = cpu_to_le32(ssr->seq_num); - msg.name_len = cpu_to_le32(strlen(ssr_name)); - strlcpy(msg.name, ssr_name, sizeof(msg.name)); - - ret = rpmsg_send(ssr->ept, &msg, sizeof(msg)); - if (ret < 0) - dev_err(ssr->dev, "failed to send cleanup message\n"); - - ret = wait_for_completion_timeout(&ssr->completion, HZ); - if (!ret) - dev_err(ssr->dev, "timeout waiting for cleanup done message\n"); - - return NOTIFY_DONE; -} - -static int qcom_glink_ssr_probe(struct rpmsg_device *rpdev) -{ - struct glink_ssr *ssr; - - ssr = devm_kzalloc(&rpdev->dev, sizeof(*ssr), GFP_KERNEL); - if (!ssr) - return -ENOMEM; - - init_completion(&ssr->completion); - - ssr->dev = &rpdev->dev; - ssr->ept = rpdev->ept; - ssr->nb.notifier_call = qcom_glink_ssr_notifier_call; - - dev_set_drvdata(&rpdev->dev, ssr); - - return blocking_notifier_chain_register(&ssr_notifiers, &ssr->nb); -} - -static void qcom_glink_ssr_remove(struct rpmsg_device *rpdev) -{ - struct glink_ssr *ssr = dev_get_drvdata(&rpdev->dev); - - blocking_notifier_chain_unregister(&ssr_notifiers, &ssr->nb); -} - -static const struct rpmsg_device_id qcom_glink_ssr_match[] = { - { "glink_ssr" }, - {} -}; - -static struct rpmsg_driver qcom_glink_ssr_driver = { - .probe = qcom_glink_ssr_probe, - .remove = qcom_glink_ssr_remove, - .callback = qcom_glink_ssr_callback, - .id_table = qcom_glink_ssr_match, - .drv = { - .name = "qcom_glink_ssr", - }, -}; -module_rpmsg_driver(qcom_glink_ssr_driver); - -MODULE_ALIAS("rpmsg:glink_ssr"); -MODULE_DESCRIPTION("Qualcomm GLINK SSR notifier"); -MODULE_LICENSE("GPL v2"); diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h index 09daa0acde2c..daded9fddf36 100644 --- a/include/linux/rpmsg/qcom_glink.h +++ b/include/linux/rpmsg/qcom_glink.h @@ -12,6 +12,7 @@ struct qcom_glink; struct qcom_glink *qcom_glink_smem_register(struct device *parent, struct device_node *node); void qcom_glink_smem_unregister(struct qcom_glink *glink); +void qcom_glink_ssr_notify(const char *ssr_name); #else @@ -23,12 +24,6 @@ qcom_glink_smem_register(struct device *parent, } static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {} - -#endif - -#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SSR) -void qcom_glink_ssr_notify(const char *ssr_name); -#else static inline void qcom_glink_ssr_notify(const char *ssr_name) {} #endif -- cgit v1.2.3 From 89826cce37542f7950e8f4b9258284805e98430c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Apr 2020 18:04:54 -0500 Subject: exec: Make unlocking exec_update_mutex explict With install_exec_creds updated to follow immediately after setup_new_exec, the failure of unshare_sighand is the only code path where exec_update_mutex is held but not explicitly unlocked. Update that code path to explicitly unlock exec_update_mutex. Remove the unlocking of exec_update_mutex from free_bprm. Reviewed-by: Kees Cook Reviewed-by: Greg Ungerer Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 6 +++--- include/linux/binfmts.h | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 06b4c550af5d..6bd82a007bfc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1344,7 +1344,7 @@ int flush_old_exec(struct linux_binprm * bprm) */ retval = unshare_sighand(me); if (retval) - goto out; + goto out_unlock; set_fs(USER_DS); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | @@ -1361,6 +1361,8 @@ int flush_old_exec(struct linux_binprm * bprm) do_close_on_exec(me->files); return 0; +out_unlock: + mutex_unlock(&me->signal->exec_update_mutex); out: return retval; } @@ -1477,8 +1479,6 @@ static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { - if (bprm->called_exec_mmap) - mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index a345d9fed3d8..6f564b9ad882 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -47,8 +47,7 @@ struct linux_binprm { secureexec:1, /* * Set by flush_old_exec, when exec_mmap has been called. - * This is past the point of no return, when the - * exec_update_mutex has been taken. + * This is past the point of no return. */ called_exec_mmap:1; #ifdef __alpha__ -- cgit v1.2.3 From 1507b7a30ad284a2a136ee79c214c0e86c62da64 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Apr 2020 18:17:50 -0500 Subject: exec: Rename the flag called_exec_mmap point_of_no_return Update the comments and make the code easier to understand by renaming this flag. Reviewed-by: Kees Cook Reviewed-by: Greg Ungerer Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 12 ++++++------ include/linux/binfmts.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 6bd82a007bfc..71de9f57ae09 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1326,12 +1326,12 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; /* - * After setting bprm->called_exec_mmap (to mark that current is - * using the prepared mm now), we have nothing left of the original - * process. If anything from here on returns an error, the check - * in search_binary_handler() will SEGV current. + * With the new mm installed it is completely impossible to + * fail and return to the original process. If anything from + * here on returns an error, the check in + * search_binary_handler() will SEGV current. */ - bprm->called_exec_mmap = 1; + bprm->point_of_no_return = true; bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS @@ -1720,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm) read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval < 0 && bprm->called_exec_mmap) { + if (retval < 0 && bprm->point_of_no_return) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 6f564b9ad882..8f479dad7931 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -46,10 +46,10 @@ struct linux_binprm { */ secureexec:1, /* - * Set by flush_old_exec, when exec_mmap has been called. - * This is past the point of no return. + * Set when errors can no longer be returned to the + * original userspace. */ - called_exec_mmap:1; + point_of_no_return:1; #ifdef __alpha__ unsigned int taso:1; #endif -- cgit v1.2.3 From 96ecee29b0b560662ec082ee9b6f2049f2a79090 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 3 May 2020 06:48:17 -0500 Subject: exec: Merge install_exec_creds into setup_new_exec The two functions are now always called one right after the other so merge them together to make future maintenance easier. Reviewed-by: Kees Cook Reviewed-by: Greg Ungerer Signed-off-by: "Eric W. Biederman" --- arch/x86/ia32/ia32_aout.c | 1 - fs/binfmt_aout.c | 1 - fs/binfmt_elf.c | 1 - fs/binfmt_elf_fdpic.c | 1 - fs/binfmt_flat.c | 1 - fs/exec.c | 56 ++++++++++++++++++++++------------------------- include/linux/binfmts.h | 1 - kernel/events/core.c | 2 +- 8 files changed, 27 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 37b36a8ce5fa..8255fdc3a027 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -140,7 +140,6 @@ static int load_aout_binary(struct linux_binprm *bprm) set_personality_ia32(false); setup_new_exec(bprm); - install_exec_creds(bprm); regs->cs = __USER32_CS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ace587b66904..c8ba28f285e5 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -162,7 +162,6 @@ static int load_aout_binary(struct linux_binprm * bprm) set_personality(PER_LINUX); #endif setup_new_exec(bprm); - install_exec_creds(bprm); current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13f25e241ac4..e6b586623035 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -858,7 +858,6 @@ out_free_interp: current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); - install_exec_creds(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 6c94c6d53d97..9a1aa61b4cc3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -353,7 +353,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) current->personality |= READ_IMPLIES_EXEC; setup_new_exec(bprm); - install_exec_creds(bprm); set_binfmt(&elf_fdpic_format); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 1a1d1fcb893f..252878969582 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -541,7 +541,6 @@ static int load_flat_file(struct linux_binprm *bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); setup_new_exec(bprm); - install_exec_creds(bprm); } /* diff --git a/fs/exec.c b/fs/exec.c index 71de9f57ae09..93e40f865523 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1443,6 +1443,31 @@ void setup_new_exec(struct linux_binprm * bprm) group */ WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1); flush_signal_handlers(current, 0); + + /* + * install the new credentials for this executable + */ + security_bprm_committing_creds(bprm); + + commit_creds(bprm->cred); + bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(current->mm) != SUID_DUMP_USER) + perf_event_exit_task(current); + /* + * cred_guard_mutex must be held at least to this point to prevent + * ptrace_attach() from altering our determination of the task's + * credentials; any time after this it may be unlocked. + */ + security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->exec_update_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); @@ -1458,7 +1483,7 @@ EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. - * install_exec_creds() commits the new creds and drops the lock. + * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred and * and unlock. */ @@ -1504,35 +1529,6 @@ int bprm_change_interp(const char *interp, struct linux_binprm *bprm) } EXPORT_SYMBOL(bprm_change_interp); -/* - * install the new credentials for this executable - */ -void install_exec_creds(struct linux_binprm *bprm) -{ - security_bprm_committing_creds(bprm); - - commit_creds(bprm->cred); - bprm->cred = NULL; - - /* - * Disable monitoring for regular users - * when executing setuid binaries. Must - * wait until new credentials are committed - * by commit_creds() above - */ - if (get_dumpable(current->mm) != SUID_DUMP_USER) - perf_event_exit_task(current); - /* - * cred_guard_mutex must be held at least to this point to prevent - * ptrace_attach() from altering our determination of the task's - * credentials; any time after this it may be unlocked. - */ - security_bprm_committed_creds(bprm); - mutex_unlock(¤t->signal->exec_update_mutex); - mutex_unlock(¤t->signal->cred_guard_mutex); -} -EXPORT_SYMBOL(install_exec_creds); - /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8f479dad7931..2a8fddf3574a 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -145,7 +145,6 @@ extern int transfer_args_to_stack(struct linux_binprm *bprm, extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); extern int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm); -extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); diff --git a/kernel/events/core.c b/kernel/events/core.c index 633b4ae72ed5..169449b5e56b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12217,7 +12217,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_mutex held when called from - * install_exec_creds(). + * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { -- cgit v1.2.3 From 2388777a0a5957a10b3d78677216530a9b3bd09f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 3 May 2020 07:54:10 -0500 Subject: exec: Rename flush_old_exec begin_new_exec There is and has been for a very long time been a lot more going on in flush_old_exec than just flushing the old state. After the movement of code from setup_new_exec there is a whole lot more going on than just flushing the old executables state. Rename flush_old_exec to begin_new_exec to more accurately reflect what this function does. Reviewed-by: Kees Cook Reviewed-by: Greg Ungerer Signed-off-by: "Eric W. Biederman" --- Documentation/trace/ftrace.rst | 2 +- arch/x86/ia32/ia32_aout.c | 2 +- fs/binfmt_aout.c | 2 +- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/binfmt_flat.c | 2 +- fs/exec.c | 4 ++-- include/linux/binfmts.h | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 3b5614b1d1a5..430a16283103 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1524,7 +1524,7 @@ display-graph option:: => remove_vma => exit_mmap => mmput - => flush_old_exec + => begin_new_exec => load_elf_binary => search_binary_handler => __do_execve_file.isra.32 diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 8255fdc3a027..385d3d172ee1 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -131,7 +131,7 @@ static int load_aout_binary(struct linux_binprm *bprm) return -ENOMEM; /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); + retval = begin_new_exec(bprm); if (retval) return retval; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index c8ba28f285e5..3e84e9bb9084 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -151,7 +151,7 @@ static int load_aout_binary(struct linux_binprm * bprm) return -ENOMEM; /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); + retval = begin_new_exec(bprm); if (retval) return retval; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e6b586623035..396d5c2e6b5e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -844,7 +844,7 @@ out_free_interp: goto out_free_dentry; /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); + retval = begin_new_exec(bprm); if (retval) goto out_free_dentry; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9a1aa61b4cc3..896e3ca9bf85 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -338,7 +338,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; /* flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); + retval = begin_new_exec(bprm); if (retval) goto error; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 252878969582..9b82bc111d0a 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -534,7 +534,7 @@ static int load_flat_file(struct linux_binprm *bprm, /* Flush all traces of the currently running executable */ if (id == 0) { - ret = flush_old_exec(bprm); + ret = begin_new_exec(bprm); if (ret) goto err; diff --git a/fs/exec.c b/fs/exec.c index 0eff20558735..3cc40048cc65 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1298,7 +1298,7 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handlers (see below). */ -int flush_old_exec(struct linux_binprm * bprm) +int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; @@ -1433,7 +1433,7 @@ out_unlock: out: return retval; } -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 2a8fddf3574a..1b48e2154766 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -125,7 +125,7 @@ extern void unregister_binfmt(struct linux_binfmt *); extern int prepare_binprm(struct linux_binprm *); extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *); -extern int flush_old_exec(struct linux_binprm * bprm); +extern int begin_new_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); extern void finalize_exec(struct linux_binprm *bprm); extern void would_dump(struct linux_binprm *, struct file *); -- cgit v1.2.3 From 7c8e2bdd5f0d990e2398ee3deafc626dd469fc2d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:44 -0500 Subject: livepatch: Apply vmlinux-specific KLP relocations early KLP relocations are livepatch-specific relocations which are applied to a KLP module's text or data. They exist for two reasons: 1) Unexported symbols: replacement functions often need to access unexported symbols (e.g. static functions), which "normal" relocations don't allow. 2) Late module patching: this is the ability for a KLP module to bypass normal module dependencies, such that the KLP module can be loaded *before* a to-be-patched module. This means that relocations which need to access symbols in the to-be-patched module might need to be applied to the KLP module well after it has been loaded. Non-late-patched KLP relocations are applied from the KLP module's init function. That usually works fine, unless the patched code wants to use alternatives, paravirt patching, jump tables, or some other special section which needs relocations. Then we run into ordering issues and crashes. In order for those special sections to work properly, the KLP relocations should be applied *before* the special section init code runs, such as apply_paravirt(), apply_alternatives(), or jump_label_apply_nops(). You might think the obvious solution would be to move the KLP relocation initialization earlier, but it's not necessarily that simple. The problem is the above-mentioned late module patching, for which KLP relocations can get applied well after the KLP module is loaded. To "fix" this issue in the past, we created .klp.arch sections: .klp.arch.{module}..altinstructions .klp.arch.{module}..parainstructions Those sections allow KLP late module patching code to call apply_paravirt() and apply_alternatives() after the module-specific KLP relocations (.klp.rela.{module}.{section}) have been applied. But that has a lot of drawbacks, including code complexity, the need for arch-specific code, and the (per-arch) danger that we missed some special section -- for example the __jump_table section which is used for jump labels. It turns out there's a simpler and more functional approach. There are two kinds of KLP relocation sections: 1) vmlinux-specific KLP relocation sections .klp.rela.vmlinux.{sec} These are relocations (applied to the KLP module) which reference unexported vmlinux symbols. 2) module-specific KLP relocation sections .klp.rela.{module}.{sec}: These are relocations (applied to the KLP module) which reference unexported or exported module symbols. Up until now, these have been treated the same. However, they're inherently different. Because of late module patching, module-specific KLP relocations can be applied very late, thus they can create the ordering headaches described above. But vmlinux-specific KLP relocations don't have that problem. There's nothing to prevent them from being applied earlier. So apply them at the same time as normal relocations, when the KLP module is being loaded. This means that for vmlinux-specific KLP relocations, we no longer have any ordering issues. vmlinux-referencing jump labels, alternatives, and paravirt patching will work automatically, without the need for the .klp.arch hacks. All that said, for module-specific KLP relocations, the ordering problems still exist and we *do* still need .klp.arch. Or do we? Stay tuned. Suggested-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- include/linux/livepatch.h | 14 +++++ kernel/livepatch/core.c | 137 +++++++++++++++++++++++++++++----------------- kernel/module.c | 10 ++-- 3 files changed, 106 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index e894e74905f3..c4302e9a5905 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -234,6 +234,11 @@ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor); struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id); struct klp_state *klp_get_prev_state(unsigned long id); +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symindex, unsigned int secindex, + const char *objname); + #else /* !CONFIG_LIVEPATCH */ static inline int klp_module_coming(struct module *mod) { return 0; } @@ -242,6 +247,15 @@ static inline bool klp_patch_pending(struct task_struct *task) { return false; } static inline void klp_update_patch_state(struct task_struct *task) {} static inline void klp_copy_process(struct task_struct *child) {} +static inline +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symindex, unsigned int secindex, + const char *objname) +{ + return 0; +} + #endif /* CONFIG_LIVEPATCH */ #endif /* _LINUX_LIVEPATCH_H_ */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 40cfac8156fd..c02791e5c75b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -191,12 +191,12 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) +static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, + unsigned int symndx, Elf_Shdr *relasec) { int i, cnt, vmlinux, ret; char objname[MODULE_NAME_LEN]; char symname[KSYM_NAME_LEN]; - char *strtab = pmod->core_kallsyms.strtab; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; @@ -216,7 +216,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); @@ -246,54 +246,59 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) return 0; } -static int klp_write_object_relocations(struct module *pmod, - struct klp_object *obj) +/* + * At a high-level, there are two types of klp relocation sections: those which + * reference symbols which live in vmlinux; and those which reference symbols + * which live in other modules. This function is called for both types: + * + * 1) When a klp module itself loads, the module code calls this function to + * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections). + * These relocations are written to the klp module text to allow the patched + * code/data to reference unexported vmlinux symbols. They're written as + * early as possible to ensure that other module init code (.e.g., + * jump_label_apply_nops) can access any unexported vmlinux symbols which + * might be referenced by the klp module's special sections. + * + * 2) When a to-be-patched module loads -- or is already loaded when a + * corresponding klp module loads -- klp code calls this function to write + * module-specific klp relocations (.klp.rela.{module}.* sections). These + * are written to the klp module text to allow the patched code/data to + * reference symbols which live in the to-be-patched module or one of its + * module dependencies. Exported symbols are supported, in addition to + * unexported symbols, in order to enable late module patching, which allows + * the to-be-patched module to be loaded and patched sometime *after* the + * klp module is loaded. + */ +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname) { - int i, cnt, ret = 0; - const char *objname, *secname; + int cnt, ret; char sec_objname[MODULE_NAME_LEN]; - Elf_Shdr *sec; + Elf_Shdr *sec = sechdrs + secndx; - if (WARN_ON(!klp_is_object_loaded(obj))) + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name\n", + shstrtab + sec->sh_name); return -EINVAL; + } - objname = klp_is_module(obj) ? obj->name : "vmlinux"; - - /* For each klp relocation section */ - for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { - sec = pmod->klp_info->sechdrs + i; - secname = pmod->klp_info->secstrings + sec->sh_name; - if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) - continue; - - /* - * Format: .klp.rela.sec_objname.section_name - * See comment in klp_resolve_symbols() for an explanation - * of the selected field width value. - */ - cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); - if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name\n", - secname); - ret = -EINVAL; - break; - } - - if (strcmp(objname, sec_objname)) - continue; - - ret = klp_resolve_symbols(sec, pmod); - if (ret) - break; + if (strcmp(objname ? objname : "vmlinux", sec_objname)) + return 0; - ret = apply_relocate_add(pmod->klp_info->sechdrs, - pmod->core_kallsyms.strtab, - pmod->klp_info->symndx, i, pmod); - if (ret) - break; - } + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec); + if (ret) + return ret; - return ret; + return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); } /* @@ -730,6 +735,28 @@ void __weak arch_klp_init_object_loaded(struct klp_patch *patch, { } +int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) +{ + int i, ret; + struct klp_modinfo *info = patch->mod->klp_info; + + for (i = 1; i < info->hdr.e_shnum; i++) { + Elf_Shdr *sec = info->sechdrs + i; + + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; + + ret = klp_apply_section_relocs(patch->mod, info->sechdrs, + info->secstrings, + patch->mod->core_kallsyms.strtab, + info->symndx, i, obj->name); + if (ret) + return ret; + } + + return 0; +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -738,18 +765,26 @@ static int klp_init_object_loaded(struct klp_patch *patch, int ret; mutex_lock(&text_mutex); - module_disable_ro(patch->mod); - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; + + if (klp_is_module(obj)) { + /* + * Only write module-specific relocations here + * (.klp.rela.{module}.*). vmlinux-specific relocations were + * written earlier during the initialization of the klp module + * itself. + */ + ret = klp_apply_object_relocs(patch, obj); + if (ret) { + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + return ret; + } } arch_klp_init_object_loaded(patch, obj); - module_enable_ro(patch->mod, true); + module_enable_ro(patch->mod, true); mutex_unlock(&text_mutex); klp_for_each_func(obj, func) { diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..fdd9f6970e9a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2334,11 +2334,13 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; - /* Livepatch relocation sections are applied by livepatch */ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) - continue; - - if (info->sechdrs[i].sh_type == SHT_REL) + err = klp_apply_section_relocs(mod, info->sechdrs, + info->secstrings, + info->strtab, + info->index.sym, i, + NULL); + else if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); else if (info->sechdrs[i].sh_type == SHT_RELA) -- cgit v1.2.3 From 1d05334d2899bd3ecdf01beb53f0a70884a7f471 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 10:24:45 -0500 Subject: livepatch: Remove .klp.arch After the previous patch, vmlinux-specific KLP relocations are now applied early during KLP module load. This means that .klp.arch sections are no longer needed for *vmlinux-specific* KLP relocations. One might think they're still needed for *module-specific* KLP relocations. If a to-be-patched module is loaded *after* its corresponding KLP module is loaded, any corresponding KLP relocations will be delayed until the to-be-patched module is loaded. If any special sections (.parainstructions, for example) rely on those relocations, their initializations (apply_paravirt) need to be done afterwards. Thus the apparent need for arch_klp_init_object_loaded() and its corresponding .klp.arch sections -- it allows some of the special section initializations to be done at a later time. But... if you look closer, that dependency between the special sections and the module-specific KLP relocations doesn't actually exist in reality. Looking at the contents of the .altinstructions and .parainstructions sections, there's not a realistic scenario in which a KLP module's .altinstructions or .parainstructions section needs to access a symbol in a to-be-patched module. It might need to access a local symbol or even a vmlinux symbol; but not another module's symbol. When a special section needs to reference a local or vmlinux symbol, a normal rela can be used instead of a KLP rela. Since the special section initializations don't actually have any real dependency on module-specific KLP relocations, .klp.arch and arch_klp_init_object_loaded() no longer have a reason to exist. So remove them. As Peter said much more succinctly: So the reason for .klp.arch was that .klp.rela.* stuff would overwrite paravirt instructions. If that happens you're doing it wrong. Those RELAs are core kernel, not module, and thus should've happened in .rela.* sections at patch-module loading time. Reverting this removes the two apply_{paravirt,alternatives}() calls from the late patching path, and means we don't have to worry about them when removing module_disable_ro(). [ jpoimboe: Rewrote patch description. Tweaked klp_init_object_loaded() error path. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- Documentation/livepatch/module-elf-format.rst | 15 +------- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/livepatch.c | 53 --------------------------- include/linux/livepatch.h | 3 -- kernel/livepatch/core.c | 27 +++++--------- 5 files changed, 11 insertions(+), 88 deletions(-) delete mode 100644 arch/x86/kernel/livepatch.c (limited to 'include/linux') diff --git a/Documentation/livepatch/module-elf-format.rst b/Documentation/livepatch/module-elf-format.rst index 2a591e6f8e6c..8c6b894c4661 100644 --- a/Documentation/livepatch/module-elf-format.rst +++ b/Documentation/livepatch/module-elf-format.rst @@ -14,8 +14,7 @@ This document outlines the Elf format requirements that livepatch modules must f 4. Livepatch symbols 4.1 A livepatch module's symbol table 4.2 Livepatch symbol format - 5. Architecture-specific sections - 6. Symbol table and Elf section access + 5. Symbol table and Elf section access 1. Background and motivation ============================ @@ -298,17 +297,7 @@ Examples: Note that the 'Ndx' (Section index) for these symbols is SHN_LIVEPATCH (0xff20). "OS" means OS-specific. -5. Architecture-specific sections -================================= -Architectures may override arch_klp_init_object_loaded() to perform -additional arch-specific tasks when a target module loads, such as applying -arch-specific sections. On x86 for example, we must apply per-object -.altinstructions and .parainstructions sections when a target module loads. -These sections must be prefixed with ".klp.arch.$objname." so that they can -be easily identified when iterating through a patch module's Elf sections -(See arch/x86/kernel/livepatch.c for a complete example). - -6. Symbol table and Elf section access +5. Symbol table and Elf section access ====================================== A livepatch module's symbol table is accessible through module->symtab. diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ba89cabe5fcf..bae9f9033734 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -90,7 +90,6 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c deleted file mode 100644 index 6a68e41206e7..000000000000 --- a/arch/x86/kernel/livepatch.c +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * livepatch.c - x86-specific Kernel Live Patching Core - */ - -#include -#include -#include -#include - -/* Apply per-object alternatives. Based on x86 module_finalize() */ -void arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ - int cnt; - struct klp_modinfo *info; - Elf_Shdr *s, *alt = NULL, *para = NULL; - void *aseg, *pseg; - const char *objname; - char sec_objname[MODULE_NAME_LEN]; - char secname[KSYM_NAME_LEN]; - - info = patch->mod->klp_info; - objname = obj->name ? obj->name : "vmlinux"; - - /* See livepatch core code for BUILD_BUG_ON() explanation */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); - - for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) { - /* Apply per-object .klp.arch sections */ - cnt = sscanf(info->secstrings + s->sh_name, - ".klp.arch.%55[^.].%127s", - sec_objname, secname); - if (cnt != 2) - continue; - if (strcmp(sec_objname, objname)) - continue; - if (!strcmp(".altinstructions", secname)) - alt = s; - if (!strcmp(".parainstructions", secname)) - para = s; - } - - if (alt) { - aseg = (void *) alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - - if (para) { - pseg = (void *) para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } -} diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index c4302e9a5905..2614247a9781 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -195,9 +195,6 @@ struct klp_patch { int klp_enable_patch(struct klp_patch *); -void arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj); - /* Called from the module loader during module coming/going states */ int klp_module_coming(struct module *mod); void klp_module_going(struct module *mod); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c02791e5c75b..16632e75112a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -729,12 +729,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -/* Arches may override this to finish any remaining arch-specific tasks */ -void __weak arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ -} - int klp_apply_object_relocs(struct klp_patch *patch, struct klp_object *obj) { int i, ret; @@ -764,10 +758,11 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - mutex_lock(&text_mutex); - module_disable_ro(patch->mod); - if (klp_is_module(obj)) { + + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); + /* * Only write module-specific relocations here * (.klp.rela.{module}.*). vmlinux-specific relocations were @@ -775,17 +770,13 @@ static int klp_init_object_loaded(struct klp_patch *patch, * itself. */ ret = klp_apply_object_relocs(patch, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; - } - } - arch_klp_init_object_loaded(patch, obj); + module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); + if (ret) + return ret; + } klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, -- cgit v1.2.3 From 0d9fbf78fefb421a3af97394ce80bba0db4f046a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:51 -0500 Subject: module: Remove module_disable_ro() module_disable_ro() has no more users. Remove it. Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- include/linux/module.h | 2 -- kernel/module.c | 13 ------------- 2 files changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 1ad393e62bef..e4ef7b36feda 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -860,10 +860,8 @@ extern int module_sysfs_initialized; #ifdef CONFIG_STRICT_MODULE_RWX extern void module_enable_ro(const struct module *mod, bool after_init); -extern void module_disable_ro(const struct module *mod); #else static inline void module_enable_ro(const struct module *mod, bool after_init) { } -static inline void module_disable_ro(const struct module *mod) { } #endif #ifdef CONFIG_GENERIC_BUG diff --git a/kernel/module.c b/kernel/module.c index fdd9f6970e9a..3ba024afe379 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1997,19 +1997,6 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -/* livepatching wants to disable read-only so it can frob module. */ -void module_disable_ro(const struct module *mod) -{ - if (!rodata_enabled) - return; - - frob_text(&mod->core_layout, set_memory_rw); - frob_rodata(&mod->core_layout, set_memory_rw); - frob_ro_after_init(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - frob_rodata(&mod->init_layout, set_memory_rw); -} - void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) -- cgit v1.2.3 From e6eff4376e2897c2e14b70d87bf7284cdb093830 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:53 -0500 Subject: module: Make module_enable_ro() static again Now that module_enable_ro() has no more external users, make it static again. Suggested-by: Jessica Yu Signed-off-by: Josh Poimboeuf Acked-by: Jessica Yu Signed-off-by: Jiri Kosina --- include/linux/module.h | 6 ------ kernel/module.c | 3 ++- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index e4ef7b36feda..2c2e988bcf10 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -858,12 +858,6 @@ extern int module_sysfs_initialized; #define __MODULE_STRING(x) __stringify(x) -#ifdef CONFIG_STRICT_MODULE_RWX -extern void module_enable_ro(const struct module *mod, bool after_init); -#else -static inline void module_enable_ro(const struct module *mod, bool after_init) { } -#endif - #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); diff --git a/kernel/module.c b/kernel/module.c index 3ba024afe379..a26343ea4d50 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1997,7 +1997,7 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -void module_enable_ro(const struct module *mod, bool after_init) +static void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) return; @@ -2025,6 +2025,7 @@ static void module_enable_nx(const struct module *mod) #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } +static void module_enable_ro(const struct module *mod, bool after_init) {} #endif /* CONFIG_STRICT_MODULE_RWX */ static void module_enable_x(const struct module *mod) { -- cgit v1.2.3 From 307f660d056b5eb8f5bb2328fac3915ab75b5007 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:18 -0700 Subject: netpoll: remove dev argument from netpoll_send_skb_on_dev() netpoll_send_skb_on_dev() can get the device pointer directly from np->dev Rename it to __netpoll_send_skb() Following patch will move netpoll_send_skb() out-of-line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 5 ++--- net/core/netpoll.c | 10 ++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 676f1ff161a9..00e0bae3d402 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,13 +63,12 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev); +void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; local_irq_save(flags); - netpoll_send_skb_on_dev(np, skb, np->dev); + __netpoll_send_skb(np, skb); local_irq_restore(flags); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 15b366a1a958..c5059b7ffc94 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,17 +305,19 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev) +void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); - npinfo = rcu_dereference_bh(np->dev->npinfo); + dev = np->dev; + npinfo = rcu_dereference_bh(dev->npinfo); + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return; @@ -358,7 +360,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(netpoll_send_skb_on_dev); +EXPORT_SYMBOL(__netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.2.3 From fb1eee476b0d3be3e58dac1a3a96f726c6278bed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:19 -0700 Subject: netpoll: move netpoll_send_skb() out of line There is no need to inline this helper, as we intend to add more code in this function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 9 +-------- net/core/netpoll.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 00e0bae3d402..e466ddffef61 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,14 +63,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); -static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) -{ - unsigned long flags; - local_irq_save(flags); - __netpoll_send_skb(np, skb); - local_irq_restore(flags); -} +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline void *netpoll_poll_lock(struct napi_struct *napi) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c5059b7ffc94..34cd34f24423 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; @@ -360,7 +360,16 @@ void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(__netpoll_send_skb); + +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + unsigned long flags; + + local_irq_save(flags); + __netpoll_send_skb(np, skb); + local_irq_restore(flags); +} +EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.2.3 From 1ddabdfaf70c202b88925edd74c66f4707dbd92e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:20 -0700 Subject: netpoll: netpoll_send_skb() returns transmit status Some callers want to know if the packet has been sent or dropped, to inform upper stacks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- net/core/netpoll.c | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e466ddffef61..f47af135bd56 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,7 +63,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline void *netpoll_poll_lock(struct napi_struct *napi) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 34cd34f24423..40d2753aa47d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; @@ -320,7 +320,7 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return; + return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ @@ -359,15 +359,18 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + return NETDEV_TX_OK; } -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; + netdev_tx_t ret; local_irq_save(flags); - __netpoll_send_skb(np, skb); + ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); + return ret; } EXPORT_SYMBOL(netpoll_send_skb); -- cgit v1.2.3 From f78ed2204db9fc35b545d693865bddbe0149aa1f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:21 -0700 Subject: netpoll: accept NULL np argument in netpoll_send_skb() netpoll_send_skb() callers seem to leak skb if the np pointer is NULL. While this should not happen, we can make the code more robust. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 5 ++--- include/linux/if_team.h | 5 +---- include/net/bonding.h | 5 +---- net/8021q/vlan_dev.c | 5 ++--- net/bridge/br_private.h | 5 +---- net/core/netpoll.c | 11 ++++++++--- net/dsa/slave.c | 5 ++--- 7 files changed, 17 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 34eb073cdd74..9a419d5102ce 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -542,12 +542,11 @@ xmit_world: static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, diff --git a/include/linux/if_team.h b/include/linux/if_team.h index ec7e4bd07f82..537dc2b8c879 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -102,10 +102,7 @@ static inline bool team_port_dev_txable(const struct net_device *port_dev) static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { - struct netpoll *np = port->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, diff --git a/include/net/bonding.h b/include/net/bonding.h index 0b696da5c115..f211983cd52a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -507,10 +507,7 @@ static inline unsigned long slave_last_rx(struct bonding *bond, static inline void bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { - struct netpoll *np = slave->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(slave->np, skb); } #else static inline void bond_netpoll_send_skb(const struct slave *slave, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 319220b2341d..f00bb57f0f60 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -88,12 +88,11 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 78d3a951180d..4dc21e8f7e33 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -598,10 +598,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { - struct netpoll *np = p->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(p->np, skb); } int br_netpoll_enable(struct net_bridge_port *p); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 40d2753aa47d..093e90e52bc2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -367,9 +367,14 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) unsigned long flags; netdev_tx_t ret; - local_irq_save(flags); - ret = __netpoll_send_skb(np, skb); - local_irq_restore(flags); + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); + } return ret; } EXPORT_SYMBOL(netpoll_send_skb); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index dfb4282fc339..61b0de52040a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -445,12 +445,11 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_slave_priv *p = netdev_priv(dev); - if (p->netpoll) - netpoll_send_skb(p->netpoll, skb); + return netpoll_send_skb(p->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, -- cgit v1.2.3 From 0090c1edebf464f34629e14ae03d764cca7e0a3b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:50:41 -0500 Subject: audit: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 1a2351508211..3fcd9ee49734 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -19,7 +19,7 @@ struct audit_sig_info { uid_t uid; pid_t pid; - char ctx[0]; + char ctx[]; }; struct audit_buffer; -- cgit v1.2.3 From 6b0b0fa2bce61db790efc8070ae6e5696435b0a8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 2 May 2020 11:24:25 -0700 Subject: crypto: lib/sha1 - rename "sha" to "sha1" The library implementation of the SHA-1 compression function is confusingly called just "sha_transform()". Alongside it are some "SHA_" constants and "sha_init()". Presumably these are left over from a time when SHA just meant SHA-1. But now there are also SHA-2 and SHA-3, and moreover SHA-1 is now considered insecure and thus shouldn't be used. Therefore, rename these functions and constants to make it very clear that they are for SHA-1. Also add a comment to make it clear that these shouldn't be used. For the extra-misleadingly named "SHA_MESSAGE_BYTES", rename it to SHA1_BLOCK_SIZE and define it to just '64' rather than '(512/8)' so that it matches the same definition in . This prepares for merging into . Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- Documentation/security/siphash.rst | 2 +- crypto/sha1_generic.c | 4 ++-- drivers/char/random.c | 6 +++--- include/linux/cryptohash.h | 16 ++++++++++------ include/linux/filter.h | 2 +- kernel/bpf/core.c | 18 +++++++++--------- lib/sha1.c | 22 ++++++++++++---------- net/ipv6/addrconf.c | 10 +++++----- 8 files changed, 43 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/Documentation/security/siphash.rst b/Documentation/security/siphash.rst index 4eba68cdf0a1..bd9363025fcb 100644 --- a/Documentation/security/siphash.rst +++ b/Documentation/security/siphash.rst @@ -7,7 +7,7 @@ SipHash - a short input PRF SipHash is a cryptographically secure PRF -- a keyed hash function -- that performs very well for short inputs, hence the name. It was designed by cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended -as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`, +as a replacement for some uses of: `jhash`, `md5_transform`, `sha1_transform`, and so forth. SipHash takes a secret key filled with randomly generated numbers and either diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c index 7c57b844c382..a16d9787dcd2 100644 --- a/crypto/sha1_generic.c +++ b/crypto/sha1_generic.c @@ -31,10 +31,10 @@ EXPORT_SYMBOL_GPL(sha1_zero_message_hash); static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src, int blocks) { - u32 temp[SHA_WORKSPACE_WORDS]; + u32 temp[SHA1_WORKSPACE_WORDS]; while (blocks--) { - sha_transform(sst->state, src, temp); + sha1_transform(sst->state, src, temp); src += SHA1_BLOCK_SIZE; } memzero_explicit(temp, sizeof(temp)); diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d10e31fd342..a19a8984741b 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1397,14 +1397,14 @@ static void extract_buf(struct entropy_store *r, __u8 *out) __u32 w[5]; unsigned long l[LONGS(20)]; } hash; - __u32 workspace[SHA_WORKSPACE_WORDS]; + __u32 workspace[SHA1_WORKSPACE_WORDS]; unsigned long flags; /* * If we have an architectural hardware random number * generator, use it for SHA's initial vector */ - sha_init(hash.w); + sha1_init(hash.w); for (i = 0; i < LONGS(20); i++) { unsigned long v; if (!arch_get_random_long(&v)) @@ -1415,7 +1415,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) /* Generate a hash across the pool, 16 words (512 bits) at a time */ spin_lock_irqsave(&r->lock, flags); for (i = 0; i < r->poolinfo->poolwords; i += 16) - sha_transform(hash.w, (__u8 *)(r->pool + i), workspace); + sha1_transform(hash.w, (__u8 *)(r->pool + i), workspace); /* * We mix the hash back into the pool to prevent backtracking diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h index f6ba4c3e60d7..c324ffca96e0 100644 --- a/include/linux/cryptohash.h +++ b/include/linux/cryptohash.h @@ -4,11 +4,15 @@ #include -#define SHA_DIGEST_WORDS 5 -#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8) -#define SHA_WORKSPACE_WORDS 16 - -void sha_init(__u32 *buf); -void sha_transform(__u32 *digest, const char *data, __u32 *W); +/* + * An implementation of SHA-1's compression function. Don't use in new code! + * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't + * the correct way to hash something with SHA-1 (use crypto_shash instead). + */ +#define SHA1_DIGEST_WORDS 5 +#define SHA1_BLOCK_SIZE 64 +#define SHA1_WORKSPACE_WORDS 16 +void sha1_init(__u32 *buf); +void sha1_transform(__u32 *digest, const char *data, __u32 *W); #endif diff --git a/include/linux/filter.h b/include/linux/filter.h index 9b5aa5c483cc..f42662adffe4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -746,7 +746,7 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + - sizeof(__be64) + 1, SHA_MESSAGE_BYTES); + sizeof(__be64) + 1, SHA1_BLOCK_SIZE); } static inline unsigned int bpf_prog_size(unsigned int proglen) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..14aa1f74dd10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -262,10 +262,10 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA_DIGEST_WORDS]; - u32 ws[SHA_WORKSPACE_WORDS]; + u32 digest[SHA1_DIGEST_WORDS]; + u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; @@ -277,7 +277,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha_init(digest); + sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation @@ -308,8 +308,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; - bsize = round_up(psize, SHA_MESSAGE_BYTES); - blocks = bsize / SHA_MESSAGE_BYTES; + bsize = round_up(psize, SHA1_BLOCK_SIZE); + blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); @@ -320,12 +320,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(digest, todo, ws); - todo += SHA_MESSAGE_BYTES; + sha1_transform(digest, todo, ws); + todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; - for (i = 0; i < SHA_DIGEST_WORDS; i++) + for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); diff --git a/lib/sha1.c b/lib/sha1.c index 1d96d2c02b82..b381e8cd4fe4 100644 --- a/lib/sha1.c +++ b/lib/sha1.c @@ -64,22 +64,24 @@ #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** - * sha_transform - single block SHA1 transform + * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * - * This function generates a SHA1 digest for a single 512-bit block. - * Be warned, it does not handle padding and message digest, do not - * confuse it with the full FIPS 180-1 digest algorithm for variable - * length messages. + * This function executes SHA-1's internal compression function. It updates the + * 160-bit internal state (@digest) with a single 512-bit data block (@data). + * + * Don't use this function. SHA-1 is no longer considered secure. And even if + * you do have to use SHA-1, this isn't the correct way to hash something with + * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ -void sha_transform(__u32 *digest, const char *data, __u32 *array) +void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; @@ -185,13 +187,13 @@ void sha_transform(__u32 *digest, const char *data, __u32 *array) digest[3] += D; digest[4] += E; } -EXPORT_SYMBOL(sha_transform); +EXPORT_SYMBOL(sha1_transform); /** - * sha_init - initialize the vectors for a SHA1 digest + * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ -void sha_init(__u32 *buf) +void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; @@ -199,4 +201,4 @@ void sha_init(__u32 *buf) buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } -EXPORT_SYMBOL(sha_init); +EXPORT_SYMBOL(sha1_init); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..f131cedf5ba6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3222,11 +3222,11 @@ static int ipv6_generate_stable_address(struct in6_addr *address, const struct inet6_dev *idev) { static DEFINE_SPINLOCK(lock); - static __u32 digest[SHA_DIGEST_WORDS]; - static __u32 workspace[SHA_WORKSPACE_WORDS]; + static __u32 digest[SHA1_DIGEST_WORDS]; + static __u32 workspace[SHA1_WORKSPACE_WORDS]; static union { - char __data[SHA_MESSAGE_BYTES]; + char __data[SHA1_BLOCK_SIZE]; struct { struct in6_addr secret; __be32 prefix[2]; @@ -3251,7 +3251,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address, retry: spin_lock_bh(&lock); - sha_init(digest); + sha1_init(digest); memset(&data, 0, sizeof(data)); memset(workspace, 0, sizeof(workspace)); memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); @@ -3260,7 +3260,7 @@ retry: data.secret = secret; data.dad_count = dad_count; - sha_transform(digest, data.__data, workspace); + sha1_transform(digest, data.__data, workspace); temp = *address; temp.s6_addr32[2] = (__force __be32)digest[0]; -- cgit v1.2.3 From 228c4f265c6eb60eaa4ed0edb3bf7c113173576c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 2 May 2020 11:24:27 -0700 Subject: crypto: lib/sha1 - fold linux/cryptohash.h into crypto/sha.h sounds very generic and important, like it's the header to include if you're doing cryptographic hashing in the kernel. But actually it only includes the library implementation of the SHA-1 compression function (not even the full SHA-1). This should basically never be used anymore; SHA-1 is no longer considered secure, and there are much better ways to do cryptographic hashing in the kernel. Remove this header and fold it into which already contains constants and functions for SHA-1 (along with SHA-2). Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/sha1_generic.c | 1 - drivers/char/random.c | 2 +- include/crypto/sha.h | 10 ++++++++++ include/linux/cryptohash.h | 18 ------------------ include/linux/filter.h | 2 +- lib/sha1.c | 2 +- 6 files changed, 13 insertions(+), 22 deletions(-) delete mode 100644 include/linux/cryptohash.h (limited to 'include/linux') diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c index a16d9787dcd2..1d43472fecbd 100644 --- a/crypto/sha1_generic.c +++ b/crypto/sha1_generic.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/random.c b/drivers/char/random.c index a19a8984741b..cae02b2a871c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -327,7 +327,6 @@ #include #include #include -#include #include #include #include @@ -337,6 +336,7 @@ #include #include #include +#include #include #include diff --git a/include/crypto/sha.h b/include/crypto/sha.h index 67aec7245cb7..10753ff71d46 100644 --- a/include/crypto/sha.h +++ b/include/crypto/sha.h @@ -113,6 +113,16 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash); +/* + * An implementation of SHA-1's compression function. Don't use in new code! + * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't + * the correct way to hash something with SHA-1 (use crypto_shash instead). + */ +#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / 4) +#define SHA1_WORKSPACE_WORDS 16 +void sha1_init(__u32 *buf); +void sha1_transform(__u32 *digest, const char *data, __u32 *W); + /* * Stand-alone implementation of the SHA256 algorithm. It is designed to * have as little dependencies as possible so it can be used in the diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h deleted file mode 100644 index c324ffca96e0..000000000000 --- a/include/linux/cryptohash.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __CRYPTOHASH_H -#define __CRYPTOHASH_H - -#include - -/* - * An implementation of SHA-1's compression function. Don't use in new code! - * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't - * the correct way to hash something with SHA-1 (use crypto_shash instead). - */ -#define SHA1_DIGEST_WORDS 5 -#define SHA1_BLOCK_SIZE 64 -#define SHA1_WORKSPACE_WORDS 16 -void sha1_init(__u32 *buf); -void sha1_transform(__u32 *digest, const char *data, __u32 *W); - -#endif diff --git a/include/linux/filter.h b/include/linux/filter.h index f42662adffe4..ec45fd7992c9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -16,11 +16,11 @@ #include #include #include -#include #include #include #include #include +#include #include diff --git a/lib/sha1.c b/lib/sha1.c index b381e8cd4fe4..49257a915bb6 100644 --- a/lib/sha1.c +++ b/lib/sha1.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include /* -- cgit v1.2.3 From d2218d4e4a65f25bd2d38489567012c7db50233c Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Fri, 8 May 2020 18:39:35 +0300 Subject: lib: add linear ranges helpers Many devices have control registers which control some measurable property. Often a register contains control field so that change in this field causes linear change in the controlled property. It is not a rare case that user wants to give 'meaningful' control values and driver needs to convert them to register field values. Even more often user wants to 'see' the currently set value - again in meaningful units - and driver needs to convert the values it reads from register to these meaningful units. Examples of this include: - regulators, voltage/current configurations - power, voltage/current configurations - clk(?) NCOs and maybe others I can't think of right now. Provide a linear_range helper which can do conversion from user value to register value 'selector'. The idea here is stolen from regulator framework and patches refactoring the regulator helpers to use this are following. Current implementation does not support inversely proportional ranges but it might be useful if we could support also inversely proportional ranges? Signed-off-by: Matti Vaittinen Reviewed-by: Mark Brown Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/59259bc475e0c800eb4bb163f02528c7c01f7b3a.1588944082.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/linear_range.h | 48 +++++++++ lib/Kconfig | 3 + lib/Makefile | 1 + lib/linear_ranges.c | 241 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 293 insertions(+) create mode 100644 include/linux/linear_range.h create mode 100644 lib/linear_ranges.c (limited to 'include/linux') diff --git a/include/linux/linear_range.h b/include/linux/linear_range.h new file mode 100644 index 000000000000..17b5943727d5 --- /dev/null +++ b/include/linux/linear_range.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 ROHM Semiconductors */ + +#ifndef LINEAR_RANGE_H +#define LINEAR_RANGE_H + +#include + +/** + * struct linear_range - table of selector - value pairs + * + * Define a lookup-table for range of values. Intended to help when looking + * for a register value matching certaing physical measure (like voltage). + * Usable when increment of one in register always results a constant increment + * of the physical measure (like voltage). + * + * @min: Lowest value in range + * @min_sel: Lowest selector for range + * @max_sel: Highest selector for range + * @step: Value step size + */ +struct linear_range { + unsigned int min; + unsigned int min_sel; + unsigned int max_sel; + unsigned int step; +}; + +unsigned int linear_range_values_in_range(const struct linear_range *r); +unsigned int linear_range_values_in_range_array(const struct linear_range *r, + int ranges); +unsigned int linear_range_get_max_value(const struct linear_range *r); + +int linear_range_get_value(const struct linear_range *r, unsigned int selector, + unsigned int *val); +int linear_range_get_value_array(const struct linear_range *r, int ranges, + unsigned int selector, unsigned int *val); +int linear_range_get_selector_low(const struct linear_range *r, + unsigned int val, unsigned int *selector, + bool *found); +int linear_range_get_selector_high(const struct linear_range *r, + unsigned int val, unsigned int *selector, + bool *found); +int linear_range_get_selector_low_array(const struct linear_range *r, + int ranges, unsigned int val, + unsigned int *selector, bool *found); + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index 5d53f9609c25..8ec05335426c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -19,6 +19,9 @@ config RAID6_PQ_BENCHMARK Benchmark all available RAID6 PQ functions on init and choose the fastest one. +config LINEAR_RANGES + tristate + config PACKING bool "Generic bitfield packing and unpacking" default n diff --git a/lib/Makefile b/lib/Makefile index 685aee60de1d..20b9cfdcad69 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -125,6 +125,7 @@ obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o obj-$(CONFIG_BITREVERSE) += bitrev.o +obj-$(CONFIG_LINEAR_RANGES) += linear_ranges.o obj-$(CONFIG_PACKING) += packing.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC16) += crc16.o diff --git a/lib/linear_ranges.c b/lib/linear_ranges.c new file mode 100644 index 000000000000..d1336c75ccd7 --- /dev/null +++ b/lib/linear_ranges.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * helpers to map values in a linear range to range index + * + * Original idea borrowed from regulator framework + * + * It might be useful if we could support also inversely proportional ranges? + * Copyright 2020 ROHM Semiconductors + */ + +#include +#include +#include +#include + +/** + * linear_range_values_in_range - return the amount of values in a range + * @r: pointer to linear range where values are counted + * + * Compute the amount of values in range pointed by @r. Note, values can + * be all equal - range with selectors 0,...,2 with step 0 still contains + * 3 values even though they are all equal. + * + * Return: the amount of values in range pointed by @r + */ +unsigned int linear_range_values_in_range(const struct linear_range *r) +{ + if (!r) + return 0; + return r->max_sel - r->min_sel + 1; +} +EXPORT_SYMBOL_GPL(linear_range_values_in_range); + +/** + * linear_range_values_in_range_array - return the amount of values in ranges + * @r: pointer to array of linear ranges where values are counted + * @ranges: amount of ranges we include in computation. + * + * Compute the amount of values in ranges pointed by @r. Note, values can + * be all equal - range with selectors 0,...,2 with step 0 still contains + * 3 values even though they are all equal. + * + * Return: the amount of values in first @ranges ranges pointed by @r + */ +unsigned int linear_range_values_in_range_array(const struct linear_range *r, + int ranges) +{ + int i, values_in_range = 0; + + for (i = 0; i < ranges; i++) { + int values; + + values = linear_range_values_in_range(&r[i]); + if (!values) + return values; + + values_in_range += values; + } + return values_in_range; +} +EXPORT_SYMBOL_GPL(linear_range_values_in_range_array); + +/** + * linear_range_get_max_value - return the largest value in a range + * @r: pointer to linear range where value is looked from + * + * Return: the largest value in the given range + */ +unsigned int linear_range_get_max_value(const struct linear_range *r) +{ + return r->min + (r->max_sel - r->min_sel) * r->step; +} +EXPORT_SYMBOL_GPL(linear_range_get_max_value); + +/** + * linear_range_get_value - fetch a value from given range + * @r: pointer to linear range where value is looked from + * @selector: selector for which the value is searched + * @val: address where found value is updated + * + * Search given ranges for value which matches given selector. + * + * Return: 0 on success, -EINVAL given selector is not found from any of the + * ranges. + */ +int linear_range_get_value(const struct linear_range *r, unsigned int selector, + unsigned int *val) +{ + if (r->min_sel > selector || r->max_sel < selector) + return -EINVAL; + + *val = r->min + (selector - r->min_sel) * r->step; + + return 0; +} +EXPORT_SYMBOL_GPL(linear_range_get_value); + +/** + * linear_range_get_value_array - fetch a value from array of ranges + * @r: pointer to array of linear ranges where value is looked from + * @ranges: amount of ranges in an array + * @selector: selector for which the value is searched + * @val: address where found value is updated + * + * Search through an array of ranges for value which matches given selector. + * + * Return: 0 on success, -EINVAL given selector is not found from any of the + * ranges. + */ +int linear_range_get_value_array(const struct linear_range *r, int ranges, + unsigned int selector, unsigned int *val) +{ + int i; + + for (i = 0; i < ranges; i++) + if (r[i].min_sel <= selector && r[i].max_sel >= selector) + return linear_range_get_value(&r[i], selector, val); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(linear_range_get_value_array); + +/** + * linear_range_get_selector_low - return linear range selector for value + * @r: pointer to linear range where selector is looked from + * @val: value for which the selector is searched + * @selector: address where found selector value is updated + * @found: flag to indicate that given value was in the range + * + * Return selector which which range value is closest match for given + * input value. Value is matching if it is equal or smaller than given + * value. If given value is in the range, then @found is set true. + * + * Return: 0 on success, -EINVAL if range is invalid or does not contain + * value smaller or equal to given value + */ +int linear_range_get_selector_low(const struct linear_range *r, + unsigned int val, unsigned int *selector, + bool *found) +{ + *found = false; + + if (r->min > val) + return -EINVAL; + + if (linear_range_get_max_value(r) < val) { + *selector = r->max_sel; + return 0; + } + + *found = true; + + if (r->step == 0) + *selector = r->min_sel; + else + *selector = (val - r->min) / r->step + r->min_sel; + + return 0; +} +EXPORT_SYMBOL_GPL(linear_range_get_selector_low); + +/** + * linear_range_get_selector_low_array - return linear range selector for value + * @r: pointer to array of linear ranges where selector is looked from + * @ranges: amount of ranges to scan from array + * @val: value for which the selector is searched + * @selector: address where found selector value is updated + * @found: flag to indicate that given value was in the range + * + * Scan array of ranges for selector which which range value matches given + * input value. Value is matching if it is equal or smaller than given + * value. If given value is found to be in a range scanning is stopped and + * @found is set true. If a range with values smaller than given value is found + * but the range max is being smaller than given value, then the ranges + * biggest selector is updated to @selector but scanning ranges is continued + * and @found is set to false. + * + * Return: 0 on success, -EINVAL if range array is invalid or does not contain + * range with a value smaller or equal to given value + */ +int linear_range_get_selector_low_array(const struct linear_range *r, + int ranges, unsigned int val, + unsigned int *selector, bool *found) +{ + int i; + int ret = -EINVAL; + + for (i = 0; i < ranges; i++) { + int tmpret; + + tmpret = linear_range_get_selector_low(&r[i], val, selector, + found); + if (!tmpret) + ret = 0; + + if (*found) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(linear_range_get_selector_low_array); + +/** + * linear_range_get_selector_high - return linear range selector for value + * @r: pointer to linear range where selector is looked from + * @val: value for which the selector is searched + * @selector: address where found selector value is updated + * @found: flag to indicate that given value was in the range + * + * Return selector which which range value is closest match for given + * input value. Value is matching if it is equal or higher than given + * value. If given value is in the range, then @found is set true. + * + * Return: 0 on success, -EINVAL if range is invalid or does not contain + * value greater or equal to given value + */ +int linear_range_get_selector_high(const struct linear_range *r, + unsigned int val, unsigned int *selector, + bool *found) +{ + *found = false; + + if (linear_range_get_max_value(r) < val) + return -EINVAL; + + if (r->min > val) { + *selector = r->min_sel; + return 0; + } + + *found = true; + + if (r->step == 0) + *selector = r->max_sel; + else + *selector = DIV_ROUND_UP(val - r->min, r->step) + r->min_sel; + + return 0; +} +EXPORT_SYMBOL_GPL(linear_range_get_selector_high); -- cgit v1.2.3 From 60ab7f4153b6af461c90d572c31104086b44639f Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Fri, 8 May 2020 18:43:36 +0300 Subject: regulator: use linear_ranges helper Change the regulator helpers to use common linear_ranges code. Signed-off-by: Matti Vaittinen Reviewed-by: Mark Brown Acked-by: Charles Keepax Acked-by: Adam Thomson Link: https://lore.kernel.org/r/64f01d5e381b8631a271616b7790f9d5640974fb.1588944082.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- drivers/regulator/88pg86x.c | 4 +- drivers/regulator/88pm800-regulator.c | 4 +- drivers/regulator/Kconfig | 1 + drivers/regulator/act8865-regulator.c | 4 +- drivers/regulator/act8945a-regulator.c | 2 +- drivers/regulator/arizona-ldo1.c | 2 +- drivers/regulator/arizona-micsupp.c | 4 +- drivers/regulator/as3711-regulator.c | 6 +- drivers/regulator/as3722-regulator.c | 4 +- drivers/regulator/axp20x-regulator.c | 16 ++-- drivers/regulator/bcm590xx-regulator.c | 8 +- drivers/regulator/bd70528-regulator.c | 8 +- drivers/regulator/bd71828-regulator.c | 10 +-- drivers/regulator/bd718x7-regulator.c | 26 +++---- drivers/regulator/da903x.c | 2 +- drivers/regulator/helpers.c | 130 ++++++++++++++------------------ drivers/regulator/hi6421-regulator.c | 4 +- drivers/regulator/lochnagar-regulator.c | 4 +- drivers/regulator/lp873x-regulator.c | 4 +- drivers/regulator/lp87565-regulator.c | 2 +- drivers/regulator/lp8788-buck.c | 2 +- drivers/regulator/max77650-regulator.c | 2 +- drivers/regulator/mcp16502.c | 4 +- drivers/regulator/mp8859.c | 2 +- drivers/regulator/mt6323-regulator.c | 6 +- drivers/regulator/mt6358-regulator.c | 8 +- drivers/regulator/mt6380-regulator.c | 6 +- drivers/regulator/mt6397-regulator.c | 6 +- drivers/regulator/palmas-regulator.c | 4 +- drivers/regulator/qcom-rpmh-regulator.c | 2 +- drivers/regulator/qcom_rpm-regulator.c | 14 ++-- drivers/regulator/qcom_smd-regulator.c | 78 +++++++++---------- drivers/regulator/rk808-regulator.c | 10 +-- drivers/regulator/s2mps11.c | 14 ++-- drivers/regulator/sky81452-regulator.c | 2 +- drivers/regulator/stpmic1_regulator.c | 18 ++--- drivers/regulator/tps65086-regulator.c | 10 +-- drivers/regulator/tps65217-regulator.c | 4 +- drivers/regulator/tps65218-regulator.c | 6 +- drivers/regulator/tps65912-regulator.c | 4 +- drivers/regulator/twl-regulator.c | 4 +- drivers/regulator/twl6030-regulator.c | 2 +- drivers/regulator/wm831x-dcdc.c | 2 +- drivers/regulator/wm831x-ldo.c | 4 +- drivers/regulator/wm8350-regulator.c | 2 +- drivers/regulator/wm8400-regulator.c | 2 +- include/linux/regulator/driver.h | 27 ++----- 47 files changed, 229 insertions(+), 261 deletions(-) (limited to 'include/linux') diff --git a/drivers/regulator/88pg86x.c b/drivers/regulator/88pg86x.c index d5ef55c81185..71cfa2c5de5e 100644 --- a/drivers/regulator/88pg86x.c +++ b/drivers/regulator/88pg86x.c @@ -11,13 +11,13 @@ static const struct regulator_ops pg86x_ops = { .list_voltage = regulator_list_voltage_linear_range, }; -static const struct regulator_linear_range pg86x_buck1_ranges[] = { +static const struct linear_range pg86x_buck1_ranges[] = { REGULATOR_LINEAR_RANGE( 0, 0, 10, 0), REGULATOR_LINEAR_RANGE(1000000, 11, 34, 25000), REGULATOR_LINEAR_RANGE(1600000, 35, 47, 50000), }; -static const struct regulator_linear_range pg86x_buck2_ranges[] = { +static const struct linear_range pg86x_buck2_ranges[] = { REGULATOR_LINEAR_RANGE( 0, 0, 15, 0), REGULATOR_LINEAR_RANGE(1000000, 16, 39, 25000), REGULATOR_LINEAR_RANGE(1600000, 40, 52, 50000), diff --git a/drivers/regulator/88pm800-regulator.c b/drivers/regulator/88pm800-regulator.c index 69ae25886181..d08ee81ed1ac 100644 --- a/drivers/regulator/88pm800-regulator.c +++ b/drivers/regulator/88pm800-regulator.c @@ -134,13 +134,13 @@ struct pm800_regulator_info { } /* Ranges are sorted in ascending order. */ -static const struct regulator_linear_range buck1_volt_range[] = { +static const struct linear_range buck1_volt_range[] = { REGULATOR_LINEAR_RANGE(600000, 0, 0x4f, 12500), REGULATOR_LINEAR_RANGE(1600000, 0x50, 0x54, 50000), }; /* BUCK 2~5 have same ranges. */ -static const struct regulator_linear_range buck2_5_volt_range[] = { +static const struct linear_range buck2_5_volt_range[] = { REGULATOR_LINEAR_RANGE(600000, 0, 0x4f, 12500), REGULATOR_LINEAR_RANGE(1600000, 0x50, 0x72, 50000), }; diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index f4b72cb098ef..f4447f5d940f 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig REGULATOR bool "Voltage and Current Regulator Support" + select LINEAR_RANGES help Generic Voltage and Current Regulator support. diff --git a/drivers/regulator/act8865-regulator.c b/drivers/regulator/act8865-regulator.c index 0fa97f934df4..19b9742c9ecc 100644 --- a/drivers/regulator/act8865-regulator.c +++ b/drivers/regulator/act8865-regulator.c @@ -220,13 +220,13 @@ static const struct regmap_config act8865_regmap_config = { .val_bits = 8, }; -static const struct regulator_linear_range act8865_voltage_ranges[] = { +static const struct linear_range act8865_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0, 23, 25000), REGULATOR_LINEAR_RANGE(1200000, 24, 47, 50000), REGULATOR_LINEAR_RANGE(2400000, 48, 63, 100000), }; -static const struct regulator_linear_range act8600_sudcdc_voltage_ranges[] = { +static const struct linear_range act8600_sudcdc_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(3000000, 0, 63, 0), REGULATOR_LINEAR_RANGE(3000000, 64, 159, 100000), REGULATOR_LINEAR_RANGE(12600000, 160, 191, 200000), diff --git a/drivers/regulator/act8945a-regulator.c b/drivers/regulator/act8945a-regulator.c index d2f804dbc785..6a62f946ccae 100644 --- a/drivers/regulator/act8945a-regulator.c +++ b/drivers/regulator/act8945a-regulator.c @@ -73,7 +73,7 @@ struct act8945a_pmic { u32 op_mode[ACT8945A_ID_MAX]; }; -static const struct regulator_linear_range act8945a_voltage_ranges[] = { +static const struct linear_range act8945a_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0, 23, 25000), REGULATOR_LINEAR_RANGE(1200000, 24, 47, 50000), REGULATOR_LINEAR_RANGE(2400000, 48, 63, 100000), diff --git a/drivers/regulator/arizona-ldo1.c b/drivers/regulator/arizona-ldo1.c index 1a3d7b720f5e..ade0bef4569d 100644 --- a/drivers/regulator/arizona-ldo1.c +++ b/drivers/regulator/arizona-ldo1.c @@ -87,7 +87,7 @@ static const struct regulator_ops arizona_ldo1_hc_ops = { .set_bypass = regulator_set_bypass_regmap, }; -static const struct regulator_linear_range arizona_ldo1_hc_ranges[] = { +static const struct linear_range arizona_ldo1_hc_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 0x6, 50000), REGULATOR_LINEAR_RANGE(1800000, 0x7, 0x7, 0), }; diff --git a/drivers/regulator/arizona-micsupp.c b/drivers/regulator/arizona-micsupp.c index ae1a5de3e57d..f6cfd3f6f0dd 100644 --- a/drivers/regulator/arizona-micsupp.c +++ b/drivers/regulator/arizona-micsupp.c @@ -125,7 +125,7 @@ static const struct regulator_ops arizona_micsupp_ops = { .set_bypass = arizona_micsupp_set_bypass, }; -static const struct regulator_linear_range arizona_micsupp_ranges[] = { +static const struct linear_range arizona_micsupp_ranges[] = { REGULATOR_LINEAR_RANGE(1700000, 0, 0x1e, 50000), REGULATOR_LINEAR_RANGE(3300000, 0x1f, 0x1f, 0), }; @@ -152,7 +152,7 @@ static const struct regulator_desc arizona_micsupp = { .owner = THIS_MODULE, }; -static const struct regulator_linear_range arizona_micsupp_ext_ranges[] = { +static const struct linear_range arizona_micsupp_ext_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 0x14, 25000), REGULATOR_LINEAR_RANGE(1500000, 0x15, 0x27, 100000), }; diff --git a/drivers/regulator/as3711-regulator.c b/drivers/regulator/as3711-regulator.c index ece88103f2fd..b6b9206969ae 100644 --- a/drivers/regulator/as3711-regulator.c +++ b/drivers/regulator/as3711-regulator.c @@ -103,18 +103,18 @@ static const struct regulator_ops as3711_dldo_ops = { .map_voltage = regulator_map_voltage_linear_range, }; -static const struct regulator_linear_range as3711_sd_ranges[] = { +static const struct linear_range as3711_sd_ranges[] = { REGULATOR_LINEAR_RANGE(612500, 0x1, 0x40, 12500), REGULATOR_LINEAR_RANGE(1425000, 0x41, 0x70, 25000), REGULATOR_LINEAR_RANGE(2650000, 0x71, 0x7f, 50000), }; -static const struct regulator_linear_range as3711_aldo_ranges[] = { +static const struct linear_range as3711_aldo_ranges[] = { REGULATOR_LINEAR_RANGE(1200000, 0, 0xf, 50000), REGULATOR_LINEAR_RANGE(1800000, 0x10, 0x1f, 100000), }; -static const struct regulator_linear_range as3711_dldo_ranges[] = { +static const struct linear_range as3711_dldo_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 0x10, 50000), REGULATOR_LINEAR_RANGE(1750000, 0x20, 0x3f, 50000), }; diff --git a/drivers/regulator/as3722-regulator.c b/drivers/regulator/as3722-regulator.c index bd5d0bacb08d..33ca197860b3 100644 --- a/drivers/regulator/as3722-regulator.c +++ b/drivers/regulator/as3722-regulator.c @@ -389,7 +389,7 @@ static const struct regulator_ops as3722_ldo6_extcntrl_ops = { .set_bypass = regulator_set_bypass_regmap, }; -static const struct regulator_linear_range as3722_ldo_ranges[] = { +static const struct linear_range as3722_ldo_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x00, 0x00, 0), REGULATOR_LINEAR_RANGE(825000, 0x01, 0x24, 25000), REGULATOR_LINEAR_RANGE(1725000, 0x40, 0x7F, 25000), @@ -487,7 +487,7 @@ static bool as3722_sd0_is_low_voltage(struct as3722_regulators *as3722_regs) return false; } -static const struct regulator_linear_range as3722_sd2345_ranges[] = { +static const struct linear_range as3722_sd2345_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x00, 0x00, 0), REGULATOR_LINEAR_RANGE(612500, 0x01, 0x40, 12500), REGULATOR_LINEAR_RANGE(1425000, 0x41, 0x70, 25000), diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 1e6eb5b1f8d8..fbc95cadaf53 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -510,7 +510,7 @@ static const struct regulator_ops axp20x_ops_sw = { .is_enabled = regulator_is_enabled_regmap, }; -static const struct regulator_linear_range axp20x_ldo4_ranges[] = { +static const struct linear_range axp20x_ldo4_ranges[] = { REGULATOR_LINEAR_RANGE(1250000, AXP20X_LDO4_V_OUT_1250mV_START, AXP20X_LDO4_V_OUT_1250mV_END, @@ -638,7 +638,7 @@ static const struct regulator_desc axp22x_drivevbus_regulator = { }; /* DCDC ranges shared with AXP813 */ -static const struct regulator_linear_range axp803_dcdc234_ranges[] = { +static const struct linear_range axp803_dcdc234_ranges[] = { REGULATOR_LINEAR_RANGE(500000, AXP803_DCDC234_500mV_START, AXP803_DCDC234_500mV_END, @@ -649,7 +649,7 @@ static const struct regulator_linear_range axp803_dcdc234_ranges[] = { 20000), }; -static const struct regulator_linear_range axp803_dcdc5_ranges[] = { +static const struct linear_range axp803_dcdc5_ranges[] = { REGULATOR_LINEAR_RANGE(800000, AXP803_DCDC5_800mV_START, AXP803_DCDC5_800mV_END, @@ -660,7 +660,7 @@ static const struct regulator_linear_range axp803_dcdc5_ranges[] = { 20000), }; -static const struct regulator_linear_range axp803_dcdc6_ranges[] = { +static const struct linear_range axp803_dcdc6_ranges[] = { REGULATOR_LINEAR_RANGE(600000, AXP803_DCDC6_600mV_START, AXP803_DCDC6_600mV_END, @@ -672,7 +672,7 @@ static const struct regulator_linear_range axp803_dcdc6_ranges[] = { }; /* AXP806's CLDO2 and AXP809's DLDO1 share the same range */ -static const struct regulator_linear_range axp803_dldo2_ranges[] = { +static const struct linear_range axp803_dldo2_ranges[] = { REGULATOR_LINEAR_RANGE(700000, AXP803_DLDO2_700mV_START, AXP803_DLDO2_700mV_END, @@ -758,7 +758,7 @@ static const struct regulator_desc axp803_regulators[] = { AXP_DESC_FIXED(AXP803, RTC_LDO, "rtc-ldo", "ips", 3000), }; -static const struct regulator_linear_range axp806_dcdca_ranges[] = { +static const struct linear_range axp806_dcdca_ranges[] = { REGULATOR_LINEAR_RANGE(600000, AXP806_DCDCA_600mV_START, AXP806_DCDCA_600mV_END, @@ -769,7 +769,7 @@ static const struct regulator_linear_range axp806_dcdca_ranges[] = { 20000), }; -static const struct regulator_linear_range axp806_dcdcd_ranges[] = { +static const struct linear_range axp806_dcdcd_ranges[] = { REGULATOR_LINEAR_RANGE(600000, AXP806_DCDCD_600mV_START, AXP806_DCDCD_600mV_END, @@ -834,7 +834,7 @@ static const struct regulator_desc axp806_regulators[] = { AXP806_PWR_OUT_CTRL2, AXP806_PWR_OUT_SW_MASK), }; -static const struct regulator_linear_range axp809_dcdc4_ranges[] = { +static const struct linear_range axp809_dcdc4_ranges[] = { REGULATOR_LINEAR_RANGE(600000, AXP809_DCDC4_600mV_START, AXP809_DCDC4_600mV_END, diff --git a/drivers/regulator/bcm590xx-regulator.c b/drivers/regulator/bcm590xx-regulator.c index 8c98c3f07660..65e23fc5f9c3 100644 --- a/drivers/regulator/bcm590xx-regulator.c +++ b/drivers/regulator/bcm590xx-regulator.c @@ -116,14 +116,14 @@ static const unsigned int ldo_vbus[] = { }; /* DCDC group CSR: supported voltages in microvolts */ -static const struct regulator_linear_range dcdc_csr_ranges[] = { +static const struct linear_range dcdc_csr_ranges[] = { REGULATOR_LINEAR_RANGE(860000, 2, 50, 10000), REGULATOR_LINEAR_RANGE(1360000, 51, 55, 20000), REGULATOR_LINEAR_RANGE(900000, 56, 63, 0), }; /* DCDC group IOSR1: supported voltages in microvolts */ -static const struct regulator_linear_range dcdc_iosr1_ranges[] = { +static const struct linear_range dcdc_iosr1_ranges[] = { REGULATOR_LINEAR_RANGE(860000, 2, 51, 10000), REGULATOR_LINEAR_RANGE(1500000, 52, 52, 0), REGULATOR_LINEAR_RANGE(1800000, 53, 53, 0), @@ -131,7 +131,7 @@ static const struct regulator_linear_range dcdc_iosr1_ranges[] = { }; /* DCDC group SDSR1: supported voltages in microvolts */ -static const struct regulator_linear_range dcdc_sdsr1_ranges[] = { +static const struct linear_range dcdc_sdsr1_ranges[] = { REGULATOR_LINEAR_RANGE(860000, 2, 50, 10000), REGULATOR_LINEAR_RANGE(1340000, 51, 51, 0), REGULATOR_LINEAR_RANGE(900000, 52, 63, 0), @@ -143,7 +143,7 @@ struct bcm590xx_info { u8 n_voltages; const unsigned int *volt_table; u8 n_linear_ranges; - const struct regulator_linear_range *linear_ranges; + const struct linear_range *linear_ranges; }; #define BCM590XX_REG_TABLE(_name, _table) \ diff --git a/drivers/regulator/bd70528-regulator.c b/drivers/regulator/bd70528-regulator.c index 5bf8a2dc5fe7..d44adf7e875a 100644 --- a/drivers/regulator/bd70528-regulator.c +++ b/drivers/regulator/bd70528-regulator.c @@ -20,22 +20,22 @@ #define BUCK_RAMPRATE_125MV 1 #define BUCK_RAMP_MAX 250 -static const struct regulator_linear_range bd70528_buck1_volts[] = { +static const struct linear_range bd70528_buck1_volts[] = { REGULATOR_LINEAR_RANGE(1200000, 0x00, 0x1, 600000), REGULATOR_LINEAR_RANGE(2750000, 0x2, 0xf, 50000), }; -static const struct regulator_linear_range bd70528_buck2_volts[] = { +static const struct linear_range bd70528_buck2_volts[] = { REGULATOR_LINEAR_RANGE(1200000, 0x00, 0x1, 300000), REGULATOR_LINEAR_RANGE(1550000, 0x2, 0xd, 50000), REGULATOR_LINEAR_RANGE(3000000, 0xe, 0xf, 300000), }; -static const struct regulator_linear_range bd70528_buck3_volts[] = { +static const struct linear_range bd70528_buck3_volts[] = { REGULATOR_LINEAR_RANGE(800000, 0x00, 0xd, 50000), REGULATOR_LINEAR_RANGE(1800000, 0xe, 0xf, 0), }; /* All LDOs have same voltage ranges */ -static const struct regulator_linear_range bd70528_ldo_volts[] = { +static const struct linear_range bd70528_ldo_volts[] = { REGULATOR_LINEAR_RANGE(1650000, 0x0, 0x07, 50000), REGULATOR_LINEAR_RANGE(2100000, 0x8, 0x0f, 100000), REGULATOR_LINEAR_RANGE(2850000, 0x10, 0x19, 50000), diff --git a/drivers/regulator/bd71828-regulator.c b/drivers/regulator/bd71828-regulator.c index b2fa17be4988..85c0b9000963 100644 --- a/drivers/regulator/bd71828-regulator.c +++ b/drivers/regulator/bd71828-regulator.c @@ -65,27 +65,27 @@ static const struct reg_init buck7_inits[] = { }, }; -static const struct regulator_linear_range bd71828_buck1267_volts[] = { +static const struct linear_range bd71828_buck1267_volts[] = { REGULATOR_LINEAR_RANGE(500000, 0x00, 0xef, 6250), REGULATOR_LINEAR_RANGE(2000000, 0xf0, 0xff, 0), }; -static const struct regulator_linear_range bd71828_buck3_volts[] = { +static const struct linear_range bd71828_buck3_volts[] = { REGULATOR_LINEAR_RANGE(1200000, 0x00, 0x0f, 50000), REGULATOR_LINEAR_RANGE(2000000, 0x10, 0x1f, 0), }; -static const struct regulator_linear_range bd71828_buck4_volts[] = { +static const struct linear_range bd71828_buck4_volts[] = { REGULATOR_LINEAR_RANGE(1000000, 0x00, 0x1f, 25000), REGULATOR_LINEAR_RANGE(1800000, 0x20, 0x3f, 0), }; -static const struct regulator_linear_range bd71828_buck5_volts[] = { +static const struct linear_range bd71828_buck5_volts[] = { REGULATOR_LINEAR_RANGE(2500000, 0x00, 0x0f, 50000), REGULATOR_LINEAR_RANGE(3300000, 0x10, 0x1f, 0), }; -static const struct regulator_linear_range bd71828_ldo_volts[] = { +static const struct linear_range bd71828_ldo_volts[] = { REGULATOR_LINEAR_RANGE(800000, 0x00, 0x31, 50000), REGULATOR_LINEAR_RANGE(3300000, 0x32, 0x3f, 0), }; diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c index cf3872837abc..819573610ee2 100644 --- a/drivers/regulator/bd718x7-regulator.c +++ b/drivers/regulator/bd718x7-regulator.c @@ -152,7 +152,7 @@ static const struct regulator_ops bd718xx_dvs_buck_regulator_ops = { * BD71847 BUCK1/2 * 0.70 to 1.30V (10mV step) */ -static const struct regulator_linear_range bd718xx_dvs_buck_volts[] = { +static const struct linear_range bd718xx_dvs_buck_volts[] = { REGULATOR_LINEAR_RANGE(700000, 0x00, 0x3C, 10000), REGULATOR_LINEAR_RANGE(1300000, 0x3D, 0x3F, 0), }; @@ -163,7 +163,7 @@ static const struct regulator_linear_range bd718xx_dvs_buck_volts[] = { * and * 0.675 to 1.325 (range 1) */ -static const struct regulator_linear_range bd71837_buck5_volts[] = { +static const struct linear_range bd71837_buck5_volts[] = { /* Ranges when VOLT_SEL bit is 0 */ REGULATOR_LINEAR_RANGE(700000, 0x00, 0x03, 100000), REGULATOR_LINEAR_RANGE(1050000, 0x04, 0x05, 50000), @@ -185,7 +185,7 @@ static const unsigned int bd71837_buck5_volt_range_sel[] = { /* * BD71847 BUCK3 */ -static const struct regulator_linear_range bd71847_buck3_volts[] = { +static const struct linear_range bd71847_buck3_volts[] = { /* Ranges when VOLT_SEL bits are 00 */ REGULATOR_LINEAR_RANGE(700000, 0x00, 0x03, 100000), REGULATOR_LINEAR_RANGE(1050000, 0x04, 0x05, 50000), @@ -202,7 +202,7 @@ static const unsigned int bd71847_buck3_volt_range_sel[] = { 0x0, 0x0, 0x0, 0x40, 0x80, 0x80, 0x80 }; -static const struct regulator_linear_range bd71847_buck4_volts[] = { +static const struct linear_range bd71847_buck4_volts[] = { REGULATOR_LINEAR_RANGE(3000000, 0x00, 0x03, 100000), REGULATOR_LINEAR_RANGE(2600000, 0x00, 0x03, 100000), }; @@ -213,7 +213,7 @@ static const unsigned int bd71847_buck4_volt_range_sel[] = { 0x0, 0x40 }; * BUCK6 * 3.0V to 3.3V (step 100mV) */ -static const struct regulator_linear_range bd71837_buck6_volts[] = { +static const struct linear_range bd71837_buck6_volts[] = { REGULATOR_LINEAR_RANGE(3000000, 0x00, 0x03, 100000), }; @@ -237,7 +237,7 @@ static const unsigned int bd718xx_3rd_nodvs_buck_volts[] = { * BUCK8 * 0.8V to 1.40V (step 10mV) */ -static const struct regulator_linear_range bd718xx_4th_nodvs_buck_volts[] = { +static const struct linear_range bd718xx_4th_nodvs_buck_volts[] = { REGULATOR_LINEAR_RANGE(800000, 0x00, 0x3C, 10000), }; @@ -245,7 +245,7 @@ static const struct regulator_linear_range bd718xx_4th_nodvs_buck_volts[] = { * LDO1 * 3.0 to 3.3V (100mV step) */ -static const struct regulator_linear_range bd718xx_ldo1_volts[] = { +static const struct linear_range bd718xx_ldo1_volts[] = { REGULATOR_LINEAR_RANGE(3000000, 0x00, 0x03, 100000), REGULATOR_LINEAR_RANGE(1600000, 0x00, 0x03, 100000), }; @@ -264,7 +264,7 @@ static const unsigned int ldo_2_volts[] = { * LDO3 * 1.8 to 3.3V (100mV step) */ -static const struct regulator_linear_range bd718xx_ldo3_volts[] = { +static const struct linear_range bd718xx_ldo3_volts[] = { REGULATOR_LINEAR_RANGE(1800000, 0x00, 0x0F, 100000), }; @@ -272,7 +272,7 @@ static const struct regulator_linear_range bd718xx_ldo3_volts[] = { * LDO4 * 0.9 to 1.8V (100mV step) */ -static const struct regulator_linear_range bd718xx_ldo4_volts[] = { +static const struct linear_range bd718xx_ldo4_volts[] = { REGULATOR_LINEAR_RANGE(900000, 0x00, 0x09, 100000), }; @@ -280,7 +280,7 @@ static const struct regulator_linear_range bd718xx_ldo4_volts[] = { * LDO5 for BD71837 * 1.8 to 3.3V (100mV step) */ -static const struct regulator_linear_range bd71837_ldo5_volts[] = { +static const struct linear_range bd71837_ldo5_volts[] = { REGULATOR_LINEAR_RANGE(1800000, 0x00, 0x0F, 100000), }; @@ -288,7 +288,7 @@ static const struct regulator_linear_range bd71837_ldo5_volts[] = { * LDO5 for BD71837 * 1.8 to 3.3V (100mV step) */ -static const struct regulator_linear_range bd71847_ldo5_volts[] = { +static const struct linear_range bd71847_ldo5_volts[] = { REGULATOR_LINEAR_RANGE(1800000, 0x00, 0x0F, 100000), REGULATOR_LINEAR_RANGE(800000, 0x00, 0x0F, 100000), }; @@ -299,7 +299,7 @@ static const unsigned int bd71847_ldo5_volt_range_sel[] = { 0x0, 0x20 }; * LDO6 * 0.9 to 1.8V (100mV step) */ -static const struct regulator_linear_range bd718xx_ldo6_volts[] = { +static const struct linear_range bd718xx_ldo6_volts[] = { REGULATOR_LINEAR_RANGE(900000, 0x00, 0x09, 100000), }; @@ -307,7 +307,7 @@ static const struct regulator_linear_range bd718xx_ldo6_volts[] = { * LDO7 * 1.8 to 3.3V (100mV step) */ -static const struct regulator_linear_range bd71837_ldo7_volts[] = { +static const struct linear_range bd71837_ldo7_volts[] = { REGULATOR_LINEAR_RANGE(1800000, 0x00, 0x0F, 100000), }; diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c index 5493c3a86426..770e694824ac 100644 --- a/drivers/regulator/da903x.c +++ b/drivers/regulator/da903x.c @@ -248,7 +248,7 @@ static int da9034_set_dvc_voltage_sel(struct regulator_dev *rdev, return ret; } -static const struct regulator_linear_range da9034_ldo12_ranges[] = { +static const struct linear_range da9034_ldo12_ranges[] = { REGULATOR_LINEAR_RANGE(1700000, 0, 7, 50000), REGULATOR_LINEAR_RANGE(2700000, 8, 15, 50000), }; diff --git a/drivers/regulator/helpers.c b/drivers/regulator/helpers.c index bb16c465426e..e970e9d2f8be 100644 --- a/drivers/regulator/helpers.c +++ b/drivers/regulator/helpers.c @@ -131,10 +131,11 @@ int regulator_get_voltage_sel_pickable_regmap(struct regulator_dev *rdev) unsigned int r_val; int range; unsigned int val; - int ret, i; - unsigned int voltages_in_range = 0; + int ret; + unsigned int voltages = 0; + const struct linear_range *r = rdev->desc->linear_ranges; - if (!rdev->desc->linear_ranges) + if (!r) return -EINVAL; ret = regmap_read(rdev->regmap, rdev->desc->vsel_reg, &val); @@ -152,11 +153,9 @@ int regulator_get_voltage_sel_pickable_regmap(struct regulator_dev *rdev) if (range < 0) return -EINVAL; - for (i = 0; i < range; i++) - voltages_in_range += (rdev->desc->linear_ranges[i].max_sel - - rdev->desc->linear_ranges[i].min_sel) + 1; + voltages = linear_range_values_in_range_array(r, range); - return val + voltages_in_range; + return val + voltages; } EXPORT_SYMBOL_GPL(regulator_get_voltage_sel_pickable_regmap); @@ -179,8 +178,11 @@ int regulator_set_voltage_sel_pickable_regmap(struct regulator_dev *rdev, unsigned int voltages_in_range = 0; for (i = 0; i < rdev->desc->n_linear_ranges; i++) { - voltages_in_range = (rdev->desc->linear_ranges[i].max_sel - - rdev->desc->linear_ranges[i].min_sel) + 1; + const struct linear_range *r; + + r = &rdev->desc->linear_ranges[i]; + voltages_in_range = linear_range_values_in_range(r); + if (sel < voltages_in_range) break; sel -= voltages_in_range; @@ -405,8 +407,10 @@ EXPORT_SYMBOL_GPL(regulator_map_voltage_linear); int regulator_map_voltage_linear_range(struct regulator_dev *rdev, int min_uV, int max_uV) { - const struct regulator_linear_range *range; + const struct linear_range *range; int ret = -EINVAL; + unsigned int sel; + bool found; int voltage, i; if (!rdev->desc->n_linear_ranges) { @@ -415,35 +419,19 @@ int regulator_map_voltage_linear_range(struct regulator_dev *rdev, } for (i = 0; i < rdev->desc->n_linear_ranges; i++) { - int linear_max_uV; - range = &rdev->desc->linear_ranges[i]; - linear_max_uV = range->min_uV + - (range->max_sel - range->min_sel) * range->uV_step; - if (!(min_uV <= linear_max_uV && max_uV >= range->min_uV)) + ret = linear_range_get_selector_high(range, min_uV, &sel, + &found); + if (ret) continue; - - if (min_uV <= range->min_uV) - min_uV = range->min_uV; - - /* range->uV_step == 0 means fixed voltage range */ - if (range->uV_step == 0) { - ret = 0; - } else { - ret = DIV_ROUND_UP(min_uV - range->min_uV, - range->uV_step); - if (ret < 0) - return ret; - } - - ret += range->min_sel; + ret = sel; /* * Map back into a voltage to verify we're still in bounds. * If we are not, then continue checking rest of the ranges. */ - voltage = rdev->desc->ops->list_voltage(rdev, ret); + voltage = rdev->desc->ops->list_voltage(rdev, sel); if (voltage >= min_uV && voltage <= max_uV) break; } @@ -468,7 +456,7 @@ EXPORT_SYMBOL_GPL(regulator_map_voltage_linear_range); int regulator_map_voltage_pickable_linear_range(struct regulator_dev *rdev, int min_uV, int max_uV) { - const struct regulator_linear_range *range; + const struct linear_range *range; int ret = -EINVAL; int voltage, i; unsigned int selector = 0; @@ -480,30 +468,25 @@ int regulator_map_voltage_pickable_linear_range(struct regulator_dev *rdev, for (i = 0; i < rdev->desc->n_linear_ranges; i++) { int linear_max_uV; + bool found; + unsigned int sel; range = &rdev->desc->linear_ranges[i]; - linear_max_uV = range->min_uV + - (range->max_sel - range->min_sel) * range->uV_step; + linear_max_uV = linear_range_get_max_value(range); - if (!(min_uV <= linear_max_uV && max_uV >= range->min_uV)) { - selector += (range->max_sel - range->min_sel + 1); + if (!(min_uV <= linear_max_uV && max_uV >= range->min)) { + selector += linear_range_values_in_range(range); continue; } - if (min_uV <= range->min_uV) - min_uV = range->min_uV; - - /* range->uV_step == 0 means fixed voltage range */ - if (range->uV_step == 0) { - ret = 0; - } else { - ret = DIV_ROUND_UP(min_uV - range->min_uV, - range->uV_step); - if (ret < 0) - return ret; + ret = linear_range_get_selector_high(range, min_uV, &sel, + &found); + if (ret) { + selector += linear_range_values_in_range(range); + continue; } - ret += selector; + ret = selector + sel; voltage = rdev->desc->ops->list_voltage(rdev, ret); @@ -513,7 +496,7 @@ int regulator_map_voltage_pickable_linear_range(struct regulator_dev *rdev, * exit but retry until we have checked all ranges. */ if (voltage < min_uV || voltage > max_uV) - selector += (range->max_sel - range->min_sel + 1); + selector += linear_range_values_in_range(range); else break; } @@ -561,7 +544,7 @@ EXPORT_SYMBOL_GPL(regulator_list_voltage_linear); int regulator_list_voltage_pickable_linear_range(struct regulator_dev *rdev, unsigned int selector) { - const struct regulator_linear_range *range; + const struct linear_range *range; int i; unsigned int all_sels = 0; @@ -571,18 +554,28 @@ int regulator_list_voltage_pickable_linear_range(struct regulator_dev *rdev, } for (i = 0; i < rdev->desc->n_linear_ranges; i++) { - unsigned int sels_in_range; + unsigned int sel_indexes; range = &rdev->desc->linear_ranges[i]; - sels_in_range = range->max_sel - range->min_sel; + sel_indexes = linear_range_values_in_range(range) - 1; - if (all_sels + sels_in_range >= selector) { + if (all_sels + sel_indexes >= selector) { selector -= all_sels; - return range->min_uV + (range->uV_step * selector); + /* + * As we see here, pickable ranges work only as + * long as the first selector for each pickable + * range is 0, and the each subsequent range for + * this 'pick' follow immediately at next unused + * selector (Eg. there is no gaps between ranges). + * I think this is fine but it probably should be + * documented. OTOH, whole pickable range stuff + * might benefit from some documentation + */ + return range->min + (range->step * selector); } - all_sels += (sels_in_range + 1); + all_sels += (sel_indexes + 1); } return -EINVAL; @@ -604,27 +597,18 @@ EXPORT_SYMBOL_GPL(regulator_list_voltage_pickable_linear_range); int regulator_desc_list_voltage_linear_range(const struct regulator_desc *desc, unsigned int selector) { - const struct regulator_linear_range *range; - int i; - - if (!desc->n_linear_ranges) { - BUG_ON(!desc->n_linear_ranges); - return -EINVAL; - } - - for (i = 0; i < desc->n_linear_ranges; i++) { - range = &desc->linear_ranges[i]; - - if (!(selector >= range->min_sel && - selector <= range->max_sel)) - continue; + unsigned int val; + int ret; - selector -= range->min_sel; + BUG_ON(!desc->n_linear_ranges); - return range->min_uV + (range->uV_step * selector); - } + ret = linear_range_get_value_array(desc->linear_ranges, + desc->n_linear_ranges, selector, + &val); + if (ret) + return ret; - return -EINVAL; + return val; } EXPORT_SYMBOL_GPL(regulator_desc_list_voltage_linear_range); diff --git a/drivers/regulator/hi6421-regulator.c b/drivers/regulator/hi6421-regulator.c index 5ac3d7c29725..66219d8dfc1a 100644 --- a/drivers/regulator/hi6421-regulator.c +++ b/drivers/regulator/hi6421-regulator.c @@ -87,7 +87,7 @@ static const unsigned int ldo_8_voltages[] = { }; /* Ranges are sorted in ascending order. */ -static const struct regulator_linear_range ldo_audio_volt_range[] = { +static const struct linear_range ldo_audio_volt_range[] = { REGULATOR_LINEAR_RANGE(2800000, 0, 3, 50000), REGULATOR_LINEAR_RANGE(3000000, 4, 7, 100000), }; @@ -195,7 +195,7 @@ static const struct regulator_ops hi6421_buck345_ops; * _id - LDO id name string * _match - of match name string * n_volt - number of votages available - * volt_ranges - array of regulator_linear_range + * volt_ranges - array of linear_range * vstep - voltage increase in each linear step in uV * vreg - voltage select register * vmask - voltage select mask diff --git a/drivers/regulator/lochnagar-regulator.c b/drivers/regulator/lochnagar-regulator.c index 9b05e03ba830..5ea3e4141684 100644 --- a/drivers/regulator/lochnagar-regulator.c +++ b/drivers/regulator/lochnagar-regulator.c @@ -36,7 +36,7 @@ static const struct regulator_ops lochnagar_micvdd_ops = { .set_voltage_sel = regulator_set_voltage_sel_regmap, }; -static const struct regulator_linear_range lochnagar_micvdd_ranges[] = { +static const struct linear_range lochnagar_micvdd_ranges[] = { REGULATOR_LINEAR_RANGE(1000000, 0, 0xC, 50000), REGULATOR_LINEAR_RANGE(1700000, 0xD, 0x1F, 100000), }; @@ -97,7 +97,7 @@ static const struct regulator_ops lochnagar_vddcore_ops = { .set_voltage_sel = regulator_set_voltage_sel_regmap, }; -static const struct regulator_linear_range lochnagar_vddcore_ranges[] = { +static const struct linear_range lochnagar_vddcore_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0x8, 0x41, 12500), }; diff --git a/drivers/regulator/lp873x-regulator.c b/drivers/regulator/lp873x-regulator.c index b55de293ca7a..fe049b67e7d5 100644 --- a/drivers/regulator/lp873x-regulator.c +++ b/drivers/regulator/lp873x-regulator.c @@ -54,14 +54,14 @@ struct lp873x_regulator { static const struct lp873x_regulator regulators[]; -static const struct regulator_linear_range buck0_buck1_ranges[] = { +static const struct linear_range buck0_buck1_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x0, 0x13, 0), REGULATOR_LINEAR_RANGE(700000, 0x14, 0x17, 10000), REGULATOR_LINEAR_RANGE(735000, 0x18, 0x9d, 5000), REGULATOR_LINEAR_RANGE(1420000, 0x9e, 0xff, 20000), }; -static const struct regulator_linear_range ldo0_ldo1_ranges[] = { +static const struct linear_range ldo0_ldo1_ranges[] = { REGULATOR_LINEAR_RANGE(800000, 0x0, 0x19, 100000), }; diff --git a/drivers/regulator/lp87565-regulator.c b/drivers/regulator/lp87565-regulator.c index 4ae12ac1f4c6..5d525dacf959 100644 --- a/drivers/regulator/lp87565-regulator.c +++ b/drivers/regulator/lp87565-regulator.c @@ -46,7 +46,7 @@ struct lp87565_regulator { static const struct lp87565_regulator regulators[]; -static const struct regulator_linear_range buck0_1_2_3_ranges[] = { +static const struct linear_range buck0_1_2_3_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0xA, 0x17, 10000), REGULATOR_LINEAR_RANGE(735000, 0x18, 0x9d, 5000), REGULATOR_LINEAR_RANGE(1420000, 0x9e, 0xff, 20000), diff --git a/drivers/regulator/lp8788-buck.c b/drivers/regulator/lp8788-buck.c index 222502a29658..74b7b496b12d 100644 --- a/drivers/regulator/lp8788-buck.c +++ b/drivers/regulator/lp8788-buck.c @@ -92,7 +92,7 @@ struct lp8788_buck { }; /* BUCK 1 ~ 4 voltage ranges */ -static const struct regulator_linear_range buck_volt_ranges[] = { +static const struct linear_range buck_volt_ranges[] = { REGULATOR_LINEAR_RANGE(500000, 0, 0, 0), REGULATOR_LINEAR_RANGE(800000, 1, 25, 50000), }; diff --git a/drivers/regulator/max77650-regulator.c b/drivers/regulator/max77650-regulator.c index ac89a412f665..ca08f94a368d 100644 --- a/drivers/regulator/max77650-regulator.c +++ b/drivers/regulator/max77650-regulator.c @@ -49,7 +49,7 @@ static const unsigned int max77651_sbb1_volt_range_sel[] = { 0x0, 0x1, 0x2, 0x3 }; -static const struct regulator_linear_range max77651_sbb1_volt_ranges[] = { +static const struct linear_range max77651_sbb1_volt_ranges[] = { /* range index 0 */ REGULATOR_LINEAR_RANGE(2400000, 0x00, 0x0f, 50000), /* range index 1 */ diff --git a/drivers/regulator/mcp16502.c b/drivers/regulator/mcp16502.c index e5a02711cb46..6d0ad74935b3 100644 --- a/drivers/regulator/mcp16502.c +++ b/drivers/regulator/mcp16502.c @@ -391,11 +391,11 @@ static const struct of_device_id mcp16502_ids[] = { }; MODULE_DEVICE_TABLE(of, mcp16502_ids); -static const struct regulator_linear_range b1l12_ranges[] = { +static const struct linear_range b1l12_ranges[] = { REGULATOR_LINEAR_RANGE(1200000, VDD_LOW_SEL, VDD_HIGH_SEL, 50000), }; -static const struct regulator_linear_range b234_ranges[] = { +static const struct linear_range b234_ranges[] = { REGULATOR_LINEAR_RANGE(600000, VDD_LOW_SEL, VDD_HIGH_SEL, 25000), }; diff --git a/drivers/regulator/mp8859.c b/drivers/regulator/mp8859.c index 6ed987648188..f2300714d5a9 100644 --- a/drivers/regulator/mp8859.c +++ b/drivers/regulator/mp8859.c @@ -73,7 +73,7 @@ static int mp8859_get_voltage_sel(struct regulator_dev *rdev) return val; } -static const struct regulator_linear_range mp8859_dcdc_ranges[] = { +static const struct linear_range mp8859_dcdc_ranges[] = { REGULATOR_LINEAR_RANGE(0, VOL_MIN_IDX, VOL_MAX_IDX, 10000), }; diff --git a/drivers/regulator/mt6323-regulator.c b/drivers/regulator/mt6323-regulator.c index 893ea190788a..ff9016170db3 100644 --- a/drivers/regulator/mt6323-regulator.c +++ b/drivers/regulator/mt6323-regulator.c @@ -102,15 +102,15 @@ struct mt6323_regulator_info { .modeset_mask = _modeset_mask, \ } -static const struct regulator_linear_range buck_volt_range1[] = { +static const struct linear_range buck_volt_range1[] = { REGULATOR_LINEAR_RANGE(700000, 0, 0x7f, 6250), }; -static const struct regulator_linear_range buck_volt_range2[] = { +static const struct linear_range buck_volt_range2[] = { REGULATOR_LINEAR_RANGE(1400000, 0, 0x7f, 12500), }; -static const struct regulator_linear_range buck_volt_range3[] = { +static const struct linear_range buck_volt_range3[] = { REGULATOR_LINEAR_RANGE(500000, 0, 0x3f, 50000), }; diff --git a/drivers/regulator/mt6358-regulator.c b/drivers/regulator/mt6358-regulator.c index ba42682e06f3..13cb6ac9a892 100644 --- a/drivers/regulator/mt6358-regulator.c +++ b/drivers/regulator/mt6358-regulator.c @@ -137,19 +137,19 @@ struct mt6358_regulator_info { .qi = BIT(15), \ } -static const struct regulator_linear_range buck_volt_range1[] = { +static const struct linear_range buck_volt_range1[] = { REGULATOR_LINEAR_RANGE(500000, 0, 0x7f, 6250), }; -static const struct regulator_linear_range buck_volt_range2[] = { +static const struct linear_range buck_volt_range2[] = { REGULATOR_LINEAR_RANGE(500000, 0, 0x7f, 12500), }; -static const struct regulator_linear_range buck_volt_range3[] = { +static const struct linear_range buck_volt_range3[] = { REGULATOR_LINEAR_RANGE(500000, 0, 0x3f, 50000), }; -static const struct regulator_linear_range buck_volt_range4[] = { +static const struct linear_range buck_volt_range4[] = { REGULATOR_LINEAR_RANGE(1000000, 0, 0x7f, 12500), }; diff --git a/drivers/regulator/mt6380-regulator.c b/drivers/regulator/mt6380-regulator.c index b6aed090b5e0..9efd8710a6f3 100644 --- a/drivers/regulator/mt6380-regulator.c +++ b/drivers/regulator/mt6380-regulator.c @@ -152,15 +152,15 @@ struct mt6380_regulator_info { .modeset_mask = _modeset_mask, \ } -static const struct regulator_linear_range buck_volt_range1[] = { +static const struct linear_range buck_volt_range1[] = { REGULATOR_LINEAR_RANGE(600000, 0, 0xfe, 6250), }; -static const struct regulator_linear_range buck_volt_range2[] = { +static const struct linear_range buck_volt_range2[] = { REGULATOR_LINEAR_RANGE(600000, 0, 0xfe, 6250), }; -static const struct regulator_linear_range buck_volt_range3[] = { +static const struct linear_range buck_volt_range3[] = { REGULATOR_LINEAR_RANGE(1200000, 0, 0x3c, 25000), }; diff --git a/drivers/regulator/mt6397-regulator.c b/drivers/regulator/mt6397-regulator.c index fd9ed864a0c1..269c2a6028e8 100644 --- a/drivers/regulator/mt6397-regulator.c +++ b/drivers/regulator/mt6397-regulator.c @@ -102,15 +102,15 @@ struct mt6397_regulator_info { .qi = BIT(15), \ } -static const struct regulator_linear_range buck_volt_range1[] = { +static const struct linear_range buck_volt_range1[] = { REGULATOR_LINEAR_RANGE(700000, 0, 0x7f, 6250), }; -static const struct regulator_linear_range buck_volt_range2[] = { +static const struct linear_range buck_volt_range2[] = { REGULATOR_LINEAR_RANGE(800000, 0, 0x7f, 6250), }; -static const struct regulator_linear_range buck_volt_range3[] = { +static const struct linear_range buck_volt_range3[] = { REGULATOR_LINEAR_RANGE(1500000, 0, 0x1f, 20000), }; diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c index 31325912d311..337dd614695e 100644 --- a/drivers/regulator/palmas-regulator.c +++ b/drivers/regulator/palmas-regulator.c @@ -22,14 +22,14 @@ #include #include -static const struct regulator_linear_range smps_low_ranges[] = { +static const struct linear_range smps_low_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(500000, 0x1, 0x6, 0), REGULATOR_LINEAR_RANGE(510000, 0x7, 0x79, 10000), REGULATOR_LINEAR_RANGE(1650000, 0x7A, 0x7f, 0), }; -static const struct regulator_linear_range smps_high_ranges[] = { +static const struct linear_range smps_high_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1000000, 0x1, 0x6, 0), REGULATOR_LINEAR_RANGE(1020000, 0x7, 0x79, 20000), diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index c86ad40015ce..953c7fba8d9d 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -86,7 +86,7 @@ enum rpmh_regulator_type { struct rpmh_vreg_hw_data { enum rpmh_regulator_type regulator_type; const struct regulator_ops *ops; - const struct regulator_linear_range voltage_range; + const struct linear_range voltage_range; int n_voltages; int hpm_min_load_uA; const int *pmic_mode_map; diff --git a/drivers/regulator/qcom_rpm-regulator.c b/drivers/regulator/qcom_rpm-regulator.c index 7fc97f23fcf4..a4562a23aa57 100644 --- a/drivers/regulator/qcom_rpm-regulator.c +++ b/drivers/regulator/qcom_rpm-regulator.c @@ -148,41 +148,41 @@ static const struct rpm_reg_parts rpm8960_ncp_parts = { /* * Physically available PMIC regulator voltage ranges */ -static const struct regulator_linear_range pldo_ranges[] = { +static const struct linear_range pldo_ranges[] = { REGULATOR_LINEAR_RANGE( 750000, 0, 59, 12500), REGULATOR_LINEAR_RANGE(1500000, 60, 123, 25000), REGULATOR_LINEAR_RANGE(3100000, 124, 160, 50000), }; -static const struct regulator_linear_range nldo_ranges[] = { +static const struct linear_range nldo_ranges[] = { REGULATOR_LINEAR_RANGE( 750000, 0, 63, 12500), }; -static const struct regulator_linear_range nldo1200_ranges[] = { +static const struct linear_range nldo1200_ranges[] = { REGULATOR_LINEAR_RANGE( 375000, 0, 59, 6250), REGULATOR_LINEAR_RANGE( 750000, 60, 123, 12500), }; -static const struct regulator_linear_range smps_ranges[] = { +static const struct linear_range smps_ranges[] = { REGULATOR_LINEAR_RANGE( 375000, 0, 29, 12500), REGULATOR_LINEAR_RANGE( 750000, 30, 89, 12500), REGULATOR_LINEAR_RANGE(1500000, 90, 153, 25000), }; -static const struct regulator_linear_range ftsmps_ranges[] = { +static const struct linear_range ftsmps_ranges[] = { REGULATOR_LINEAR_RANGE( 350000, 0, 6, 50000), REGULATOR_LINEAR_RANGE( 700000, 7, 63, 12500), REGULATOR_LINEAR_RANGE(1500000, 64, 100, 50000), }; -static const struct regulator_linear_range smb208_ranges[] = { +static const struct linear_range smb208_ranges[] = { REGULATOR_LINEAR_RANGE( 375000, 0, 29, 12500), REGULATOR_LINEAR_RANGE( 750000, 30, 89, 12500), REGULATOR_LINEAR_RANGE(1500000, 90, 153, 25000), REGULATOR_LINEAR_RANGE(3100000, 154, 234, 25000), }; -static const struct regulator_linear_range ncp_ranges[] = { +static const struct linear_range ncp_ranges[] = { REGULATOR_LINEAR_RANGE(1500000, 0, 31, 50000), }; diff --git a/drivers/regulator/qcom_smd-regulator.c b/drivers/regulator/qcom_smd-regulator.c index fdde4195cefb..53a64d856926 100644 --- a/drivers/regulator/qcom_smd-regulator.c +++ b/drivers/regulator/qcom_smd-regulator.c @@ -199,7 +199,7 @@ static const struct regulator_ops rpm_bob_ops = { }; static const struct regulator_desc pma8084_hfsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(375000, 0, 95, 12500), REGULATOR_LINEAR_RANGE(1550000, 96, 158, 25000), }, @@ -209,7 +209,7 @@ static const struct regulator_desc pma8084_hfsmps = { }; static const struct regulator_desc pma8084_ftsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(350000, 0, 184, 5000), REGULATOR_LINEAR_RANGE(1280000, 185, 261, 10000), }, @@ -219,7 +219,7 @@ static const struct regulator_desc pma8084_ftsmps = { }; static const struct regulator_desc pma8084_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE( 750000, 0, 63, 12500), REGULATOR_LINEAR_RANGE(1550000, 64, 126, 25000), REGULATOR_LINEAR_RANGE(3100000, 127, 163, 50000), @@ -230,7 +230,7 @@ static const struct regulator_desc pma8084_pldo = { }; static const struct regulator_desc pma8084_nldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(750000, 0, 63, 12500), }, .n_linear_ranges = 1, @@ -243,7 +243,7 @@ static const struct regulator_desc pma8084_switch = { }; static const struct regulator_desc pm8x41_hfsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE( 375000, 0, 95, 12500), REGULATOR_LINEAR_RANGE(1575000, 96, 158, 25000), }, @@ -253,7 +253,7 @@ static const struct regulator_desc pm8x41_hfsmps = { }; static const struct regulator_desc pm8841_ftsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(350000, 0, 184, 5000), REGULATOR_LINEAR_RANGE(1280000, 185, 261, 10000), }, @@ -263,7 +263,7 @@ static const struct regulator_desc pm8841_ftsmps = { }; static const struct regulator_desc pm8941_boost = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(4000000, 0, 30, 50000), }, .n_linear_ranges = 1, @@ -272,7 +272,7 @@ static const struct regulator_desc pm8941_boost = { }; static const struct regulator_desc pm8941_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE( 750000, 0, 63, 12500), REGULATOR_LINEAR_RANGE(1550000, 64, 126, 25000), REGULATOR_LINEAR_RANGE(3100000, 127, 163, 50000), @@ -283,7 +283,7 @@ static const struct regulator_desc pm8941_pldo = { }; static const struct regulator_desc pm8941_nldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(750000, 0, 63, 12500), }, .n_linear_ranges = 1, @@ -302,7 +302,7 @@ static const struct regulator_desc pm8941_switch = { }; static const struct regulator_desc pm8916_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(750000, 0, 208, 12500), }, .n_linear_ranges = 1, @@ -311,7 +311,7 @@ static const struct regulator_desc pm8916_pldo = { }; static const struct regulator_desc pm8916_nldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(375000, 0, 93, 12500), }, .n_linear_ranges = 1, @@ -320,7 +320,7 @@ static const struct regulator_desc pm8916_nldo = { }; static const struct regulator_desc pm8916_buck_lvo_smps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(375000, 0, 95, 12500), REGULATOR_LINEAR_RANGE(750000, 96, 127, 25000), }, @@ -330,7 +330,7 @@ static const struct regulator_desc pm8916_buck_lvo_smps = { }; static const struct regulator_desc pm8916_buck_hvo_smps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1550000, 0, 31, 25000), }, .n_linear_ranges = 1, @@ -339,7 +339,7 @@ static const struct regulator_desc pm8916_buck_hvo_smps = { }; static const struct regulator_desc pm8950_hfsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(375000, 0, 95, 12500), REGULATOR_LINEAR_RANGE(1550000, 96, 127, 25000), }, @@ -349,7 +349,7 @@ static const struct regulator_desc pm8950_hfsmps = { }; static const struct regulator_desc pm8950_ftsmps2p5 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(80000, 0, 255, 5000), REGULATOR_LINEAR_RANGE(160000, 256, 460, 10000), }, @@ -359,7 +359,7 @@ static const struct regulator_desc pm8950_ftsmps2p5 = { }; static const struct regulator_desc pm8950_ult_nldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(375000, 0, 202, 12500), }, .n_linear_ranges = 1, @@ -368,7 +368,7 @@ static const struct regulator_desc pm8950_ult_nldo = { }; static const struct regulator_desc pm8950_ult_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1750000, 0, 127, 12500), }, .n_linear_ranges = 1, @@ -377,7 +377,7 @@ static const struct regulator_desc pm8950_ult_pldo = { }; static const struct regulator_desc pm8950_pldo_lv = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1500000, 0, 16, 25000), }, .n_linear_ranges = 1, @@ -386,7 +386,7 @@ static const struct regulator_desc pm8950_pldo_lv = { }; static const struct regulator_desc pm8950_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(975000, 0, 164, 12500), }, .n_linear_ranges = 1, @@ -396,7 +396,7 @@ static const struct regulator_desc pm8950_pldo = { static const struct regulator_desc pm8994_hfsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE( 375000, 0, 95, 12500), REGULATOR_LINEAR_RANGE(1550000, 96, 158, 25000), }, @@ -406,7 +406,7 @@ static const struct regulator_desc pm8994_hfsmps = { }; static const struct regulator_desc pm8994_ftsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(350000, 0, 199, 5000), REGULATOR_LINEAR_RANGE(700000, 200, 349, 10000), }, @@ -416,7 +416,7 @@ static const struct regulator_desc pm8994_ftsmps = { }; static const struct regulator_desc pm8994_nldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(750000, 0, 63, 12500), }, .n_linear_ranges = 1, @@ -425,7 +425,7 @@ static const struct regulator_desc pm8994_nldo = { }; static const struct regulator_desc pm8994_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE( 750000, 0, 63, 12500), REGULATOR_LINEAR_RANGE(1550000, 64, 126, 25000), REGULATOR_LINEAR_RANGE(3100000, 127, 163, 50000), @@ -446,7 +446,7 @@ static const struct regulator_desc pm8994_lnldo = { }; static const struct regulator_desc pmi8994_ftsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(350000, 0, 199, 5000), REGULATOR_LINEAR_RANGE(700000, 200, 349, 10000), }, @@ -456,7 +456,7 @@ static const struct regulator_desc pmi8994_ftsmps = { }; static const struct regulator_desc pmi8994_hfsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(350000, 0, 80, 12500), REGULATOR_LINEAR_RANGE(700000, 81, 141, 25000), }, @@ -466,7 +466,7 @@ static const struct regulator_desc pmi8994_hfsmps = { }; static const struct regulator_desc pmi8994_bby = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(3000000, 0, 44, 50000), }, .n_linear_ranges = 1, @@ -475,7 +475,7 @@ static const struct regulator_desc pmi8994_bby = { }; static const struct regulator_desc pmi8994_boost = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(4000000, 0, 30, 50000), }, .n_linear_ranges = 1, @@ -484,7 +484,7 @@ static const struct regulator_desc pmi8994_boost = { }; static const struct regulator_desc pm8998_ftsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(320000, 0, 258, 4000), }, .n_linear_ranges = 1, @@ -493,7 +493,7 @@ static const struct regulator_desc pm8998_ftsmps = { }; static const struct regulator_desc pm8998_hfsmps = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), }, .n_linear_ranges = 1, @@ -502,7 +502,7 @@ static const struct regulator_desc pm8998_hfsmps = { }; static const struct regulator_desc pm8998_nldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(312000, 0, 127, 8000), }, .n_linear_ranges = 1, @@ -511,7 +511,7 @@ static const struct regulator_desc pm8998_nldo = { }; static const struct regulator_desc pm8998_pldo = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1664000, 0, 255, 8000), }, .n_linear_ranges = 1, @@ -520,7 +520,7 @@ static const struct regulator_desc pm8998_pldo = { }; static const struct regulator_desc pm8998_pldo_lv = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1256000, 0, 127, 8000), }, .n_linear_ranges = 1, @@ -533,7 +533,7 @@ static const struct regulator_desc pm8998_switch = { }; static const struct regulator_desc pmi8998_bob = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1824000, 0, 83, 32000), }, .n_linear_ranges = 1, @@ -542,7 +542,7 @@ static const struct regulator_desc pmi8998_bob = { }; static const struct regulator_desc pms405_hfsmps3 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(320000, 0, 215, 8000), }, .n_linear_ranges = 1, @@ -551,7 +551,7 @@ static const struct regulator_desc pms405_hfsmps3 = { }; static const struct regulator_desc pms405_nldo300 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(312000, 0, 127, 8000), }, .n_linear_ranges = 1, @@ -560,7 +560,7 @@ static const struct regulator_desc pms405_nldo300 = { }; static const struct regulator_desc pms405_nldo1200 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(312000, 0, 127, 8000), }, .n_linear_ranges = 1, @@ -569,7 +569,7 @@ static const struct regulator_desc pms405_nldo1200 = { }; static const struct regulator_desc pms405_pldo50 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1664000, 0, 128, 16000), }, .n_linear_ranges = 1, @@ -578,7 +578,7 @@ static const struct regulator_desc pms405_pldo50 = { }; static const struct regulator_desc pms405_pldo150 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1664000, 0, 128, 16000), }, .n_linear_ranges = 1, @@ -587,7 +587,7 @@ static const struct regulator_desc pms405_pldo150 = { }; static const struct regulator_desc pms405_pldo600 = { - .linear_ranges = (struct regulator_linear_range[]) { + .linear_ranges = (struct linear_range[]) { REGULATOR_LINEAR_RANGE(1256000, 0, 98, 8000), }, .n_linear_ranges = 1, diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c index 31f79fda3238..e926c1a85846 100644 --- a/drivers/regulator/rk808-regulator.c +++ b/drivers/regulator/rk808-regulator.c @@ -165,14 +165,14 @@ static const int rk808_buck_config_regs[] = { RK808_BUCK4_CONFIG_REG, }; -static const struct regulator_linear_range rk808_ldo3_voltage_ranges[] = { +static const struct linear_range rk808_ldo3_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(800000, 0, 13, 100000), REGULATOR_LINEAR_RANGE(2500000, 15, 15, 0), }; #define RK809_BUCK5_SEL_CNT (8) -static const struct regulator_linear_range rk809_buck5_voltage_ranges[] = { +static const struct linear_range rk809_buck5_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(1500000, 0, 0, 0), REGULATOR_LINEAR_RANGE(1800000, 1, 3, 200000), REGULATOR_LINEAR_RANGE(2800000, 4, 5, 200000), @@ -201,14 +201,14 @@ static const struct regulator_linear_range rk809_buck5_voltage_ranges[] = { #define RK817_BUCK1_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK1_SEL1 + 1) #define RK817_BUCK3_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK3_SEL1 + 1) -static const struct regulator_linear_range rk817_buck1_voltage_ranges[] = { +static const struct linear_range rk817_buck1_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0, RK817_BUCK1_SEL0, RK817_BUCK1_STP0), REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1, RK817_BUCK1_SEL_CNT, RK817_BUCK1_STP1), }; -static const struct regulator_linear_range rk817_buck3_voltage_ranges[] = { +static const struct linear_range rk817_buck3_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0, RK817_BUCK1_SEL0, RK817_BUCK1_STP0), REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1, @@ -665,7 +665,7 @@ static const struct regulator_ops rk808_switch_ops = { .set_suspend_disable = rk808_set_suspend_disable, }; -static const struct regulator_linear_range rk805_buck_1_2_voltage_ranges[] = { +static const struct linear_range rk805_buck_1_2_voltage_ranges[] = { REGULATOR_LINEAR_RANGE(712500, 0, 59, 12500), REGULATOR_LINEAR_RANGE(1800000, 60, 62, 200000), REGULATOR_LINEAR_RANGE(2300000, 63, 63, 0), diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c index 23d288278957..33cf84bce05a 100644 --- a/drivers/regulator/s2mps11.c +++ b/drivers/regulator/s2mps11.c @@ -749,37 +749,37 @@ static const struct regulator_ops s2mps15_reg_buck_ops = { } /* voltage range for s2mps15 LDO 3, 5, 15, 16, 18, 20, 23 and 27 */ -static const struct regulator_linear_range s2mps15_ldo_voltage_ranges1[] = { +static const struct linear_range s2mps15_ldo_voltage_ranges1[] = { REGULATOR_LINEAR_RANGE(1000000, 0xc, 0x38, 25000), }; /* voltage range for s2mps15 LDO 2, 6, 14, 17, 19, 21, 24 and 25 */ -static const struct regulator_linear_range s2mps15_ldo_voltage_ranges2[] = { +static const struct linear_range s2mps15_ldo_voltage_ranges2[] = { REGULATOR_LINEAR_RANGE(1800000, 0x0, 0x3f, 25000), }; /* voltage range for s2mps15 LDO 4, 11, 12, 13, 22 and 26 */ -static const struct regulator_linear_range s2mps15_ldo_voltage_ranges3[] = { +static const struct linear_range s2mps15_ldo_voltage_ranges3[] = { REGULATOR_LINEAR_RANGE(700000, 0x0, 0x34, 12500), }; /* voltage range for s2mps15 LDO 7, 8, 9 and 10 */ -static const struct regulator_linear_range s2mps15_ldo_voltage_ranges4[] = { +static const struct linear_range s2mps15_ldo_voltage_ranges4[] = { REGULATOR_LINEAR_RANGE(700000, 0x10, 0x20, 25000), }; /* voltage range for s2mps15 LDO 1 */ -static const struct regulator_linear_range s2mps15_ldo_voltage_ranges5[] = { +static const struct linear_range s2mps15_ldo_voltage_ranges5[] = { REGULATOR_LINEAR_RANGE(500000, 0x0, 0x20, 12500), }; /* voltage range for s2mps15 BUCK 1, 2, 3, 4, 5, 6 and 7 */ -static const struct regulator_linear_range s2mps15_buck_voltage_ranges1[] = { +static const struct linear_range s2mps15_buck_voltage_ranges1[] = { REGULATOR_LINEAR_RANGE(500000, 0x20, 0xc0, 6250), }; /* voltage range for s2mps15 BUCK 8, 9 and 10 */ -static const struct regulator_linear_range s2mps15_buck_voltage_ranges2[] = { +static const struct linear_range s2mps15_buck_voltage_ranges2[] = { REGULATOR_LINEAR_RANGE(1000000, 0x20, 0x78, 12500), }; diff --git a/drivers/regulator/sky81452-regulator.c b/drivers/regulator/sky81452-regulator.c index 177dede82a61..37658affe072 100644 --- a/drivers/regulator/sky81452-regulator.c +++ b/drivers/regulator/sky81452-regulator.c @@ -32,7 +32,7 @@ static const struct regulator_ops sky81452_reg_ops = { .is_enabled = regulator_is_enabled_regmap, }; -static const struct regulator_linear_range sky81452_reg_ranges[] = { +static const struct linear_range sky81452_reg_ranges[] = { REGULATOR_LINEAR_RANGE(4500000, 0, 14, 250000), REGULATOR_LINEAR_RANGE(9000000, 15, 31, 1000000), }; diff --git a/drivers/regulator/stpmic1_regulator.c b/drivers/regulator/stpmic1_regulator.c index f3d7d007ecbb..adc9973d1b2f 100644 --- a/drivers/regulator/stpmic1_regulator.c +++ b/drivers/regulator/stpmic1_regulator.c @@ -57,13 +57,13 @@ enum { /* Ramp delay worst case is (2250uV/uS) */ #define PMIC_RAMP_DELAY 2200 -static const struct regulator_linear_range buck1_ranges[] = { +static const struct linear_range buck1_ranges[] = { REGULATOR_LINEAR_RANGE(725000, 0, 4, 0), REGULATOR_LINEAR_RANGE(725000, 5, 36, 25000), REGULATOR_LINEAR_RANGE(1500000, 37, 63, 0), }; -static const struct regulator_linear_range buck2_ranges[] = { +static const struct linear_range buck2_ranges[] = { REGULATOR_LINEAR_RANGE(1000000, 0, 17, 0), REGULATOR_LINEAR_RANGE(1050000, 18, 19, 0), REGULATOR_LINEAR_RANGE(1100000, 20, 21, 0), @@ -77,7 +77,7 @@ static const struct regulator_linear_range buck2_ranges[] = { REGULATOR_LINEAR_RANGE(1500000, 36, 63, 0), }; -static const struct regulator_linear_range buck3_ranges[] = { +static const struct linear_range buck3_ranges[] = { REGULATOR_LINEAR_RANGE(1000000, 0, 19, 0), REGULATOR_LINEAR_RANGE(1100000, 20, 23, 0), REGULATOR_LINEAR_RANGE(1200000, 24, 27, 0), @@ -87,7 +87,7 @@ static const struct regulator_linear_range buck3_ranges[] = { REGULATOR_LINEAR_RANGE(3400000, 56, 63, 0), }; -static const struct regulator_linear_range buck4_ranges[] = { +static const struct linear_range buck4_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0, 27, 25000), REGULATOR_LINEAR_RANGE(1300000, 28, 29, 0), REGULATOR_LINEAR_RANGE(1350000, 30, 31, 0), @@ -97,19 +97,19 @@ static const struct regulator_linear_range buck4_ranges[] = { REGULATOR_LINEAR_RANGE(3900000, 61, 63, 0), }; -static const struct regulator_linear_range ldo1_ranges[] = { +static const struct linear_range ldo1_ranges[] = { REGULATOR_LINEAR_RANGE(1700000, 0, 7, 0), REGULATOR_LINEAR_RANGE(1700000, 8, 24, 100000), REGULATOR_LINEAR_RANGE(3300000, 25, 31, 0), }; -static const struct regulator_linear_range ldo2_ranges[] = { +static const struct linear_range ldo2_ranges[] = { REGULATOR_LINEAR_RANGE(1700000, 0, 7, 0), REGULATOR_LINEAR_RANGE(1700000, 8, 24, 100000), REGULATOR_LINEAR_RANGE(3300000, 25, 30, 0), }; -static const struct regulator_linear_range ldo3_ranges[] = { +static const struct linear_range ldo3_ranges[] = { REGULATOR_LINEAR_RANGE(1700000, 0, 7, 0), REGULATOR_LINEAR_RANGE(1700000, 8, 24, 100000), REGULATOR_LINEAR_RANGE(3300000, 25, 30, 0), @@ -117,13 +117,13 @@ static const struct regulator_linear_range ldo3_ranges[] = { REGULATOR_LINEAR_RANGE(500000, 31, 31, 0), }; -static const struct regulator_linear_range ldo5_ranges[] = { +static const struct linear_range ldo5_ranges[] = { REGULATOR_LINEAR_RANGE(1700000, 0, 7, 0), REGULATOR_LINEAR_RANGE(1700000, 8, 30, 100000), REGULATOR_LINEAR_RANGE(3900000, 31, 31, 0), }; -static const struct regulator_linear_range ldo6_ranges[] = { +static const struct linear_range ldo6_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 24, 100000), REGULATOR_LINEAR_RANGE(3300000, 25, 31, 0), }; diff --git a/drivers/regulator/tps65086-regulator.c b/drivers/regulator/tps65086-regulator.c index 5a5e9b5bf4be..9910e949373c 100644 --- a/drivers/regulator/tps65086-regulator.c +++ b/drivers/regulator/tps65086-regulator.c @@ -71,23 +71,23 @@ struct tps65086_regulator { unsigned int decay_mask; }; -static const struct regulator_linear_range tps65086_10mv_ranges[] = { +static const struct linear_range tps65086_10mv_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(410000, 0x1, 0x7F, 10000), }; -static const struct regulator_linear_range tps65086_buck126_25mv_ranges[] = { +static const struct linear_range tps65086_buck126_25mv_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1000000, 0x1, 0x18, 0), REGULATOR_LINEAR_RANGE(1025000, 0x19, 0x7F, 25000), }; -static const struct regulator_linear_range tps65086_buck345_25mv_ranges[] = { +static const struct linear_range tps65086_buck345_25mv_ranges[] = { REGULATOR_LINEAR_RANGE(0, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(425000, 0x1, 0x7F, 25000), }; -static const struct regulator_linear_range tps65086_ldoa1_ranges[] = { +static const struct linear_range tps65086_ldoa1_ranges[] = { REGULATOR_LINEAR_RANGE(1350000, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1500000, 0x1, 0x7, 100000), REGULATOR_LINEAR_RANGE(2300000, 0x8, 0xB, 100000), @@ -95,7 +95,7 @@ static const struct regulator_linear_range tps65086_ldoa1_ranges[] = { REGULATOR_LINEAR_RANGE(3300000, 0xE, 0xE, 0), }; -static const struct regulator_linear_range tps65086_ldoa23_ranges[] = { +static const struct linear_range tps65086_ldoa23_ranges[] = { REGULATOR_LINEAR_RANGE(700000, 0x0, 0xD, 50000), REGULATOR_LINEAR_RANGE(1400000, 0xE, 0xF, 100000), }; diff --git a/drivers/regulator/tps65217-regulator.c b/drivers/regulator/tps65217-regulator.c index 67ba78da77ec..d27dbbafcf72 100644 --- a/drivers/regulator/tps65217-regulator.c +++ b/drivers/regulator/tps65217-regulator.c @@ -56,14 +56,14 @@ static const unsigned int LDO1_VSEL_table[] = { 2800000, 3000000, 3100000, 3300000, }; -static const struct regulator_linear_range tps65217_uv1_ranges[] = { +static const struct linear_range tps65217_uv1_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 24, 25000), REGULATOR_LINEAR_RANGE(1550000, 25, 52, 50000), REGULATOR_LINEAR_RANGE(3000000, 53, 55, 100000), REGULATOR_LINEAR_RANGE(3300000, 56, 63, 0), }; -static const struct regulator_linear_range tps65217_uv2_ranges[] = { +static const struct linear_range tps65217_uv2_ranges[] = { REGULATOR_LINEAR_RANGE(1500000, 0, 8, 50000), REGULATOR_LINEAR_RANGE(2000000, 9, 13, 100000), REGULATOR_LINEAR_RANGE(2450000, 14, 31, 50000), diff --git a/drivers/regulator/tps65218-regulator.c b/drivers/regulator/tps65218-regulator.c index b72035610013..05d13f807918 100644 --- a/drivers/regulator/tps65218-regulator.c +++ b/drivers/regulator/tps65218-regulator.c @@ -56,17 +56,17 @@ .bypass_mask = _sm, \ } \ -static const struct regulator_linear_range dcdc1_dcdc2_ranges[] = { +static const struct linear_range dcdc1_dcdc2_ranges[] = { REGULATOR_LINEAR_RANGE(850000, 0x0, 0x32, 10000), REGULATOR_LINEAR_RANGE(1375000, 0x33, 0x3f, 25000), }; -static const struct regulator_linear_range ldo1_dcdc3_ranges[] = { +static const struct linear_range ldo1_dcdc3_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0x0, 0x1a, 25000), REGULATOR_LINEAR_RANGE(1600000, 0x1b, 0x3f, 50000), }; -static const struct regulator_linear_range dcdc4_ranges[] = { +static const struct linear_range dcdc4_ranges[] = { REGULATOR_LINEAR_RANGE(1175000, 0x0, 0xf, 25000), REGULATOR_LINEAR_RANGE(1600000, 0x10, 0x34, 50000), }; diff --git a/drivers/regulator/tps65912-regulator.c b/drivers/regulator/tps65912-regulator.c index 276faeddc370..15c79931ea89 100644 --- a/drivers/regulator/tps65912-regulator.c +++ b/drivers/regulator/tps65912-regulator.c @@ -46,11 +46,11 @@ enum tps65912_regulators { DCDC1, DCDC2, DCDC3, DCDC4, LDO1, LDO2, LDO3, .n_linear_ranges = ARRAY_SIZE(_lr), \ } -static const struct regulator_linear_range tps65912_dcdc_ranges[] = { +static const struct linear_range tps65912_dcdc_ranges[] = { REGULATOR_LINEAR_RANGE(500000, 0x0, 0x3f, 50000), }; -static const struct regulator_linear_range tps65912_ldo_ranges[] = { +static const struct linear_range tps65912_ldo_ranges[] = { REGULATOR_LINEAR_RANGE(800000, 0x0, 0x20, 25000), REGULATOR_LINEAR_RANGE(1650000, 0x21, 0x3c, 50000), REGULATOR_LINEAR_RANGE(3100000, 0x3d, 0x3f, 100000), diff --git a/drivers/regulator/twl-regulator.c b/drivers/regulator/twl-regulator.c index 866b4dd01da9..4a51cfea45ac 100644 --- a/drivers/regulator/twl-regulator.c +++ b/drivers/regulator/twl-regulator.c @@ -360,12 +360,12 @@ static const u16 VINTANA2_VSEL_table[] = { }; /* 600mV to 1450mV in 12.5 mV steps */ -static const struct regulator_linear_range VDD1_ranges[] = { +static const struct linear_range VDD1_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0, 68, 12500) }; /* 600mV to 1450mV in 12.5 mV steps, everything above = 1500mV */ -static const struct regulator_linear_range VDD2_ranges[] = { +static const struct linear_range VDD2_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0, 68, 12500), REGULATOR_LINEAR_RANGE(1500000, 69, 69, 12500) }; diff --git a/drivers/regulator/twl6030-regulator.c b/drivers/regulator/twl6030-regulator.c index b8100c3cedad..f7db250a7583 100644 --- a/drivers/regulator/twl6030-regulator.c +++ b/drivers/regulator/twl6030-regulator.c @@ -495,7 +495,7 @@ static const struct regulator_ops twlsmps_ops = { }; /*----------------------------------------------------------------------*/ -static const struct regulator_linear_range twl6030ldo_linear_range[] = { +static const struct linear_range twl6030ldo_linear_range[] = { REGULATOR_LINEAR_RANGE(0, 0, 0, 0), REGULATOR_LINEAR_RANGE(1000000, 1, 24, 100000), REGULATOR_LINEAR_RANGE(2750000, 31, 31, 0), diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c index 018dbbd96771..ad2203d11a88 100644 --- a/drivers/regulator/wm831x-dcdc.c +++ b/drivers/regulator/wm831x-dcdc.c @@ -204,7 +204,7 @@ static irqreturn_t wm831x_dcdc_oc_irq(int irq, void *data) * BUCKV specifics */ -static const struct regulator_linear_range wm831x_buckv_ranges[] = { +static const struct linear_range wm831x_buckv_ranges[] = { REGULATOR_LINEAR_RANGE(600000, 0, 0x7, 0), REGULATOR_LINEAR_RANGE(600000, 0x8, 0x68, 12500), }; diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c index 56754686c982..7b6cf4810cb7 100644 --- a/drivers/regulator/wm831x-ldo.c +++ b/drivers/regulator/wm831x-ldo.c @@ -59,7 +59,7 @@ static irqreturn_t wm831x_ldo_uv_irq(int irq, void *data) * General purpose LDOs */ -static const struct regulator_linear_range wm831x_gp_ldo_ranges[] = { +static const struct linear_range wm831x_gp_ldo_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 14, 50000), REGULATOR_LINEAR_RANGE(1700000, 15, 31, 100000), }; @@ -312,7 +312,7 @@ static struct platform_driver wm831x_gp_ldo_driver = { * Analogue LDOs */ -static const struct regulator_linear_range wm831x_aldo_ranges[] = { +static const struct linear_range wm831x_aldo_ranges[] = { REGULATOR_LINEAR_RANGE(1000000, 0, 12, 50000), REGULATOR_LINEAR_RANGE(1700000, 13, 31, 100000), }; diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c index 56d6168a888d..ae5f0e7fce8b 100644 --- a/drivers/regulator/wm8350-regulator.c +++ b/drivers/regulator/wm8350-regulator.c @@ -470,7 +470,7 @@ static int wm8350_dcdc_set_suspend_mode(struct regulator_dev *rdev, return 0; } -static const struct regulator_linear_range wm8350_ldo_ranges[] = { +static const struct linear_range wm8350_ldo_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 15, 50000), REGULATOR_LINEAR_RANGE(1800000, 16, 31, 100000), }; diff --git a/drivers/regulator/wm8400-regulator.c b/drivers/regulator/wm8400-regulator.c index 6f331b51e479..4cb1fbb59722 100644 --- a/drivers/regulator/wm8400-regulator.c +++ b/drivers/regulator/wm8400-regulator.c @@ -13,7 +13,7 @@ #include #include -static const struct regulator_linear_range wm8400_ldo_ranges[] = { +static const struct linear_range wm8400_ldo_ranges[] = { REGULATOR_LINEAR_RANGE(900000, 0, 14, 50000), REGULATOR_LINEAR_RANGE(1700000, 15, 31, 100000), }; diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 29d920516e0b..7eb9fea8e482 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -13,6 +13,7 @@ #define __LINUX_REGULATOR_DRIVER_H_ #include +#include #include #include #include @@ -39,31 +40,13 @@ enum regulator_status { REGULATOR_STATUS_UNDEFINED, }; -/** - * struct regulator_linear_range - specify linear voltage ranges - * - * Specify a range of voltages for regulator_map_linear_range() and - * regulator_list_linear_range(). - * - * @min_uV: Lowest voltage in range - * @min_sel: Lowest selector for range - * @max_sel: Highest selector for range - * @uV_step: Step size - */ -struct regulator_linear_range { - unsigned int min_uV; - unsigned int min_sel; - unsigned int max_sel; - unsigned int uV_step; -}; - -/* Initialize struct regulator_linear_range */ +/* Initialize struct linear_range for regulators */ #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) \ { \ - .min_uV = _min_uV, \ + .min = _min_uV, \ .min_sel = _min_sel, \ .max_sel = _max_sel, \ - .uV_step = _step_uV, \ + .step = _step_uV, \ } /** @@ -348,7 +331,7 @@ struct regulator_desc { unsigned int ramp_delay; int min_dropout_uV; - const struct regulator_linear_range *linear_ranges; + const struct linear_range *linear_ranges; const unsigned int *linear_range_selectors; int n_linear_ranges; -- cgit v1.2.3 From 62a7f3009a460001eb46984395280dd900bc4ef4 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 8 May 2020 14:53:40 +0800 Subject: serial: 8250_pci: Move Pericom IDs to pci_ids.h Move the IDs to pci_ids.h so it can be used by next patch. Link: https://lore.kernel.org/r/20200508065343.32751-1-kai.heng.feng@canonical.com Signed-off-by: Kai-Heng Feng Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- drivers/tty/serial/8250/8250_pci.c | 6 ------ include/linux/pci_ids.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 0804469ff052..1a74d511b02a 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -1869,12 +1869,6 @@ pci_moxa_setup(struct serial_private *priv, #define PCIE_DEVICE_ID_WCH_CH384_4S 0x3470 #define PCIE_DEVICE_ID_WCH_CH382_2S 0x3253 -#define PCI_VENDOR_ID_PERICOM 0x12D8 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7951 0x7951 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7952 0x7952 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7954 0x7954 -#define PCI_DEVICE_ID_PERICOM_PI7C9X7958 0x7958 - #define PCI_VENDOR_ID_ACCESIO 0x494f #define PCI_DEVICE_ID_ACCESIO_PCIE_COM_2SDB 0x1051 #define PCI_DEVICE_ID_ACCESIO_MPCIE_COM_2S 0x1053 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 1dfc4e1dcb94..9a57e6717e5c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1832,6 +1832,12 @@ #define PCI_VENDOR_ID_NVIDIA_SGS 0x12d2 #define PCI_DEVICE_ID_NVIDIA_SGS_RIVA128 0x0018 +#define PCI_VENDOR_ID_PERICOM 0x12D8 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7951 0x7951 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7952 0x7952 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7954 0x7954 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7958 0x7958 + #define PCI_SUBVENDOR_ID_CHASE_PCIFAST 0x12E0 #define PCI_SUBDEVICE_ID_CHASE_PCIFAST4 0x0031 #define PCI_SUBDEVICE_ID_CHASE_PCIFAST8 0x0021 -- cgit v1.2.3 From 52782c92ac85c4e393eb4a903a62e6c24afa633f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 6 May 2020 12:09:34 -0400 Subject: kthread: save thread function It's handy to keep the kthread_fn just as a unique cookie to identify classes of kthreads. E.g. if you can verify that a given task is running your thread_fn, then you may know what sort of type kthread_data points to. We'll use this in nfsd to pass some information into the vfs. Note it will need kthread_data() exported too. Original-patch-by: Tejun Heo Signed-off-by: J. Bruce Fields --- include/linux/kthread.h | 1 + kernel/kthread.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 8bbcaad7ef0f..c2a274b79c42 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -57,6 +57,7 @@ bool kthread_should_stop(void); bool kthread_should_park(void); bool __kthread_should_park(struct task_struct *k); bool kthread_freezable_should_stop(bool *was_frozen); +void *kthread_func(struct task_struct *k); void *kthread_data(struct task_struct *k); void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); diff --git a/kernel/kthread.c b/kernel/kthread.c index bfbfa481be3a..b84fc7eec035 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -46,6 +46,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; @@ -152,6 +153,20 @@ bool kthread_freezable_should_stop(bool *was_frozen) } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); +/** + * kthread_func - return the function specified on kthread creation + * @task: kthread task in question + * + * Returns NULL if the task is not a kthread. + */ +void *kthread_func(struct task_struct *task) +{ + if (task->flags & PF_KTHREAD) + return to_kthread(task)->threadfn; + return NULL; +} +EXPORT_SYMBOL_GPL(kthread_func); + /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question @@ -164,6 +179,7 @@ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } +EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() @@ -244,6 +260,7 @@ static int kthread(void *_create) do_exit(-ENOMEM); } + self->threadfn = threadfn; self->data = data; init_completion(&self->exited); init_completion(&self->parked); -- cgit v1.2.3 From 28df3d1539de5090f7916f6fff03891b67f366f4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 28 Jul 2017 16:35:15 -0400 Subject: nfsd: clients don't need to break their own delegations We currently revoke read delegations on any write open or any operation that modifies file data or metadata (including rename, link, and unlink). But if the delegation in question is the only read delegation and is held by the client performing the operation, that's not really necessary. It's not always possible to prevent this in the NFSv4.0 case, because there's not always a way to determine which client an NFSv4.0 delegation came from. (In theory we could try to guess this from the transport layer, e.g., by assuming all traffic on a given TCP connection comes from the same client. But that's not really correct.) In the NFSv4.1 case the session layer always tells us the client. This patch should remove such self-conflicts in all cases where we can reliably determine the client from the compound. To do that we need to track "who" is performing a given (possibly lease-breaking) file operation. We're doing that by storing the information in the svc_rqst and using kthread_data() to map the current task back to a svc_rqst. Signed-off-by: J. Bruce Fields --- Documentation/filesystems/locking.rst | 2 ++ fs/locks.c | 3 +++ fs/nfsd/nfs4proc.c | 2 ++ fs/nfsd/nfs4state.c | 14 ++++++++++++++ fs/nfsd/nfsd.h | 2 ++ fs/nfsd/nfssvc.c | 6 ++++++ include/linux/fs.h | 1 + include/linux/sunrpc/svc.h | 1 + 8 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 5057e4d9dcd1..9fdcec416614 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -425,6 +425,7 @@ prototypes:: int (*lm_grant)(struct file_lock *, struct file_lock *, int); void (*lm_break)(struct file_lock *); /* break_lease callback */ int (*lm_change)(struct file_lock **, int); + bool (*lm_breaker_owns_lease)(struct file_lock *); locking rules: @@ -435,6 +436,7 @@ lm_notify: yes yes no lm_grant: no no no lm_break: yes no no lm_change yes no no +lm_breaker_owns_lease: no no no ========== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index b8a31c1c4fff..a3f186846e93 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1557,6 +1557,9 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) { bool rc; + if (lease->fl_lmops->lm_breaker_owns_lease + && lease->fl_lmops->lm_breaker_owns_lease(lease)) + return false; if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) { rc = false; goto trace; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0e75f7fb5fec..a6d73aa51ce4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2302,6 +2302,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) } check_if_stalefh_allowed(args); + rqstp->rq_lease_breaker = (void **)&cstate->clp; + trace_nfsd_compound(rqstp, args->opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fe88f5f23cf4..31388046c6fe 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4579,6 +4579,19 @@ nfsd_break_deleg_cb(struct file_lock *fl) return ret; } +static bool nfsd_breaker_owns_lease(struct file_lock *fl) +{ + struct nfs4_delegation *dl = fl->fl_owner; + struct svc_rqst *rqst; + struct nfs4_client *clp; + + if (!i_am_nfsd()) + return NULL; + rqst = kthread_data(current); + clp = *(rqst->rq_lease_breaker); + return dl->dl_stid.sc_client == clp; +} + static int nfsd_change_deleg_cb(struct file_lock *onlist, int arg, struct list_head *dispose) @@ -4590,6 +4603,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg, } static const struct lock_manager_operations nfsd_lease_mng_ops = { + .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, }; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 2ab5569126b8..36cdd81b6688 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -88,6 +88,8 @@ int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_destroy(struct net *net); +bool i_am_nfsd(void); + struct nfsdfs_client { struct kref cl_ref; void (*cl_release)(struct kref *kref); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ca9fd348548b..4f588c0eaaf4 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -601,6 +601,11 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_module = THIS_MODULE, }; +bool i_am_nfsd() +{ + return kthread_func(current) == nfsd; +} + int nfsd_create_serv(struct net *net) { int error; @@ -1011,6 +1016,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) *statp = rpc_garbage_args; return 1; } + rqstp->rq_lease_breaker = NULL; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..4b784560ffaf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1045,6 +1045,7 @@ struct lock_manager_operations { bool (*lm_break)(struct file_lock *); int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); + bool (*lm_breaker_owns_lease)(struct file_lock *); }; struct lock_manager { diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index fd390894a584..abf4a57ce4a7 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -299,6 +299,7 @@ struct svc_rqst { struct net *rq_bc_net; /* pointer to backchannel's * net namespace */ + void ** rq_lease_breaker; /* The v4 client breaking a lease */ }; #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net) -- cgit v1.2.3 From f2a8d52e0a4db968c346c4332630a71cba377567 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 May 2020 16:04:30 +0200 Subject: nsproxy: add struct nsset Add a simple struct nsset. It holds all necessary pieces to switch to a new set of namespaces without leaving a task in a half-switched state which we will make use of in the next patch. This patch switches the existing setns logic over without causing a change in setns() behavior. This brings setns() closer to how unshare() works(). The prepare_ns() function is responsible to prepare all necessary information. This has two reasons. First it minimizes dependencies between individual namespaces, i.e. all install handler can expect that all fields are properly initialized independent in what order they are called in. Second, this makes the code easier to maintain and easier to follow if it needs to be changed. The prepare_ns() helper will only be switched over to use a flags argument in the next patch. Here it will still use nstype as a simple integer argument which was argued would be clearer. I'm not particularly opinionated about this if it really helps or not. The struct nsset itself already contains the flags field since its name already indicates that it can contain information required by different namespaces. None of this should have functional consequences. Signed-off-by: Christian Brauner Reviewed-by: Serge Hallyn Cc: Eric W. Biederman Cc: Serge Hallyn Cc: Jann Horn Cc: Michael Kerrisk Cc: Aleksa Sarai Link: https://lore.kernel.org/r/20200505140432.181565-2-christian.brauner@ubuntu.com --- fs/namespace.c | 10 +++-- include/linux/mnt_namespace.h | 1 + include/linux/nsproxy.h | 24 ++++++++++++ include/linux/proc_ns.h | 4 +- ipc/namespace.c | 7 ++-- kernel/cgroup/namespace.c | 5 ++- kernel/nsproxy.c | 90 ++++++++++++++++++++++++++++++++++++------- kernel/pid_namespace.c | 5 ++- kernel/time/namespace.c | 5 ++- kernel/user_namespace.c | 8 ++-- kernel/utsname.c | 5 ++- net/core/net_namespace.c | 5 ++- 12 files changed, 132 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..62899fad4a04 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3954,16 +3954,18 @@ static void mntns_put(struct ns_common *ns) put_mnt_ns(to_mnt_ns(ns)); } -static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int mntns_install(struct nsset *nsset, struct ns_common *ns) { - struct fs_struct *fs = current->fs; + struct nsproxy *nsproxy = nsset->nsproxy; + struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; + struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(user_ns, CAP_SYS_CHROOT) || + !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 35942084cd40..007cfa52efb2 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -6,6 +6,7 @@ struct mnt_namespace; struct fs_struct; struct user_namespace; +struct ns_common; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 074f395b9ad2..cdb171efc7cb 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -41,6 +41,30 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; +/* + * A structure to encompass all bits needed to install + * a partial or complete new set of namespaces. + * + * If a new user namespace is requested cred will + * point to a modifiable set of credentials. If a pointer + * to a modifiable set is needed nsset_cred() must be + * used and tested. + */ +struct nsset { + unsigned flags; + struct nsproxy *nsproxy; + struct fs_struct *fs; + const struct cred *cred; +}; + +static inline struct cred *nsset_cred(struct nsset *set) +{ + if (set->flags & CLONE_NEWUSER) + return (struct cred *)set->cred; + + return NULL; +} + /* * the namespaces access rules are: * diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 6abe85c34681..75807ecef880 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -8,7 +8,7 @@ #include struct pid_namespace; -struct nsproxy; +struct nsset; struct path; struct task_struct; struct inode; @@ -19,7 +19,7 @@ struct proc_ns_operations { int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); - int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); + int (*install)(struct nsset *nsset, struct ns_common *ns); struct user_namespace *(*owner)(struct ns_common *ns); struct ns_common *(*get_parent)(struct ns_common *ns); } __randomize_layout; diff --git a/ipc/namespace.c b/ipc/namespace.c index b3ca1476ca51..fdc3b5f3f53a 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -177,15 +177,14 @@ static void ipcns_put(struct ns_common *ns) return put_ipc_ns(to_ipc_ns(ns)); } -static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int ipcns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - /* Ditch state from the old ipc namespace */ - exit_sem(current); put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index b05f1dd58a62..812a61afd538 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ed9882108cd2..b7954fd60475 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -257,12 +258,79 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +static void put_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + + if (flags & CLONE_NEWUSER) + put_cred(nsset_cred(nsset)); + if (nsset->nsproxy) + free_nsproxy(nsset->nsproxy); +} + +static int prepare_nsset(int nstype, struct nsset *nsset) +{ + struct task_struct *me = current; + + nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs); + if (IS_ERR(nsset->nsproxy)) + return PTR_ERR(nsset->nsproxy); + + if (nstype == CLONE_NEWUSER) + nsset->cred = prepare_creds(); + else + nsset->cred = current_cred(); + if (!nsset->cred) + goto out; + + if (nstype == CLONE_NEWNS) + nsset->fs = me->fs; + + nsset->flags = nstype; + return 0; + +out: + put_nsset(nsset); + return -ENOMEM; +} + +/* + * This is the point of no return. There are just a few namespaces + * that do some actual work here and it's sufficiently minimal that + * a separate ns_common operation seems unnecessary for now. + * Unshare is doing the same thing. If we'll end up needing to do + * more in a given namespace or a helper here is ultimately not + * exported anymore a simple commit handler for each namespace + * should be added to ns_common. + */ +static void commit_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + struct task_struct *me = current; + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + /* transfer ownership */ + commit_creds(nsset_cred(nsset)); + nsset->cred = NULL; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + exit_sem(me); +#endif + + /* transfer ownership */ + switch_task_namespaces(me, nsset->nsproxy); + nsset->nsproxy = NULL; +} + SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; struct file *file; struct ns_common *ns; + struct nsset nsset = {}; int err; file = proc_ns_fget(fd); @@ -274,20 +342,16 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ns->ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); + err = prepare_nsset(ns->ops->type, &nsset); + if (err) goto out; - } - err = ns->ops->install(new_nsproxy, ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; + err = ns->ops->install(&nsset, ns); + if (!err) { + commit_nsset(&nsset); + perf_event_namespaces(current); } - switch_task_namespaces(tsk, new_nsproxy); - - perf_event_namespaces(tsk); + put_nsset(&nsset); out: fput(file); return err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..11db2bdbb41e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 53bce347cd50..5d9fc22d836a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +static int timens_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); int err; @@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; timens_set_vvar_page(current, ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..87804e0371fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns) put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; @@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - cred = prepare_creds(); + cred = nsset_cred(nsset); if (!cred) - return -ENOMEM; + return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); - return commit_creds(cred); + return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index f0e491193009..e488d0e2ab45 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns) put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int utsns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 190ca66a383b..dcd61aca343e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1353,12 +1353,13 @@ static void netns_put(struct ns_common *ns) put_net(to_net_ns(ns)); } -static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int netns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); -- cgit v1.2.3 From 4c74746625dedb62ce00ac86ac4faedb640536d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:57 +0200 Subject: driver core: remove device_create_vargs All external users of device_create_vargs are gone, so remove it and open code it in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/base/core.c | 37 ++----------------------------------- include/linux/device.h | 4 ---- 2 files changed, 2 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 139cdf7e7327..fb8ae248e5aa 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3188,40 +3188,6 @@ error: return ERR_PTR(retval); } -/** - * device_create_vargs - creates a device and registers it with sysfs - * @class: pointer to the struct class that this device should be registered to - * @parent: pointer to the parent struct device of this new device, if any - * @devt: the dev_t for the char device to be added - * @drvdata: the data to be added to the device for callbacks - * @fmt: string for the device's name - * @args: va_list for the device's name - * - * This function can be used by char device classes. A struct device - * will be created in sysfs, registered to the specified class. - * - * A "dev" file will be created, showing the dev_t for the device, if - * the dev_t is not 0,0. - * If a pointer to a parent struct device is passed in, the newly created - * struct device will be a child of that device in sysfs. - * The pointer to the struct device will be returned from the call. - * Any further sysfs files that might be required can be created using this - * pointer. - * - * Returns &struct device pointer on success, or ERR_PTR() on error. - * - * Note: the struct class passed to this function must have previously - * been created with a call to class_create(). - */ -struct device *device_create_vargs(struct class *class, struct device *parent, - dev_t devt, void *drvdata, const char *fmt, - va_list args) -{ - return device_create_groups_vargs(class, parent, devt, drvdata, NULL, - fmt, args); -} -EXPORT_SYMBOL_GPL(device_create_vargs); - /** * device_create - creates a device and registers it with sysfs * @class: pointer to the struct class that this device should be registered to @@ -3253,7 +3219,8 @@ struct device *device_create(struct class *class, struct device *parent, struct device *dev; va_start(vargs, fmt); - dev = device_create_vargs(class, parent, devt, drvdata, fmt, vargs); + dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL, + fmt, vargs); va_end(vargs); return dev; } diff --git a/include/linux/device.h b/include/linux/device.h index ac8e37cd716a..15460a5ac024 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -884,10 +884,6 @@ extern bool device_is_bound(struct device *dev); /* * Easy functions for dynamically creating devices on the fly */ -extern __printf(5, 0) -struct device *device_create_vargs(struct class *cls, struct device *parent, - dev_t devt, void *drvdata, - const char *fmt, va_list vargs); extern __printf(5, 6) struct device *device_create(struct class *cls, struct device *parent, dev_t devt, void *drvdata, -- cgit v1.2.3 From 3c5d202b55d3fa9106607f99cdd5044b02b5929b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:59 +0200 Subject: bdi: remove bdi_register_owner Split out a new bdi_set_owner helper to set the owner, and move the policy for creating the bdi name back into genhd.c, where it belongs. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/genhd.c | 8 +++++--- include/linux/backing-dev.h | 2 +- mm/backing-dev.c | 12 ++---------- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index c05d509877fa..27511b3d164d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -840,13 +840,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { + struct backing_dev_info *bdi = disk->queue->backing_dev_info; + struct device *dev = disk_to_dev(disk); int ret; /* Register BDI before referencing it from bdev */ - disk_to_dev(disk)->devt = devt; - ret = bdi_register_owner(disk->queue->backing_dev_info, - disk_to_dev(disk)); + dev->devt = devt; + ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); + bdi_set_owner(bdi, dev); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c9ad5c3b7b4b..4098ed6ba6b4 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -33,7 +33,7 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); -int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eb6b51e49d11..bb993f99d424 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -977,20 +977,12 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) } EXPORT_SYMBOL(bdi_register); -int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { - int rc; - - rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt)); - if (rc) - return rc; - /* Leaking owner reference... */ - WARN_ON(bdi->owner); + WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); - return 0; } -EXPORT_SYMBOL(bdi_register_owner); /* * Remove bdi from bdi_list, and ensure that it is no longer visible -- cgit v1.2.3 From aef33c2ff8aa5e24f3f7d93806aa84ca1c2b6832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:48:00 +0200 Subject: bdi: simplify bdi_alloc Merge the _node vs normal version and drop the superflous gfp_t argument. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/mtd/mtdcore.c | 2 +- fs/super.c | 2 +- include/linux/backing-dev.h | 6 +----- mm/backing-dev.c | 7 +++---- 5 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 7f11560bfddb..285a2f8ee8d3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -501,7 +501,7 @@ struct request_queue *__blk_alloc_queue(int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); + q->backing_dev_info = bdi_alloc(node_id); if (!q->backing_dev_info) goto fail_split; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 2916674208b3..39ec563d9a14 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2036,7 +2036,7 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name) struct backing_dev_info *bdi; int ret; - bdi = bdi_alloc(GFP_KERNEL); + bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return ERR_PTR(-ENOMEM); diff --git a/fs/super.c b/fs/super.c index cd352530eca9..dd28fcd706ff 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1598,7 +1598,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) int err; va_list args; - bdi = bdi_alloc(GFP_KERNEL); + bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 4098ed6ba6b4..6b3504bf7a42 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,11 +36,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); -struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); -static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) -{ - return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); -} +struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bb993f99d424..1f55d5b8269f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -865,12 +865,11 @@ static int bdi_init(struct backing_dev_info *bdi) return ret; } -struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; - bdi = kmalloc_node(sizeof(struct backing_dev_info), - gfp_mask | __GFP_ZERO, node_id); + bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; @@ -880,7 +879,7 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } return bdi; } -EXPORT_SYMBOL(bdi_alloc_node); +EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { -- cgit v1.2.3 From 1cd925d583857ee3ead6cfbf1e4b1cd067d28591 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:48:01 +0200 Subject: bdi: remove the name field in struct backing_dev_info The name is only printed for a not registered bdi in writeback. Use the device name there as is more useful anyway for the unlike case that the warning triggers. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - drivers/mtd/mtdcore.c | 1 - fs/fs-writeback.c | 2 +- fs/super.c | 2 -- include/linux/backing-dev-defs.h | 2 -- mm/backing-dev.c | 1 - 6 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 285a2f8ee8d3..538cbc725620 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -511,7 +511,6 @@ struct request_queue *__blk_alloc_queue(int node_id) q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; - q->backing_dev_info->name = "block"; q->node = node_id; timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 39ec563d9a14..fcb018ce17c3 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2040,7 +2040,6 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name) if (!bdi) return ERR_PTR(-ENOMEM); - bdi->name = name; /* * We put '-0' suffix to the name to get the same name format as we * used to get. Since this is called only once, we get a unique name. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..d85323607b49 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2320,7 +2320,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) WARN(bdi_cap_writeback_dirty(wb->bdi) && !test_bit(WB_registered, &wb->state), - "bdi-%s not registered\n", wb->bdi->name); + "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); inode->dirtied_when = jiffies; if (dirtytime) diff --git a/fs/super.c b/fs/super.c index dd28fcd706ff..4991f441988e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1602,8 +1602,6 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) if (!bdi) return -ENOMEM; - bdi->name = sb->s_type->name; - va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 7367150f962a..90a7e844a098 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -193,8 +193,6 @@ struct backing_dev_info { congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ - const char *name; - struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1f55d5b8269f..d382272bcc31 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -15,7 +15,6 @@ #include struct backing_dev_info noop_backing_dev_info = { - .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); -- cgit v1.2.3 From 615399896ca3787728c56c499c99be79e40ac125 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 31 Mar 2020 09:49:46 -0700 Subject: nvme-fc: Sync header to FC-NVME-2 rev 1.08 A couple of minor changes occurred between 1.06 and 1.08: - Addition of NVME_SR_RSP opcode - change of SR_RSP status code 1 to Reserved Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme-fc.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index e8c30b39bb27..51fe44e0328b 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -4,8 +4,8 @@ */ /* - * This file contains definitions relative to FC-NVME-2 r1.06 - * (T11-2019-00210-v001). + * This file contains definitions relative to FC-NVME-2 r1.08 + * (T11-2019-00210-v004). */ #ifndef _NVME_FC_H @@ -81,7 +81,8 @@ struct nvme_fc_ersp_iu { }; -#define FCNVME_NVME_SR_OPCODE 0x01 +#define FCNVME_NVME_SR_OPCODE 0x01 +#define FCNVME_NVME_SR_RSP_OPCODE 0x02 struct nvme_fc_nvme_sr_iu { __u8 fc_id; @@ -94,7 +95,7 @@ struct nvme_fc_nvme_sr_iu { enum { FCNVME_SRSTAT_ACC = 0x0, - FCNVME_SRSTAT_INV_FCID = 0x1, + /* reserved 0x1 */ /* reserved 0x2 */ FCNVME_SRSTAT_LOGICAL_ERR = 0x3, FCNVME_SRSTAT_INV_QUALIF = 0x4, @@ -397,7 +398,7 @@ struct fcnvme_ls_disconnect_conn_rqst { struct fcnvme_ls_rqst_w0 w0; __be32 desc_list_len; struct fcnvme_lsdesc_assoc_id associd; - struct fcnvme_lsdesc_disconn_cmd connectid; + struct fcnvme_lsdesc_conn_id connectid; }; struct fcnvme_ls_disconnect_conn_acc { -- cgit v1.2.3 From 72e6329f86c714785ac195d293cb19dd24507880 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 31 Mar 2020 09:49:47 -0700 Subject: nvme-fc and nvmet-fc: revise LLDD api for LS reception and LS request The current LLDD api has: nvme-fc: contains api for transport to do LS requests (and aborts of them). However, there is no interface for reception of LS's and sending responses for them. nvmet-fc: contains api for transport to do reception of LS's and sending of responses for them. However, there is no interface for doing LS requests. Revise the api's so that both nvme-fc and nvmet-fc can send LS's, as well as receiving LS's and sending their responses. Change name of the rcv_ls_req struct to better reflect generic use as a context to used to send an ls rsp. Specifically: nvmefc_tgt_ls_req -> nvmefc_ls_rsp nvmefc_tgt_ls_req.nvmet_fc_private -> nvmefc_ls_rsp.nvme_fc_private Change nvmet_fc_rcv_ls_req() calling sequence to provide handle that can be used by transport in later LS request sequences for an association. nvme-fc nvmet_fc nvme_fcloop: Revise to adapt to changed names in api header. Change calling sequence to nvmet_fc_rcv_ls_req() for hosthandle. Add stubs for new interfaces: host/fc.c: nvme_fc_rcv_ls_req() target/fc.c: nvmet_fc_invalidate_host() lpfc: Revise to adapt code to changed names in api header. Change calling sequence to nvmet_fc_rcv_ls_req() for hosthandle. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 35 ++++ drivers/nvme/target/fc.c | 77 ++++++--- drivers/nvme/target/fcloop.c | 20 +-- drivers/scsi/lpfc/lpfc_nvmet.c | 10 +- drivers/scsi/lpfc/lpfc_nvmet.h | 2 +- include/linux/nvme-fc-driver.h | 368 ++++++++++++++++++++++++++++++----------- 6 files changed, 378 insertions(+), 134 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 7dfc4a2ecf1e..8012099fc3ee 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1464,6 +1464,41 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) kfree(lsop); } +/** + * nvme_fc_rcv_ls_req - transport entry point called by an LLDD + * upon the reception of a NVME LS request. + * + * The nvme-fc layer will copy payload to an internal structure for + * processing. As such, upon completion of the routine, the LLDD may + * immediately free/reuse the LS request buffer passed in the call. + * + * If this routine returns error, the LLDD should abort the exchange. + * + * @remoteport: pointer to the (registered) remote port that the LS + * was received from. The remoteport is associated with + * a specific localport. + * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be + * used to reference the exchange corresponding to the LS + * when issuing an ls response. + * @lsreqbuf: pointer to the buffer containing the LS Request + * @lsreqbuf_len: length, in bytes, of the received LS request + */ +int +nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, + struct nvmefc_ls_rsp *lsrsp, + void *lsreqbuf, u32 lsreqbuf_len) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + struct nvme_fc_lport *lport = rport->lport; + + /* validate there's a routine to transmit a response */ + if (!lport->ops->xmt_ls_rsp) + return(-EINVAL); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_fc_rcv_ls_req); + /* *********************** NVME Ctrl Routines **************************** */ diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a8ceb7721640..aac7869a70bb 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -28,7 +28,7 @@ struct nvmet_fc_tgtport; struct nvmet_fc_tgt_assoc; struct nvmet_fc_ls_iod { - struct nvmefc_tgt_ls_req *lsreq; + struct nvmefc_ls_rsp *lsrsp; struct nvmefc_tgt_fcp_req *fcpreq; /* only if RS */ struct list_head ls_list; /* tgtport->ls_list */ @@ -1146,6 +1146,42 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) spin_unlock_irqrestore(&tgtport->lock, flags); } +/** + * nvmet_fc_invalidate_host - transport entry point called by an LLDD + * to remove references to a hosthandle for LS's. + * + * The nvmet-fc layer ensures that any references to the hosthandle + * on the targetport are forgotten (set to NULL). The LLDD will + * typically call this when a login with a remote host port has been + * lost, thus LS's for the remote host port are no longer possible. + * + * If an LS request is outstanding to the targetport/hosthandle (or + * issued concurrently with the call to invalidate the host), the + * LLDD is responsible for terminating/aborting the LS and completing + * the LS request. It is recommended that these terminations/aborts + * occur after calling to invalidate the host handle to avoid additional + * retries by the nvmet-fc transport. The nvmet-fc transport may + * continue to reference host handle while it cleans up outstanding + * NVME associations. The nvmet-fc transport will call the + * ops->host_release() callback to notify the LLDD that all references + * are complete and the related host handle can be recovered. + * Note: if there are no references, the callback may be called before + * the invalidate host call returns. + * + * @target_port: pointer to the (registered) target port that a prior + * LS was received on and which supplied the transport the + * hosthandle. + * @hosthandle: the handle (pointer) that represents the host port + * that no longer has connectivity and that LS's should + * no longer be directed to. + */ +void +nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, + void *hosthandle) +{ +} +EXPORT_SYMBOL_GPL(nvmet_fc_invalidate_host); + /* * nvmet layer has called to terminate an association */ @@ -1371,7 +1407,7 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, dev_err(tgtport->dev, "Create Association LS failed: %s\n", validation_errors[ret]); - iod->lsreq->rsplen = nvmet_fc_format_rjt(acc, + iod->lsrsp->rsplen = nvmet_fc_format_rjt(acc, NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, FCNVME_RJT_RC_LOGIC, FCNVME_RJT_EXP_NONE, 0); @@ -1384,7 +1420,7 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, /* format a response */ - iod->lsreq->rsplen = sizeof(*acc); + iod->lsrsp->rsplen = sizeof(*acc); nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, fcnvme_lsdesc_len( @@ -1462,7 +1498,7 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport, dev_err(tgtport->dev, "Create Connection LS failed: %s\n", validation_errors[ret]); - iod->lsreq->rsplen = nvmet_fc_format_rjt(acc, + iod->lsrsp->rsplen = nvmet_fc_format_rjt(acc, NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, (ret == VERR_NO_ASSOC) ? FCNVME_RJT_RC_INV_ASSOC : @@ -1477,7 +1513,7 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport, /* format a response */ - iod->lsreq->rsplen = sizeof(*acc); + iod->lsrsp->rsplen = sizeof(*acc); nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)), @@ -1542,7 +1578,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, dev_err(tgtport->dev, "Disconnect LS failed: %s\n", validation_errors[ret]); - iod->lsreq->rsplen = nvmet_fc_format_rjt(acc, + iod->lsrsp->rsplen = nvmet_fc_format_rjt(acc, NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, (ret == VERR_NO_ASSOC) ? FCNVME_RJT_RC_INV_ASSOC : @@ -1555,7 +1591,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, /* format a response */ - iod->lsreq->rsplen = sizeof(*acc); + iod->lsrsp->rsplen = sizeof(*acc); nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, fcnvme_lsdesc_len( @@ -1577,9 +1613,9 @@ static void nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req); static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops; static void -nvmet_fc_xmt_ls_rsp_done(struct nvmefc_tgt_ls_req *lsreq) +nvmet_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) { - struct nvmet_fc_ls_iod *iod = lsreq->nvmet_fc_private; + struct nvmet_fc_ls_iod *iod = lsrsp->nvme_fc_private; struct nvmet_fc_tgtport *tgtport = iod->tgtport; fc_dma_sync_single_for_cpu(tgtport->dev, iod->rspdma, @@ -1597,9 +1633,9 @@ nvmet_fc_xmt_ls_rsp(struct nvmet_fc_tgtport *tgtport, fc_dma_sync_single_for_device(tgtport->dev, iod->rspdma, NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE); - ret = tgtport->ops->xmt_ls_rsp(&tgtport->fc_target_port, iod->lsreq); + ret = tgtport->ops->xmt_ls_rsp(&tgtport->fc_target_port, iod->lsrsp); if (ret) - nvmet_fc_xmt_ls_rsp_done(iod->lsreq); + nvmet_fc_xmt_ls_rsp_done(iod->lsrsp); } /* @@ -1612,12 +1648,12 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport, struct fcnvme_ls_rqst_w0 *w0 = (struct fcnvme_ls_rqst_w0 *)iod->rqstbuf; - iod->lsreq->nvmet_fc_private = iod; - iod->lsreq->rspbuf = iod->rspbuf; - iod->lsreq->rspdma = iod->rspdma; - iod->lsreq->done = nvmet_fc_xmt_ls_rsp_done; + iod->lsrsp->nvme_fc_private = iod; + iod->lsrsp->rspbuf = iod->rspbuf; + iod->lsrsp->rspdma = iod->rspdma; + iod->lsrsp->done = nvmet_fc_xmt_ls_rsp_done; /* Be preventative. handlers will later set to valid length */ - iod->lsreq->rsplen = 0; + iod->lsrsp->rsplen = 0; iod->assoc = NULL; @@ -1640,7 +1676,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport, nvmet_fc_ls_disconnect(tgtport, iod); break; default: - iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf, + iod->lsrsp->rsplen = nvmet_fc_format_rjt(iod->rspbuf, NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd, FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); } @@ -1674,14 +1710,15 @@ nvmet_fc_handle_ls_rqst_work(struct work_struct *work) * * @target_port: pointer to the (registered) target port the LS was * received on. - * @lsreq: pointer to a lsreq request structure to be used to reference + * @lsrsp: pointer to a lsrsp structure to be used to reference * the exchange corresponding to the LS. * @lsreqbuf: pointer to the buffer containing the LS Request * @lsreqbuf_len: length, in bytes, of the received LS request */ int nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *target_port, - struct nvmefc_tgt_ls_req *lsreq, + void *hosthandle, + struct nvmefc_ls_rsp *lsrsp, void *lsreqbuf, u32 lsreqbuf_len) { struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port); @@ -1699,7 +1736,7 @@ nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *target_port, return -ENOENT; } - iod->lsreq = lsreq; + iod->lsrsp = lsrsp; iod->fcpreq = NULL; memcpy(iod->rqstbuf, lsreqbuf, lsreqbuf_len); iod->rqstdatalen = lsreqbuf_len; diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index f69ce66e2d44..c11805a155e8 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -228,7 +228,7 @@ struct fcloop_nport { struct fcloop_lsreq { struct nvmefc_ls_req *lsreq; - struct nvmefc_tgt_ls_req tgt_ls_req; + struct nvmefc_ls_rsp ls_rsp; int status; struct list_head ls_list; /* fcloop_rport->ls_list */ }; @@ -267,9 +267,9 @@ struct fcloop_ini_fcpreq { }; static inline struct fcloop_lsreq * -tgt_ls_req_to_lsreq(struct nvmefc_tgt_ls_req *tgt_lsreq) +ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp) { - return container_of(tgt_lsreq, struct fcloop_lsreq, tgt_ls_req); + return container_of(lsrsp, struct fcloop_lsreq, ls_rsp); } static inline struct fcloop_fcpreq * @@ -344,7 +344,7 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, } tls_req->status = 0; - ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req, + ret = nvmet_fc_rcv_ls_req(rport->targetport, NULL, &tls_req->ls_rsp, lsreq->rqstaddr, lsreq->rqstlen); return ret; @@ -352,19 +352,19 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, static int fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, - struct nvmefc_tgt_ls_req *tgt_lsreq) + struct nvmefc_ls_rsp *lsrsp) { - struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq); + struct fcloop_lsreq *tls_req = ls_rsp_to_lsreq(lsrsp); struct nvmefc_ls_req *lsreq = tls_req->lsreq; struct fcloop_tport *tport = targetport->private; struct nvme_fc_remote_port *remoteport = tport->remoteport; struct fcloop_rport *rport; - memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf, - ((lsreq->rsplen < tgt_lsreq->rsplen) ? - lsreq->rsplen : tgt_lsreq->rsplen)); + memcpy(lsreq->rspaddr, lsrsp->rspbuf, + ((lsreq->rsplen < lsrsp->rsplen) ? + lsreq->rsplen : lsrsp->rsplen)); - tgt_lsreq->done(tgt_lsreq); + lsrsp->done(lsrsp); if (remoteport) { rport = remoteport->private; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index 565419bf8d74..3b25bcb150ea 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -302,7 +302,7 @@ lpfc_nvmet_xmt_ls_rsp_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, struct lpfc_wcqe_complete *wcqe) { struct lpfc_nvmet_tgtport *tgtp; - struct nvmefc_tgt_ls_req *rsp; + struct nvmefc_ls_rsp *rsp; struct lpfc_nvmet_rcv_ctx *ctxp; uint32_t status, result; @@ -335,7 +335,7 @@ lpfc_nvmet_xmt_ls_rsp_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, } out: - rsp = &ctxp->ctx.ls_req; + rsp = &ctxp->ctx.ls_rsp; lpfc_nvmeio_data(phba, "NVMET LS CMPL: xri x%x stat x%x result x%x\n", ctxp->oxid, status, result); @@ -828,10 +828,10 @@ lpfc_nvmet_xmt_fcp_op_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe, static int lpfc_nvmet_xmt_ls_rsp(struct nvmet_fc_target_port *tgtport, - struct nvmefc_tgt_ls_req *rsp) + struct nvmefc_ls_rsp *rsp) { struct lpfc_nvmet_rcv_ctx *ctxp = - container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.ls_req); + container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.ls_rsp); struct lpfc_hba *phba = ctxp->phba; struct hbq_dmabuf *nvmebuf = (struct hbq_dmabuf *)ctxp->rqb_buffer; @@ -1999,7 +1999,7 @@ dropit: * lpfc_nvmet_xmt_ls_rsp_cmp should free the allocated ctxp. */ atomic_inc(&tgtp->rcv_ls_req_in); - rc = nvmet_fc_rcv_ls_req(phba->targetport, &ctxp->ctx.ls_req, + rc = nvmet_fc_rcv_ls_req(phba->targetport, NULL, &ctxp->ctx.ls_rsp, payload, size); lpfc_printf_log(phba, KERN_INFO, LOG_NVME_DISC, diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h index b80b1639b9a7..f0196f3ef90d 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.h +++ b/drivers/scsi/lpfc/lpfc_nvmet.h @@ -105,7 +105,7 @@ struct lpfc_nvmet_ctx_info { struct lpfc_nvmet_rcv_ctx { union { - struct nvmefc_tgt_ls_req ls_req; + struct nvmefc_ls_rsp ls_rsp; struct nvmefc_tgt_fcp_req fcp_req; } ctx; struct list_head list; diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 10f81629b9ce..41e7795a3ee4 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -10,47 +10,26 @@ /* - * ********************** LLDD FC-NVME Host API ******************** + * ********************** FC-NVME LS API ******************** * - * For FC LLDD's that are the NVME Host role. + * Data structures used by both FC-NVME hosts and FC-NVME + * targets to perform FC-NVME LS requests or transmit + * responses. * - * ****************************************************************** + * *********************************************************** */ - - /** - * struct nvme_fc_port_info - port-specific ids and FC connection-specific - * data element used during NVME Host role - * registrations - * - * Static fields describing the port being registered: - * @node_name: FC WWNN for the port - * @port_name: FC WWPN for the port - * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) - * @dev_loss_tmo: maximum delay for reconnects to an association on - * this device. Used only on a remoteport. + * struct nvmefc_ls_req - Request structure passed from the transport + * to the LLDD to perform a NVME-FC LS request and obtain + * a response. + * Used by nvme-fc transport (host) to send LS's such as + * Create Association, Create Connection and Disconnect + * Association. + * Used by the nvmet-fc transport (controller) to send + * LS's such as Disconnect Association. * - * Initialization values for dynamic port fields: - * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must - * be set to 0. - */ -struct nvme_fc_port_info { - u64 node_name; - u64 port_name; - u32 port_role; - u32 port_id; - u32 dev_loss_tmo; -}; - - -/** - * struct nvmefc_ls_req - Request structure passed from NVME-FC transport - * to LLDD in order to perform a NVME FC-4 LS - * request and obtain a response. - * - * Values set by the NVME-FC layer prior to calling the LLDD ls_req - * entrypoint. + * Values set by the requestor prior to calling the LLDD ls_req entrypoint: * @rqstaddr: pointer to request buffer * @rqstdma: PCI DMA address of request buffer * @rqstlen: Length, in bytes, of request buffer @@ -63,8 +42,8 @@ struct nvme_fc_port_info { * @private: pointer to memory allocated alongside the ls request structure * that is specifically for the LLDD to use while processing the * request. The length of the buffer corresponds to the - * lsrqst_priv_sz value specified in the nvme_fc_port_template - * supplied by the LLDD. + * lsrqst_priv_sz value specified in the xxx_template supplied + * by the LLDD. * @done: The callback routine the LLDD is to invoke upon completion of * the LS request. req argument is the pointer to the original LS * request structure. Status argument must be 0 upon success, a @@ -86,6 +65,101 @@ struct nvmefc_ls_req { } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ +/** + * struct nvmefc_ls_rsp - Structure passed from the transport to the LLDD + * to request the transmit the NVME-FC LS response to a + * NVME-FC LS request. The structure originates in the LLDD + * and is given to the transport via the xxx_rcv_ls_req() + * transport routine. As such, the structure represents the + * FC exchange context for the NVME-FC LS request that was + * received and which the response is to be sent for. + * Used by the LLDD to pass the nvmet-fc transport (controller) + * received LS's such as Create Association, Create Connection + * and Disconnect Association. + * Used by the LLDD to pass the nvme-fc transport (host) + * received LS's such as Disconnect Association or Disconnect + * Connection. + * + * The structure is allocated by the LLDD whenever a LS Request is received + * from the FC link. The address of the structure is passed to the nvmet-fc + * or nvme-fc layer via the xxx_rcv_ls_req() transport routines. + * + * The address of the structure is to be passed back to the LLDD + * when the response is to be transmit. The LLDD will use the address to + * map back to the LLDD exchange structure which maintains information such + * the remote N_Port that sent the LS as well as any FC exchange context. + * Upon completion of the LS response transmit, the LLDD will pass the + * address of the structure back to the transport LS rsp done() routine, + * allowing the transport release dma resources. Upon completion of + * the done() routine, no further access to the structure will be made by + * the transport and the LLDD can de-allocate the structure. + * + * Field initialization: + * At the time of the xxx_rcv_ls_req() call, there is no content that + * is valid in the structure. + * + * When the structure is used for the LLDD->xmt_ls_rsp() call, the + * transport layer will fully set the fields in order to specify the + * response payload buffer and its length as well as the done routine + * to be called upon completion of the transmit. The transport layer + * will also set a private pointer for its own use in the done routine. + * + * Values set by the transport layer prior to calling the LLDD xmt_ls_rsp + * entrypoint: + * @rspbuf: pointer to the LS response buffer + * @rspdma: PCI DMA address of the LS response buffer + * @rsplen: Length, in bytes, of the LS response buffer + * @done: The callback routine the LLDD is to invoke upon completion of + * transmitting the LS response. req argument is the pointer to + * the original ls request. + * @nvme_fc_private: pointer to an internal transport-specific structure + * used as part of the transport done() processing. The LLDD is + * not to access this pointer. + */ +struct nvmefc_ls_rsp { + void *rspbuf; + dma_addr_t rspdma; + u16 rsplen; + + void (*done)(struct nvmefc_ls_rsp *rsp); + void *nvme_fc_private; /* LLDD is not to access !! */ +}; + + + +/* + * ********************** LLDD FC-NVME Host API ******************** + * + * For FC LLDD's that are the NVME Host role. + * + * ****************************************************************** + */ + + +/** + * struct nvme_fc_port_info - port-specific ids and FC connection-specific + * data element used during NVME Host role + * registrations + * + * Static fields describing the port being registered: + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. Used only on a remoteport. + * + * Initialization values for dynamic port fields: + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + */ +struct nvme_fc_port_info { + u64 node_name; + u64 port_name; + u32 port_role; + u32 port_id; + u32 dev_loss_tmo; +}; + enum nvmefc_fcp_datadir { NVMEFC_FCP_NODATA, /* payload_length and sg_cnt will be zero */ NVMEFC_FCP_WRITE, @@ -337,6 +411,21 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * + * @xmt_ls_rsp: Called to transmit the response to a FC-NVME FC-4 LS service. + * The nvmefc_ls_rsp structure is the same LLDD-supplied exchange + * structure specified in the nvme_fc_rcv_ls_req() call made when + * the LS request was received. The structure will fully describe + * the buffers for the response payload and the dma address of the + * payload. The LLDD is to transmit the response (or return a + * non-zero errno status), and upon completion of the transmit, call + * the "done" routine specified in the nvmefc_ls_rsp structure + * (argument to done is the address of the nvmefc_ls_rsp structure + * itself). Upon the completion of the done routine, the LLDD shall + * consider the LS handling complete and the nvmefc_ls_rsp structure + * may be freed/released. + * Entrypoint is mandatory if the LLDD calls the nvme_fc_rcv_ls_req() + * entrypoint. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -371,7 +460,7 @@ struct nvme_fc_remote_port { * @lsrqst_priv_sz: The LLDD sets this field to the amount of additional * memory that it would like fc nvme layer to allocate on the LLDD's * behalf whenever a ls request structure is allocated. The additional - * memory area solely for the of the LLDD and its location is + * memory area is solely for use by the LLDD and its location is * specified by the ls_request->private pointer. * Value is Mandatory. Allowed to be zero. * @@ -405,6 +494,9 @@ struct nvme_fc_port_template { struct nvme_fc_remote_port *, void *hw_queue_handle, struct nvmefc_fcp_req *); + int (*xmt_ls_rsp)(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *rport, + struct nvmefc_ls_rsp *ls_rsp); u32 max_hw_queues; u16 max_sgl_segments; @@ -441,6 +533,34 @@ void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport, u32 dev_loss_tmo); +/* + * Routine called to pass a NVME-FC LS request, received by the lldd, + * to the nvme-fc transport. + * + * If the return value is zero: the LS was successfully accepted by the + * transport. + * If the return value is non-zero: the transport has not accepted the + * LS. The lldd should ABTS-LS the LS. + * + * Note: if the LLDD receives and ABTS for the LS prior to the transport + * calling the ops->xmt_ls_rsp() routine to transmit a response, the LLDD + * shall mark the LS as aborted, and when the xmt_ls_rsp() is called: the + * response shall not be transmit and the struct nvmefc_ls_rsp() done + * routine shall be called. The LLDD may transmit the ABTS response as + * soon as the LS was marked or can delay until the xmt_ls_rsp() call is + * made. + * Note: if an RCV LS was successfully posted to the transport and the + * remoteport is then unregistered before xmt_ls_rsp() was called for + * the lsrsp structure, the transport will still call xmt_ls_rsp() + * afterward to cleanup the outstanding lsrsp structure. The LLDD should + * noop the transmission of the rsp and call the lsrsp->done() routine + * to allow the lsrsp structure to be released. + */ +int nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *remoteport, + struct nvmefc_ls_rsp *lsrsp, + void *lsreqbuf, u32 lsreqbuf_len); + + /* * *************** LLDD FC-NVME Target/Subsystem API *************** @@ -470,55 +590,6 @@ struct nvmet_fc_port_info { }; -/** - * struct nvmefc_tgt_ls_req - Structure used between LLDD and NVMET-FC - * layer to represent the exchange context for - * a FC-NVME Link Service (LS). - * - * The structure is allocated by the LLDD whenever a LS Request is received - * from the FC link. The address of the structure is passed to the nvmet-fc - * layer via the nvmet_fc_rcv_ls_req() call. The address of the structure - * will be passed back to the LLDD when the response is to be transmit. - * The LLDD is to use the address to map back to the LLDD exchange structure - * which maintains information such as the targetport the LS was received - * on, the remote FC NVME initiator that sent the LS, and any FC exchange - * context. Upon completion of the LS response transmit, the address of the - * structure will be passed back to the LS rsp done() routine, allowing the - * nvmet-fc layer to release dma resources. Upon completion of the done() - * routine, no further access will be made by the nvmet-fc layer and the - * LLDD can de-allocate the structure. - * - * Field initialization: - * At the time of the nvmet_fc_rcv_ls_req() call, there is no content that - * is valid in the structure. - * - * When the structure is used for the LLDD->xmt_ls_rsp() call, the nvmet-fc - * layer will fully set the fields in order to specify the response - * payload buffer and its length as well as the done routine to be called - * upon compeletion of the transmit. The nvmet-fc layer will also set a - * private pointer for its own use in the done routine. - * - * Values set by the NVMET-FC layer prior to calling the LLDD xmt_ls_rsp - * entrypoint. - * @rspbuf: pointer to the LS response buffer - * @rspdma: PCI DMA address of the LS response buffer - * @rsplen: Length, in bytes, of the LS response buffer - * @done: The callback routine the LLDD is to invoke upon completion of - * transmitting the LS response. req argument is the pointer to - * the original ls request. - * @nvmet_fc_private: pointer to an internal NVMET-FC layer structure used - * as part of the NVMET-FC processing. The LLDD is not to access - * this pointer. - */ -struct nvmefc_tgt_ls_req { - void *rspbuf; - dma_addr_t rspdma; - u16 rsplen; - - void (*done)(struct nvmefc_tgt_ls_req *req); - void *nvmet_fc_private; /* LLDD is not to access !! */ -}; - /* Operations that NVME-FC layer may request the LLDD to perform for FCP */ enum { NVMET_FCOP_READDATA = 1, /* xmt data to initiator */ @@ -693,17 +764,19 @@ struct nvmet_fc_target_port { * Entrypoint is Mandatory. * * @xmt_ls_rsp: Called to transmit the response to a FC-NVME FC-4 LS service. - * The nvmefc_tgt_ls_req structure is the same LLDD-supplied exchange + * The nvmefc_ls_rsp structure is the same LLDD-supplied exchange * structure specified in the nvmet_fc_rcv_ls_req() call made when - * the LS request was received. The structure will fully describe + * the LS request was received. The structure will fully describe * the buffers for the response payload and the dma address of the - * payload. The LLDD is to transmit the response (or return a non-zero - * errno status), and upon completion of the transmit, call the - * "done" routine specified in the nvmefc_tgt_ls_req structure - * (argument to done is the ls reqwuest structure itself). - * After calling the done routine, the LLDD shall consider the - * LS handling complete and the nvmefc_tgt_ls_req structure may - * be freed/released. + * payload. The LLDD is to transmit the response (or return a + * non-zero errno status), and upon completion of the transmit, call + * the "done" routine specified in the nvmefc_ls_rsp structure + * (argument to done is the address of the nvmefc_ls_rsp structure + * itself). Upon the completion of the done() routine, the LLDD shall + * consider the LS handling complete and the nvmefc_ls_rsp structure + * may be freed/released. + * The transport will always call the xmt_ls_rsp() routine for any + * LS received. * Entrypoint is Mandatory. * * @fcp_op: Called to perform a data transfer or transmit a response. @@ -798,6 +871,39 @@ struct nvmet_fc_target_port { * should cause the initiator to rescan the discovery controller * on the targetport. * + * @ls_req: Called to issue a FC-NVME FC-4 LS service request. + * The nvme_fc_ls_req structure will fully describe the buffers for + * the request payload and where to place the response payload. + * The targetport that is to issue the LS request is identified by + * the targetport argument. The remote port that is to receive the + * LS request is identified by the hosthandle argument. The nvmet-fc + * transport is only allowed to issue FC-NVME LS's on behalf of an + * association that was created prior by a Create Association LS. + * The hosthandle will originate from the LLDD in the struct + * nvmefc_ls_rsp structure for the Create Association LS that + * was delivered to the transport. The transport will save the + * hosthandle as an attribute of the association. If the LLDD + * loses connectivity with the remote port, it must call the + * nvmet_fc_invalidate_host() routine to remove any references to + * the remote port in the transport. + * The LLDD is to allocate an exchange, issue the LS request, obtain + * the LS response, and call the "done" routine specified in the + * request structure (argument to done is the ls request structure + * itself). + * Entrypoint is Optional - but highly recommended. + * + * @ls_abort: called to request the LLDD to abort the indicated ls request. + * The call may return before the abort has completed. After aborting + * the request, the LLDD must still call the ls request done routine + * indicating an FC transport Aborted status. + * Entrypoint is Mandatory if the ls_req entry point is specified. + * + * @host_release: called to inform the LLDD that the request to invalidate + * the host port indicated by the hosthandle has been fully completed. + * No associations exist with the host port and there will be no + * further references to hosthandle. + * Entrypoint is Mandatory if the lldd calls nvmet_fc_invalidate_host(). + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -826,11 +932,19 @@ struct nvmet_fc_target_port { * area solely for the of the LLDD and its location is specified by * the targetport->private pointer. * Value is Mandatory. Allowed to be zero. + * + * @lsrqst_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like nvmet-fc layer to allocate on the LLDD's + * behalf whenever a ls request structure is allocated. The additional + * memory area is solely for use by the LLDD and its location is + * specified by the ls_request->private pointer. + * Value is Mandatory. Allowed to be zero. + * */ struct nvmet_fc_target_template { void (*targetport_delete)(struct nvmet_fc_target_port *tgtport); int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport, - struct nvmefc_tgt_ls_req *tls_req); + struct nvmefc_ls_rsp *ls_rsp); int (*fcp_op)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); void (*fcp_abort)(struct nvmet_fc_target_port *tgtport, @@ -840,6 +954,11 @@ struct nvmet_fc_target_template { void (*defer_rcv)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); void (*discovery_event)(struct nvmet_fc_target_port *tgtport); + int (*ls_req)(struct nvmet_fc_target_port *targetport, + void *hosthandle, struct nvmefc_ls_req *lsreq); + void (*ls_abort)(struct nvmet_fc_target_port *targetport, + void *hosthandle, struct nvmefc_ls_req *lsreq); + void (*host_release)(void *hosthandle); u32 max_hw_queues; u16 max_sgl_segments; @@ -848,7 +967,9 @@ struct nvmet_fc_target_template { u32 target_features; + /* sizes of additional private data for data structures */ u32 target_priv_sz; + u32 lsrqst_priv_sz; }; @@ -859,10 +980,61 @@ int nvmet_fc_register_targetport(struct nvmet_fc_port_info *portinfo, int nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *tgtport); +/* + * Routine called to pass a NVME-FC LS request, received by the lldd, + * to the nvmet-fc transport. + * + * If the return value is zero: the LS was successfully accepted by the + * transport. + * If the return value is non-zero: the transport has not accepted the + * LS. The lldd should ABTS-LS the LS. + * + * Note: if the LLDD receives and ABTS for the LS prior to the transport + * calling the ops->xmt_ls_rsp() routine to transmit a response, the LLDD + * shall mark the LS as aborted, and when the xmt_ls_rsp() is called: the + * response shall not be transmit and the struct nvmefc_ls_rsp() done + * routine shall be called. The LLDD may transmit the ABTS response as + * soon as the LS was marked or can delay until the xmt_ls_rsp() call is + * made. + * Note: if an RCV LS was successfully posted to the transport and the + * targetport is then unregistered before xmt_ls_rsp() was called for + * the lsrsp structure, the transport will still call xmt_ls_rsp() + * afterward to cleanup the outstanding lsrsp structure. The LLDD should + * noop the transmission of the rsp and call the lsrsp->done() routine + * to allow the lsrsp structure to be released. + */ int nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *tgtport, - struct nvmefc_tgt_ls_req *lsreq, + void *hosthandle, + struct nvmefc_ls_rsp *rsp, void *lsreqbuf, u32 lsreqbuf_len); +/* + * Routine called by the LLDD whenever it has a logout or loss of + * connectivity to a NVME-FC host port which there had been active + * NVMe controllers for. The host port is indicated by the + * hosthandle. The hosthandle is given to the nvmet-fc transport + * when a NVME LS was received, typically to create a new association. + * The nvmet-fc transport will cache the hostport value with the + * association for use in LS requests for the association. + * When the LLDD calls this routine, the nvmet-fc transport will + * immediately terminate all associations that were created with + * the hosthandle host port. + * The LLDD, after calling this routine and having control returned, + * must assume the transport may subsequently utilize hosthandle as + * part of sending LS's to terminate the association. The LLDD + * should reject the LS's if they are attempted. + * Once the last association has terminated for the hosthandle host + * port, the nvmet-fc transport will call the ops->host_release() + * callback. As of the callback, the nvmet-fc transport will no + * longer reference hosthandle. + */ +void nvmet_fc_invalidate_host(struct nvmet_fc_target_port *tgtport, + void *hosthandle); + +/* + * If nvmet_fc_rcv_fcp_req returns non-zero, the transport has not accepted + * the FCP cmd. The lldd should ABTS-LS the cmd. + */ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq, void *cmdiubuf, u32 cmdiubuf_len); -- cgit v1.2.3 From d02abd198633a4c40411b9a5994111452720470d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 4 May 2020 01:56:48 -0700 Subject: nvmet: align addrfam list to spec With reference to the NVMeOF Specification (page 44, Figure 38) discovery log page entry provides address family field. We do set the transport type field but the adrfam field is not set when using loop transport and also it doesn't have support in the nvme-cli. So when reading discovery log page with a loop transport it leads to confusing output. As per the spec for adrfam value 254 is reserved for Intra Host Transport i.e. loopback), we add a required macro in the protocol header file, set default port disc addr entry's adrfam to NVMF_ADDR_FAMILY_MAX, and update nvmet_addr_family configfs array for show/store attribute. Without this patch, setting adrfam to (ipv4/ipv6/ib/fc/loop/" ") we get following output for nvme discover command from nvme-cli which is confusing. trtype: loop adrfam: ipv4 trtype: loop adrfam: ipv6 trtype: loop adrfam: infiniband trtype: loop adrfam: fibre-channel trtype: loop # ${CFGFS_HOME}/nvmet/ports/1/addr_adrfam = loop adrfam: pci # <----- pci for loop trtype: loop # ${CFGFS_HOME}/nvmet/ports/1/addr_adrfam = " " adrfam: pci # <----- pci for unrecognized This patch fixes above output :- trtype: loop adrfam: ipv4 trtype: loop adrfam: ipv6 trtype: loop adrfam: infiniband trtype: loop adrfam: fibre-channel trtype: loop # ${CFGFS_HOME}/nvmet/ports/1/addr_adrfam = loop adrfam: loop # <----- loop for loop trtype: loop # ${CFGFS_HOME}/config/nvmet/ports/adrfam = " " adrfam: unrecognized # <----- unrecognized when invalid value Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/configfs.c | 14 ++++++++------ include/linux/nvme.h | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 20779587eefe..ae8fb4489a10 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -33,11 +33,12 @@ static struct nvmet_type_name_map nvmet_transport[] = { }; static const struct nvmet_type_name_map nvmet_addr_family[] = { - { NVMF_ADDR_FAMILY_PCI, "pcie" }, - { NVMF_ADDR_FAMILY_IP4, "ipv4" }, - { NVMF_ADDR_FAMILY_IP6, "ipv6" }, - { NVMF_ADDR_FAMILY_IB, "ib" }, - { NVMF_ADDR_FAMILY_FC, "fc" }, + { NVMF_ADDR_FAMILY_PCI, "pcie" }, + { NVMF_ADDR_FAMILY_IP4, "ipv4" }, + { NVMF_ADDR_FAMILY_IP6, "ipv6" }, + { NVMF_ADDR_FAMILY_IB, "ib" }, + { NVMF_ADDR_FAMILY_FC, "fc" }, + { NVMF_ADDR_FAMILY_LOOP, "loop" }, }; static bool nvmet_is_port_enabled(struct nvmet_port *p, const char *caller) @@ -83,7 +84,7 @@ static ssize_t nvmet_addr_adrfam_store(struct config_item *item, return -EINVAL; found: - port->disc_addr.adrfam = i; + port->disc_addr.adrfam = nvmet_addr_family[i].type; return count; } @@ -1338,6 +1339,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, port->inline_data_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX; port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW; config_group_init_type_name(&port->group, name, &nvmet_port_type); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3d5189f46cb1..2d978d0cbde6 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -38,6 +38,8 @@ enum { NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ + NVMF_ADDR_FAMILY_LOOP = 254, /* Reserved for host usage */ + NVMF_ADDR_FAMILY_MAX, }; /* Transport Type codes for Discovery Log Page entry TRTYPE field */ -- cgit v1.2.3 From 92decf118f1da4c866515f80387f9cf4d48611d6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 3 Apr 2020 10:53:46 -0700 Subject: nvme: define constants for identification values Improve code readability by defining the specification's constants that the driver is using when decoding identification payloads. Signed-off-by: Keith Busch Reviewed-by: Bart van Assche Reviewed-by: Chaitanya Kulkarni Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/multipath.c | 5 +++-- include/linux/nvme.h | 6 ++++++ 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4a124a28118f..805d289e6cd9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1859,13 +1859,13 @@ static void nvme_update_disk_info(struct gendisk *disk, * and whether it should be used instead of AWUPF. If NAWUPF == * 0 then AWUPF must be used instead. */ - if (id->nsfeat & (1 << 1) && id->nawupf) + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } - if (id->nsfeat & (1 << 4)) { + if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ phys_bs = bs * (1 + le16_to_cpu(id->npwg)); /* NOWS = Namespace Optimal Write Size */ @@ -1894,7 +1894,7 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if (id->nsattr & (1 << 0)) + if (id->nsattr & NVME_NS_ATTR_RO) set_disk_ro(disk, true); else set_disk_ro(disk, false); @@ -2685,7 +2685,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, return false; } - if ((id->cmic & (1 << 1)) || + if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || (ctrl->opts && ctrl->opts->discovery_nqn)) continue; @@ -3497,7 +3497,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_id_ns *id) { struct nvme_ctrl *ctrl = ns->ctrl; - bool is_shared = id->nmic & (1 << 0); + bool is_shared = id->nmic & NVME_NS_NMIC_SHARED; struct nvme_ns_head *head = NULL; struct nvme_ns_ids ids; int ret = 0; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5f5537d0a024..da78e499947a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -372,7 +372,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) * We also do this for private namespaces as the namespace sharing data could * change after a rescan. */ - if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) return 0; q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); @@ -694,7 +694,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) int error; /* check if multipath is enabled and we have the capability */ - if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3))) + if (!multipath || !ctrl->subsys || + !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0; ctrl->anacap = id->anacap; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2d978d0cbde6..b235a48eac8c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -301,6 +301,8 @@ struct nvme_id_ctrl { }; enum { + NVME_CTRL_CMIC_MULTI_CTRL = 1 << 1, + NVME_CTRL_CMIC_ANA = 1 << 3, NVME_CTRL_ONCS_COMPARE = 1 << 0, NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, @@ -396,8 +398,12 @@ enum { enum { NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FEAT_ATOMICS = 1 << 1, + NVME_NS_FEAT_IO_OPT = 1 << 4, + NVME_NS_ATTR_RO = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, NVME_NS_FLBAS_META_EXT = 0x10, + NVME_NS_NMIC_SHARED = 1 << 0, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, -- cgit v1.2.3 From ae24345da54e452880808b011fa2d8a0bbd191ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:58:59 -0700 Subject: bpf: Implement an interface to register bpf_iter targets The target can call bpf_iter_reg_target() to register itself. The needed information: target: target name seq_ops: the seq_file operations for the target init_seq_private target callback to initialize seq_priv during file open fini_seq_private target callback to clean up seq_priv during file release seq_priv_size: the private_data size needed by the seq_file operations The target name represents a target which provides a seq_ops for iterating objects. The target can provide two callback functions, init_seq_private and fini_seq_private, called during file open/release time. For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net name space needs to be setup properly during file open and released properly during file release. Function bpf_iter_unreg_target() is also implemented to unregister a particular target. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175859.2474669-1-yhs@fb.com --- include/linux/bpf.h | 15 +++++++++++++ kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_iter.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_iter.c (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1262ec460ab3..40c78b86fe38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ struct seq_file; struct btf; struct btf_type; struct exception_table_entry; +struct seq_operations; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1126,6 +1127,20 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); +typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); + +struct bpf_iter_reg { + const char *target; + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + +int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); +void bpf_iter_unreg_target(const char *target); + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f2d7be596966..6a8b0febd3f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c new file mode 100644 index 000000000000..5a8119d17d14 --- /dev/null +++ b/kernel/bpf/bpf_iter.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include + +struct bpf_iter_target_info { + struct list_head list; + const char *target; + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + +static struct list_head targets = LIST_HEAD_INIT(targets); +static DEFINE_MUTEX(targets_mutex); + +int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->target = reg_info->target; + tinfo->seq_ops = reg_info->seq_ops; + tinfo->init_seq_private = reg_info->init_seq_private; + tinfo->fini_seq_private = reg_info->fini_seq_private; + tinfo->seq_priv_size = reg_info->seq_priv_size; + INIT_LIST_HEAD(&tinfo->list); + + mutex_lock(&targets_mutex); + list_add(&tinfo->list, &targets); + mutex_unlock(&targets_mutex); + + return 0; +} + +void bpf_iter_unreg_target(const char *target) +{ + struct bpf_iter_target_info *tinfo; + bool found = false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (!strcmp(target, tinfo->target)) { + list_del(&tinfo->list); + kfree(tinfo); + found = true; + break; + } + } + mutex_unlock(&targets_mutex); + + WARN_ON(found == false); +} -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_iter.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 21 +++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 62 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c78b86fe38..f28bdd714754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1127,6 +1127,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" + typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -1140,6 +1142,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); +bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5a8119d17d14..dec182d8395a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -12,6 +12,7 @@ struct bpf_iter_target_info { bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 btf_id; /* cached value */ }; static struct list_head targets = LIST_HEAD_INIT(targets); @@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target) WARN_ON(found == false); } + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + return supported; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 70ad009577f8..d725ff7d11db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type != BPF_TRACE_ITER) + return 0; + break; default: return 0; } @@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_iter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 14 ++++++++++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 80 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f28bdd714754..e93d2d33c82c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1143,6 +1143,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 8345cdf553b8..29d22752fc87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) #ifdef CONFIG_CGROUP_BPF BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif +BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dec182d8395a..03f5832909db 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -15,6 +15,11 @@ struct bpf_iter_target_info { u32 btf_id; /* cached value */ }; +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); @@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, +}; + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bb1ab7da6103..6ffe2d8fb6c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3729,6 +3731,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++ kernel/bpf/bpf_iter.c | 129 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 26 +++++++++ tools/include/uapi/linux/bpf.h | 6 ++ 5 files changed, 168 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93d2d33c82c..80b1b9d8a638 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1144,6 +1144,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_iter_new_fd(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 832973ee80fa..e7129b57865f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include +#include #include #include @@ -20,12 +21,24 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* incremented on every opened seq_file */ +static atomic64_t session_id; + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -149,6 +162,33 @@ done: return copied; } +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->fini_seq_private) + iter_priv->tinfo->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +static const struct file_operations bpf_iter_fops = { + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -309,3 +349,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->init_seq_private) { + err = tinfo->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ffe2d8fb6c7..a293e88ee01a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr) return -EINVAL; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 367ec3e4834cbd611401c2c40a23c22c825474f1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:06 -0700 Subject: bpf: Create file bpf iterator To produce a file bpf iterator, the fd must be corresponding to a link_fd assocciated with a trace/iter program. When the pinned file is opened, a seq_file will be generated. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175906.2475893-1-yhs@fb.com --- include/linux/bpf.h | 2 ++ kernel/bpf/bpf_iter.c | 17 ++++++++++++++++- kernel/bpf/inode.c | 5 ++++- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80b1b9d8a638..b06653ab3476 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1022,6 +1022,7 @@ static inline void bpf_enable_instrumentation(void) extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; +extern const struct file_operations bpf_iter_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ @@ -1145,6 +1146,7 @@ void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); +bool bpf_link_is_iter(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index e7129b57865f..090f09b0eacb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -39,6 +39,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -162,6 +164,13 @@ done: return copied; } +static int iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link); +} + static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; @@ -183,7 +192,8 @@ static int iter_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static const struct file_operations bpf_iter_fops = { +const struct file_operations bpf_iter_fops = { + .open = iter_open, .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, @@ -310,6 +320,11 @@ static const struct bpf_link_ops bpf_iter_link_lops = { .update_prog = bpf_iter_link_replace, }; +bool bpf_link_is_iter(struct bpf_link *link) +{ + return link->ops == &bpf_iter_link_lops; +} + int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..fb878ba3f22f 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { + struct bpf_link *link = arg; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, - &bpffs_obj_fops); + bpf_link_is_iter(link) ? + &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * -- cgit v1.2.3 From e5158d987b72c3f318b4b52a01ac6f3997bd0c00 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:07 -0700 Subject: bpf: Implement common macros/helpers for target iterators Macro DEFINE_BPF_ITER_FUNC is implemented so target can define an init function to capture the BTF type which represents the target. The bpf_iter_meta is a structure holding meta data, common to all targets in the bpf program. Additional marker functions are called before or after bpf_seq_read() show()/next()/stop() callback functions to help calculate precise seq_num and whether call bpf_prog inside stop(). Two functions, bpf_iter_get_info() and bpf_iter_run_prog(), are implemented so target can get needed information from bpf_iter infrastructure and can run the program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175907.2475956-1-yhs@fb.com --- include/linux/bpf.h | 11 +++++++ kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 91 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b06653ab3476..ffe0b9b669bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); #define BPF_ITER_FUNC_PREFIX "__bpf_iter__" +#define DEFINE_BPF_ITER_FUNC(target, args...) \ + extern int __bpf_iter__ ## target(args); \ + int __init __bpf_iter__ ## target(args) { return 0; } typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -1141,12 +1144,20 @@ struct bpf_iter_reg { u32 seq_priv_size; }; +struct bpf_iter_meta { + __bpf_md_ptr(struct seq_file *, seq); + u64 session_id; + u64 seq_num; +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 090f09b0eacb..30efd15cd4a0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -41,6 +41,33 @@ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static void bpf_iter_inc_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num++; +} + +static void bpf_iter_dec_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num--; +} + +static void bpf_iter_done_stop(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->done_stop = true; +} + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -93,6 +120,10 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, err = seq->op->show(seq, p); if (err > 0) { + /* object is skipped, decrease seq_num, so next + * valid object can reuse the same seq_num. + */ + bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) @@ -117,11 +148,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (IS_ERR_OR_NULL(p)) break; + /* got a valid next object, increase seq_num */ + bpf_iter_inc_seq_num(seq); + if (seq->count >= size) break; err = seq->op->show(seq, p); if (err > 0) { + bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; @@ -138,11 +173,15 @@ stop: offs = seq->count; /* bpf program called if !p */ seq->op->stop(seq, p); - if (!p && seq_has_overflowed(seq)) { - seq->count = offs; - if (offs == 0) { - err = -E2BIG; - goto done; + if (!p) { + if (!seq_has_overflowed(seq)) { + bpf_iter_done_stop(seq); + } else { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } } } @@ -453,3 +492,39 @@ free_fd: put_unused_fd(fd); return err; } + +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + void *seq_priv; + + seq = meta->seq; + if (seq->file->f_op != &bpf_iter_fops) + return NULL; + + seq_priv = seq->private; + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, + target_private); + + if (in_stop && iter_priv->done_stop) + return NULL; + + meta->session_id = iter_priv->session_id; + meta->seq_num = iter_priv->seq_num; + + return iter_priv->prog; +} + +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) +{ + int ret; + + rcu_read_lock(); + migrate_disable(); + ret = BPF_PROG_RUN(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + + return ret == 0 ? 0 : -EAGAIN; +} -- cgit v1.2.3 From 6086d29def80edd78f9832ea6eafa74e3818f6a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:09 -0700 Subject: bpf: Add bpf_map iterator Implement seq_file operations to traverse all bpf_maps. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175909.2476096-1-yhs@fb.com --- include/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/map_iter.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 ++++++++++ 4 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/map_iter.c (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ffe0b9b669bf..363ab0751967 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1082,6 +1082,7 @@ int generic_map_update_batch(struct bpf_map *map, int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); +struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a8b0febd3f6..b2b5eefc5254 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c new file mode 100644 index 000000000000..8162e0c00b9f --- /dev/null +++ b/kernel/bpf/map_iter.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include + +struct bpf_iter_seq_map_info { + u32 mid; +}; + +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + ++*pos; + return map; +} + +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + ++*pos; + ++info->mid; + bpf_map_put((struct bpf_map *)v); + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + return map; +} + +struct bpf_iter__bpf_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); +}; + +DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) + +static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_map ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.map = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_map_seq_show(seq, v, false); +} + +static void bpf_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_map_seq_show(seq, v, true); + else + bpf_map_put((struct bpf_map *)v); +} + +static const struct seq_operations bpf_map_seq_ops = { + .start = bpf_map_seq_start, + .next = bpf_map_seq_next, + .stop = bpf_map_seq_stop, + .show = bpf_map_seq_show, +}; + +static int __init bpf_map_iter_init(void) +{ + struct bpf_iter_reg reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + }; + + return bpf_iter_reg_target(®_info); +} + +late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a293e88ee01a..de2a75500233 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2934,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) +{ + struct bpf_map *map; + + spin_lock_bh(&map_idr_lock); +again: + map = idr_get_next(&map_idr, id); + if (map) { + map = __bpf_map_inc_not_zero(map, false); + if (IS_ERR(map)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&map_idr_lock); + + return map; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) -- cgit v1.2.3 From 138d0be35b141e09f6b267c6ae4094318d4e4491 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:10 -0700 Subject: net: bpf: Add netlink and ipv6_route bpf_iter targets This patch added netlink and ipv6_route targets, using the same seq_ops (except show() and minor changes for stop()) for /proc/net/{netlink,ipv6_route}. The net namespace for these targets are the current net namespace at file open stage, similar to /proc/net/{netlink,ipv6_route} reference counting the net namespace at seq_file open stage. Since module is not supported for now, ipv6_route is supported only if the IPV6 is built-in, i.e., not compiled as a module. The restriction can be lifted once module is properly supported for bpf_iter. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175910.2476329-1-yhs@fb.com --- fs/proc/proc_net.c | 19 +++++++++++ include/linux/proc_fs.h | 3 ++ net/ipv6/ip6_fib.c | 65 ++++++++++++++++++++++++++++++++++-- net/ipv6/route.c | 37 ++++++++++++++++++++ net/netlink/af_netlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 207 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4888c5224442..dba63b2429f0 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = { .proc_release = seq_release_net, }; +int bpf_iter_init_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + p->net = get_net(current->nsproxy->net_ns); +#endif + return 0; +} + +void bpf_iter_fini_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + put_net(p->net); +#endif +} + struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..03953c59807d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); +extern int bpf_iter_init_seq_net(void *priv_data); +extern void bpf_iter_fini_seq_net(void *priv_data); + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 46ed56719476..a1fcc0ca21af 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void) } #ifdef CONFIG_PROC_FS -static int ipv6_route_seq_show(struct seq_file *seq, void *v) +static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; @@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) return w->node && !(w->state == FWS_U && w->node == w->root); } -static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { struct net *net = seq_file_net(seq); @@ -2637,6 +2637,67 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock_bh(); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; + +static int ipv6_route_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__ipv6_route ctx; + + ctx.meta = meta; + ctx.rt = v; + return bpf_iter_run_prog(prog, &ctx); +} + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct ipv6_route_iter *iter = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return ipv6_route_native_seq_show(seq, v); + + ret = ipv6_route_prog_seq_show(prog, &meta, v); + iter->w.leaf = NULL; + + return ret; +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)ipv6_route_prog_seq_show(prog, &meta, v); + } + + ipv6_route_native_seq_stop(seq, v); +} +#else +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + return ipv6_route_native_seq_show(seq, v); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + ipv6_route_native_seq_stop(seq, v); +} +#endif + const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3912aac7854d..25f6d3e619d0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6393,6 +6393,30 @@ void __init ip6_route_init_special_entries(void) #endif } +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) + +static int __init bpf_iter_register(void) +{ + struct bpf_iter_reg reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), + }; + + return bpf_iter_reg_target(®_info); +} + +static void bpf_iter_unregister(void) +{ + bpf_iter_unreg_target("ipv6_route"); +} +#endif +#endif + int __init ip6_route_init(void) { int ret; @@ -6455,6 +6479,14 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + ret = bpf_iter_register(); + if (ret) + goto out_register_late_subsys; +#endif +#endif + for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); @@ -6487,6 +6519,11 @@ out_kmem_cache: void ip6_route_cleanup(void) { +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_unregister(); +#endif +#endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ded01ca8b20..33cda9baa979 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) return __netlink_seq_next(seq); } -static void netlink_seq_stop(struct seq_file *seq, void *v) +static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; @@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v) } -static int netlink_seq_show(struct seq_file *seq, void *v) +static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__netlink { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct netlink_sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) + +static int netlink_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__netlink ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk = nlk_sk((struct sock *)v); + return bpf_iter_run_prog(prog, &ctx); +} + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return netlink_native_seq_show(seq, v); + + if (v != SEQ_START_TOKEN) + return netlink_prog_seq_show(prog, &meta, v); + + return 0; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)netlink_prog_seq_show(prog, &meta, v); + } + + netlink_native_seq_stop(seq, v); +} +#else +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + return netlink_native_seq_show(seq, v); +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + netlink_native_seq_stop(seq, v); +} +#endif + static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, @@ -2740,6 +2802,21 @@ static const struct rhashtable_params netlink_rhashtable_params = { .automatic_shrinking = true, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static int __init bpf_iter_register(void) +{ + struct bpf_iter_reg reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), + }; + + return bpf_iter_reg_target(®_info); +} +#endif + static int __init netlink_proto_init(void) { int i; @@ -2748,6 +2825,12 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + err = bpf_iter_register(); + if (err) + goto out; +#endif + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); -- cgit v1.2.3 From b121b341e5983bdccf7a5d6cf9236a45c965a31f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:12 -0700 Subject: bpf: Add PTR_TO_BTF_ID_OR_NULL support Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support. For tracing/iter program, the bpf program context definition, e.g., for previous bpf_map target, looks like struct bpf_iter__bpf_map { struct bpf_iter_meta *meta; struct bpf_map *map; }; The kernel guarantees that meta is not NULL, but map pointer maybe NULL. The NULL map indicates that all objects have been traversed, so bpf program can take proper action, e.g., do final aggregation and/or send final report to user space. Add btf_id_or_null_non0_off to prog->aux structure, to indicate that if the context access offset is not 0, set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID. This bit is set for tracing/iter program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175912.2476576-1-yhs@fb.com --- include/linux/bpf.h | 2 ++ kernel/bpf/btf.c | 5 ++++- kernel/bpf/verifier.c | 16 ++++++++++++---- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 363ab0751967..cf4b6e44f2bc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -320,6 +320,7 @@ enum bpf_reg_type { PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ + PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -658,6 +659,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool btf_id_or_null_non0_off; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2cfba89a8e1..c490fbde22d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3790,7 +3790,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; + if (off != 0 && prog->aux->btf_id_or_null_non0_off) + info->reg_type = PTR_TO_BTF_ID_OR_NULL; + else + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d725ff7d11db..36b2a38a06fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_BTF_ID_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -483,6 +484,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", }; static char slot_type_char[] = { @@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID) + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return true; default: return false; @@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID) + if (reg_type == PTR_TO_BTF_ID || + reg_type == PTR_TO_BTF_ID_OR_NULL) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -6572,6 +6576,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { + reg->type = PTR_TO_BTF_ID; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -8429,6 +8435,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -10640,6 +10647,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; + prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From 5a63b7ba50fd6b7a897bf9353dbf31d579cfe116 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Fri, 8 May 2020 18:47:20 +0300 Subject: power: supply: add battery parameters Add parsing of new device-tree battery bindings. - trickle-charge-current-microamp - precharge-upper-limit-microvolt - re-charge-voltage-microvolt - over-voltage-threshold-microvolt Signed-off-by: Matti Vaittinen Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_core.c | 8 ++++++++ include/linux/power_supply.h | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index 1a9a9fae73d3..02b37fe6061c 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -620,10 +620,18 @@ int power_supply_get_battery_info(struct power_supply *psy, &info->voltage_min_design_uv); of_property_read_u32(battery_np, "voltage-max-design-microvolt", &info->voltage_max_design_uv); + of_property_read_u32(battery_np, "trickle-charge-current-microamp", + &info->tricklecharge_current_ua); of_property_read_u32(battery_np, "precharge-current-microamp", &info->precharge_current_ua); + of_property_read_u32(battery_np, "precharge-upper-limit-microvolt", + &info->precharge_voltage_max_uv); of_property_read_u32(battery_np, "charge-term-current-microamp", &info->charge_term_current_ua); + of_property_read_u32(battery_np, "re-charge-voltage-microvolt", + &info->charge_restart_voltage_uv); + of_property_read_u32(battery_np, "over-voltage-threshold-microvolt", + &info->overvoltage_limit_uv); of_property_read_u32(battery_np, "constant-charge-current-max-microamp", &info->constant_charge_current_max_ua); of_property_read_u32(battery_np, "constant-charge-voltage-max-microvolt", diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 6a34df65d4d1..1f60731ec7fe 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -346,8 +346,12 @@ struct power_supply_battery_info { int charge_full_design_uah; /* microAmp-hours */ int voltage_min_design_uv; /* microVolts */ int voltage_max_design_uv; /* microVolts */ + int tricklecharge_current_ua; /* microAmps */ int precharge_current_ua; /* microAmps */ + int precharge_voltage_max_uv; /* microVolts */ int charge_term_current_ua; /* microAmps */ + int charge_restart_voltage_uv; /* microVolts */ + int overvoltage_limit_uv; /* microVolts */ int constant_charge_current_max_ua; /* microAmps */ int constant_charge_voltage_max_uv; /* microVolts */ int factory_internal_resistance_uohm; /* microOhms */ -- cgit v1.2.3 From e7bb7ecefa817543e11fa3c1c3e55deb90b02e6c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:59:21 -0500 Subject: IB/mlx4: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jakub Kicinski --- include/linux/mlx4/qp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 8e2828d48d7f..9db93e487496 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -362,7 +362,7 @@ struct mlx4_wqe_datagram_seg { struct mlx4_wqe_lso_seg { __be32 mss_hdr_size; - __be32 header[0]; + __be32 header[]; }; enum mlx4_wqe_bind_seg_flags2 { -- cgit v1.2.3 From 7d374b20908338c9fbb03ea8022a11f3b3e0e55f Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Sun, 10 May 2020 18:30:37 +0300 Subject: interconnect: Add helpers for enabling/disabling a path There is a repeated pattern in multiple drivers where they want to switch the bandwidth between zero and some other value. This is happening often in the suspend/resume callbacks. Let's add helper functions to enable and disable the path, so that callers don't have to take care of remembering the bandwidth values and handle this in the framework instead. With this patch the users can call icc_disable() and icc_enable() to lower their bandwidth request to zero and then restore it back to it's previous value. Suggested-by: Evan Green Suggested-by: Bjorn Andersson Signed-off-by: Georgi Djakov Reviewed-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20200507120846.8354-1-georgi.djakov@linaro.org Signed-off-by: Georgi Djakov --- drivers/interconnect/core.c | 39 ++++++++++++++++++++++++++++++++++++++- drivers/interconnect/internal.h | 2 ++ include/linux/interconnect.h | 12 ++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index f5699ed34e43..f9dd55a632dd 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -158,6 +158,7 @@ static struct icc_path *path_init(struct device *dev, struct icc_node *dst, hlist_add_head(&path->reqs[i].req_node, &node->req_list); path->reqs[i].node = node; path->reqs[i].dev = dev; + path->reqs[i].enabled = true; /* reference to previous node was saved during path traversal */ node = node->reverse; } @@ -249,9 +250,12 @@ static int aggregate_requests(struct icc_node *node) if (p->pre_aggregate) p->pre_aggregate(node); - hlist_for_each_entry(r, &node->req_list, req_node) + hlist_for_each_entry(r, &node->req_list, req_node) { + if (!r->enabled) + continue; p->aggregate(node, r->tag, r->avg_bw, r->peak_bw, &node->avg_bw, &node->peak_bw); + } return 0; } @@ -571,6 +575,39 @@ int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw) } EXPORT_SYMBOL_GPL(icc_set_bw); +static int __icc_enable(struct icc_path *path, bool enable) +{ + int i; + + if (!path) + return 0; + + if (WARN_ON(IS_ERR(path) || !path->num_nodes)) + return -EINVAL; + + mutex_lock(&icc_lock); + + for (i = 0; i < path->num_nodes; i++) + path->reqs[i].enabled = enable; + + mutex_unlock(&icc_lock); + + return icc_set_bw(path, path->reqs[0].avg_bw, + path->reqs[0].peak_bw); +} + +int icc_enable(struct icc_path *path) +{ + return __icc_enable(path, true); +} +EXPORT_SYMBOL_GPL(icc_enable); + +int icc_disable(struct icc_path *path) +{ + return __icc_enable(path, false); +} +EXPORT_SYMBOL_GPL(icc_disable); + /** * icc_get() - return a handle for path between two endpoints * @dev: the device requesting the path diff --git a/drivers/interconnect/internal.h b/drivers/interconnect/internal.h index bf18cb7239df..f5f82a5c939e 100644 --- a/drivers/interconnect/internal.h +++ b/drivers/interconnect/internal.h @@ -14,6 +14,7 @@ * @req_node: entry in list of requests for the particular @node * @node: the interconnect node to which this constraint applies * @dev: reference to the device that sets the constraints + * @enabled: indicates whether the path with this request is enabled * @tag: path tag (optional) * @avg_bw: an integer describing the average bandwidth in kBps * @peak_bw: an integer describing the peak bandwidth in kBps @@ -22,6 +23,7 @@ struct icc_req { struct hlist_node req_node; struct icc_node *node; struct device *dev; + bool enabled; u32 tag; u32 avg_bw; u32 peak_bw; diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index 770692421f4c..e56a95e3cf25 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -30,6 +30,8 @@ struct icc_path *icc_get(struct device *dev, const int src_id, struct icc_path *of_icc_get(struct device *dev, const char *name); struct icc_path *devm_of_icc_get(struct device *dev, const char *name); void icc_put(struct icc_path *path); +int icc_enable(struct icc_path *path); +int icc_disable(struct icc_path *path); int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw); void icc_set_tag(struct icc_path *path, u32 tag); @@ -57,6 +59,16 @@ static inline void icc_put(struct icc_path *path) { } +static inline int icc_enable(struct icc_path *path) +{ + return 0; +} + +static inline int icc_disable(struct icc_path *path) +{ + return 0; +} + static inline int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw) { return 0; -- cgit v1.2.3 From a68a813836e12b15715d9101309899123c250302 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:30 +0200 Subject: net: phy: Add cable test support to state machine Running a cable test is desruptive to normal operation of the PHY and can take a 5 to 10 seconds to complete. The RTNL lock cannot be held for this amount of time, and add a new state to the state machine for running a cable test. The driver is expected to implement two functions. The first is used to start a cable test. Once the test has started, it should return. The second function is called once per second, or on interrupt to check if the cable test is complete, and to allow the PHY to report the status. v2: Rename phy_cable_test_abort to phy_abort_cable_test Return different extack when already running test Use phy_init_hw() to reset the PHY Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 28 +++++++++++++++++++ 2 files changed, 104 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 8c22d02b4218..0f4b27215429 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ static const char *phy_state_to_str(enum phy_state st) PHY_STATE_STR(UP) PHY_STATE_STR(RUNNING) PHY_STATE_STR(NOLINK) + PHY_STATE_STR(CABLETEST) PHY_STATE_STR(HALTED) } @@ -472,6 +474,62 @@ static void phy_trigger_machine(struct phy_device *phydev) phy_queue_state_machine(phydev, 0); } +static void phy_abort_cable_test(struct phy_device *phydev) +{ + int err; + + err = phy_init_hw(phydev); + if (err) + phydev_err(phydev, "Error while aborting cable test"); +} + +int phy_start_cable_test(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + int err; + + if (!(phydev->drv && + phydev->drv->cable_test_start && + phydev->drv->cable_test_get_status)) { + NL_SET_ERR_MSG(extack, + "PHY driver does not support cable testing"); + return -EOPNOTSUPP; + } + + mutex_lock(&phydev->lock); + if (phydev->state == PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY already performing a test"); + err = -EBUSY; + goto out; + } + + if (phydev->state < PHY_UP || + phydev->state > PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY not configured. Try setting interface up"); + err = -EBUSY; + goto out; + } + + /* Mark the carrier down until the test is complete */ + phy_link_down(phydev, true); + + err = phydev->drv->cable_test_start(phydev); + if (err) { + phy_link_up(phydev); + goto out; + } + + phydev->state = PHY_CABLETEST; + +out: + mutex_unlock(&phydev->lock); + + return err; +} +EXPORT_SYMBOL(phy_start_cable_test); + static int phy_config_aneg(struct phy_device *phydev) { if (phydev->drv->config_aneg) @@ -810,6 +868,9 @@ void phy_stop(struct phy_device *phydev) mutex_lock(&phydev->lock); + if (phydev->state == PHY_CABLETEST) + phy_abort_cable_test(phydev); + if (phydev->sfp_bus) sfp_upstream_stop(phydev->sfp_bus); @@ -872,6 +933,7 @@ void phy_state_machine(struct work_struct *work) container_of(dwork, struct phy_device, state_queue); bool needs_aneg = false, do_suspend = false; enum phy_state old_state; + bool finished = false; int err = 0; mutex_lock(&phydev->lock); @@ -890,6 +952,20 @@ void phy_state_machine(struct work_struct *work) case PHY_RUNNING: err = phy_check_link_status(phydev); break; + case PHY_CABLETEST: + err = phydev->drv->cable_test_get_status(phydev, &finished); + if (err) { + phy_abort_cable_test(phydev); + needs_aneg = true; + phydev->state = PHY_UP; + break; + } + + if (finished) { + needs_aneg = true; + phydev->state = PHY_UP; + } + break; case PHY_HALTED: if (phydev->link) { phydev->link = 0; diff --git a/include/linux/phy.h b/include/linux/phy.h index a2b91b5f9d0a..632403fc34f4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -372,6 +373,12 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); * - irq or timer will set NOLINK if link goes down * - phy_stop moves to HALTED * + * CABLETEST: PHY is performing a cable test. Packet reception/sending + * is not expected to work, carrier will be indicated as down. PHY will be + * poll once per second, or on interrupt for it current state. + * Once complete, move to UP to restart the PHY. + * - phy_stop aborts the running test and moves to HALTED + * * HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. * - phy_start moves to UP @@ -383,6 +390,7 @@ enum phy_state { PHY_UP, PHY_RUNNING, PHY_NOLINK, + PHY_CABLETEST, }; /** @@ -689,6 +697,13 @@ struct phy_driver { int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); + /* Start a cable test */ + int (*cable_test_start)(struct phy_device *dev); + /* Once per second, or on interrupt, request the status of the + * test. + */ + int (*cable_test_get_status)(struct phy_device *dev, bool *finished); + /* Get statistics from the phy using ethtool */ int (*get_sset_count)(struct phy_device *dev); void (*get_strings)(struct phy_device *dev, u8 *data); @@ -1227,6 +1242,19 @@ int phy_speed_up(struct phy_device *phydev); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); +#if IS_ENABLED(CONFIG_PHYLIB) +int phy_start_cable_test(struct phy_device *phydev, + struct netlink_ext_ack *extack); +#else +static inline +int phy_start_cable_test(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); + return -EOPNOTSUPP; +} +#endif + static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); -- cgit v1.2.3 From 97c22438963a7484c05c59ab6654e30f0a3e9288 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:32 +0200 Subject: net: phy: Add support for polling cable test Some PHYs are not capable of generating interrupts when a cable test finished. They do however support interrupts for normal operations, like link up/down. As such, the PHY state machine would normally not poll the PHY. Add support for indicating the PHY state machine must poll the PHY when performing a cable test. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 2 ++ include/linux/phy.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 0f4b27215429..9fa61019533f 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -523,6 +523,8 @@ int phy_start_cable_test(struct phy_device *phydev, phydev->state = PHY_CABLETEST; + if (phy_polling_mode(phydev)) + phy_trigger_machine(phydev); out: mutex_unlock(&phydev->lock); diff --git a/include/linux/phy.h b/include/linux/phy.h index 632403fc34f4..f58eee735a45 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -79,6 +79,7 @@ extern const int phy_10gbit_features_array[1]; #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 +#define PHY_POLL_CABLE_TEST 0x00000004 #define MDIO_DEVICE_IS_PHY 0x80000000 /* Interface Mode definitions */ @@ -1061,6 +1062,10 @@ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) */ static inline bool phy_polling_mode(struct phy_device *phydev) { + if (phydev->state == PHY_CABLETEST) + if (phydev->drv->flags & PHY_POLL_CABLE_TEST) + return true; + return phydev->irq == PHY_POLL; } -- cgit v1.2.3 From 1dd3f212af30b42c90ba252c165f2f6d2ddf5230 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:36 +0200 Subject: net: ethtool: Add infrastructure for reporting cable test results Provide infrastructure for PHY drivers to report the cable test results. A netlink skb is associated to the phydev. Helpers will be added which can add results to this skb. Once the test has finished the results are sent to user space. When netlink ethtool is not part of the kernel configuration stubs are provided. It is also impossible to trigger a cable test, so the error code returned by the alloc function is of no consequence. v2: Include the status complete in the netlink notification message v4: Replace -EINVAL with -EMSGSIZE Signed-off-by: Andrew Lunn Reviewed-by: Michal Kubecek Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 22 +++++++++++++++-- include/linux/ethtool_netlink.h | 20 +++++++++++++++ include/linux/phy.h | 5 ++++ net/ethtool/cabletest.c | 55 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 9fa61019533f..afdc1c2146ee 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,9 @@ #include #include #include +#include +#include +#include #define PHY_STATE_TIME HZ @@ -478,6 +482,8 @@ static void phy_abort_cable_test(struct phy_device *phydev) { int err; + ethnl_cable_test_finished(phydev); + err = phy_init_hw(phydev); if (err) phydev_err(phydev, "Error while aborting cable test"); @@ -486,7 +492,7 @@ static void phy_abort_cable_test(struct phy_device *phydev) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { - int err; + int err = -ENOMEM; if (!(phydev->drv && phydev->drv->cable_test_start && @@ -512,19 +518,30 @@ int phy_start_cable_test(struct phy_device *phydev, goto out; } + err = ethnl_cable_test_alloc(phydev); + if (err) + goto out; + /* Mark the carrier down until the test is complete */ phy_link_down(phydev, true); err = phydev->drv->cable_test_start(phydev); if (err) { phy_link_up(phydev); - goto out; + goto out_free; } phydev->state = PHY_CABLETEST; if (phy_polling_mode(phydev)) phy_trigger_machine(phydev); + + mutex_unlock(&phydev->lock); + + return 0; + +out_free: + ethnl_cable_test_free(phydev); out: mutex_unlock(&phydev->lock); @@ -964,6 +981,7 @@ void phy_state_machine(struct work_struct *work) } if (finished) { + ethnl_cable_test_finished(phydev); needs_aneg = true; phydev->state = PHY_UP; } diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index d01b77887f82..7d763ba22f6f 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -14,4 +14,24 @@ enum ethtool_multicast_groups { ETHNL_MCGRP_MONITOR, }; +struct phy_device; + +#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) +int ethnl_cable_test_alloc(struct phy_device *phydev); +void ethnl_cable_test_free(struct phy_device *phydev); +void ethnl_cable_test_finished(struct phy_device *phydev); +#else +static inline int ethnl_cable_test_alloc(struct phy_device *phydev) +{ + return -ENOTSUPP; +} + +static inline void ethnl_cable_test_free(struct phy_device *phydev) +{ +} + +static inline void ethnl_cable_test_finished(struct phy_device *phydev) +{ +} +#endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index f58eee735a45..169fae4249a9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -523,6 +523,11 @@ struct phy_device { /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; + /* Reporting cable test results */ + struct sk_buff *skb; + void *ehdr; + struct nlattr *nest; + /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index aeb6672a46d0..ae8e63647663 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include "netlink.h" #include "common.h" @@ -52,3 +53,57 @@ out_dev_put: dev_put(dev); return ret; } + +int ethnl_cable_test_alloc(struct phy_device *phydev) +{ + int err = -ENOMEM; + + phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!phydev->skb) + goto out; + + phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, + ETHTOOL_MSG_CABLE_TEST_NTF); + if (!phydev->ehdr) { + err = -EMSGSIZE; + goto out; + } + + err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev, + ETHTOOL_A_CABLE_TEST_NTF_HEADER); + if (err) + goto out; + + err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED); + if (err) + goto out; + + phydev->nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TEST_NTF_NEST); + if (!phydev->nest) + goto out; + + return 0; + +out: + nlmsg_free(phydev->skb); + return err; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); + +void ethnl_cable_test_free(struct phy_device *phydev) +{ + nlmsg_free(phydev->skb); +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_free); + +void ethnl_cable_test_finished(struct phy_device *phydev) +{ + nla_nest_end(phydev->skb, phydev->nest); + + genlmsg_end(phydev->skb, phydev->ehdr); + + ethnl_multicast(phydev->skb, phydev->attached_dev); +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); -- cgit v1.2.3 From 1e2dc14509fd072739e4bab98ac42317267dbad6 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:37 +0200 Subject: net: ethtool: Add helpers for reporting test results The PHY drivers can use these helpers for reporting the results. The results get translated into netlink attributes which are added to the pre-allocated skbuf. v3: Poison phydev->skb Return -EMSGSIZE when ethnl_bcastmsg_put() fails Return valid error code when nla_nest_start() fails Use u8 for results Actually put u32 length into message v4: s/ENOTSUPP/EOPNOTSUPP/g Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Michal Kubecek Signed-off-by: Jakub Kicinski --- include/linux/ethtool_netlink.h | 15 +++++++++++- include/linux/phy.h | 4 ++++ net/ethtool/cabletest.c | 53 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 7d763ba22f6f..e317fc99565e 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -20,10 +20,12 @@ struct phy_device; int ethnl_cable_test_alloc(struct phy_device *phydev); void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); +int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); +int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline void ethnl_cable_test_free(struct phy_device *phydev) @@ -33,5 +35,16 @@ static inline void ethnl_cable_test_free(struct phy_device *phydev) static inline void ethnl_cable_test_finished(struct phy_device *phydev) { } +static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, + u8 result) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, + u8 pair, u32 cm) +{ + return -EOPNOTSUPP; +} #endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 169fae4249a9..5d8ff5428010 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1265,6 +1265,10 @@ int phy_start_cable_test(struct phy_device *phydev, } #endif +int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result); +int phy_cable_test_fault_length(struct phy_device *phydev, u8 pair, + u16 cm); + static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index ae8e63647663..e0c917918c70 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -81,13 +81,16 @@ int ethnl_cable_test_alloc(struct phy_device *phydev) phydev->nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_NEST); - if (!phydev->nest) + if (!phydev->nest) { + err = -EMSGSIZE; goto out; + } return 0; out: nlmsg_free(phydev->skb); + phydev->skb = NULL; return err; } EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); @@ -95,6 +98,7 @@ EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); void ethnl_cable_test_free(struct phy_device *phydev) { nlmsg_free(phydev->skb); + phydev->skb = NULL; } EXPORT_SYMBOL_GPL(ethnl_cable_test_free); @@ -107,3 +111,50 @@ void ethnl_cable_test_finished(struct phy_device *phydev) ethnl_multicast(phydev->skb, phydev->attached_dev); } EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); + +int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair)) + goto err; + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_result); + +int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair)) + goto err; + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); -- cgit v1.2.3 From b6ca09cb156d349e6fdde8a8466ec15b902d1419 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:59:35 -0500 Subject: net/mlx5: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 +- include/linux/mlx5/mlx5_ifc.h | 66 +++++++++++++++++++++---------------------- include/linux/mlx5/qp.h | 2 +- 3 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 267dfcc5493e..24e04901f92e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -201,7 +201,7 @@ struct mlx5_rsc_debug { void *object; enum dbg_rsc_type type; struct dentry *root; - struct mlx5_field_desc fields[0]; + struct mlx5_field_desc fields[]; }; enum mlx5_dev_event { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fb243848132d..c9dd6e99ad56 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1703,7 +1703,7 @@ struct mlx5_ifc_wq_bits { u8 reserved_at_140[0x4c0]; - struct mlx5_ifc_cmd_pas_bits pas[0]; + struct mlx5_ifc_cmd_pas_bits pas[]; }; struct mlx5_ifc_rq_num_bits { @@ -1921,7 +1921,7 @@ struct mlx5_ifc_resource_dump_menu_segment_bits { u8 reserved_at_20[0x10]; u8 num_of_records[0x10]; - struct mlx5_ifc_resource_dump_menu_record_bits record[0]; + struct mlx5_ifc_resource_dump_menu_record_bits record[]; }; struct mlx5_ifc_resource_dump_resource_segment_bits { @@ -1933,7 +1933,7 @@ struct mlx5_ifc_resource_dump_resource_segment_bits { u8 index2[0x20]; - u8 payload[0][0x20]; + u8 payload[][0x20]; }; struct mlx5_ifc_resource_dump_terminate_segment_bits { @@ -3010,7 +3010,7 @@ struct mlx5_ifc_flow_context_bits { u8 reserved_at_1200[0x600]; - union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[0]; + union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[]; }; enum { @@ -3303,7 +3303,7 @@ struct mlx5_ifc_rqtc_bits { u8 reserved_at_e0[0x6a0]; - struct mlx5_ifc_rq_num_bits rq_num[0]; + struct mlx5_ifc_rq_num_bits rq_num[]; }; enum { @@ -3415,7 +3415,7 @@ struct mlx5_ifc_nic_vport_context_bits { u8 reserved_at_7e0[0x20]; - u8 current_uc_mac_address[0][0x40]; + u8 current_uc_mac_address[][0x40]; }; enum { @@ -4338,7 +4338,7 @@ struct mlx5_ifc_query_xrc_srq_out_bits { u8 reserved_at_280[0x600]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_query_xrc_srq_in_bits { @@ -4616,7 +4616,7 @@ struct mlx5_ifc_query_srq_out_bits { u8 reserved_at_280[0x600]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_query_srq_in_bits { @@ -4827,7 +4827,7 @@ struct mlx5_ifc_query_qp_out_bits { u8 reserved_at_800[0x80]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_query_qp_in_bits { @@ -5160,7 +5160,7 @@ struct mlx5_ifc_query_hca_vport_pkey_out_bits { u8 reserved_at_40[0x40]; - struct mlx5_ifc_pkey_bits pkey[0]; + struct mlx5_ifc_pkey_bits pkey[]; }; struct mlx5_ifc_query_hca_vport_pkey_in_bits { @@ -5196,7 +5196,7 @@ struct mlx5_ifc_query_hca_vport_gid_out_bits { u8 gids_num[0x10]; u8 reserved_at_70[0x10]; - struct mlx5_ifc_array128_auto_bits gid[0]; + struct mlx5_ifc_array128_auto_bits gid[]; }; struct mlx5_ifc_query_hca_vport_gid_in_bits { @@ -5464,7 +5464,7 @@ struct mlx5_ifc_query_flow_counter_out_bits { u8 reserved_at_40[0x40]; - struct mlx5_ifc_traffic_counter_bits flow_statistics[0]; + struct mlx5_ifc_traffic_counter_bits flow_statistics[]; }; struct mlx5_ifc_query_flow_counter_in_bits { @@ -5558,7 +5558,7 @@ struct mlx5_ifc_query_eq_out_bits { u8 reserved_at_300[0x580]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_query_eq_in_bits { @@ -5583,7 +5583,7 @@ struct mlx5_ifc_packet_reformat_context_in_bits { u8 reserved_at_20[0x10]; u8 reformat_data[2][0x8]; - u8 more_reformat_data[0][0x8]; + u8 more_reformat_data[][0x8]; }; struct mlx5_ifc_query_packet_reformat_context_out_bits { @@ -5594,7 +5594,7 @@ struct mlx5_ifc_query_packet_reformat_context_out_bits { u8 reserved_at_40[0xa0]; - struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context[0]; + struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context[]; }; struct mlx5_ifc_query_packet_reformat_context_in_bits { @@ -5833,7 +5833,7 @@ struct mlx5_ifc_query_cq_out_bits { u8 reserved_at_280[0x600]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_query_cq_in_bits { @@ -6440,7 +6440,7 @@ struct mlx5_ifc_modify_cq_in_bits { u8 reserved_at_300[0x580]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_modify_cong_status_out_bits { @@ -6504,7 +6504,7 @@ struct mlx5_ifc_manage_pages_out_bits { u8 reserved_at_60[0x20]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; enum { @@ -6526,7 +6526,7 @@ struct mlx5_ifc_manage_pages_in_bits { u8 input_num_entries[0x20]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_mad_ifc_out_bits { @@ -7481,7 +7481,7 @@ struct mlx5_ifc_create_xrc_srq_in_bits { u8 reserved_at_300[0x580]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_create_tis_out_bits { @@ -7557,7 +7557,7 @@ struct mlx5_ifc_create_srq_in_bits { u8 reserved_at_280[0x600]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_create_sq_out_bits { @@ -7718,7 +7718,7 @@ struct mlx5_ifc_create_qp_in_bits { u8 wq_umem_valid[0x1]; u8 reserved_at_861[0x1f]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_create_psv_out_bits { @@ -7789,7 +7789,7 @@ struct mlx5_ifc_create_mkey_in_bits { u8 reserved_at_320[0x560]; - u8 klm_pas_mtt[0][0x20]; + u8 klm_pas_mtt[][0x20]; }; enum { @@ -7922,7 +7922,7 @@ struct mlx5_ifc_create_eq_in_bits { u8 reserved_at_3c0[0x4c0]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_create_dct_out_bits { @@ -7979,7 +7979,7 @@ struct mlx5_ifc_create_cq_in_bits { u8 cq_umem_valid[0x1]; u8 reserved_at_2e1[0x59f]; - u8 pas[0][0x40]; + u8 pas[][0x40]; }; struct mlx5_ifc_config_int_moderation_out_bits { @@ -8335,7 +8335,7 @@ struct mlx5_ifc_access_register_out_bits { u8 reserved_at_40[0x40]; - u8 register_data[0][0x20]; + u8 register_data[][0x20]; }; enum { @@ -8355,7 +8355,7 @@ struct mlx5_ifc_access_register_in_bits { u8 argument[0x20]; - u8 register_data[0][0x20]; + u8 register_data[][0x20]; }; struct mlx5_ifc_sltp_reg_bits { @@ -9372,7 +9372,7 @@ struct mlx5_ifc_cmd_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 command[0][0x20]; + u8 command[][0x20]; }; struct mlx5_ifc_cmd_if_box_bits { @@ -9666,7 +9666,7 @@ struct mlx5_ifc_mcqi_reg_bits { u8 reserved_at_a0[0x10]; u8 data_size[0x10]; - union mlx5_ifc_mcqi_reg_data_bits data[0]; + union mlx5_ifc_mcqi_reg_data_bits data[]; }; struct mlx5_ifc_mcc_reg_bits { @@ -10252,7 +10252,7 @@ struct mlx5_ifc_umem_bits { u8 num_of_mtt[0x40]; - struct mlx5_ifc_mtt_bits mtt[0]; + struct mlx5_ifc_mtt_bits mtt[]; }; struct mlx5_ifc_uctx_bits { @@ -10377,7 +10377,7 @@ struct mlx5_ifc_mtrc_stdb_bits { u8 reserved_at_4[0x4]; u8 read_size[0x18]; u8 start_offset[0x20]; - u8 string_db_data[0]; + u8 string_db_data[]; }; struct mlx5_ifc_mtrc_ctrl_bits { @@ -10431,7 +10431,7 @@ struct mlx5_ifc_query_esw_functions_out_bits { struct mlx5_ifc_host_params_context_bits host_params_context; u8 reserved_at_280[0x180]; - u8 host_sf_enable[0][0x40]; + u8 host_sf_enable[][0x40]; }; struct mlx5_ifc_sf_partition_bits { @@ -10451,7 +10451,7 @@ struct mlx5_ifc_query_sf_partitions_out_bits { u8 reserved_at_60[0x20]; - struct mlx5_ifc_sf_partition_bits sf_partition[0]; + struct mlx5_ifc_sf_partition_bits sf_partition[]; }; struct mlx5_ifc_query_sf_partitions_in_bits { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f23eb18526fe..1af5e460b5f6 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -408,7 +408,7 @@ struct mlx5_wqe_signature_seg { struct mlx5_wqe_inline_seg { __be32 byte_count; - __be32 data[0]; + __be32 data[]; }; enum mlx5_sig_type { -- cgit v1.2.3 From ac02a451a6148bb9c395b39783ce7299eddf4f31 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 10 May 2020 19:37:43 +0300 Subject: net: dsa: sja1105: implement cross-chip bridging operations sja1105 uses dsa_8021q for DSA tagging, a format which is VLAN at heart and which is compatible with cascading. A complete description of this tagging format is in net/dsa/tag_8021q.c, but a quick summary is that each external-facing port tags incoming frames with a unique pvid, and this special VLAN is transmitted as tagged towards the inside of the system, and as untagged towards the exterior. The tag encodes the switch id and the source port index. This means that cross-chip bridging for dsa_8021q only entails adding the dsa_8021q pvids of one switch to the RX filter of the other switches. Everything else falls naturally into place, as long as the bottom-end of ports (the leaves in the tree) is comprised exclusively of dsa_8021q-compatible (i.e. sja1105 switches). Otherwise, there would be a chance that a front-panel switch transmits a packet tagged with a dsa_8021q header, header which it wouldn't be able to remove, and which would hence "leak" out. The only use case I tested (due to lack of board availability) was when the sja1105 switches are part of disjoint trees (however, this doesn't change the fact that multiple sja1105 switches still need unique switch identifiers in such a system). But in principle, even "true" single-tree setups (with DSA links) should work just as fine, except for a small change which I can't test: dsa_towards_port should be used instead of dsa_upstream_port (I made the assumption that the routing port that any sja1105 should use towards its neighbours is the CPU port. That might not hold true in other setups). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105.h | 2 + drivers/net/dsa/sja1105/sja1105_main.c | 90 ++++++++++++++++++++ include/linux/dsa/8021q.h | 45 ++++++++++ net/dsa/tag_8021q.c | 151 +++++++++++++++++++++++++++++++++ 4 files changed, 288 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 8df2a5c53b02..a64ace07b89f 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include "sja1105_static_config.h" @@ -185,6 +186,7 @@ struct sja1105_private { struct gpio_desc *reset_gpio; struct spi_device *spidev; struct dsa_switch *ds; + struct list_head crosschip_links; struct sja1105_flow_block flow_block; struct sja1105_port ports[SJA1105_NUM_PORTS]; /* Serializes transmission of management frames so that diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 666e54565df0..d5de9305df25 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -25,6 +25,8 @@ #include "sja1105_sgmii.h" #include "sja1105_tas.h" +static const struct dsa_switch_ops sja1105_switch_ops; + static void sja1105_hw_reset(struct gpio_desc *gpio, unsigned int pulse_len, unsigned int startup_delay) { @@ -1791,6 +1793,84 @@ static int sja1105_vlan_apply(struct sja1105_private *priv, int port, u16 vid, return 0; } +static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, + int tree_index, int sw_index, + int other_port, struct net_device *br) +{ + struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index); + struct sja1105_private *other_priv = other_ds->priv; + struct sja1105_private *priv = ds->priv; + int port, rc; + + if (other_ds->ops != &sja1105_switch_ops) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (!dsa_is_user_port(ds, port)) + continue; + if (dsa_to_port(ds, port)->bridge_dev != br) + continue; + + rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds, + other_port, br, + &priv->crosschip_links); + if (rc) + return rc; + + rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port, ds, + port, br, + &other_priv->crosschip_links); + if (rc) + return rc; + } + + return 0; +} + +static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds, + int tree_index, int sw_index, + int other_port, + struct net_device *br) +{ + struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index); + struct sja1105_private *other_priv = other_ds->priv; + struct sja1105_private *priv = ds->priv; + int port; + + if (other_ds->ops != &sja1105_switch_ops) + return; + + for (port = 0; port < ds->num_ports; port++) { + if (!dsa_is_user_port(ds, port)) + continue; + if (dsa_to_port(ds, port)->bridge_dev != br) + continue; + + dsa_8021q_crosschip_bridge_leave(ds, port, other_ds, other_port, + br, &priv->crosschip_links); + + dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, + port, br, + &other_priv->crosschip_links); + } +} + +static int sja1105_replay_crosschip_vlans(struct dsa_switch *ds, bool enabled) +{ + struct sja1105_private *priv = ds->priv; + struct dsa_8021q_crosschip_link *c; + int rc; + + list_for_each_entry(c, &priv->crosschip_links, list) { + rc = dsa_8021q_crosschip_link_apply(ds, c->port, c->other_ds, + c->other_port, enabled); + if (rc) + break; + } + + return rc; +} + static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) { int rc, i; @@ -1803,6 +1883,12 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) return rc; } } + rc = sja1105_replay_crosschip_vlans(ds, enabled); + if (rc) { + dev_err(ds->dev, "Failed to replay crosschip VLANs: %d\n", rc); + return rc; + } + dev_info(ds->dev, "%s switch tagging\n", enabled ? "Enabled" : "Disabled"); return 0; @@ -2370,6 +2456,8 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .cls_flower_add = sja1105_cls_flower_add, .cls_flower_del = sja1105_cls_flower_del, .cls_flower_stats = sja1105_cls_flower_stats, + .crosschip_bridge_join = sja1105_crosschip_bridge_join, + .crosschip_bridge_leave = sja1105_crosschip_bridge_leave, }; static int sja1105_check_device_id(struct sja1105_private *priv) @@ -2472,6 +2560,8 @@ static int sja1105_probe(struct spi_device *spi) mutex_init(&priv->ptp_data.lock); mutex_init(&priv->mgmt_lock); + INIT_LIST_HEAD(&priv->crosschip_links); + sja1105_tas_setup(ds); sja1105_flower_setup(ds); diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index c620d9139c28..b8daaec0896e 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -12,11 +12,33 @@ struct sk_buff; struct net_device; struct packet_type; +struct dsa_8021q_crosschip_link { + struct list_head list; + int port; + struct dsa_switch *other_ds; + int other_port; + refcount_t refcount; +}; + #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, bool enabled); +int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled); + +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links); + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); @@ -36,6 +58,29 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, return 0; } +int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) +{ + return 0; +} + +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + return 0; +} + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + return 0; +} + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) { diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index b97ad93d1c1a..ff9c5bf64bda 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -8,6 +8,7 @@ */ #include #include +#include #include "dsa_priv.h" @@ -288,6 +289,156 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); +int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) +{ + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + /* @rx_vid of local @ds port @port goes to @other_port of + * @other_ds + */ + return dsa_8021q_vid_apply(other_ds, other_port, rx_vid, + BRIDGE_VLAN_INFO_UNTAGGED, enabled); +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_link_apply); + +static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, + struct list_head *crosschip_links) +{ + struct dsa_8021q_crosschip_link *c; + + list_for_each_entry(c, crosschip_links, list) { + if (c->port == port && c->other_ds == other_ds && + c->other_port == other_port) { + refcount_inc(&c->refcount); + return 0; + } + } + + dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n", + port, dev_name(other_ds->dev), other_port); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + + c->port = port; + c->other_ds = other_ds; + c->other_port = other_port; + refcount_set(&c->refcount, 1); + + list_add(&c->list, crosschip_links); + + return 0; +} + +static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, + struct dsa_8021q_crosschip_link *c, + struct list_head *crosschip_links, + bool *keep) +{ + *keep = !refcount_dec_and_test(&c->refcount); + + if (*keep) + return; + + dev_dbg(ds->dev, + "deleting crosschip link from port %d to %s port %d\n", + c->port, dev_name(c->other_ds->dev), c->other_port); + + list_del(&c->list); + kfree(c); +} + +/* Make traffic from local port @port be received by remote port @other_port. + * This means that our @rx_vid needs to be installed on @other_ds's upstream + * and user ports. The user ports should be egress-untagged so that they can + * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged + * or untagged: it doesn't matter, since it should never egress a frame having + * our @rx_vid. + */ +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + /* @other_upstream is how @other_ds reaches us. If we are part + * of disjoint trees, then we are probably connected through + * our CPU ports. If we're part of the same tree though, we should + * probably use dsa_towards_port. + */ + int other_upstream = dsa_upstream_port(other_ds, other_port); + int rc; + + rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, + other_port, crosschip_links); + if (rc) + return rc; + + if (!br_vlan_enabled(br)) { + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_port, true); + if (rc) + return rc; + } + + rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, + other_upstream, + crosschip_links); + if (rc) + return rc; + + if (!br_vlan_enabled(br)) { + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_upstream, true); + if (rc) + return rc; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, struct net_device *br, + struct list_head *crosschip_links) +{ + int other_upstream = dsa_upstream_port(other_ds, other_port); + struct dsa_8021q_crosschip_link *c, *n; + + list_for_each_entry_safe(c, n, crosschip_links, list) { + if (c->port == port && c->other_ds == other_ds && + (c->other_port == other_port || + c->other_port == other_upstream)) { + struct dsa_switch *other_ds = c->other_ds; + int other_port = c->other_port; + bool keep; + int rc; + + dsa_8021q_crosschip_link_del(ds, c, crosschip_links, + &keep); + if (keep) + continue; + + if (!br_vlan_enabled(br)) { + rc = dsa_8021q_crosschip_link_apply(ds, port, + other_ds, + other_port, + false); + if (rc) + return rc; + } + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) { -- cgit v1.2.3 From 479da1f538a2f3547e15f9d5922c611b69ec2fbc Mon Sep 17 00:00:00 2001 From: Noralf Trønnes Date: Sat, 9 May 2020 16:16:10 +0200 Subject: backlight: Add backlight_device_get_by_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a way to lookup a backlight device based on its name. Will be used by a USB display gadget getting the name from configfs. Signed-off-by: Noralf Trønnes Reviewed-by: Sam Ravnborg Reviewed-by: Daniel Thompson Acked-by: Jingoo Han Signed-off-by: Lee Jones --- drivers/video/backlight/backlight.c | 21 +++++++++++++++++++++ include/linux/backlight.h | 1 + 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index cac3e35d7630..92d80aa0c0ef 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -432,6 +432,27 @@ struct backlight_device *backlight_device_get_by_type(enum backlight_type type) } EXPORT_SYMBOL(backlight_device_get_by_type); +/** + * backlight_device_get_by_name - Get backlight device by name + * @name: Device name + * + * This function looks up a backlight device by its name. It obtains a reference + * on the backlight device and it is the caller's responsibility to drop the + * reference by calling backlight_put(). + * + * Returns: + * A pointer to the backlight device if found, otherwise NULL. + */ +struct backlight_device *backlight_device_get_by_name(const char *name) +{ + struct device *dev; + + dev = class_find_device_by_name(backlight_class, name); + + return dev ? to_backlight_device(dev) : NULL; +} +EXPORT_SYMBOL(backlight_device_get_by_name); + /** * backlight_device_unregister - unregisters a backlight device object. * @bd: the backlight device object to be unregistered and freed. diff --git a/include/linux/backlight.h b/include/linux/backlight.h index c7d6b2e8c3b5..56e4580d4f55 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -190,6 +190,7 @@ extern void backlight_force_update(struct backlight_device *bd, extern int backlight_register_notifier(struct notifier_block *nb); extern int backlight_unregister_notifier(struct notifier_block *nb); extern struct backlight_device *backlight_device_get_by_type(enum backlight_type type); +struct backlight_device *backlight_device_get_by_name(const char *name); extern int backlight_device_set_brightness(struct backlight_device *bd, unsigned long brightness); #define to_backlight_device(obj) container_of(obj, struct backlight_device, dev) -- cgit v1.2.3 From 83c411c29b90b7de505a2fe1d655293c95f8ba90 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 28 Apr 2020 11:42:54 +0200 Subject: mtd: rawnand: timings: Add mode information to the timings structure Convert the timings union into a structure containing the mode and the actual values. The values are still a union in prevision of the addition of the NVDDR modes. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200428094302.14624-2-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/nand_timings.c | 6 ++++++ include/linux/mtd/rawnand.h | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_timings.c b/drivers/mtd/nand/raw/nand_timings.c index f64b06a71dfa..0061cbaf931d 100644 --- a/drivers/mtd/nand/raw/nand_timings.c +++ b/drivers/mtd/nand/raw/nand_timings.c @@ -16,6 +16,7 @@ static const struct nand_data_interface onfi_sdr_timings[] = { /* Mode 0 */ { .type = NAND_SDR_IFACE, + .timings.mode = 0, .timings.sdr = { .tCCS_min = 500000, .tR_max = 200000000, @@ -58,6 +59,7 @@ static const struct nand_data_interface onfi_sdr_timings[] = { /* Mode 1 */ { .type = NAND_SDR_IFACE, + .timings.mode = 1, .timings.sdr = { .tCCS_min = 500000, .tR_max = 200000000, @@ -100,6 +102,7 @@ static const struct nand_data_interface onfi_sdr_timings[] = { /* Mode 2 */ { .type = NAND_SDR_IFACE, + .timings.mode = 2, .timings.sdr = { .tCCS_min = 500000, .tR_max = 200000000, @@ -142,6 +145,7 @@ static const struct nand_data_interface onfi_sdr_timings[] = { /* Mode 3 */ { .type = NAND_SDR_IFACE, + .timings.mode = 3, .timings.sdr = { .tCCS_min = 500000, .tR_max = 200000000, @@ -184,6 +188,7 @@ static const struct nand_data_interface onfi_sdr_timings[] = { /* Mode 4 */ { .type = NAND_SDR_IFACE, + .timings.mode = 4, .timings.sdr = { .tCCS_min = 500000, .tR_max = 200000000, @@ -226,6 +231,7 @@ static const struct nand_data_interface onfi_sdr_timings[] = { /* Mode 5 */ { .type = NAND_SDR_IFACE, + .timings.mode = 5, .timings.sdr = { .tCCS_min = 500000, .tR_max = 200000000, diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 1e76196f9829..21873168ba4d 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -491,13 +491,17 @@ enum nand_data_interface_type { /** * struct nand_data_interface - NAND interface timing * @type: type of the timing - * @timings: The timing, type according to @type + * @timings: The timing information + * @timings.mode: Timing mode as defined in the specification * @timings.sdr: Use it when @type is %NAND_SDR_IFACE. */ struct nand_data_interface { enum nand_data_interface_type type; - union { - struct nand_sdr_timings sdr; + struct nand_timings { + unsigned int mode; + union { + struct nand_sdr_timings sdr; + }; } timings; }; -- cgit v1.2.3 From 9e3307a169537a6adc30b13bf9063e94990a5493 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sun, 3 May 2020 17:53:37 +0200 Subject: mtd: Add support for emulated SLC mode on MLC NANDs MLC NANDs can be made a bit more reliable if we only program the lower page of each pair. At least, this solves the paired-pages corruption issue. Signed-off-by: Boris Brezillon Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200503155341.16712-5-miquel.raynal@bootlin.com --- drivers/mtd/mtdcore.c | 189 +++++++++++++++++++++++++++++++++++++---- drivers/mtd/mtdpart.c | 54 +++++++----- include/linux/mtd/mtd.h | 7 +- include/linux/mtd/partitions.h | 2 + include/uapi/mtd/mtd-abi.h | 1 + 5 files changed, 213 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 2916674208b3..50437b4ffe76 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -617,6 +617,19 @@ int add_mtd_device(struct mtd_info *mtd) !(mtd->flags & MTD_NO_ERASE))) return -EINVAL; + /* + * MTD_SLC_ON_MLC_EMULATION can only be set on partitions, when the + * master is an MLC NAND and has a proper pairing scheme defined. + * We also reject masters that implement ->_writev() for now, because + * NAND controller drivers don't implement this hook, and adding the + * SLC -> MLC address/length conversion to this path is useless if we + * don't have a user. + */ + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION && + (!mtd_is_partition(mtd) || master->type != MTD_MLCNANDFLASH || + !master->pairing || master->_writev)) + return -EINVAL; + mutex_lock(&mtd_table_mutex); i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); @@ -632,6 +645,14 @@ int add_mtd_device(struct mtd_info *mtd) if (mtd->bitflip_threshold == 0) mtd->bitflip_threshold = mtd->ecc_strength; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + int ngroups = mtd_pairing_groups(master); + + mtd->erasesize /= ngroups; + mtd->size = (u64)mtd_div_by_eb(mtd->size, master) * + mtd->erasesize; + } + if (is_power_of_2(mtd->erasesize)) mtd->erasesize_shift = ffs(mtd->erasesize) - 1; else @@ -1074,9 +1095,11 @@ int mtd_erase(struct mtd_info *mtd, struct erase_info *instr) { struct mtd_info *master = mtd_get_master(mtd); u64 mst_ofs = mtd_get_master_ofs(mtd, 0); + struct erase_info adjinstr; int ret; instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; + adjinstr = *instr; if (!mtd->erasesize || !master->_erase) return -ENOTSUPP; @@ -1091,12 +1114,27 @@ int mtd_erase(struct mtd_info *mtd, struct erase_info *instr) ledtrig_mtd_activity(); - instr->addr += mst_ofs; - ret = master->_erase(master, instr); - if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN) - instr->fail_addr -= mst_ofs; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + adjinstr.addr = (loff_t)mtd_div_by_eb(instr->addr, mtd) * + master->erasesize; + adjinstr.len = ((u64)mtd_div_by_eb(instr->addr + instr->len, mtd) * + master->erasesize) - + adjinstr.addr; + } + + adjinstr.addr += mst_ofs; + + ret = master->_erase(master, &adjinstr); + + if (adjinstr.fail_addr != MTD_FAIL_ADDR_UNKNOWN) { + instr->fail_addr = adjinstr.fail_addr - mst_ofs; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + instr->fail_addr = mtd_div_by_eb(instr->fail_addr, + master); + instr->fail_addr *= mtd->erasesize; + } + } - instr->addr -= mst_ofs; return ret; } EXPORT_SYMBOL_GPL(mtd_erase); @@ -1276,6 +1314,101 @@ static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs, return 0; } +static int mtd_read_oob_std(struct mtd_info *mtd, loff_t from, + struct mtd_oob_ops *ops) +{ + struct mtd_info *master = mtd_get_master(mtd); + int ret; + + from = mtd_get_master_ofs(mtd, from); + if (master->_read_oob) + ret = master->_read_oob(master, from, ops); + else + ret = master->_read(master, from, ops->len, &ops->retlen, + ops->datbuf); + + return ret; +} + +static int mtd_write_oob_std(struct mtd_info *mtd, loff_t to, + struct mtd_oob_ops *ops) +{ + struct mtd_info *master = mtd_get_master(mtd); + int ret; + + to = mtd_get_master_ofs(mtd, to); + if (master->_write_oob) + ret = master->_write_oob(master, to, ops); + else + ret = master->_write(master, to, ops->len, &ops->retlen, + ops->datbuf); + + return ret; +} + +static int mtd_io_emulated_slc(struct mtd_info *mtd, loff_t start, bool read, + struct mtd_oob_ops *ops) +{ + struct mtd_info *master = mtd_get_master(mtd); + int ngroups = mtd_pairing_groups(master); + int npairs = mtd_wunit_per_eb(master) / ngroups; + struct mtd_oob_ops adjops = *ops; + unsigned int wunit, oobavail; + struct mtd_pairing_info info; + int max_bitflips = 0; + u32 ebofs, pageofs; + loff_t base, pos; + + ebofs = mtd_mod_by_eb(start, mtd); + base = (loff_t)mtd_div_by_eb(start, mtd) * master->erasesize; + info.group = 0; + info.pair = mtd_div_by_ws(ebofs, mtd); + pageofs = mtd_mod_by_ws(ebofs, mtd); + oobavail = mtd_oobavail(mtd, ops); + + while (ops->retlen < ops->len || ops->oobretlen < ops->ooblen) { + int ret; + + if (info.pair >= npairs) { + info.pair = 0; + base += master->erasesize; + } + + wunit = mtd_pairing_info_to_wunit(master, &info); + pos = mtd_wunit_to_offset(mtd, base, wunit); + + adjops.len = ops->len - ops->retlen; + if (adjops.len > mtd->writesize - pageofs) + adjops.len = mtd->writesize - pageofs; + + adjops.ooblen = ops->ooblen - ops->oobretlen; + if (adjops.ooblen > oobavail - adjops.ooboffs) + adjops.ooblen = oobavail - adjops.ooboffs; + + if (read) { + ret = mtd_read_oob_std(mtd, pos + pageofs, &adjops); + if (ret > 0) + max_bitflips = max(max_bitflips, ret); + } else { + ret = mtd_write_oob_std(mtd, pos + pageofs, &adjops); + } + + if (ret < 0) + return ret; + + max_bitflips = max(max_bitflips, ret); + ops->retlen += adjops.retlen; + ops->oobretlen += adjops.oobretlen; + adjops.datbuf += adjops.retlen; + adjops.oobbuf += adjops.oobretlen; + adjops.ooboffs = 0; + pageofs = 0; + info.pair++; + } + + return max_bitflips; +} + int mtd_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); @@ -1294,12 +1427,10 @@ int mtd_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) if (!master->_read_oob && (!master->_read || ops->oobbuf)) return -EOPNOTSUPP; - from = mtd_get_master_ofs(mtd, from); - if (master->_read_oob) - ret_code = master->_read_oob(master, from, ops); + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ret_code = mtd_io_emulated_slc(mtd, from, true, ops); else - ret_code = master->_read(master, from, ops->len, &ops->retlen, - ops->datbuf); + ret_code = mtd_read_oob_std(mtd, from, ops); mtd_update_ecc_stats(mtd, master, &old_stats); @@ -1338,13 +1469,10 @@ int mtd_write_oob(struct mtd_info *mtd, loff_t to, if (!master->_write_oob && (!master->_write || ops->oobbuf)) return -EOPNOTSUPP; - to = mtd_get_master_ofs(mtd, to); + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + return mtd_io_emulated_slc(mtd, to, false, ops); - if (master->_write_oob) - return master->_write_oob(master, to, ops); - else - return master->_write(master, to, ops->len, &ops->retlen, - ops->datbuf); + return mtd_write_oob_std(mtd, to, ops); } EXPORT_SYMBOL_GPL(mtd_write_oob); @@ -1817,6 +1945,12 @@ int mtd_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) return -EINVAL; if (!len) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; + } + return master->_lock(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_lock); @@ -1831,6 +1965,12 @@ int mtd_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) return -EINVAL; if (!len) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; + } + return master->_unlock(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_unlock); @@ -1845,6 +1985,12 @@ int mtd_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len) return -EINVAL; if (!len) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; + } + return master->_is_locked(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_is_locked); @@ -1857,6 +2003,10 @@ int mtd_block_isreserved(struct mtd_info *mtd, loff_t ofs) return -EINVAL; if (!master->_block_isreserved) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + return master->_block_isreserved(master, mtd_get_master_ofs(mtd, ofs)); } EXPORT_SYMBOL_GPL(mtd_block_isreserved); @@ -1869,6 +2019,10 @@ int mtd_block_isbad(struct mtd_info *mtd, loff_t ofs) return -EINVAL; if (!master->_block_isbad) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + return master->_block_isbad(master, mtd_get_master_ofs(mtd, ofs)); } EXPORT_SYMBOL_GPL(mtd_block_isbad); @@ -1885,6 +2039,9 @@ int mtd_block_markbad(struct mtd_info *mtd, loff_t ofs) if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + ret = master->_block_markbad(master, mtd_get_master_ofs(mtd, ofs)); if (ret) return ret; diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 3f6025684f58..c3575b686f79 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -35,9 +35,12 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, const struct mtd_partition *part, int partno, uint64_t cur_offset) { - int wr_alignment = (parent->flags & MTD_NO_ERASE) ? parent->writesize : - parent->erasesize; - struct mtd_info *child, *master = mtd_get_master(parent); + struct mtd_info *master = mtd_get_master(parent); + int wr_alignment = (parent->flags & MTD_NO_ERASE) ? + master->writesize : master->erasesize; + u64 parent_size = mtd_is_partition(parent) ? + parent->part.size : parent->size; + struct mtd_info *child; u32 remainder; char *name; u64 tmp; @@ -56,8 +59,9 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, /* set up the MTD object for this partition */ child->type = parent->type; child->part.flags = parent->flags & ~part->mask_flags; + child->part.flags |= part->add_flags; child->flags = child->part.flags; - child->size = part->size; + child->part.size = part->size; child->writesize = parent->writesize; child->writebufsize = parent->writebufsize; child->oobsize = parent->oobsize; @@ -98,29 +102,29 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, } if (child->part.offset == MTDPART_OFS_RETAIN) { child->part.offset = cur_offset; - if (parent->size - child->part.offset >= child->size) { - child->size = parent->size - child->part.offset - - child->size; + if (parent_size - child->part.offset >= child->part.size) { + child->part.size = parent_size - child->part.offset - + child->part.size; } else { printk(KERN_ERR "mtd partition \"%s\" doesn't have enough space: %#llx < %#llx, disabled\n", - part->name, parent->size - child->part.offset, - child->size); + part->name, parent_size - child->part.offset, + child->part.size); /* register to preserve ordering */ goto out_register; } } - if (child->size == MTDPART_SIZ_FULL) - child->size = parent->size - child->part.offset; + if (child->part.size == MTDPART_SIZ_FULL) + child->part.size = parent_size - child->part.offset; printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", - child->part.offset, child->part.offset + child->size, + child->part.offset, child->part.offset + child->part.size, child->name); /* let's do some sanity checks */ - if (child->part.offset >= parent->size) { + if (child->part.offset >= parent_size) { /* let's register it anyway to preserve ordering */ child->part.offset = 0; - child->size = 0; + child->part.size = 0; /* Initialize ->erasesize to make add_mtd_device() happy. */ child->erasesize = parent->erasesize; @@ -128,15 +132,16 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, part->name); goto out_register; } - if (child->part.offset + child->size > parent->size) { - child->size = parent->size - child->part.offset; + if (child->part.offset + child->part.size > parent->size) { + child->part.size = parent_size - child->part.offset; printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#llx\n", - part->name, parent->name, child->size); + part->name, parent->name, child->part.size); } + if (parent->numeraseregions > 1) { /* Deal with variable erase size stuff */ int i, max = parent->numeraseregions; - u64 end = child->part.offset + child->size; + u64 end = child->part.offset + child->part.size; struct mtd_erase_region_info *regions = parent->eraseregions; /* Find the first erase regions which is part of this @@ -156,7 +161,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, BUG_ON(child->erasesize == 0); } else { /* Single erase size */ - child->erasesize = parent->erasesize; + child->erasesize = master->erasesize; } /* @@ -178,7 +183,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, part->name); } - tmp = mtd_get_master_ofs(child, 0) + child->size; + tmp = mtd_get_master_ofs(child, 0) + child->part.size; remainder = do_div(tmp, wr_alignment); if ((child->flags & MTD_WRITEABLE) && remainder) { child->flags &= ~MTD_WRITEABLE; @@ -186,6 +191,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, part->name); } + child->size = child->part.size; child->ecc_step_size = parent->ecc_step_size; child->ecc_strength = parent->ecc_strength; child->bitflip_threshold = parent->bitflip_threshold; @@ -193,7 +199,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, if (master->_block_isbad) { uint64_t offs = 0; - while (offs < child->size) { + while (offs < child->part.size) { if (mtd_block_isreserved(child, offs)) child->ecc_stats.bbtblocks++; else if (mtd_block_isbad(child, offs)) @@ -234,6 +240,8 @@ int mtd_add_partition(struct mtd_info *parent, const char *name, long long offset, long long length) { struct mtd_info *master = mtd_get_master(parent); + u64 parent_size = mtd_is_partition(parent) ? + parent->part.size : parent->size; struct mtd_partition part; struct mtd_info *child; int ret = 0; @@ -244,7 +252,7 @@ int mtd_add_partition(struct mtd_info *parent, const char *name, return -EINVAL; if (length == MTDPART_SIZ_FULL) - length = parent->size - offset; + length = parent_size - offset; if (length <= 0) return -EINVAL; @@ -419,7 +427,7 @@ int add_mtd_partitions(struct mtd_info *parent, /* Look for subpartitions */ parse_mtd_partitions(child, parts[i].types, NULL); - cur_offset = child->part.offset + child->size; + cur_offset = child->part.offset + child->part.size; } return 0; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 2d1f4a61f4ac..157357ec1441 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -200,6 +200,8 @@ struct mtd_debug_info { * * @node: list node used to add an MTD partition to the parent partition list * @offset: offset of the partition relatively to the parent offset + * @size: partition size. Should be equal to mtd->size unless + * MTD_SLC_ON_MLC_EMULATION is set * @flags: original flags (before the mtdpart logic decided to tweak them based * on flash constraints, like eraseblock/pagesize alignment) * @@ -209,6 +211,7 @@ struct mtd_debug_info { struct mtd_part { struct list_head node; u64 offset; + u64 size; u32 flags; }; @@ -622,7 +625,9 @@ static inline uint32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd) static inline int mtd_wunit_per_eb(struct mtd_info *mtd) { - return mtd->erasesize / mtd->writesize; + struct mtd_info *master = mtd_get_master(mtd); + + return master->erasesize / mtd->writesize; } static inline int mtd_offset_to_wunit(struct mtd_info *mtd, loff_t offs) diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index e545c050d3e8..b74a539ec581 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -37,6 +37,7 @@ * master MTD flag set for the corresponding MTD partition. * For example, to force a read-only partition, simply adding * MTD_WRITEABLE to the mask_flags will do the trick. + * add_flags: contains flags to add to the parent flags * * Note: writeable partitions require their size and offset be * erasesize aligned (e.g. use MTDPART_OFS_NEXTBLK). @@ -48,6 +49,7 @@ struct mtd_partition { uint64_t size; /* partition size */ uint64_t offset; /* offset within the master MTD space */ uint32_t mask_flags; /* master MTD flags to mask out for this partition */ + uint32_t add_flags; /* flags to add to the partition */ struct device_node *of_node; }; diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h index 47ffe3208c27..4b48fbf7d343 100644 --- a/include/uapi/mtd/mtd-abi.h +++ b/include/uapi/mtd/mtd-abi.h @@ -104,6 +104,7 @@ struct mtd_write_req { #define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */ #define MTD_NO_ERASE 0x1000 /* No erase necessary */ #define MTD_POWERUP_LOCK 0x2000 /* Always locked after reset */ +#define MTD_SLC_ON_MLC_EMULATION 0x4000 /* Emulate SLC behavior on MLC NANDs */ /* Some common devices / combinations of capabilities */ #define MTD_CAP_ROM 0 -- cgit v1.2.3 From dd6ed5c9890b759ba1b56697b9f3f50e71909e43 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 May 2020 12:52:29 +0200 Subject: mtd: rawnand: Translate obscure bitfields into readable macros Use the BIT() macro instead of defining a 8-digit value. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200507105241.14299-2-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 21873168ba4d..4b58de842340 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -129,36 +129,36 @@ enum nand_ecc_algo { * features. */ /* Buswidth is 16 bit */ -#define NAND_BUSWIDTH_16 0x00000002 +#define NAND_BUSWIDTH_16 BIT(1) /* Chip has cache program function */ -#define NAND_CACHEPRG 0x00000008 +#define NAND_CACHEPRG BIT(3) /* * Chip requires ready check on read (for auto-incremented sequential read). * True only for small page devices; large page devices do not support * autoincrement. */ -#define NAND_NEED_READRDY 0x00000100 +#define NAND_NEED_READRDY BIT(8) /* Chip does not allow subpage writes */ -#define NAND_NO_SUBPAGE_WRITE 0x00000200 +#define NAND_NO_SUBPAGE_WRITE BIT(9) /* Device is one of 'new' xD cards that expose fake nand command set */ -#define NAND_BROKEN_XD 0x00000400 +#define NAND_BROKEN_XD BIT(10) /* Device behaves just like nand, but is readonly */ -#define NAND_ROM 0x00000800 +#define NAND_ROM BIT(11) /* Device supports subpage reads */ -#define NAND_SUBPAGE_READ 0x00001000 +#define NAND_SUBPAGE_READ BIT(12) /* * Some MLC NANDs need data scrambling to limit bitflips caused by repeated * patterns. */ -#define NAND_NEED_SCRAMBLING 0x00002000 +#define NAND_NEED_SCRAMBLING BIT(13) /* Device needs 3rd row address cycle */ -#define NAND_ROW_ADDR_3 0x00004000 +#define NAND_ROW_ADDR_3 BIT(14) /* Options valid for Samsung large page devices */ #define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG @@ -173,9 +173,9 @@ enum nand_ecc_algo { * Position within the block: Each of these pages needs to be checked for a * bad block marking pattern. */ -#define NAND_BBM_FIRSTPAGE 0x01000000 -#define NAND_BBM_SECONDPAGE 0x02000000 -#define NAND_BBM_LASTPAGE 0x04000000 +#define NAND_BBM_FIRSTPAGE BIT(24) +#define NAND_BBM_SECONDPAGE BIT(25) +#define NAND_BBM_LASTPAGE BIT(26) /* Position within the OOB data of the page */ #define NAND_BBM_POS_SMALL 5 @@ -183,21 +183,21 @@ enum nand_ecc_algo { /* Non chip related options */ /* This option skips the bbt scan during initialization. */ -#define NAND_SKIP_BBTSCAN 0x00010000 +#define NAND_SKIP_BBTSCAN BIT(16) /* Chip may not exist, so silence any errors in scan */ -#define NAND_SCAN_SILENT_NODEV 0x00040000 +#define NAND_SCAN_SILENT_NODEV BIT(18) /* * Autodetect nand buswidth with readid/onfi. * This suppose the driver will configure the hardware in 8 bits mode * when calling nand_scan_ident, and update its configuration * before calling nand_scan_tail. */ -#define NAND_BUSWIDTH_AUTO 0x00080000 +#define NAND_BUSWIDTH_AUTO BIT(19) /* * This option could be defined by controller drivers to protect against * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers */ -#define NAND_USE_BOUNCE_BUFFER 0x00100000 +#define NAND_USE_BOUNCE_BUFFER BIT(20) /* * In case your controller is implementing ->legacy.cmd_ctrl() and is relying @@ -207,20 +207,20 @@ enum nand_ecc_algo { * If your controller already takes care of this delay, you don't need to set * this flag. */ -#define NAND_WAIT_TCCS 0x00200000 +#define NAND_WAIT_TCCS BIT(21) /* * Whether the NAND chip is a boot medium. Drivers might use this information * to select ECC algorithms supported by the boot ROM or similar restrictions. */ -#define NAND_IS_BOOT_MEDIUM 0x00400000 +#define NAND_IS_BOOT_MEDIUM BIT(22) /* * Do not try to tweak the timings at runtime. This is needed when the * controller initializes the timings on itself or when it relies on * configuration done by the bootloader. */ -#define NAND_KEEP_TIMINGS 0x00800000 +#define NAND_KEEP_TIMINGS BIT(23) /* Cell info constants */ #define NAND_CI_CHIPNR_MSK 0x03 -- cgit v1.2.3 From 96d627bdf112d116a79c86435550cd18b9a9d82b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 May 2020 12:52:30 +0200 Subject: mtd: rawnand: Reorder the nand_chip->options flags These flags are in a strange order, reorder the list, add spaces when it is relevant, pack definitions that are related. There is no functional change. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200507105241.14299-3-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 57 ++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 4b58de842340..e70fea67030b 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -118,20 +118,25 @@ enum nand_ecc_algo { #define NAND_ECC_GENERIC_ERASED_CHECK BIT(0) #define NAND_ECC_MAXIMIZE BIT(1) -/* - * When using software implementation of Hamming, we can specify which byte - * ordering should be used. - */ -#define NAND_ECC_SOFT_HAMMING_SM_ORDER BIT(2) - /* * Option constants for bizarre disfunctionality and real * features. */ + /* Buswidth is 16 bit */ #define NAND_BUSWIDTH_16 BIT(1) + +/* + * When using software implementation of Hamming, we can specify which byte + * ordering should be used. + */ +#define NAND_ECC_SOFT_HAMMING_SM_ORDER BIT(2) + /* Chip has cache program function */ #define NAND_CACHEPRG BIT(3) +/* Options valid for Samsung large page devices */ +#define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG + /* * Chip requires ready check on read (for auto-incremented sequential read). * True only for small page devices; large page devices do not support @@ -150,6 +155,8 @@ enum nand_ecc_algo { /* Device supports subpage reads */ #define NAND_SUBPAGE_READ BIT(12) +/* Macros to identify the above */ +#define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ)) /* * Some MLC NANDs need data scrambling to limit bitflips caused by repeated @@ -160,32 +167,12 @@ enum nand_ecc_algo { /* Device needs 3rd row address cycle */ #define NAND_ROW_ADDR_3 BIT(14) -/* Options valid for Samsung large page devices */ -#define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG - -/* Macros to identify the above */ -#define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ)) - -/* - * There are different places where the manufacturer stores the factory bad - * block markers. - * - * Position within the block: Each of these pages needs to be checked for a - * bad block marking pattern. - */ -#define NAND_BBM_FIRSTPAGE BIT(24) -#define NAND_BBM_SECONDPAGE BIT(25) -#define NAND_BBM_LASTPAGE BIT(26) - -/* Position within the OOB data of the page */ -#define NAND_BBM_POS_SMALL 5 -#define NAND_BBM_POS_LARGE 0 - /* Non chip related options */ /* This option skips the bbt scan during initialization. */ #define NAND_SKIP_BBTSCAN BIT(16) /* Chip may not exist, so silence any errors in scan */ #define NAND_SCAN_SILENT_NODEV BIT(18) + /* * Autodetect nand buswidth with readid/onfi. * This suppose the driver will configure the hardware in 8 bits mode @@ -193,6 +180,7 @@ enum nand_ecc_algo { * before calling nand_scan_tail. */ #define NAND_BUSWIDTH_AUTO BIT(19) + /* * This option could be defined by controller drivers to protect against * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers @@ -222,11 +210,26 @@ enum nand_ecc_algo { */ #define NAND_KEEP_TIMINGS BIT(23) +/* + * There are different places where the manufacturer stores the factory bad + * block markers. + * + * Position within the block: Each of these pages needs to be checked for a + * bad block marking pattern. + */ +#define NAND_BBM_FIRSTPAGE BIT(24) +#define NAND_BBM_SECONDPAGE BIT(25) +#define NAND_BBM_LASTPAGE BIT(26) + /* Cell info constants */ #define NAND_CI_CHIPNR_MSK 0x03 #define NAND_CI_CELLTYPE_MSK 0x0C #define NAND_CI_CELLTYPE_SHIFT 2 +/* Position within the OOB data of the page */ +#define NAND_BBM_POS_SMALL 5 +#define NAND_BBM_POS_LARGE 0 + /** * struct nand_parameters - NAND generic parameters from the parameter page * @model: Model name -- cgit v1.2.3 From ce8148d7b8f204a18188e3cd7386c8dddbb461a1 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 May 2020 12:52:31 +0200 Subject: mtd: rawnand: Rename a NAND chip option NAND controller drivers can set the NAND_USE_BOUNCE_BUFFER flag to a chip 'option' field. With this flag, the core is responsible of providing DMA-able buffers. The current behavior is to not force the use of a bounce buffer when the core thinks this is not needed. So in the end the name is a bit misleading, because in theory we will always have a DMA buffer but in practice it will not always be a bounce buffer. Rename this flag NAND_USES_DMA to be more accurate. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200507105241.14299-4-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/atmel/nand-controller.c | 2 +- drivers/mtd/nand/raw/brcmnand/brcmnand.c | 2 +- drivers/mtd/nand/raw/denali.c | 2 +- drivers/mtd/nand/raw/meson_nand.c | 2 +- drivers/mtd/nand/raw/mtk_nand.c | 2 +- drivers/mtd/nand/raw/nand_base.c | 4 ++-- drivers/mtd/nand/raw/qcom_nandc.c | 2 +- drivers/mtd/nand/raw/stm32_fmc2_nand.c | 2 +- drivers/mtd/nand/raw/sunxi_nand.c | 2 +- drivers/mtd/nand/raw/tango_nand.c | 2 +- drivers/mtd/nand/raw/tegra_nand.c | 2 +- include/linux/mtd/rawnand.h | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c index 3ba17a98df4d..46a3724a788e 100644 --- a/drivers/mtd/nand/raw/atmel/nand-controller.c +++ b/drivers/mtd/nand/raw/atmel/nand-controller.c @@ -1494,7 +1494,7 @@ static void atmel_nand_init(struct atmel_nand_controller *nc, * suitable for DMA. */ if (nc->dmac) - chip->options |= NAND_USE_BOUNCE_BUFFER; + chip->options |= NAND_USES_DMA; /* Default to HW ECC if pmecc is available. */ if (nc->pmecc) diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index 57076c3d98dc..fe7cd3aa0cd6 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -2576,7 +2576,7 @@ static int brcmnand_attach_chip(struct nand_chip *chip) * to/from, and have nand_base pass us a bounce buffer instead, as * needed. */ - chip->options |= NAND_USE_BOUNCE_BUFFER; + chip->options |= NAND_USES_DMA; if (chip->bbt_options & NAND_BBT_USE_FLASH) chip->bbt_options |= NAND_BBT_NO_OOB; diff --git a/drivers/mtd/nand/raw/denali.c b/drivers/mtd/nand/raw/denali.c index 2fcd2baf6e35..7a76b761dd0b 100644 --- a/drivers/mtd/nand/raw/denali.c +++ b/drivers/mtd/nand/raw/denali.c @@ -1226,7 +1226,7 @@ int denali_chip_init(struct denali_controller *denali, mtd->name = "denali-nand"; if (denali->dma_avail) { - chip->options |= NAND_USE_BOUNCE_BUFFER; + chip->options |= NAND_USES_DMA; chip->buf_align = 16; } diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index e961f7bebf0a..3f376471f3f7 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -1269,7 +1269,7 @@ meson_nfc_nand_chip_init(struct device *dev, nand_set_flash_node(nand, np); nand_set_controller_data(nand, nfc); - nand->options |= NAND_USE_BOUNCE_BUFFER; + nand->options |= NAND_USES_DMA; mtd = nand_to_mtd(nand); mtd->owner = THIS_MODULE; mtd->dev.parent = dev; diff --git a/drivers/mtd/nand/raw/mtk_nand.c b/drivers/mtd/nand/raw/mtk_nand.c index ef149e8b26d0..e7ec30e784fd 100644 --- a/drivers/mtd/nand/raw/mtk_nand.c +++ b/drivers/mtd/nand/raw/mtk_nand.c @@ -1380,7 +1380,7 @@ static int mtk_nfc_nand_chip_init(struct device *dev, struct mtk_nfc *nfc, nand_set_flash_node(nand, np); nand_set_controller_data(nand, nfc); - nand->options |= NAND_USE_BOUNCE_BUFFER | NAND_SUBPAGE_READ; + nand->options |= NAND_USES_DMA | NAND_SUBPAGE_READ; nand->legacy.dev_ready = mtk_nfc_dev_ready; nand->legacy.select_chip = mtk_nfc_select_chip; nand->legacy.write_byte = mtk_nfc_write_byte; diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 106edd514c00..906b8cd94bc4 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -3241,7 +3241,7 @@ static int nand_do_read_ops(struct nand_chip *chip, loff_t from, if (!aligned) use_bufpoi = 1; - else if (chip->options & NAND_USE_BOUNCE_BUFFER) + else if (chip->options & NAND_USES_DMA) use_bufpoi = !virt_addr_valid(buf) || !IS_ALIGNED((unsigned long)buf, chip->buf_align); @@ -4067,7 +4067,7 @@ static int nand_do_write_ops(struct nand_chip *chip, loff_t to, if (part_pagewr) use_bufpoi = 1; - else if (chip->options & NAND_USE_BOUNCE_BUFFER) + else if (chip->options & NAND_USES_DMA) use_bufpoi = !virt_addr_valid(buf) || !IS_ALIGNED((unsigned long)buf, chip->buf_align); diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index 5b11c7061497..9ab22c5d4166 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -2836,7 +2836,7 @@ static int qcom_nand_host_init_and_register(struct qcom_nand_controller *nandc, chip->legacy.block_markbad = qcom_nandc_block_markbad; chip->controller = &nandc->controller; - chip->options |= NAND_NO_SUBPAGE_WRITE | NAND_USE_BOUNCE_BUFFER | + chip->options |= NAND_NO_SUBPAGE_WRITE | NAND_USES_DMA | NAND_SKIP_BBTSCAN; /* set up initial status value */ diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index 46b7d04e2c87..c5fde09e0175 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -1987,7 +1987,7 @@ static int stm32_fmc2_probe(struct platform_device *pdev) chip->controller = &fmc2->base; chip->options |= NAND_BUSWIDTH_AUTO | NAND_NO_SUBPAGE_WRITE | - NAND_USE_BOUNCE_BUFFER; + NAND_USES_DMA; /* Default ECC settings */ chip->ecc.mode = NAND_ECC_HW; diff --git a/drivers/mtd/nand/raw/sunxi_nand.c b/drivers/mtd/nand/raw/sunxi_nand.c index 18ac0b36abfa..26d862213cac 100644 --- a/drivers/mtd/nand/raw/sunxi_nand.c +++ b/drivers/mtd/nand/raw/sunxi_nand.c @@ -1698,7 +1698,7 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct nand_chip *nand, ecc->read_page = sunxi_nfc_hw_ecc_read_page_dma; ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage_dma; ecc->write_page = sunxi_nfc_hw_ecc_write_page_dma; - nand->options |= NAND_USE_BOUNCE_BUFFER; + nand->options |= NAND_USES_DMA; } else { ecc->read_page = sunxi_nfc_hw_ecc_read_page; ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage; diff --git a/drivers/mtd/nand/raw/tango_nand.c b/drivers/mtd/nand/raw/tango_nand.c index 9acf2de37ee0..b92de603e6db 100644 --- a/drivers/mtd/nand/raw/tango_nand.c +++ b/drivers/mtd/nand/raw/tango_nand.c @@ -568,7 +568,7 @@ static int chip_init(struct device *dev, struct device_node *np) chip->legacy.select_chip = tango_select_chip; chip->legacy.cmd_ctrl = tango_cmd_ctrl; chip->legacy.dev_ready = tango_dev_ready; - chip->options = NAND_USE_BOUNCE_BUFFER | + chip->options = NAND_USES_DMA | NAND_NO_SUBPAGE_WRITE | NAND_WAIT_TCCS; chip->controller = &nfc->hw; diff --git a/drivers/mtd/nand/raw/tegra_nand.c b/drivers/mtd/nand/raw/tegra_nand.c index 6a255ba0f288..f9d046b2cd3b 100644 --- a/drivers/mtd/nand/raw/tegra_nand.c +++ b/drivers/mtd/nand/raw/tegra_nand.c @@ -1115,7 +1115,7 @@ static int tegra_nand_chips_init(struct device *dev, if (!mtd->name) mtd->name = "tegra_nand"; - chip->options = NAND_NO_SUBPAGE_WRITE | NAND_USE_BOUNCE_BUFFER; + chip->options = NAND_NO_SUBPAGE_WRITE | NAND_USES_DMA; ret = nand_scan(chip, 1); if (ret) diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index e70fea67030b..d1f5c5258e35 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -185,7 +185,7 @@ enum nand_ecc_algo { * This option could be defined by controller drivers to protect against * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers */ -#define NAND_USE_BOUNCE_BUFFER BIT(20) +#define NAND_USES_DMA BIT(20) /* * In case your controller is implementing ->legacy.cmd_ctrl() and is relying -- cgit v1.2.3 From b451f5beece3f5556920992e7498d23f6da6ef6e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 May 2020 12:52:36 +0200 Subject: mtd: rawnand: Give the possibility to verify a read operation is supported This can be used to discriminate between two path in the parameter page detection: use data_in cycles (like before) if supported, use the CHANGE READ COLUMN command otherwise. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200507105241.14299-9-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/fsmc_nand.c | 2 +- drivers/mtd/nand/raw/marvell_nand.c | 4 ++-- drivers/mtd/nand/raw/nand_base.c | 48 ++++++++++++++++++++++--------------- drivers/mtd/nand/raw/nand_jedec.c | 2 +- drivers/mtd/nand/raw/nand_legacy.c | 8 ++++--- drivers/mtd/nand/raw/nand_micron.c | 6 ++--- drivers/mtd/nand/raw/nand_onfi.c | 3 ++- include/linux/mtd/rawnand.h | 2 +- 8 files changed, 44 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c index 31dc9fd0a94d..2a9222e99bcc 100644 --- a/drivers/mtd/nand/raw/fsmc_nand.c +++ b/drivers/mtd/nand/raw/fsmc_nand.c @@ -694,7 +694,7 @@ static int fsmc_read_page_hwecc(struct nand_chip *chip, u8 *buf, for (i = 0, s = 0; s < eccsteps; s++, i += eccbytes, p += eccsize) { nand_read_page_op(chip, page, s * eccsize, NULL, 0); chip->ecc.hwctl(chip, NAND_ECC_READ); - ret = nand_read_data_op(chip, p, eccsize, false); + ret = nand_read_data_op(chip, p, eccsize, false, false); if (ret) return ret; diff --git a/drivers/mtd/nand/raw/marvell_nand.c b/drivers/mtd/nand/raw/marvell_nand.c index 88269c44b2b4..a79ce4bdd31c 100644 --- a/drivers/mtd/nand/raw/marvell_nand.c +++ b/drivers/mtd/nand/raw/marvell_nand.c @@ -1224,12 +1224,12 @@ static int marvell_nfc_hw_ecc_bch_read_page_raw(struct nand_chip *chip, u8 *buf, /* Read spare bytes */ nand_read_data_op(chip, oob + (lt->spare_bytes * chunk), - spare_len, false); + spare_len, false, false); /* Read ECC bytes */ nand_read_data_op(chip, oob + ecc_offset + (ALIGN(lt->ecc_bytes, 32) * chunk), - ecc_len, false); + ecc_len, false, false); } return 0; diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index c440f11a34e9..8df864881fff 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -740,7 +740,8 @@ int nand_soft_waitrdy(struct nand_chip *chip, unsigned long timeout_ms) */ timeout_ms = jiffies + msecs_to_jiffies(timeout_ms) + 1; do { - ret = nand_read_data_op(chip, &status, sizeof(status), true); + ret = nand_read_data_op(chip, &status, sizeof(status), true, + false); if (ret) break; @@ -820,7 +821,7 @@ void panic_nand_wait(struct nand_chip *chip, unsigned long timeo) u8 status; ret = nand_read_data_op(chip, &status, sizeof(status), - true); + true, false); if (ret) return; @@ -1918,6 +1919,8 @@ EXPORT_SYMBOL_GPL(nand_reset_op); * @buf: buffer used to store the data * @len: length of the buffer * @force_8bit: force 8-bit bus access + * @check_only: do not actually run the command, only checks if the + * controller driver supports it * * This function does a raw data read on the bus. Usually used after launching * another NAND operation like nand_read_page_op(). @@ -1926,7 +1929,7 @@ EXPORT_SYMBOL_GPL(nand_reset_op); * Returns 0 on success, a negative error code otherwise. */ int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len, - bool force_8bit) + bool force_8bit, bool check_only) { if (!len || !buf) return -EINVAL; @@ -1939,9 +1942,15 @@ int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len, instrs[0].ctx.data.force_8bit = force_8bit; + if (check_only) + return nand_check_op(chip, &op); + return nand_exec_op(chip, &op); } + if (check_only) + return 0; + if (force_8bit) { u8 *p = buf; unsigned int i; @@ -2670,7 +2679,7 @@ int nand_read_page_raw(struct nand_chip *chip, uint8_t *buf, int oob_required, if (oob_required) { ret = nand_read_data_op(chip, chip->oob_poi, mtd->oobsize, - false); + false, false); if (ret) return ret; } @@ -2702,7 +2711,7 @@ static int nand_read_page_raw_syndrome(struct nand_chip *chip, uint8_t *buf, return ret; for (steps = chip->ecc.steps; steps > 0; steps--) { - ret = nand_read_data_op(chip, buf, eccsize, false); + ret = nand_read_data_op(chip, buf, eccsize, false, false); if (ret) return ret; @@ -2710,14 +2719,14 @@ static int nand_read_page_raw_syndrome(struct nand_chip *chip, uint8_t *buf, if (chip->ecc.prepad) { ret = nand_read_data_op(chip, oob, chip->ecc.prepad, - false); + false, false); if (ret) return ret; oob += chip->ecc.prepad; } - ret = nand_read_data_op(chip, oob, eccbytes, false); + ret = nand_read_data_op(chip, oob, eccbytes, false, false); if (ret) return ret; @@ -2725,7 +2734,7 @@ static int nand_read_page_raw_syndrome(struct nand_chip *chip, uint8_t *buf, if (chip->ecc.postpad) { ret = nand_read_data_op(chip, oob, chip->ecc.postpad, - false); + false, false); if (ret) return ret; @@ -2735,7 +2744,7 @@ static int nand_read_page_raw_syndrome(struct nand_chip *chip, uint8_t *buf, size = mtd->oobsize - (oob - chip->oob_poi); if (size) { - ret = nand_read_data_op(chip, oob, size, false); + ret = nand_read_data_op(chip, oob, size, false, false); if (ret) return ret; } @@ -2928,14 +2937,15 @@ static int nand_read_page_hwecc(struct nand_chip *chip, uint8_t *buf, for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) { chip->ecc.hwctl(chip, NAND_ECC_READ); - ret = nand_read_data_op(chip, p, eccsize, false); + ret = nand_read_data_op(chip, p, eccsize, false, false); if (ret) return ret; chip->ecc.calculate(chip, p, &ecc_calc[i]); } - ret = nand_read_data_op(chip, chip->oob_poi, mtd->oobsize, false); + ret = nand_read_data_op(chip, chip->oob_poi, mtd->oobsize, false, + false); if (ret) return ret; @@ -3014,7 +3024,7 @@ static int nand_read_page_hwecc_oob_first(struct nand_chip *chip, uint8_t *buf, chip->ecc.hwctl(chip, NAND_ECC_READ); - ret = nand_read_data_op(chip, p, eccsize, false); + ret = nand_read_data_op(chip, p, eccsize, false, false); if (ret) return ret; @@ -3071,13 +3081,13 @@ static int nand_read_page_syndrome(struct nand_chip *chip, uint8_t *buf, chip->ecc.hwctl(chip, NAND_ECC_READ); - ret = nand_read_data_op(chip, p, eccsize, false); + ret = nand_read_data_op(chip, p, eccsize, false, false); if (ret) return ret; if (chip->ecc.prepad) { ret = nand_read_data_op(chip, oob, chip->ecc.prepad, - false); + false, false); if (ret) return ret; @@ -3086,7 +3096,7 @@ static int nand_read_page_syndrome(struct nand_chip *chip, uint8_t *buf, chip->ecc.hwctl(chip, NAND_ECC_READSYN); - ret = nand_read_data_op(chip, oob, eccbytes, false); + ret = nand_read_data_op(chip, oob, eccbytes, false, false); if (ret) return ret; @@ -3096,7 +3106,7 @@ static int nand_read_page_syndrome(struct nand_chip *chip, uint8_t *buf, if (chip->ecc.postpad) { ret = nand_read_data_op(chip, oob, chip->ecc.postpad, - false); + false, false); if (ret) return ret; @@ -3124,7 +3134,7 @@ static int nand_read_page_syndrome(struct nand_chip *chip, uint8_t *buf, /* Calculate remaining oob bytes */ i = mtd->oobsize - (oob - chip->oob_poi); if (i) { - ret = nand_read_data_op(chip, oob, i, false); + ret = nand_read_data_op(chip, oob, i, false, false); if (ret) return ret; } @@ -3426,7 +3436,7 @@ static int nand_read_oob_syndrome(struct nand_chip *chip, int page) sndrnd = 1; toread = min_t(int, length, chunk); - ret = nand_read_data_op(chip, bufpoi, toread, false); + ret = nand_read_data_op(chip, bufpoi, toread, false, false); if (ret) return ret; @@ -3434,7 +3444,7 @@ static int nand_read_oob_syndrome(struct nand_chip *chip, int page) length -= toread; } if (length > 0) { - ret = nand_read_data_op(chip, bufpoi, length, false); + ret = nand_read_data_op(chip, bufpoi, length, false, false); if (ret) return ret; } diff --git a/drivers/mtd/nand/raw/nand_jedec.c b/drivers/mtd/nand/raw/nand_jedec.c index 15937e02c64f..63069f1948a8 100644 --- a/drivers/mtd/nand/raw/nand_jedec.c +++ b/drivers/mtd/nand/raw/nand_jedec.c @@ -51,7 +51,7 @@ int nand_jedec_detect(struct nand_chip *chip) } for (i = 0; i < JEDEC_PARAM_PAGES; i++) { - ret = nand_read_data_op(chip, p, sizeof(*p), true); + ret = nand_read_data_op(chip, p, sizeof(*p), true, false); if (ret) { ret = 0; goto free_jedec_param_page; diff --git a/drivers/mtd/nand/raw/nand_legacy.c b/drivers/mtd/nand/raw/nand_legacy.c index f91e92e1b972..d64791c06a97 100644 --- a/drivers/mtd/nand/raw/nand_legacy.c +++ b/drivers/mtd/nand/raw/nand_legacy.c @@ -225,7 +225,8 @@ static void nand_wait_status_ready(struct nand_chip *chip, unsigned long timeo) do { u8 status; - ret = nand_read_data_op(chip, &status, sizeof(status), true); + ret = nand_read_data_op(chip, &status, sizeof(status), true, + false); if (ret) return; @@ -552,7 +553,8 @@ static int nand_wait(struct nand_chip *chip) break; } else { ret = nand_read_data_op(chip, &status, - sizeof(status), true); + sizeof(status), true, + false); if (ret) return ret; @@ -563,7 +565,7 @@ static int nand_wait(struct nand_chip *chip) } while (time_before(jiffies, timeo)); } - ret = nand_read_data_op(chip, &status, sizeof(status), true); + ret = nand_read_data_op(chip, &status, sizeof(status), true, false); if (ret) return ret; diff --git a/drivers/mtd/nand/raw/nand_micron.c b/drivers/mtd/nand/raw/nand_micron.c index 56654030ec7f..3a37d48c9472 100644 --- a/drivers/mtd/nand/raw/nand_micron.c +++ b/drivers/mtd/nand/raw/nand_micron.c @@ -212,7 +212,7 @@ static int micron_nand_on_die_ecc_status_4(struct nand_chip *chip, u8 status, */ if (!oob_required) { ret = nand_read_data_op(chip, chip->oob_poi, mtd->oobsize, - false); + false, false); if (ret) return ret; } @@ -304,10 +304,10 @@ micron_nand_read_page_on_die_ecc(struct nand_chip *chip, uint8_t *buf, if (ret) goto out; - ret = nand_read_data_op(chip, buf, mtd->writesize, false); + ret = nand_read_data_op(chip, buf, mtd->writesize, false, false); if (!ret && oob_required) ret = nand_read_data_op(chip, chip->oob_poi, mtd->oobsize, - false); + false, false); if (chip->ecc.strength == 4) max_bitflips = micron_nand_on_die_ecc_status_4(chip, status, diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c index ee0f2c2549c1..e6ffbe8c9a0c 100644 --- a/drivers/mtd/nand/raw/nand_onfi.c +++ b/drivers/mtd/nand/raw/nand_onfi.c @@ -167,7 +167,8 @@ int nand_onfi_detect(struct nand_chip *chip) } for (i = 0; i < ONFI_PARAM_PAGES; i++) { - ret = nand_read_data_op(chip, &pbuf[i], sizeof(*pbuf), true); + ret = nand_read_data_op(chip, &pbuf[i], sizeof(*pbuf), true, + false); if (ret) { ret = 0; goto free_onfi_param_page; diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index d1f5c5258e35..70380c91731c 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1363,7 +1363,7 @@ int nand_change_write_column_op(struct nand_chip *chip, unsigned int offset_in_page, const void *buf, unsigned int len, bool force_8bit); int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len, - bool force_8bit); + bool force_8bit, bool check_only); int nand_write_data_op(struct nand_chip *chip, const void *buf, unsigned int len, bool force_8bit); -- cgit v1.2.3 From 658beb6639606268bd48eb7de7cf308d6b10600f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 May 2020 12:52:39 +0200 Subject: mtd: rawnand: Expose monolithic read/write_page_raw() helpers The current nand_read/write_page_raw() helpers are already widely used but do not fit the purpose of "constrained" controllers which cannot, for instance, separate command/address cycles with data cycles. Workaround this issue by proposing alternative helpers that can be used by these controller drivers instead. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200507105241.14299-12-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/nand_base.c | 77 ++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/rawnand.h | 8 +++-- 2 files changed, 83 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 8df864881fff..43c7ebc7e859 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -2688,6 +2688,47 @@ int nand_read_page_raw(struct nand_chip *chip, uint8_t *buf, int oob_required, } EXPORT_SYMBOL(nand_read_page_raw); +/** + * nand_monolithic_read_page_raw - Monolithic page read in raw mode + * @chip: NAND chip info structure + * @buf: buffer to store read data + * @oob_required: caller requires OOB data read to chip->oob_poi + * @page: page number to read + * + * This is a raw page read, ie. without any error detection/correction. + * Monolithic means we are requesting all the relevant data (main plus + * eventually OOB) to be loaded in the NAND cache and sent over the + * bus (from the NAND chip to the NAND controller) in a single + * operation. This is an alternative to nand_read_page_raw(), which + * first reads the main data, and if the OOB data is requested too, + * then reads more data on the bus. + */ +int nand_monolithic_read_page_raw(struct nand_chip *chip, u8 *buf, + int oob_required, int page) +{ + struct mtd_info *mtd = nand_to_mtd(chip); + unsigned int size = mtd->writesize; + u8 *read_buf = buf; + int ret; + + if (oob_required) { + size += mtd->oobsize; + + if (buf != chip->data_buf) + read_buf = nand_get_data_buf(chip); + } + + ret = nand_read_page_op(chip, page, 0, read_buf, size); + if (ret) + return ret; + + if (buf != chip->data_buf) + memcpy(buf, read_buf, mtd->writesize); + + return 0; +} +EXPORT_SYMBOL(nand_monolithic_read_page_raw); + /** * nand_read_page_raw_syndrome - [INTERN] read raw page data without ecc * @chip: nand chip info structure @@ -3696,6 +3737,42 @@ int nand_write_page_raw(struct nand_chip *chip, const uint8_t *buf, } EXPORT_SYMBOL(nand_write_page_raw); +/** + * nand_monolithic_write_page_raw - Monolithic page write in raw mode + * @chip: NAND chip info structure + * @buf: data buffer to write + * @oob_required: must write chip->oob_poi to OOB + * @page: page number to write + * + * This is a raw page write, ie. without any error detection/correction. + * Monolithic means we are requesting all the relevant data (main plus + * eventually OOB) to be sent over the bus and effectively programmed + * into the NAND chip arrays in a single operation. This is an + * alternative to nand_write_page_raw(), which first sends the main + * data, then eventually send the OOB data by latching more data + * cycles on the NAND bus, and finally sends the program command to + * synchronyze the NAND chip cache. + */ +int nand_monolithic_write_page_raw(struct nand_chip *chip, const u8 *buf, + int oob_required, int page) +{ + struct mtd_info *mtd = nand_to_mtd(chip); + unsigned int size = mtd->writesize; + u8 *write_buf = (u8 *)buf; + + if (oob_required) { + size += mtd->oobsize; + + if (buf != chip->data_buf) { + write_buf = nand_get_data_buf(chip); + memcpy(write_buf, buf, mtd->writesize); + } + } + + return nand_prog_page_op(chip, page, 0, write_buf, size); +} +EXPORT_SYMBOL(nand_monolithic_write_page_raw); + /** * nand_write_page_raw_syndrome - [INTERN] raw page write function * @chip: nand chip info structure diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 70380c91731c..406e9ff0f45c 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1328,13 +1328,17 @@ int nand_read_oob_std(struct nand_chip *chip, int page); int nand_get_set_features_notsupp(struct nand_chip *chip, int addr, u8 *subfeature_param); -/* Default read_page_raw implementation */ +/* read_page_raw implementations */ int nand_read_page_raw(struct nand_chip *chip, uint8_t *buf, int oob_required, int page); +int nand_monolithic_read_page_raw(struct nand_chip *chip, uint8_t *buf, + int oob_required, int page); -/* Default write_page_raw implementation */ +/* write_page_raw implementations */ int nand_write_page_raw(struct nand_chip *chip, const uint8_t *buf, int oob_required, int page); +int nand_monolithic_write_page_raw(struct nand_chip *chip, const uint8_t *buf, + int oob_required, int page); /* Reset and initialize a NAND device */ int nand_reset(struct nand_chip *chip, int chipnr); -- cgit v1.2.3 From ec7cfc3d763c761ff1ad5a4e66c4f94098d7e161 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 11 May 2020 08:49:15 +0200 Subject: mtd: rawnand: Add a NAND_NO_BBM_QUIRK flag Some controllers with embedded ECC engines override the BBM marker with data or ECC bytes, thus making bad block detection through bad block marker impossible. Let's flag those chips so the core knows it shouldn't check the BBM and consider all blocks good. This should allow us to get rid of two implementers of the legacy.block_bad() hook. Signed-off-by: Boris Brezillon Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200511064917.6255-1-boris.brezillon@collabora.com --- drivers/mtd/nand/raw/nand_base.c | 3 +++ include/linux/mtd/rawnand.h | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index ba37ca9c1260..2d2a216af120 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -395,6 +395,9 @@ static int nand_block_bad(struct nand_chip *chip, loff_t ofs) static int nand_isbad_bbm(struct nand_chip *chip, loff_t ofs) { + if (chip->options & NAND_NO_BBM_QUIRK) + return 0; + if (chip->legacy.block_bad) return chip->legacy.block_bad(chip, ofs); diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 406e9ff0f45c..0f45b6984ad1 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -221,6 +221,14 @@ enum nand_ecc_algo { #define NAND_BBM_SECONDPAGE BIT(25) #define NAND_BBM_LASTPAGE BIT(26) +/* + * Some controllers with pipelined ECC engines override the BBM marker with + * data or ECC bytes, thus making bad block detection through bad block marker + * impossible. Let's flag those chips so the core knows it shouldn't check the + * BBM and consider all blocks good. + */ +#define NAND_NO_BBM_QUIRK BIT(27) + /* Cell info constants */ #define NAND_CI_CHIPNR_MSK 0x03 #define NAND_CI_CELLTYPE_MSK 0x0C -- cgit v1.2.3 From 913b99f70feb1cd9fb4bbcb302a029b4b9bee454 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:25:24 -0500 Subject: thunderbolt: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mika Westerberg --- include/linux/thunderbolt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index ece782ef5466..ff397c0d5c07 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -80,7 +80,7 @@ struct tb { int index; enum tb_security_level security_level; size_t nboot_acl; - unsigned long privdata[0]; + unsigned long privdata[]; }; extern struct bus_type tb_bus_type; -- cgit v1.2.3 From 975cf23e3aa89588cbfc9ad6f2b23bd32af4edc7 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:15 +0100 Subject: PCI: endpoint: Pass page size as argument to pci_epc_mem_init() pci_epc_mem_init() internally used page size equal to *PAGE_SIZE* to manage the address space so instead just pass the page size as a argument to pci_epc_mem_init(). Also make pci_epc_mem_init() as a C function instead of a macro function in preparation for adding support for pci-epc-mem core to handle multiple windows. Link: https://lore.kernel.org/r/1588854799-13710-5-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/cadence/pcie-cadence-ep.c | 2 +- drivers/pci/controller/pcie-rockchip-ep.c | 2 +- drivers/pci/endpoint/pci-epc-mem.c | 7 +++++++ include/linux/pci-epc.h | 5 ++--- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/cadence/pcie-cadence-ep.c b/drivers/pci/controller/cadence/pcie-cadence-ep.c index 1c173dad67d1..1c15c8352125 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-ep.c +++ b/drivers/pci/controller/cadence/pcie-cadence-ep.c @@ -450,7 +450,7 @@ int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep) epc->max_functions = 1; ret = pci_epc_mem_init(epc, pcie->mem_res->start, - resource_size(pcie->mem_res)); + resource_size(pcie->mem_res), PAGE_SIZE); if (ret < 0) { dev_err(dev, "failed to initialize the memory space\n"); goto err_init; diff --git a/drivers/pci/controller/pcie-rockchip-ep.c b/drivers/pci/controller/pcie-rockchip-ep.c index d743b0a48988..5eaf36629a75 100644 --- a/drivers/pci/controller/pcie-rockchip-ep.c +++ b/drivers/pci/controller/pcie-rockchip-ep.c @@ -615,7 +615,7 @@ static int rockchip_pcie_ep_probe(struct platform_device *pdev) rockchip_pcie_write(rockchip, BIT(0), PCIE_CORE_PHY_FUNC_CFG); err = pci_epc_mem_init(epc, rockchip->mem_res->start, - resource_size(rockchip->mem_res)); + resource_size(rockchip->mem_res), PAGE_SIZE); if (err < 0) { dev_err(dev, "failed to initialize the memory space\n"); goto err_uninit_port; diff --git a/drivers/pci/endpoint/pci-epc-mem.c b/drivers/pci/endpoint/pci-epc-mem.c index abfac1109a13..cdd1d3821249 100644 --- a/drivers/pci/endpoint/pci-epc-mem.c +++ b/drivers/pci/endpoint/pci-epc-mem.c @@ -93,6 +93,13 @@ return ret; } EXPORT_SYMBOL_GPL(__pci_epc_mem_init); +int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, + size_t size, size_t page_size) +{ + return __pci_epc_mem_init(epc, base, size, page_size); +} +EXPORT_SYMBOL_GPL(pci_epc_mem_init); + /** * pci_epc_mem_exit() - cleanup the pci_epc_mem structure * @epc: the EPC device that invoked pci_epc_mem_exit diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index e0ed9d01f6e5..5bc1de65849e 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -137,9 +137,6 @@ struct pci_epc_features { #define devm_pci_epc_create(dev, ops) \ __devm_pci_epc_create((dev), (ops), THIS_MODULE) -#define pci_epc_mem_init(epc, phys_addr, size) \ - __pci_epc_mem_init((epc), (phys_addr), (size), PAGE_SIZE) - static inline void epc_set_drvdata(struct pci_epc *epc, void *data) { dev_set_drvdata(&epc->dev, data); @@ -195,6 +192,8 @@ unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features struct pci_epc *pci_epc_get(const char *epc_name); void pci_epc_put(struct pci_epc *epc); +int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, + size_t size, size_t page_size); int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size, size_t page_size); void pci_epc_mem_exit(struct pci_epc *epc); -- cgit v1.2.3 From e33bcbab16d1c0dd85d72bec275308369ad901f5 Mon Sep 17 00:00:00 2001 From: Vesa Jääskeläinen Date: Thu, 30 Apr 2020 15:37:09 +0300 Subject: tee: add support for session's client UUID generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TEE Client API defines that from user space only information needed for specified login operations is group identifier for group based logins. REE kernel is expected to formulate trustworthy client UUID and pass that to TEE environment. REE kernel is required to verify that provided group identifier for group based logins matches calling processes group memberships. TEE specification only defines that the information passed from REE environment to TEE environment is encoded into on UUID. In order to guarantee trustworthiness of client UUID user space is not allowed to freely pass client UUID. UUIDv5 form is used encode variable amount of information needed for different login types. Signed-off-by: Vesa Jääskeläinen [jw: remove unused variable application_id] Signed-off-by: Jens Wiklander --- drivers/tee/Kconfig | 1 + drivers/tee/tee_core.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/tee_drv.h | 16 +++++ 3 files changed, 169 insertions(+) (limited to 'include/linux') diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 8da63f38e6bd..806eb87d4da0 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -3,6 +3,7 @@ config TEE tristate "Trusted Execution Environment support" depends on HAVE_ARM_SMCCC || COMPILE_TEST || CPU_SUP_AMD + select CRYPTO_SHA1 select DMA_SHARED_BUFFER select GENERIC_ALLOCATOR help diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 6aec502c495c..9fa837224fed 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -6,18 +6,33 @@ #define pr_fmt(fmt) "%s: " fmt, __func__ #include +#include #include #include #include #include #include #include +#include +#include #include "tee_private.h" #define TEE_NUM_DEVICES 32 #define TEE_IOCTL_PARAM_SIZE(x) (sizeof(struct tee_param) * (x)) +#define TEE_UUID_NS_NAME_SIZE 128 + +/* + * TEE Client UUID name space identifier (UUIDv4) + * + * Value here is random UUID that is allocated as name space identifier for + * forming Client UUID's for TEE environment using UUIDv5 scheme. + */ +static const uuid_t tee_client_uuid_ns = UUID_INIT(0x58ac9ca0, 0x2086, 0x4683, + 0xa1, 0xb8, 0xec, 0x4b, + 0xc0, 0x8e, 0x01, 0xb6); + /* * Unprivileged devices in the lower half range and privileged devices in * the upper half range. @@ -110,6 +125,143 @@ static int tee_release(struct inode *inode, struct file *filp) return 0; } +/** + * uuid_v5() - Calculate UUIDv5 + * @uuid: Resulting UUID + * @ns: Name space ID for UUIDv5 function + * @name: Name for UUIDv5 function + * @size: Size of name + * + * UUIDv5 is specific in RFC 4122. + * + * This implements section (for SHA-1): + * 4.3. Algorithm for Creating a Name-Based UUID + */ +static int uuid_v5(uuid_t *uuid, const uuid_t *ns, const void *name, + size_t size) +{ + unsigned char hash[SHA1_DIGEST_SIZE]; + struct crypto_shash *shash = NULL; + struct shash_desc *desc = NULL; + int rc; + + shash = crypto_alloc_shash("sha1", 0, 0); + if (IS_ERR(shash)) { + rc = PTR_ERR(shash); + pr_err("shash(sha1) allocation failed\n"); + return rc; + } + + desc = kzalloc(sizeof(*desc) + crypto_shash_descsize(shash), + GFP_KERNEL); + if (!desc) { + rc = -ENOMEM; + goto out_free_shash; + } + + desc->tfm = shash; + + rc = crypto_shash_init(desc); + if (rc < 0) + goto out_free_desc; + + rc = crypto_shash_update(desc, (const u8 *)ns, sizeof(*ns)); + if (rc < 0) + goto out_free_desc; + + rc = crypto_shash_update(desc, (const u8 *)name, size); + if (rc < 0) + goto out_free_desc; + + rc = crypto_shash_final(desc, hash); + if (rc < 0) + goto out_free_desc; + + memcpy(uuid->b, hash, UUID_SIZE); + + /* Tag for version 5 */ + uuid->b[6] = (hash[6] & 0x0F) | 0x50; + uuid->b[8] = (hash[8] & 0x3F) | 0x80; + +out_free_desc: + kfree(desc); + +out_free_shash: + crypto_free_shash(shash); + return rc; +} + +int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, + const u8 connection_data[TEE_IOCTL_UUID_LEN]) +{ + gid_t ns_grp = (gid_t)-1; + kgid_t grp = INVALID_GID; + char *name = NULL; + int name_len; + int rc; + + if (connection_method == TEE_IOCTL_LOGIN_PUBLIC) { + /* Nil UUID to be passed to TEE environment */ + uuid_copy(uuid, &uuid_null); + return 0; + } + + /* + * In Linux environment client UUID is based on UUIDv5. + * + * Determine client UUID with following semantics for 'name': + * + * For TEEC_LOGIN_USER: + * uid= + * + * For TEEC_LOGIN_GROUP: + * gid= + * + */ + + name = kzalloc(TEE_UUID_NS_NAME_SIZE, GFP_KERNEL); + if (!name) + return -ENOMEM; + + switch (connection_method) { + case TEE_IOCTL_LOGIN_USER: + name_len = snprintf(name, TEE_UUID_NS_NAME_SIZE, "uid=%x", + current_euid().val); + if (name_len >= TEE_UUID_NS_NAME_SIZE) { + rc = -E2BIG; + goto out_free_name; + } + break; + + case TEE_IOCTL_LOGIN_GROUP: + memcpy(&ns_grp, connection_data, sizeof(gid_t)); + grp = make_kgid(current_user_ns(), ns_grp); + if (!gid_valid(grp) || !in_egroup_p(grp)) { + rc = -EPERM; + goto out_free_name; + } + + name_len = snprintf(name, TEE_UUID_NS_NAME_SIZE, "gid=%x", + grp.val); + if (name_len >= TEE_UUID_NS_NAME_SIZE) { + rc = -E2BIG; + goto out_free_name; + } + break; + + default: + rc = -EINVAL; + goto out_free_name; + } + + rc = uuid_v5(uuid, &tee_client_uuid_ns, name, name_len); +out_free_name: + kfree(name); + + return rc; +} +EXPORT_SYMBOL_GPL(tee_session_calc_client_uuid); + static int tee_ioctl_version(struct tee_context *ctx, struct tee_ioctl_version_data __user *uvers) { diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 1412e9cc79ce..8471b790e858 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -165,6 +165,22 @@ int tee_device_register(struct tee_device *teedev); */ void tee_device_unregister(struct tee_device *teedev); +/** + * tee_session_calc_client_uuid() - Calculates client UUID for session + * @uuid: Resulting UUID + * @connection_method: Connection method for session (TEE_IOCTL_LOGIN_*) + * @connectuon_data: Connection data for opening session + * + * Based on connection method calculates UUIDv5 based client UUID. + * + * For group based logins verifies that calling process has specified + * credentials. + * + * @return < 0 on failure + */ +int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, + const u8 connection_data[TEE_IOCTL_UUID_LEN]); + /** * struct tee_shm - shared memory object * @ctx: context using the object -- cgit v1.2.3 From be957c886d92aa9caf0f63aee2c77d1497217d93 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 1 May 2020 15:20:45 -0300 Subject: mm/hmm: make hmm_range_fault return 0 or -1 hmm_vma_walk->last is supposed to be updated after every write to the pfns, so that it can be returned by hmm_range_fault(). However, this is not done consistently. Fortunately nothing checks the return code of hmm_range_fault() for anything other than error. More importantly last must be set before returning -EBUSY as it is used to prevent reading an output pfn as an input flags when the loop restarts. For clarity and simplicity make hmm_range_fault() return 0 or -ERRNO. Only set last when returning -EBUSY. Link: https://lore.kernel.org/r/2-v2-b4e84f444c7d+24f57-hmm_no_flags_jgg@mellanox.com Acked-by: Felix Kuehling Tested-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++-- drivers/gpu/drm/nouveau/nouveau_svm.c | 6 +++--- include/linux/hmm.h | 2 +- mm/hmm.c | 25 +++++++++---------------- 5 files changed, 16 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 4e3e9362afeb..9924f2caa018 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -161,7 +161,7 @@ device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use:: - long hmm_range_fault(struct hmm_range *range); + int hmm_range_fault(struct hmm_range *range); It will trigger a page fault on missing or read-only entries if write access is requested (see below). Page faults use the generic mm page fault code path just diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 6309ff72bd78..7eb745b8acce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -852,12 +852,12 @@ retry: down_read(&mm->mmap_sem); r = hmm_range_fault(range); up_read(&mm->mmap_sem); - if (unlikely(r <= 0)) { + if (unlikely(r)) { /* * FIXME: This timeout should encompass the retry from * mmu_interval_read_retry() as well. */ - if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout)) + if (r == -EBUSY && !time_after(jiffies, timeout)) goto retry; goto out_free_pfns; } diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 645fedd77e21..c68e9317cf07 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -536,7 +536,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, .pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT, }; struct mm_struct *mm = notifier->notifier.mm; - long ret; + int ret; while (true) { if (time_after(jiffies, timeout)) @@ -548,8 +548,8 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, down_read(&mm->mmap_sem); ret = hmm_range_fault(&range); up_read(&mm->mmap_sem); - if (ret <= 0) { - if (ret == 0 || ret == -EBUSY) + if (ret) { + if (ret == -EBUSY) continue; return ret; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7475051100c7..0df27dd03d53 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -120,7 +120,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ -long hmm_range_fault(struct hmm_range *range); +int hmm_range_fault(struct hmm_range *range); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index 280585833adf..f06bcac948a7 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -174,7 +174,6 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, } if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); - hmm_vma_walk->last = addr; return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); } @@ -207,7 +206,6 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - hmm_vma_walk->last = end; return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -386,13 +384,10 @@ again: r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); if (r) { /* hmm_vma_handle_pte() did pte_unmap() */ - hmm_vma_walk->last = addr; return r; } } pte_unmap(ptep - 1); - - hmm_vma_walk->last = addr; return 0; } @@ -455,7 +450,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, for (i = 0; i < npages; ++i, ++pfn) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - hmm_vma_walk->last = end; goto out_unlock; } @@ -500,7 +494,6 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, for (; addr < end; addr += PAGE_SIZE, i++, pfn++) range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - hmm_vma_walk->last = end; spin_unlock(ptl); return 0; } @@ -537,7 +530,6 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, return -EFAULT; hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); - hmm_vma_walk->last = end; /* Skip this vma and continue processing the next vma. */ return 1; @@ -555,9 +547,7 @@ static const struct mm_walk_ops hmm_walk_ops = { * hmm_range_fault - try to fault some address in a virtual address range * @range: argument structure * - * Return: the number of valid pages in range->pfns[] (from range start - * address), which may be zero. On error one of the following status codes - * can be returned: + * Returns 0 on success or one of the following error codes: * * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma * (e.g., device file vma). @@ -572,7 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = { * This is similar to get_user_pages(), except that it can read the page tables * without mutating them (ie causing faults). */ -long hmm_range_fault(struct hmm_range *range) +int hmm_range_fault(struct hmm_range *range) { struct hmm_vma_walk hmm_vma_walk = { .range = range, @@ -590,10 +580,13 @@ long hmm_range_fault(struct hmm_range *range) return -EBUSY; ret = walk_page_range(mm, hmm_vma_walk.last, range->end, &hmm_walk_ops, &hmm_vma_walk); + /* + * When -EBUSY is returned the loop restarts with + * hmm_vma_walk.last set to an address that has not been stored + * in pfns. All entries < last in the pfn array are set to their + * output, and all >= are still at their input values. + */ } while (ret == -EBUSY); - - if (ret) - return ret; - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + return ret; } EXPORT_SYMBOL(hmm_range_fault); -- cgit v1.2.3 From 5c8f3c4cf18ad007242bc370da54d45d4d4293dc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 1 May 2020 15:20:47 -0300 Subject: mm/hmm: remove HMM_PFN_SPECIAL This is just an alias for HMM_PFN_ERROR, nothing cares that the error was because of a special page vs any other error case. Link: https://lore.kernel.org/r/4-v2-b4e84f444c7d+24f57-hmm_no_flags_jgg@mellanox.com Acked-by: Felix Kuehling Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 - drivers/gpu/drm/nouveau/nouveau_svm.c | 1 - include/linux/hmm.h | 8 -------- mm/hmm.c | 2 +- 4 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 41ae7f96f481..76b4a4fa39ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -775,7 +775,6 @@ static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = { static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = { 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */ 0, /* HMM_PFN_NONE */ - 0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */ }; /** diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index c68e9317cf07..cf0d9bd61beb 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -379,7 +379,6 @@ static const u64 nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = { [HMM_PFN_ERROR ] = ~NVIF_VMM_PFNMAP_V0_V, [HMM_PFN_NONE ] = NVIF_VMM_PFNMAP_V0_NONE, - [HMM_PFN_SPECIAL] = ~NVIF_VMM_PFNMAP_V0_V, }; /* Issue fault replay for GPU to retry accesses that faulted previously. */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 0df27dd03d53..81c302c884c0 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -44,10 +44,6 @@ enum hmm_pfn_flag_e { * Flags: * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() - * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the - * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not - * be mirrored by a device, because the entry will never have HMM_PFN_VALID - * set and the pfn value is undefined. * * Driver provides values for none entry, error entry, and special entry. * Driver can alias (i.e., use same value) error and special, but @@ -56,12 +52,10 @@ enum hmm_pfn_flag_e { * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry, - * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one */ enum hmm_pfn_value_e { HMM_PFN_ERROR, HMM_PFN_NONE, - HMM_PFN_SPECIAL, HMM_PFN_VALUE_MAX }; @@ -110,8 +104,6 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang return NULL; if (entry == range->values[HMM_PFN_ERROR]) return NULL; - if (entry == range->values[HMM_PFN_SPECIAL]) - return NULL; if (!(entry & range->flags[HMM_PFN_VALID])) return NULL; return pfn_to_page(entry >> range->pfn_shift); diff --git a/mm/hmm.c b/mm/hmm.c index f06bcac948a7..2e975eedb14f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -301,7 +301,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *pfn = range->values[HMM_PFN_SPECIAL]; + *pfn = range->values[HMM_PFN_ERROR]; return 0; } -- cgit v1.2.3 From 2733ea144dcce789de20988c1056e228a07b1bff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 1 May 2020 15:20:48 -0300 Subject: mm/hmm: remove the customizable pfn format from hmm_range_fault Presumably the intent here was that hmm_range_fault() could put the data into some HW specific format and thus avoid some work. However, nothing actually does that, and it isn't clear how anything actually could do that as hmm_range_fault() provides CPU addresses which must be DMA mapped. Perhaps there is some special HW that does not need DMA mapping, but we don't have any examples of this, and the theoretical performance win of avoiding an extra scan over the pfns array doesn't seem worth the complexity. Plus pfns needs to be scanned anyhow to sort out any DEVICE_PRIVATE pages. This version replaces the uint64_t with an usigned long containing a pfn and fixed flags. On input flags is filled with the HMM_PFN_REQ_* values, on successful output it is filled with HMM_PFN_* values, describing the state of the pages. amdgpu is simple to convert, it doesn't use snapshot and doesn't use per-page flags. nouveau uses only 16 hmm_pte entries at most (ie fits in a few cache lines), and it sweeps over its pfns array a couple of times anyhow. It also has a nasty call chain before it reaches the dma map and hardware suggesting performance isn't important: nouveau_svm_fault(): args.i.m.method = NVIF_VMM_V0_PFNMAP nouveau_range_fault() nvif_object_ioctl() client->driver->ioctl() struct nvif_driver nvif_driver_nvkm: .ioctl = nvkm_client_ioctl nvkm_ioctl() nvkm_ioctl_path() nvkm_ioctl_v0[type].func(..) nvkm_ioctl_mthd() nvkm_object_mthd() struct nvkm_object_func nvkm_uvmm: .mthd = nvkm_uvmm_mthd nvkm_uvmm_mthd() nvkm_uvmm_mthd_pfnmap() nvkm_vmm_pfn_map() nvkm_vmm_ptes_get_map() func == gp100_vmm_pgt_pfn struct nvkm_vmm_desc_func gp100_vmm_desc_spt: .pfn = gp100_vmm_pgt_pfn nvkm_vmm_iter() REF_PTES == func == gp100_vmm_pgt_pfn() dma_map_page() Link: https://lore.kernel.org/r/5-v2-b4e84f444c7d+24f57-hmm_no_flags_jgg@mellanox.com Acked-by: Felix Kuehling Tested-by: Ralph Campbell Signed-off-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 28 ++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 35 ++----- drivers/gpu/drm/nouveau/nouveau_dmem.c | 27 +----- drivers/gpu/drm/nouveau/nouveau_dmem.h | 3 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 87 +++++++++++------ include/linux/hmm.h | 101 ++++++++------------ mm/hmm.c | 160 +++++++++++++++----------------- 7 files changed, 195 insertions(+), 246 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 9924f2caa018..561969754bc0 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -184,10 +184,7 @@ The usage pattern is:: range.notifier = &interval_sub; range.start = ...; range.end = ...; - range.pfns = ...; - range.flags = ...; - range.values = ...; - range.pfn_shift = ...; + range.hmm_pfns = ...; if (!mmget_not_zero(interval_sub->notifier.mm)) return -EFAULT; @@ -229,15 +226,10 @@ The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specif fault or snapshot policy for the whole range instead of having to set them for each entry in the pfns array. -For instance, if the device flags for range.flags are:: +For instance if the device driver wants pages for a range with at least read +permission, it sets:: - range.flags[HMM_PFN_VALID] = (1 << 63); - range.flags[HMM_PFN_WRITE] = (1 << 62); - -and the device driver wants pages for a range with at least read permission, -it sets:: - - range->default_flags = (1 << 63); + range->default_flags = HMM_PFN_REQ_FAULT; range->pfn_flags_mask = 0; and calls hmm_range_fault() as described above. This will fill fault all pages @@ -246,18 +238,18 @@ in the range with at least read permission. Now let's say the driver wants to do the same except for one page in the range for which it wants to have write permission. Now driver set:: - range->default_flags = (1 << 63); - range->pfn_flags_mask = (1 << 62); - range->pfns[index_of_write] = (1 << 62); + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; With this, HMM will fault in all pages with at least read (i.e., valid) and for the address == range->start + (index_of_write << PAGE_SHIFT) it will fault with write permission i.e., if the CPU pte does not have write permission set then HMM will call handle_mm_fault(). -Note that HMM will populate the pfns array with write permission for any page -that is mapped with CPU write permission no matter what values are set -in default_flags or pfn_flags_mask. +After hmm_range_fault completes the flag bits are set to the current state of +the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is +writable. Represent and manage device memory from core kernel point of view diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 76b4a4fa39ed..b1c62da527c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -766,17 +766,6 @@ struct amdgpu_ttm_tt { }; #ifdef CONFIG_DRM_AMDGPU_USERPTR -/* flags used by HMM internal, not related to CPU/GPU PTE flags */ -static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = { - (1 << 0), /* HMM_PFN_VALID */ - (1 << 1), /* HMM_PFN_WRITE */ -}; - -static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = { - 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */ - 0, /* HMM_PFN_NONE */ -}; - /** * amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user * memory and start HMM tracking CPU page table update @@ -815,18 +804,15 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) goto out; } range->notifier = &bo->notifier; - range->flags = hmm_range_flags; - range->values = hmm_range_values; - range->pfn_shift = PAGE_SHIFT; range->start = bo->notifier.interval_tree.start; range->end = bo->notifier.interval_tree.last + 1; - range->default_flags = hmm_range_flags[HMM_PFN_VALID]; + range->default_flags = HMM_PFN_REQ_FAULT; if (!amdgpu_ttm_tt_is_readonly(ttm)) - range->default_flags |= range->flags[HMM_PFN_WRITE]; + range->default_flags |= HMM_PFN_REQ_WRITE; - range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns), - GFP_KERNEL); - if (unlikely(!range->pfns)) { + range->hmm_pfns = kvmalloc_array(ttm->num_pages, + sizeof(*range->hmm_pfns), GFP_KERNEL); + if (unlikely(!range->hmm_pfns)) { r = -ENOMEM; goto out_free_ranges; } @@ -867,7 +853,7 @@ retry: * the notifier_lock, and mmu_interval_read_retry() must be done first. */ for (i = 0; i < ttm->num_pages; i++) - pages[i] = hmm_device_entry_to_page(range, range->pfns[i]); + pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]); gtt->range = range; mmput(mm); @@ -877,7 +863,7 @@ retry: out_unlock: up_read(&mm->mmap_sem); out_free_pfns: - kvfree(range->pfns); + kvfree(range->hmm_pfns); out_free_ranges: kfree(range); out: @@ -902,7 +888,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm) DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%lx\n", gtt->userptr, ttm->num_pages); - WARN_ONCE(!gtt->range || !gtt->range->pfns, + WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns, "No user pages to check\n"); if (gtt->range) { @@ -912,7 +898,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm) */ r = mmu_interval_read_retry(gtt->range->notifier, gtt->range->notifier_seq); - kvfree(gtt->range->pfns); + kvfree(gtt->range->hmm_pfns); kfree(gtt->range); gtt->range = NULL; } @@ -1003,8 +989,7 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm) for (i = 0; i < ttm->num_pages; i++) { if (ttm->pages[i] != - hmm_device_entry_to_page(gtt->range, - gtt->range->pfns[i])) + hmm_pfn_to_page(gtt->range->hmm_pfns[i])) break; } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index ad89e09a0be3..3364904eccff 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -85,7 +85,7 @@ static inline struct nouveau_dmem *page_to_dmem(struct page *page) return container_of(page->pgmap, struct nouveau_dmem, pagemap); } -static unsigned long nouveau_dmem_page_addr(struct page *page) +unsigned long nouveau_dmem_page_addr(struct page *page) { struct nouveau_dmem_chunk *chunk = page->zone_device_data; unsigned long idx = page_to_pfn(page) - chunk->pfn_first; @@ -671,28 +671,3 @@ out_free_src: out: return ret; } - -void -nouveau_dmem_convert_pfn(struct nouveau_drm *drm, - struct hmm_range *range) -{ - unsigned long i, npages; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0; i < npages; ++i) { - struct page *page; - uint64_t addr; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - if (!is_device_private_page(page)) - continue; - - addr = nouveau_dmem_page_addr(page); - range->pfns[i] &= ((1UL << range->pfn_shift) - 1); - range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; - range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM; - } -} diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.h b/drivers/gpu/drm/nouveau/nouveau_dmem.h index 92394be5d649..db3b59b210af 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.h +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.h @@ -37,9 +37,8 @@ int nouveau_dmem_migrate_vma(struct nouveau_drm *drm, struct vm_area_struct *vma, unsigned long start, unsigned long end); +unsigned long nouveau_dmem_page_addr(struct page *page); -void nouveau_dmem_convert_pfn(struct nouveau_drm *drm, - struct hmm_range *range); #else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ static inline void nouveau_dmem_init(struct nouveau_drm *drm) {} static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {} diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index cf0d9bd61beb..407e34a5c0ab 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -369,18 +369,6 @@ out_free: return ret; } -static const u64 -nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = { - [HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V, - [HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W, -}; - -static const u64 -nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = { - [HMM_PFN_ERROR ] = ~NVIF_VMM_PFNMAP_V0_V, - [HMM_PFN_NONE ] = NVIF_VMM_PFNMAP_V0_NONE, -}; - /* Issue fault replay for GPU to retry accesses that faulted previously. */ static void nouveau_svm_fault_replay(struct nouveau_svm *svm) @@ -518,9 +506,45 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = { .invalidate = nouveau_svm_range_invalidate, }; +static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, + struct hmm_range *range, u64 *ioctl_addr) +{ + unsigned long i, npages; + + /* + * The ioctl_addr prepared here is passed through nvif_object_ioctl() + * to an eventual DMA map in something like gp100_vmm_pgt_pfn() + * + * This is all just encoding the internal hmm representation into a + * different nouveau internal representation. + */ + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0; i < npages; ++i) { + struct page *page; + + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) { + ioctl_addr[i] = 0; + continue; + } + + page = hmm_pfn_to_page(range->hmm_pfns[i]); + if (is_device_private_page(page)) + ioctl_addr[i] = nouveau_dmem_page_addr(page) | + NVIF_VMM_PFNMAP_V0_V | + NVIF_VMM_PFNMAP_V0_VRAM; + else + ioctl_addr[i] = page_to_phys(page) | + NVIF_VMM_PFNMAP_V0_V | + NVIF_VMM_PFNMAP_V0_HOST; + if (range->hmm_pfns[i] & HMM_PFN_WRITE) + ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W; + } +} + static int nouveau_range_fault(struct nouveau_svmm *svmm, struct nouveau_drm *drm, void *data, u32 size, - u64 *pfns, struct svm_notifier *notifier) + unsigned long hmm_pfns[], u64 *ioctl_addr, + struct svm_notifier *notifier) { unsigned long timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); @@ -529,10 +553,8 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, .notifier = ¬ifier->notifier, .start = notifier->notifier.interval_tree.start, .end = notifier->notifier.interval_tree.last + 1, - .pfns = pfns, - .flags = nouveau_svm_pfn_flags, - .values = nouveau_svm_pfn_values, - .pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT, + .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, + .hmm_pfns = hmm_pfns, }; struct mm_struct *mm = notifier->notifier.mm; int ret; @@ -542,12 +564,15 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, return -EBUSY; range.notifier_seq = mmu_interval_read_begin(range.notifier); - range.default_flags = 0; - range.pfn_flags_mask = -1UL; down_read(&mm->mmap_sem); ret = hmm_range_fault(&range); up_read(&mm->mmap_sem); if (ret) { + /* + * FIXME: the input PFN_REQ flags are destroyed on + * -EBUSY, we need to regenerate them, also for the + * other continue below + */ if (ret == -EBUSY) continue; return ret; @@ -562,7 +587,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, break; } - nouveau_dmem_convert_pfn(drm, &range); + nouveau_hmm_convert_pfn(drm, &range, ioctl_addr); svmm->vmm->vmm.object.client->super = true; ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL); @@ -589,6 +614,7 @@ nouveau_svm_fault(struct nvif_notify *notify) } i; u64 phys[16]; } args; + unsigned long hmm_pfns[ARRAY_SIZE(args.phys)]; struct vm_area_struct *vma; u64 inst, start, limit; int fi, fn, pi, fill; @@ -704,12 +730,17 @@ nouveau_svm_fault(struct nvif_notify *notify) * access flags. *XXX: atomic? */ - if (buffer->fault[fn]->access != 0 /* READ. */ && - buffer->fault[fn]->access != 3 /* PREFETCH. */) { - args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V | - NVIF_VMM_PFNMAP_V0_W; - } else { - args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V; + switch (buffer->fault[fn]->access) { + case 0: /* READ. */ + hmm_pfns[pi++] = HMM_PFN_REQ_FAULT; + break; + case 3: /* PREFETCH. */ + hmm_pfns[pi++] = 0; + break; + default: + hmm_pfns[pi++] = HMM_PFN_REQ_FAULT | + HMM_PFN_REQ_WRITE; + break; } args.i.p.size = pi << PAGE_SHIFT; @@ -737,7 +768,7 @@ nouveau_svm_fault(struct nvif_notify *notify) fill = (buffer->fault[fn ]->addr - buffer->fault[fn - 1]->addr) >> PAGE_SHIFT; while (--fill) - args.phys[pi++] = NVIF_VMM_PFNMAP_V0_NONE; + hmm_pfns[pi++] = 0; } SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)", @@ -753,7 +784,7 @@ nouveau_svm_fault(struct nvif_notify *notify) ret = nouveau_range_fault( svmm, svm->drm, &args, sizeof(args.i) + pi * sizeof(args.phys[0]), - args.phys, ¬ifier); + hmm_pfns, args.phys, ¬ifier); mmu_interval_notifier_remove(¬ifier.notifier); } mmput(mm); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 81c302c884c0..e912b9dc4633 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -19,45 +19,47 @@ #include /* - * hmm_pfn_flag_e - HMM flag enums + * On output: + * 0 - The page is faultable and a future call with + * HMM_PFN_REQ_FAULT could succeed. + * HMM_PFN_VALID - the pfn field points to a valid PFN. This PFN is at + * least readable. If dev_private_owner is !NULL then this could + * point at a DEVICE_PRIVATE page. + * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) + * HMM_PFN_ERROR - accessing the pfn is impossible and the device should + * fail. ie poisoned memory, special pages, no vma, etc * - * Flags: - * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. - * HMM_PFN_WRITE: CPU page table has write permission set - * - * The driver provides a flags array for mapping page protections to device - * PTE bits. If the driver valid bit for an entry is bit 3, - * i.e., (entry & (1 << 3)), then the driver must provide - * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. - * Same logic apply to all flags. This is the same idea as vm_page_prot in vma - * except that this is per device driver rather than per architecture. + * On input: + * 0 - Return the current state of the page, do not fault it. + * HMM_PFN_REQ_FAULT - The output must have HMM_PFN_VALID or hmm_range_fault() + * will fail + * HMM_PFN_REQ_WRITE - The output must have HMM_PFN_WRITE or hmm_range_fault() + * will fail. Must be combined with HMM_PFN_REQ_FAULT. */ -enum hmm_pfn_flag_e { - HMM_PFN_VALID = 0, - HMM_PFN_WRITE, - HMM_PFN_FLAG_MAX +enum hmm_pfn_flags { + /* Output flags */ + HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), + HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), + HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), + + /* Input flags */ + HMM_PFN_REQ_FAULT = HMM_PFN_VALID, + HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, + + HMM_PFN_FLAGS = HMM_PFN_VALID | HMM_PFN_WRITE | HMM_PFN_ERROR, }; /* - * hmm_pfn_value_e - HMM pfn special value - * - * Flags: - * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory - * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() + * hmm_pfn_to_page() - return struct page pointed to by a device entry * - * Driver provides values for none entry, error entry, and special entry. - * Driver can alias (i.e., use same value) error and special, but - * it should not alias none with error or special. - * - * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: - * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, - * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry, + * This must be called under the caller 'user_lock' after a successful + * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID + * already. */ -enum hmm_pfn_value_e { - HMM_PFN_ERROR, - HMM_PFN_NONE, - HMM_PFN_VALUE_MAX -}; +static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) +{ + return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS); +} /* * struct hmm_range - track invalidation lock on virtual address range @@ -66,12 +68,9 @@ enum hmm_pfn_value_e { * @notifier_seq: result of mmu_interval_read_begin() * @start: range virtual start address (inclusive) * @end: range virtual end address (exclusive) - * @pfns: array of pfns (big enough for the range) - * @flags: pfn flags to match device driver page table - * @values: pfn value for some special case (none, special, error, ...) + * @hmm_pfns: array of pfns (big enough for the range) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter - * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT) * @dev_private_owner: owner of device private pages */ struct hmm_range { @@ -79,36 +78,12 @@ struct hmm_range { unsigned long notifier_seq; unsigned long start; unsigned long end; - uint64_t *pfns; - const uint64_t *flags; - const uint64_t *values; - uint64_t default_flags; - uint64_t pfn_flags_mask; - uint8_t pfn_shift; + unsigned long *hmm_pfns; + unsigned long default_flags; + unsigned long pfn_flags_mask; void *dev_private_owner; }; -/* - * hmm_device_entry_to_page() - return struct page pointed to by a device entry - * @range: range use to decode device entry value - * @entry: device entry value to get corresponding struct page from - * Return: struct page pointer if entry is a valid, NULL otherwise - * - * If the device entry is valid (ie valid flag set) then return the struct page - * matching the entry value. Otherwise return NULL. - */ -static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range, - uint64_t entry) -{ - if (entry == range->values[HMM_PFN_NONE]) - return NULL; - if (entry == range->values[HMM_PFN_ERROR]) - return NULL; - if (!(entry & range->flags[HMM_PFN_VALID])) - return NULL; - return pfn_to_page(entry >> range->pfn_shift); -} - /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ diff --git a/mm/hmm.c b/mm/hmm.c index 2e975eedb14f..41673a6d8d46 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -37,28 +37,13 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; -/* - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn - * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the device entry - * Return: valid device entry for the pfn - */ -static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; -} - static int hmm_pfns_fill(unsigned long addr, unsigned long end, - struct hmm_range *range, enum hmm_pfn_value_e value) + struct hmm_range *range, unsigned long cpu_flags) { - uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i = (addr - range->start) >> PAGE_SHIFT; - i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[value]; - + range->hmm_pfns[i] = cpu_flags; return 0; } @@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end, } static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - uint64_t pfns, uint64_t cpu_flags) + unsigned long pfn_req_flags, + unsigned long cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; @@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, * waste to have the user pre-fill the pfn arrays with a default * flags value. */ - pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + pfn_req_flags &= range->pfn_flags_mask; + pfn_req_flags |= range->default_flags; /* We aren't ask to do anything ... */ - if (!(pfns & range->flags[HMM_PFN_VALID])) + if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) return 0; /* Need to write fault ? */ - if ((pfns & range->flags[HMM_PFN_WRITE]) && - !(cpu_flags & range->flags[HMM_PFN_WRITE])) + if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && + !(cpu_flags & HMM_PFN_WRITE)) return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; /* If CPU page table is not valid then we need to fault */ - if (!(cpu_flags & range->flags[HMM_PFN_VALID])) + if (!(cpu_flags & HMM_PFN_VALID)) return HMM_NEED_FAULT; return 0; } static unsigned int hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - const uint64_t *pfns, unsigned long npages, - uint64_t cpu_flags) + const unsigned long hmm_pfns[], unsigned long npages, + unsigned long cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault = 0; @@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, * hmm_pte_need_fault() will always return 0. */ if (!((range->default_flags | range->pfn_flags_mask) & - range->flags[HMM_PFN_VALID])) + HMM_PFN_REQ_FAULT)) return 0; for (i = 0; i < npages; ++i) { - required_fault |= - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); + required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], + cpu_flags); if (required_fault == HMM_NEED_ALL_BITS) return required_fault; } @@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long i, npages; - uint64_t *pfns; + unsigned long *hmm_pfns; i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); + hmm_pfns = &range->hmm_pfns[i]; + required_fault = + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); if (!walk->vma) { if (required_fault) return -EFAULT; @@ -174,44 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, } if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); - return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); + return hmm_pfns_fill(addr, end, range, 0); } -static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, + pmd_t pmd) { if (pmd_protnone(pmd)) return 0; - return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, - unsigned long end, uint64_t *pfns, pmd_t pmd) + unsigned long end, unsigned long hmm_pfns[], + pmd_t pmd) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; unsigned int required_fault; - uint64_t cpu_flags; + unsigned long cpu_flags; npages = (end - addr) >> PAGE_SHIFT; cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); required_fault = - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; + hmm_pfns[i] = pfn | cpu_flags; return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ /* stub to allow the code below to compile */ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, - unsigned long end, uint64_t *pfns, pmd_t pmd); + unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool hmm_is_device_private_entry(struct hmm_range *range, @@ -222,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range, range->dev_private_owner; } -static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, + pte_t pte) { if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) return 0; - return pte_write(pte) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long end, pmd_t *pmdp, pte_t *ptep, - uint64_t *pfn) + unsigned long *hmm_pfn) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; - uint64_t cpu_flags; + unsigned long cpu_flags; pte_t pte = *ptep; - uint64_t orig_pfn = *pfn; + uint64_t pfn_req_flags = *hmm_pfn; if (pte_none(pte)) { - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *pfn = range->values[HMM_PFN_NONE]; + *hmm_pfn = 0; return 0; } @@ -258,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * the PFN even if not present. */ if (hmm_is_device_private_entry(range, entry)) { - *pfn = hmm_device_entry_from_pfn(range, - device_private_entry_to_pfn(entry)); - *pfn |= range->flags[HMM_PFN_VALID]; + cpu_flags = HMM_PFN_VALID; if (is_write_device_private_entry(entry)) - *pfn |= range->flags[HMM_PFN_WRITE]; + cpu_flags |= HMM_PFN_WRITE; + *hmm_pfn = device_private_entry_to_pfn(entry) | + cpu_flags; return 0; } - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (!required_fault) { - *pfn = range->values[HMM_PFN_NONE]; + *hmm_pfn = 0; return 0; } @@ -288,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } cpu_flags = pte_to_hmm_pfn_flags(range, pte); - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) goto fault; @@ -297,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * fall through and treat it like a normal page. */ if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { - if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { + if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; } - *pfn = range->values[HMM_PFN_ERROR]; + *hmm_pfn = HMM_PFN_ERROR; return 0; } - *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; + *hmm_pfn = pte_pfn(pte) | cpu_flags; return 0; fault: @@ -321,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long *hmm_pfns = + &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; unsigned long npages = (end - start) >> PAGE_SHIFT; unsigned long addr = start; pte_t *ptep; @@ -333,16 +324,16 @@ again: return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } - return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + return hmm_pfns_fill(start, end, range, 0); } if (!pmd_present(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } @@ -362,7 +353,7 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); + return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); } /* @@ -372,16 +363,16 @@ again: * recover. */ if (pmd_bad(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } ptep = pte_offset_map(pmdp, addr); - for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); if (r) { /* hmm_vma_handle_pte() did pte_unmap() */ return r; @@ -393,13 +384,12 @@ again: #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, + pud_t pud) { if (!pud_present(pud)) return 0; - return pud_write(pud) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, @@ -427,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; - uint64_t *pfns, cpu_flags; + unsigned long *hmm_pfns; + unsigned long cpu_flags; if (!pud_present(pud)) { spin_unlock(ptl); @@ -436,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; + hmm_pfns = &range->hmm_pfns[i]; cpu_flags = pud_to_hmm_pfn_flags(range, pud); - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) { spin_unlock(ptl); @@ -448,8 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | - cpu_flags; + hmm_pfns[i] = pfn | cpu_flags; goto out_unlock; } @@ -473,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - uint64_t orig_pfn, cpu_flags; unsigned int required_fault; + unsigned long pfn_req_flags; + unsigned long cpu_flags; spinlock_t *ptl; pte_t entry; @@ -482,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); i = (start - range->start) >> PAGE_SHIFT; - orig_pfn = range->pfns[i]; + pfn_req_flags = range->hmm_pfns[i]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) { spin_unlock(ptl); return hmm_vma_fault(addr, end, required_fault, walk); @@ -492,8 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | - cpu_flags; + range->hmm_pfns[i] = pfn | cpu_flags; + spin_unlock(ptl); return 0; } @@ -524,7 +516,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, * failure. */ if (hmm_range_need_fault(hmm_vma_walk, - range->pfns + + range->hmm_pfns + ((start - range->start) >> PAGE_SHIFT), (end - start) >> PAGE_SHIFT, 0)) return -EFAULT; -- cgit v1.2.3 From 9c8255c888bac9221739c822132b405d4196bdd8 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:25:07 -0500 Subject: team: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- include/linux/if_team.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 537dc2b8c879..add607943c95 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -67,7 +67,7 @@ struct team_port { u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; - long mode_priv[0]; + long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) -- cgit v1.2.3 From 0462b6bdb6445b887b8896f28be92e0d94c92e7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2020 13:59:11 +0200 Subject: net: add a CMSG_USER_DATA macro Add a variant of CMSG_DATA that operates on user pointer to avoid sparse warnings about casting to/from user pointers. Also fix up CMSG_DATA to rely on the gcc extension that allows void pointer arithmetics to cut down on the amount of casts. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/socket.h | 5 ++++- net/core/scm.c | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 54338fac45cb..4cc64d611cf4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -94,7 +94,10 @@ struct cmsghdr { #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) -#define CMSG_DATA(cmsg) ((void *)((char *)(cmsg) + sizeof(struct cmsghdr))) +#define CMSG_DATA(cmsg) \ + ((void *)(cmsg) + sizeof(struct cmsghdr)) +#define CMSG_USER_DATA(cmsg) \ + ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) diff --git a/net/core/scm.c b/net/core/scm.c index dc6fed1f221c..abfdc85a64c1 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -236,7 +236,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) err = -EFAULT; if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) goto out; - if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) goto out; cmlen = CMSG_SPACE(len); if (msg->msg_controllen < cmlen) @@ -300,7 +300,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) if (fdnum < fdmax) fdmax = fdnum; - for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i Date: Mon, 11 May 2020 13:59:13 +0200 Subject: net: cleanly handle kernel vs user buffers for ->msg_control The msg_control field in struct msghdr can either contain a user pointer when used with the recvmsg system call, or a kernel pointer when used with sendmsg. To complicate things further kernel_recvmsg can stuff a kernel pointer in and then use set_fs to make the uaccess helpers accept it. Replace it with a union of a kernel pointer msg_control field, and a user pointer msg_control_user one, and allow kernel_recvmsg operate on a proper kernel pointer using a bitfield to override the normal choice of a user pointer for recvmsg. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/socket.h | 12 +++++++++++- net/compat.c | 5 +++-- net/core/scm.c | 49 ++++++++++++++++++++++++++++--------------------- net/ipv4/ip_sockglue.c | 3 ++- net/socket.c | 22 ++++++---------------- 5 files changed, 50 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 4cc64d611cf4..04d2bc97f497 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -50,7 +50,17 @@ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iov_iter msg_iter; /* data */ - void *msg_control; /* ancillary data */ + + /* + * Ancillary data. msg_control_user is the user buffer used for the + * recv* side when msg_control_is_user is set, msg_control is the kernel + * buffer used for all other cases. + */ + union { + void *msg_control; + void __user *msg_control_user; + }; + bool msg_control_is_user : 1; __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ diff --git a/net/compat.c b/net/compat.c index 4bed96e84d9a..69fc6d1e4e6e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -56,7 +56,8 @@ int __get_compat_msghdr(struct msghdr *kmsg, if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); - kmsg->msg_control = compat_ptr(msg.msg_control); + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = compat_ptr(msg.msg_control); kmsg->msg_controllen = msg.msg_controllen; if (save_addr) @@ -121,7 +122,7 @@ int get_compat_msghdr(struct msghdr *kmsg, ((ucmlen) >= sizeof(struct compat_cmsghdr) && \ (ucmlen) <= (unsigned long) \ ((mhdr)->msg_controllen - \ - ((char *)(ucmsg) - (char *)(mhdr)->msg_control))) + ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user))) static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg, struct compat_cmsghdr __user *cmsg, int cmsg_len) diff --git a/net/core/scm.c b/net/core/scm.c index 168b006a52ff..a75cd637a71f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user *)msg->msg_control; - struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); - int err; - if (MSG_CMSG_COMPAT & msg->msg_flags) + if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); - if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - - err = -EFAULT; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) - goto out; - if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) - goto out; - cmlen = CMSG_SPACE(len); - if (msg->msg_controllen < cmlen) - cmlen = msg->msg_controllen; + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + struct cmsghdr cmhdr; + + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || + copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) + return -EFAULT; + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); msg->msg_control += cmlen; msg->msg_controllen -= cmlen; - err = 0; -out: - return err; + return 0; } EXPORT_SYMBOL(put_cmsg); @@ -328,6 +331,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) return; } + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; + for (i = 0; i < fdmax; i++) { err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa3fd61818c4..8206047d70b6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1492,7 +1492,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = (__force void *) optval; + msg.msg_control_is_user = true; + msg.msg_control_user = optval; msg.msg_controllen = len; msg.msg_flags = flags; diff --git a/net/socket.c b/net/socket.c index 2dd739fba866..1c9a7260a41d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -924,14 +924,9 @@ EXPORT_SYMBOL(sock_recvmsg); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { - mm_segment_t oldfs = get_fs(); - int result; - + msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); - set_fs(KERNEL_DS); - result = sock_recvmsg(sock, msg, flags); - set_fs(oldfs); - return result; + return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); @@ -2239,7 +2234,8 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - kmsg->msg_control = (void __force *)msg.msg_control; + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; @@ -2331,16 +2327,10 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, goto out; } err = -EFAULT; - /* - * Careful! Before this, msg_sys->msg_control contains a user pointer. - * Afterwards, it will be a kernel pointer. Thus the compiler-assisted - * checking falls down on this. - */ - if (copy_from_user(ctl_buf, - (void __user __force *)msg_sys->msg_control, - ctl_len)) + if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; + msg_sys->msg_control_is_user = false; } msg_sys->msg_flags = flags; -- cgit v1.2.3 From 8b59cd81dc5e724eaea283fa6006985891c7bff4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Apr 2020 23:23:52 +0900 Subject: kbuild: ensure full rebuild when the compiler is updated Commit 21c54b774744 ("kconfig: show compiler version text in the top comment") added the environment variable, CC_VERSION_TEXT in the comment of the top Kconfig file. It can detect the compiler update, and invoke the syncconfig because all environment variables referenced in Kconfig files are recorded in include/config/auto.conf.cmd This commit makes it a CONFIG option in order to ensure the full rebuild when the compiler is updated. This works like follows: include/config/kconfig.h contains "CONFIG_CC_VERSION_TEXT" in the comment block. The top Makefile specifies "-include $(srctree)/include/linux/kconfig.h" to guarantee it is included from all kernel source files. fixdep parses every source file and all headers included from it, searching for words prefixed with "CONFIG_". Then, fixdep finds CONFIG_CC_VERSION_TEXT in include/config/kconfig.h and adds include/config/cc/version/text.h into every .*.cmd file. When the compiler is updated, syncconfig is invoked because init/Kconfig contains the reference to the environment variable CC_VERTION_TEXT. CONFIG_CC_VERSION_TEXT is updated to the new version string, and include/config/cc/version/text.h is touched. In the next rebuild, Make will rebuild every files since the timestamp of include/config/cc/version/text.h is newer than that of target. Signed-off-by: Masahiro Yamada --- Kconfig | 2 -- include/linux/kconfig.h | 2 ++ init/Kconfig | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Kconfig b/Kconfig index e10b3ee084d4..745bc773f567 100644 --- a/Kconfig +++ b/Kconfig @@ -5,8 +5,6 @@ # mainmenu "Linux/$(ARCH) $(KERNELVERSION) Kernel Configuration" -comment "Compiler: $(CC_VERSION_TEXT)" - source "scripts/Kconfig.include" source "init/Kconfig" diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index cc8fa109cfa3..9d12c970f18f 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -2,6 +2,8 @@ #ifndef __LINUX_KCONFIG_H #define __LINUX_KCONFIG_H +/* CONFIG_CC_VERSION_TEXT (Do not delete this comment. See help in Kconfig) */ + #include #ifdef CONFIG_CPU_BIG_ENDIAN diff --git a/init/Kconfig b/init/Kconfig index 2bc63a361033..ed1d82c9f1df 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -8,6 +8,23 @@ config DEFCONFIG_LIST default "/boot/config-$(shell,uname -r)" default "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)" +config CC_VERSION_TEXT + string + default "$(CC_VERSION_TEXT)" + help + This is used in unclear ways: + + - Re-run Kconfig when the compiler is updated + The 'default' property references the environment variable, + CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd. + When the compiler is updated, Kconfig will be invoked. + + - Ensure full rebuild when the compier is updated + include/linux/kconfig.h contains this option in the comment line so + fixdep adds include/config/cc/version/text.h into the auto-generated + dependency. When the compiler is updated, syncconfig will touch it + and then every file will be rebuilt. + config CC_IS_GCC def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc) -- cgit v1.2.3 From 914a1951d88968371c7d43400c9d936382cd7d69 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:05:44 -0500 Subject: PCI: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these as a flexible array member [1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that dynamic memory allocations won't be affected by this change: Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero. [1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type [1]. There are some instances of code in which the sizeof() operator is being incorrectly/erroneously applied to zero-length arrays, and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Link: https://lore.kernel.org/r/20200507190544.GA15633@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 2 +- include/linux/pci.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 595fcf59843f..bb78f580814e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1578,7 +1578,7 @@ EXPORT_SYMBOL(pci_restore_state); struct pci_saved_state { u32 config_space[16]; - struct pci_cap_saved_data cap[0]; + struct pci_cap_saved_data cap[]; }; /** diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..0453ee458ab1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -279,7 +279,7 @@ struct pci_cap_saved_data { u16 cap_nr; bool cap_extended; unsigned int size; - u32 data[0]; + u32 data[]; }; struct pci_cap_saved_state { @@ -532,7 +532,7 @@ struct pci_host_bridge { resource_size_t start, resource_size_t size, resource_size_t align); - unsigned long private[0] ____cacheline_aligned; + unsigned long private[] ____cacheline_aligned; }; #define to_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev) -- cgit v1.2.3 From b0dbd97de1f1fd6b3c9a7bb8f7c795bba7e169d8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 10 May 2020 14:24:31 +0200 Subject: platform/x86: asus-wmi: Add support for SW_TABLET_MODE On Asus 2-in-1s with a detachable keyboard the Asus WMI interface reports if the tablet is attached to the keyboard or not. Report if the 2-in-1 is in tablet or clamshell mode to userspace by reporting SW_TABLET_MODE events to userspace. This has been tested on a T100TA, T100CHI, T100HA and T200TA. Signed-off-by: Hans de Goede Signed-off-by: Andy Shevchenko --- drivers/platform/x86/asus-wmi.c | 24 ++++++++++++++++++++++-- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 91d0c8be63a5..a97aba2ba467 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -57,6 +57,7 @@ MODULE_LICENSE("GPL"); #define NOTIFY_BRNDOWN_MIN 0x20 #define NOTIFY_BRNDOWN_MAX 0x2e #define NOTIFY_FNLOCK_TOGGLE 0x4e +#define NOTIFY_KBD_DOCK_CHANGE 0x75 #define NOTIFY_KBD_BRTUP 0xc4 #define NOTIFY_KBD_BRTDWN 0xc5 #define NOTIFY_KBD_BRTTOGGLE 0xc7 @@ -346,7 +347,7 @@ static bool asus_wmi_dev_is_present(struct asus_wmi *asus, u32 dev_id) static int asus_wmi_input_init(struct asus_wmi *asus) { - int err; + int err, result; asus->inputdev = input_allocate_device(); if (!asus->inputdev) @@ -362,6 +363,14 @@ static int asus_wmi_input_init(struct asus_wmi *asus) if (err) goto err_free_dev; + result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_KBD_DOCK); + if (result >= 0) { + input_set_capability(asus->inputdev, EV_SW, SW_TABLET_MODE); + input_report_switch(asus->inputdev, SW_TABLET_MODE, !result); + } else if (result != -ENODEV) { + pr_err("Error checking for keyboard-dock: %d\n", result); + } + err = input_register_device(asus->inputdev); if (err) goto err_free_dev; @@ -2055,9 +2064,9 @@ static int asus_wmi_get_event_code(u32 value) static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus) { - int orig_code; unsigned int key_value = 1; bool autorelease = 1; + int result, orig_code; orig_code = code; @@ -2102,6 +2111,17 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus) return; } + if (code == NOTIFY_KBD_DOCK_CHANGE) { + result = asus_wmi_get_devstate_simple(asus, + ASUS_WMI_DEVID_KBD_DOCK); + if (result >= 0) { + input_report_switch(asus->inputdev, SW_TABLET_MODE, + !result); + input_sync(asus->inputdev); + } + return; + } + if (asus->fan_boost_mode_available && code == NOTIFY_KBD_FBM) { fan_boost_mode_switch_next(asus); return; diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index d39fc658c320..897b8332a39f 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -85,6 +85,9 @@ /* Maximum charging percentage */ #define ASUS_WMI_DEVID_RSOC 0x00120057 +/* Keyboard dock */ +#define ASUS_WMI_DEVID_KBD_DOCK 0x00120063 + /* DSTS masks */ #define ASUS_WMI_DSTS_STATUS_BIT 0x00000001 #define ASUS_WMI_DSTS_UNKNOWN_BIT 0x00000002 -- cgit v1.2.3 From 2a0a24ebb499c9d499eea948d3fc108f936e36d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 27 Mar 2020 12:42:00 +0100 Subject: sched: Make scheduler_ipi inline Now that the scheduler IPI is trivial and simple again there is no point to have the little function out of line. This simplifies the effort of constraining the instrumentation nicely. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134058.453581595@linutronix.de --- include/linux/sched.h | 10 +++++++++- kernel/sched/core.c | 10 ---------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..d4ea4407cd6d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1715,7 +1715,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); }) #ifdef CONFIG_SMP -void scheduler_ipi(void); +static __always_inline void scheduler_ipi(void) +{ + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); +} extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd2070d6f1e4..74fb89b5ce3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2312,16 +2312,6 @@ static void wake_csd_func(void *info) sched_ttwu_pending(); } -void scheduler_ipi(void) -{ - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); -} - static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); -- cgit v1.2.3 From 1e6769b0aece51ea7a3dc3117c37d4a5669e4a21 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:49:48 +0900 Subject: kprobes: Support __kprobes blacklist in modules Support __kprobes attribute for blacklist functions in modules. The __kprobes attribute functions are stored in .kprobes.text section. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.678201813@linutronix.de --- include/linux/module.h | 4 ++++ kernel/kprobes.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/module.c | 4 ++++ 3 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 1ad393e62bef..369c354f9207 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -489,6 +489,10 @@ struct module { unsigned int num_ftrace_callsites; unsigned long *ftrace_callsites; #endif +#ifdef CONFIG_KPROBES + void *kprobes_text_start; + unsigned int kprobes_text_size; +#endif #ifdef CONFIG_LIVEPATCH bool klp; /* Is this a livepatch module? */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 570d60827656..b7549992b9bd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2179,6 +2179,19 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2215,6 +2228,28 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2260,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2255,6 +2295,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..978f3fa40850 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3193,6 +3193,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->ei_funcs = section_objs(info, "_error_injection_whitelist", sizeof(*mod->ei_funcs), &mod->num_ei_funcs); +#endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 16db6264c93d2d7df9eb8be5d9eb717ab30105fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Mar 2020 23:50:00 +0900 Subject: kprobes: Support NOKPROBE_SYMBOL() in modules Support NOKPROBE_SYMBOL() in modules. NOKPROBE_SYMBOL() records only symbol address in "_kprobe_blacklist" section in the module. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.771170126@linutronix.de --- include/linux/module.h | 2 ++ kernel/kprobes.c | 17 +++++++++++++++++ kernel/module.c | 3 +++ 3 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 369c354f9207..1192097c9a69 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -492,6 +492,8 @@ struct module { #ifdef CONFIG_KPROBES void *kprobes_text_start; unsigned int kprobes_text_size; + unsigned long *kprobe_blacklist; + unsigned int num_kprobe_blacklist; #endif #ifdef CONFIG_LIVEPATCH diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7549992b9bd..9eb5acf0a9f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2192,6 +2192,11 @@ static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) } } +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2231,6 +2236,12 @@ static int __init populate_kprobe_blacklist(unsigned long *start, static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } start = (unsigned long)mod->kprobes_text_start; if (start) { @@ -2242,6 +2253,12 @@ static void add_module_kprobe_blacklist(struct module *mod) static void remove_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } start = (unsigned long)mod->kprobes_text_start; if (start) { diff --git a/kernel/module.c b/kernel/module.c index 978f3fa40850..faf733789560 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3197,6 +3197,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_KPROBES mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 1f66b0f0aec671f8fbc86d75b2efdf7c7e0f7880 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:26 +0300 Subject: net: dsa: tag_8021q: introduce a vid_is_dsa_8021q helper This function returns a boolean denoting whether the VLAN passed as argument is part of the 1024-3071 range that the dsa_8021q tagging scheme uses. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 7 +++++++ net/dsa/tag_8021q.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index b8daaec0896e..ebc245ff838a 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -50,6 +50,8 @@ int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); +bool vid_is_dsa_8021q(u16 vid); + #else int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -107,6 +109,11 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } +bool vid_is_dsa_8021q(u16 vid) +{ + return false; +} + #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */ #endif /* _NET_DSA_8021Q_H */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index ff9c5bf64bda..4774ecd1f8fc 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -93,6 +93,13 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); +bool vid_is_dsa_8021q(u16 vid) +{ + return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || + (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX); +} +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); + static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) { struct bridge_vlan_info vinfo; -- cgit v1.2.3 From ec5ae61076d07be986df19773662506220757c9f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:29 +0300 Subject: net: dsa: sja1105: save/restore VLANs using a delta commit method Managing the VLAN table that is present in hardware will become very difficult once we add a third operating state (best_effort_vlan_filtering). That is because correct cleanup (not too little, not too much) becomes virtually impossible, when VLANs can be added from the bridge layer, from dsa_8021q for basic tagging, for cross-chip bridging, as well as retagging rules for sub-VLANs and cross-chip sub-VLANs. So we need to rethink VLAN interaction with the switch in a more scalable way. In preparation for that, use the priv->expect_dsa_8021q boolean to classify any VLAN request received through .port_vlan_add or .port_vlan_del towards either one of 2 internal lists: bridge VLANs and dsa_8021q VLANs. Then, implement a central sja1105_build_vlan_table method that creates a VLAN configuration from scratch based on the 2 lists of VLANs kept by the driver, and based on the VLAN awareness state. Currently, if we are VLAN-unaware, install the dsa_8021q VLANs, otherwise the bridge VLANs. Then, implement a delta commit procedure that identifies which VLANs from this new configuration are actually different from the config previously committed to hardware. We apply the delta through the dynamic configuration interface (we don't reset the switch). The result is that the hardware should see the exact sequence of operations as before this patch. This also helps remove the "br" argument passed to dsa_8021q_crosschip_bridge_join, which it was only using to figure out whether it should commit the configuration back to us or not, based on the VLAN awareness state of the bridge. We can simplify that, by always allowing those VLANs inside of our dsa_8021q_vlans list, and committing those to hardware when necessary. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 10 + drivers/net/dsa/sja1105/sja1105_main.c | 493 ++++++++++++++++++++++++--------- include/linux/dsa/8021q.h | 19 +- net/dsa/tag_8021q.c | 45 ++- 4 files changed, 393 insertions(+), 174 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 667056d0c819..c80f1999c694 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -178,6 +178,14 @@ struct sja1105_flow_block { int num_virtual_links; }; +struct sja1105_bridge_vlan { + struct list_head list; + int port; + u16 vid; + bool pvid; + bool untagged; +}; + enum sja1105_vlan_state { SJA1105_VLAN_UNAWARE, SJA1105_VLAN_FILTERING_FULL, @@ -191,6 +199,8 @@ struct sja1105_private { struct gpio_desc *reset_gpio; struct spi_device *spidev; struct dsa_switch *ds; + struct list_head dsa_8021q_vlans; + struct list_head bridge_vlans; struct list_head crosschip_links; struct sja1105_flow_block flow_block; struct sja1105_port ports[SJA1105_NUM_PORTS]; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 8e68adba9144..fb95130299b1 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -303,7 +303,8 @@ static int sja1105_init_static_vlan(struct sja1105_private *priv) .tag_port = 0, .vlanid = 1, }; - int i; + struct dsa_switch *ds = priv->ds; + int port; table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; @@ -324,12 +325,31 @@ static int sja1105_init_static_vlan(struct sja1105_private *priv) table->entry_count = 1; /* VLAN 1: all DT-defined ports are members; no restrictions on - * forwarding; always transmit priority-tagged frames as untagged. + * forwarding; always transmit as untagged. */ - for (i = 0; i < SJA1105_NUM_PORTS; i++) { - pvid.vmemb_port |= BIT(i); - pvid.vlan_bc |= BIT(i); - pvid.tag_port &= ~BIT(i); + for (port = 0; port < ds->num_ports; port++) { + struct sja1105_bridge_vlan *v; + + if (dsa_is_unused_port(ds, port)) + continue; + + pvid.vmemb_port |= BIT(port); + pvid.vlan_bc |= BIT(port); + pvid.tag_port &= ~BIT(port); + + /* Let traffic that don't need dsa_8021q (e.g. STP, PTP) be + * transmitted as untagged. + */ + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) + return -ENOMEM; + + v->port = port; + v->vid = 1; + v->untagged = true; + if (dsa_is_cpu_port(ds, port)) + v->pvid = true; + list_add(&v->list, &priv->dsa_8021q_vlans); } ((struct sja1105_vlan_lookup_entry *)table->entries)[0] = pvid; @@ -1717,82 +1737,6 @@ static int sja1105_pvid_apply(struct sja1105_private *priv, int port, u16 pvid) &mac[port], true); } -static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid) -{ - struct sja1105_vlan_lookup_entry *vlan; - int count, i; - - vlan = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entries; - count = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entry_count; - - for (i = 0; i < count; i++) - if (vlan[i].vlanid == vid) - return i; - - /* Return an invalid entry index if not found */ - return -1; -} - -static int sja1105_vlan_apply(struct sja1105_private *priv, int port, u16 vid, - bool enabled, bool untagged) -{ - struct sja1105_vlan_lookup_entry *vlan; - struct sja1105_table *table; - bool keep = true; - int match, rc; - - table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; - - match = sja1105_is_vlan_configured(priv, vid); - if (match < 0) { - /* Can't delete a missing entry. */ - if (!enabled) - return 0; - rc = sja1105_table_resize(table, table->entry_count + 1); - if (rc) - return rc; - match = table->entry_count - 1; - } - /* Assign pointer after the resize (it's new memory) */ - vlan = table->entries; - vlan[match].vlanid = vid; - if (enabled) { - vlan[match].vlan_bc |= BIT(port); - vlan[match].vmemb_port |= BIT(port); - } else { - vlan[match].vlan_bc &= ~BIT(port); - vlan[match].vmemb_port &= ~BIT(port); - } - /* Also unset tag_port if removing this VLAN was requested, - * just so we don't have a confusing bitmap (no practical purpose). - */ - if (untagged || !enabled) - vlan[match].tag_port &= ~BIT(port); - else - vlan[match].tag_port |= BIT(port); - /* If there's no port left as member of this VLAN, - * it's time for it to go. - */ - if (!vlan[match].vmemb_port) - keep = false; - - dev_dbg(priv->ds->dev, - "%s: port %d, vid %llu, broadcast domain 0x%llx, " - "port members 0x%llx, tagged ports 0x%llx, keep %d\n", - __func__, port, vlan[match].vlanid, vlan[match].vlan_bc, - vlan[match].vmemb_port, vlan[match].tag_port, keep); - - rc = sja1105_dynamic_config_write(priv, BLK_IDX_VLAN_LOOKUP, vid, - &vlan[match], keep); - if (rc < 0) - return rc; - - if (!keep) - return sja1105_table_delete_entry(table, match); - - return 0; -} - static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, int tree_index, int sw_index, int other_port, struct net_device *br) @@ -1813,7 +1757,7 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, other_priv->expect_dsa_8021q = true; rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds, - other_port, br, + other_port, &priv->crosschip_links); other_priv->expect_dsa_8021q = false; if (rc) @@ -1821,7 +1765,7 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds, priv->expect_dsa_8021q = true; rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port, ds, - port, br, + port, &other_priv->crosschip_links); priv->expect_dsa_8021q = false; if (rc) @@ -1852,35 +1796,16 @@ static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds, other_priv->expect_dsa_8021q = true; dsa_8021q_crosschip_bridge_leave(ds, port, other_ds, other_port, - br, &priv->crosschip_links); + &priv->crosschip_links); other_priv->expect_dsa_8021q = false; priv->expect_dsa_8021q = true; - dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, - port, br, + dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, port, &other_priv->crosschip_links); priv->expect_dsa_8021q = false; } } -static int sja1105_replay_crosschip_vlans(struct dsa_switch *ds, bool enabled) -{ - struct sja1105_private *priv = ds->priv; - struct dsa_8021q_crosschip_link *c; - int rc; - - list_for_each_entry(c, &priv->crosschip_links, list) { - priv->expect_dsa_8021q = true; - rc = dsa_8021q_crosschip_link_apply(ds, c->port, c->other_ds, - c->other_port, enabled); - priv->expect_dsa_8021q = false; - if (rc) - break; - } - - return rc; -} - static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) { struct sja1105_private *priv = ds->priv; @@ -1896,11 +1821,6 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) return rc; } } - rc = sja1105_replay_crosschip_vlans(ds, enabled); - if (rc) { - dev_err(ds->dev, "Failed to replay crosschip VLANs: %d\n", rc); - return rc; - } dev_info(ds->dev, "%s switch tagging\n", enabled ? "Enabled" : "Disabled"); @@ -1914,6 +1834,269 @@ sja1105_get_tag_protocol(struct dsa_switch *ds, int port, return DSA_TAG_PROTO_SJA1105; } +static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid) +{ + struct sja1105_vlan_lookup_entry *vlan; + int count, i; + + vlan = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entries; + count = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entry_count; + + for (i = 0; i < count; i++) + if (vlan[i].vlanid == vid) + return i; + + /* Return an invalid entry index if not found */ + return -1; +} + +static int sja1105_commit_vlans(struct sja1105_private *priv, + struct sja1105_vlan_lookup_entry *new_vlan) +{ + struct sja1105_vlan_lookup_entry *vlan; + struct sja1105_table *table; + int num_vlans = 0; + int rc, i, k = 0; + + /* VLAN table */ + table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; + vlan = table->entries; + + for (i = 0; i < VLAN_N_VID; i++) { + int match = sja1105_is_vlan_configured(priv, i); + + if (new_vlan[i].vlanid != VLAN_N_VID) + num_vlans++; + + if (new_vlan[i].vlanid == VLAN_N_VID && match >= 0) { + /* Was there before, no longer is. Delete */ + dev_dbg(priv->ds->dev, "Deleting VLAN %d\n", i); + rc = sja1105_dynamic_config_write(priv, + BLK_IDX_VLAN_LOOKUP, + i, &vlan[match], false); + if (rc < 0) + return rc; + } else if (new_vlan[i].vlanid != VLAN_N_VID) { + /* Nothing changed, don't do anything */ + if (match >= 0 && + vlan[match].vlanid == new_vlan[i].vlanid && + vlan[match].tag_port == new_vlan[i].tag_port && + vlan[match].vlan_bc == new_vlan[i].vlan_bc && + vlan[match].vmemb_port == new_vlan[i].vmemb_port) + continue; + /* Update entry */ + dev_dbg(priv->ds->dev, "Updating VLAN %d\n", i); + rc = sja1105_dynamic_config_write(priv, + BLK_IDX_VLAN_LOOKUP, + i, &new_vlan[i], + true); + if (rc < 0) + return rc; + } + } + + if (table->entry_count) + kfree(table->entries); + + table->entries = kcalloc(num_vlans, table->ops->unpacked_entry_size, + GFP_KERNEL); + if (!table->entries) + return -ENOMEM; + + table->entry_count = num_vlans; + vlan = table->entries; + + for (i = 0; i < VLAN_N_VID; i++) { + if (new_vlan[i].vlanid == VLAN_N_VID) + continue; + vlan[k++] = new_vlan[i]; + } + + return 0; +} + +struct sja1105_crosschip_switch { + struct list_head list; + struct dsa_switch *other_ds; +}; + +static int sja1105_commit_pvid(struct sja1105_private *priv) +{ + struct sja1105_bridge_vlan *v; + struct list_head *vlan_list; + int rc = 0; + + if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) + vlan_list = &priv->bridge_vlans; + else + vlan_list = &priv->dsa_8021q_vlans; + + list_for_each_entry(v, vlan_list, list) { + if (v->pvid) { + rc = sja1105_pvid_apply(priv, v->port, v->vid); + if (rc) + break; + } + } + + return rc; +} + +static int +sja1105_build_bridge_vlans(struct sja1105_private *priv, + struct sja1105_vlan_lookup_entry *new_vlan) +{ + struct sja1105_bridge_vlan *v; + + if (priv->vlan_state == SJA1105_VLAN_UNAWARE) + return 0; + + list_for_each_entry(v, &priv->bridge_vlans, list) { + int match = v->vid; + + new_vlan[match].vlanid = v->vid; + new_vlan[match].vmemb_port |= BIT(v->port); + new_vlan[match].vlan_bc |= BIT(v->port); + if (!v->untagged) + new_vlan[match].tag_port |= BIT(v->port); + } + + return 0; +} + +static int +sja1105_build_dsa_8021q_vlans(struct sja1105_private *priv, + struct sja1105_vlan_lookup_entry *new_vlan) +{ + struct sja1105_bridge_vlan *v; + + if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) + return 0; + + list_for_each_entry(v, &priv->dsa_8021q_vlans, list) { + int match = v->vid; + + new_vlan[match].vlanid = v->vid; + new_vlan[match].vmemb_port |= BIT(v->port); + new_vlan[match].vlan_bc |= BIT(v->port); + if (!v->untagged) + new_vlan[match].tag_port |= BIT(v->port); + } + + return 0; +} + +static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify); + +static int sja1105_notify_crosschip_switches(struct sja1105_private *priv) +{ + struct sja1105_crosschip_switch *s, *pos; + struct list_head crosschip_switches; + struct dsa_8021q_crosschip_link *c; + int rc = 0; + + INIT_LIST_HEAD(&crosschip_switches); + + list_for_each_entry(c, &priv->crosschip_links, list) { + bool already_added = false; + + list_for_each_entry(s, &crosschip_switches, list) { + if (s->other_ds == c->other_ds) { + already_added = true; + break; + } + } + + if (already_added) + continue; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + dev_err(priv->ds->dev, "Failed to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + s->other_ds = c->other_ds; + list_add(&s->list, &crosschip_switches); + } + + list_for_each_entry(s, &crosschip_switches, list) { + struct sja1105_private *other_priv = s->other_ds->priv; + + rc = sja1105_build_vlan_table(other_priv, false); + if (rc) + goto out; + } + +out: + list_for_each_entry_safe(s, pos, &crosschip_switches, list) { + list_del(&s->list); + kfree(s); + } + + return rc; +} + +static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify) +{ + struct sja1105_vlan_lookup_entry *new_vlan; + struct sja1105_table *table; + int rc; + int i; + + table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP]; + new_vlan = kcalloc(VLAN_N_VID, + table->ops->unpacked_entry_size, GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + + for (i = 0; i < VLAN_N_VID; i++) + new_vlan[i].vlanid = VLAN_N_VID; + + /* Bridge VLANs */ + rc = sja1105_build_bridge_vlans(priv, new_vlan); + if (rc) + goto out; + + /* VLANs necessary for dsa_8021q operation, given to us by tag_8021q.c: + * - RX VLANs + * - TX VLANs + * - Crosschip links + */ + rc = sja1105_build_dsa_8021q_vlans(priv, new_vlan); + if (rc) + goto out; + + rc = sja1105_commit_vlans(priv, new_vlan); + if (rc) + goto out; + + rc = sja1105_commit_pvid(priv); + if (rc) + goto out; + + if (notify) { + rc = sja1105_notify_crosschip_switches(priv); + if (rc) + goto out; + } + +out: + kfree(new_vlan); + + return rc; +} + +/* Select the list to which we should add this VLAN. */ +static struct list_head *sja1105_classify_vlan(struct sja1105_private *priv, + u16 vid) +{ + if (priv->expect_dsa_8021q) + return &priv->dsa_8021q_vlans; + + return &priv->bridge_vlans; +} + static int sja1105_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { @@ -2026,45 +2209,80 @@ static void sja1105_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct sja1105_private *priv = ds->priv; + bool vlan_table_changed = false; u16 vid; int rc; for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - rc = sja1105_vlan_apply(priv, port, vid, true, vlan->flags & - BRIDGE_VLAN_INFO_UNTAGGED); - if (rc < 0) { - dev_err(ds->dev, "Failed to add VLAN %d to port %d: %d\n", - vid, port, rc); - return; - } - if (vlan->flags & BRIDGE_VLAN_INFO_PVID) { - rc = sja1105_pvid_apply(ds->priv, port, vid); - if (rc < 0) { - dev_err(ds->dev, "Failed to set pvid %d on port %d: %d\n", - vid, port, rc); - return; + bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; + bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; + struct sja1105_bridge_vlan *v; + struct list_head *vlan_list; + bool already_added = false; + + vlan_list = sja1105_classify_vlan(priv, vid); + + list_for_each_entry(v, vlan_list, list) { + if (v->port == port && v->vid == vid && + v->untagged == untagged && v->pvid == pvid) { + already_added = true; + break; } } + + if (already_added) + continue; + + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) { + dev_err(ds->dev, "Out of memory while storing VLAN\n"); + return; + } + + v->port = port; + v->vid = vid; + v->untagged = untagged; + v->pvid = pvid; + list_add(&v->list, vlan_list); + + vlan_table_changed = true; } + + if (!vlan_table_changed) + return; + + rc = sja1105_build_vlan_table(priv, true); + if (rc) + dev_err(ds->dev, "Failed to build VLAN table: %d\n", rc); } static int sja1105_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct sja1105_private *priv = ds->priv; + bool vlan_table_changed = false; u16 vid; - int rc; for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - rc = sja1105_vlan_apply(priv, port, vid, false, vlan->flags & - BRIDGE_VLAN_INFO_UNTAGGED); - if (rc < 0) { - dev_err(ds->dev, "Failed to remove VLAN %d from port %d: %d\n", - vid, port, rc); - return rc; + struct sja1105_bridge_vlan *v, *n; + struct list_head *vlan_list; + + vlan_list = sja1105_classify_vlan(priv, vid); + + list_for_each_entry_safe(v, n, vlan_list, list) { + if (v->port == port && v->vid == vid) { + list_del(&v->list); + kfree(v); + vlan_table_changed = true; + break; + } } } - return 0; + + if (!vlan_table_changed) + return 0; + + return sja1105_build_vlan_table(priv, true); } /* The programming model for the SJA1105 switch is "all-at-once" via static @@ -2142,6 +2360,7 @@ static int sja1105_setup(struct dsa_switch *ds) static void sja1105_teardown(struct dsa_switch *ds) { struct sja1105_private *priv = ds->priv; + struct sja1105_bridge_vlan *v, *n; int port; for (port = 0; port < SJA1105_NUM_PORTS; port++) { @@ -2158,6 +2377,16 @@ static void sja1105_teardown(struct dsa_switch *ds) sja1105_tas_teardown(ds); sja1105_ptp_clock_unregister(ds); sja1105_static_config_free(&priv->static_config); + + list_for_each_entry_safe(v, n, &priv->dsa_8021q_vlans, list) { + list_del(&v->list); + kfree(v); + } + + list_for_each_entry_safe(v, n, &priv->bridge_vlans, list) { + list_del(&v->list); + kfree(v); + } } static int sja1105_port_enable(struct dsa_switch *ds, int port, @@ -2598,6 +2827,8 @@ static int sja1105_probe(struct spi_device *spi) mutex_init(&priv->mgmt_lock); INIT_LIST_HEAD(&priv->crosschip_links); + INIT_LIST_HEAD(&priv->bridge_vlans); + INIT_LIST_HEAD(&priv->dsa_8021q_vlans); sja1105_tas_setup(ds); sja1105_flower_setup(ds); diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index ebc245ff838a..404bd2cce642 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -25,18 +25,14 @@ struct dsa_8021q_crosschip_link { int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, bool enabled); -int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, bool enabled); - int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links); int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, @@ -60,16 +56,9 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, return 0; } -int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, bool enabled) -{ - return 0; -} - int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { return 0; @@ -77,7 +66,7 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { return 0; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 4774ecd1f8fc..3236fbbf85b9 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -296,9 +296,9 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); -int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, bool enabled) +static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) { u16 rx_vid = dsa_8021q_rx_vid(ds, port); @@ -308,7 +308,6 @@ int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, return dsa_8021q_vid_apply(other_ds, other_port, rx_vid, BRIDGE_VLAN_INFO_UNTAGGED, enabled); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_link_apply); static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, @@ -369,7 +368,7 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, */ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { /* @other_upstream is how @other_ds reaches us. If we are part @@ -385,12 +384,10 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, if (rc) return rc; - if (!br_vlan_enabled(br)) { - rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, - other_port, true); - if (rc) - return rc; - } + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_port, true); + if (rc) + return rc; rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, other_upstream, @@ -398,20 +395,14 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, if (rc) return rc; - if (!br_vlan_enabled(br)) { - rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, - other_upstream, true); - if (rc) - return rc; - } - - return 0; + return dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_upstream, true); } EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, - int other_port, struct net_device *br, + int other_port, struct list_head *crosschip_links) { int other_upstream = dsa_upstream_port(other_ds, other_port); @@ -431,14 +422,12 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, if (keep) continue; - if (!br_vlan_enabled(br)) { - rc = dsa_8021q_crosschip_link_apply(ds, port, - other_ds, - other_port, - false); - if (rc) - return rc; - } + rc = dsa_8021q_crosschip_link_apply(ds, port, + other_ds, + other_port, + false); + if (rc) + return rc; } } -- cgit v1.2.3 From 38b5beeae7a4cde87edabb0196fac1f55ae668ee Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:32 +0300 Subject: net: dsa: sja1105: prepare tagger for handling DSA tags and VLAN simultaneously In VLAN-unaware mode, sja1105 uses VLAN tags with a custom TPID of 0xdadb. While in the yet-to-be introduced best_effort_vlan_filtering mode, it needs to work with normal VLAN TPID values. A complication arises when we must transmit a VLAN-tagged packet to the switch when it's in VLAN-aware mode. We need to construct a packet with 2 VLAN tags, and the switch will use the outer header for routing and pop it on egress. But sadly, here the 2 hardware generations don't behave the same: - E/T switches won't pop an ETH_P_8021AD tag on egress, it seems (packets will remain double-tagged). - P/Q/R/S switches will drop a packet with 2 ETH_P_8021Q tags (it looks like it tries to prevent VLAN hopping). But looks like the reverse is also true: - E/T switches have no problem popping the outer tag from packets with 2 ETH_P_8021Q tags. - P/Q/R/S will have no problem popping a single tag even if that is ETH_P_8021AD. So it is clear that if we want the hardware to work with dsa_8021q tagging in VLAN-aware mode, we need to send different TPIDs depending on revision. Keep that information in priv->info->qinq_tpid. The per-port tagger structure will hold an xmit_tpid value that depends not only upon the qinq_tpid, but also upon the VLAN awareness state itself (in case we must transmit using 0xdadb). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 6 ++++++ drivers/net/dsa/sja1105/sja1105_main.c | 10 ++++++++++ drivers/net/dsa/sja1105/sja1105_spi.c | 6 ++++++ include/linux/dsa/sja1105.h | 1 + net/dsa/tag_sja1105.c | 32 +++++++++++++++++++++----------- 5 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index c80f1999c694..a019ffae38f1 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -87,6 +87,12 @@ struct sja1105_info { const struct sja1105_dynamic_table_ops *dyn_ops; const struct sja1105_table_ops *static_ops; const struct sja1105_regs *regs; + /* Both E/T and P/Q/R/S have quirks when it comes to popping the S-Tag + * from double-tagged frames. E/T will pop it only when it's equal to + * TPID from the General Parameters Table, while P/Q/R/S will only + * pop it when it's equal to TPID2. + */ + u16 qinq_tpid; int (*reset_cmd)(struct dsa_switch *ds); int (*setup_rgmii_delay)(const void *ctx, int port); /* Prototypes from include/net/dsa.h */ diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 7b9c3db98e1d..b7e4a85caade 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2153,6 +2153,15 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) tpid2 = ETH_P_SJA1105; } + for (port = 0; port < ds->num_ports; port++) { + struct sja1105_port *sp = &priv->ports[port]; + + if (enabled) + sp->xmit_tpid = priv->info->qinq_tpid; + else + sp->xmit_tpid = ETH_P_SJA1105; + } + if (!enabled) state = SJA1105_VLAN_UNAWARE; else @@ -2866,6 +2875,7 @@ static int sja1105_probe(struct spi_device *spi) goto out; } skb_queue_head_init(&sp->xmit_queue); + sp->xmit_tpid = ETH_P_SJA1105; } return 0; diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index 0be75c49e6c3..a0dacae803cc 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -512,6 +512,7 @@ struct sja1105_info sja1105e_info = { .part_no = SJA1105ET_PART_NO, .static_ops = sja1105e_table_ops, .dyn_ops = sja1105et_dyn_ops, + .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, .reset_cmd = sja1105et_reset_cmd, @@ -526,6 +527,7 @@ struct sja1105_info sja1105t_info = { .part_no = SJA1105ET_PART_NO, .static_ops = sja1105t_table_ops, .dyn_ops = sja1105et_dyn_ops, + .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, .reset_cmd = sja1105et_reset_cmd, @@ -540,6 +542,7 @@ struct sja1105_info sja1105p_info = { .part_no = SJA1105P_PART_NO, .static_ops = sja1105p_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, @@ -555,6 +558,7 @@ struct sja1105_info sja1105q_info = { .part_no = SJA1105Q_PART_NO, .static_ops = sja1105q_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, @@ -570,6 +574,7 @@ struct sja1105_info sja1105r_info = { .part_no = SJA1105R_PART_NO, .static_ops = sja1105r_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, @@ -586,6 +591,7 @@ struct sja1105_info sja1105s_info = { .static_ops = sja1105s_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, .regs = &sja1105pqrs_regs, + .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index fa5735c353cd..f821d08b1b5f 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -59,6 +59,7 @@ struct sja1105_port { struct sja1105_tagger_data *data; struct dsa_port *dp; bool hwts_tx_en; + u16 xmit_tpid; }; #endif /* _NET_DSA_SJA1105_H */ diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 5ecac5921a7d..398e2b9a1b96 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -69,12 +69,25 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) return true; } +static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) +{ + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + + if (hdr->h_vlan_proto == ntohs(ETH_P_SJA1105)) + return true; + + if (hdr->h_vlan_proto != ntohs(ETH_P_8021Q)) + return false; + + return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK); +} + /* This is the first time the tagger sees the frame on RX. * Figure out if we can decode it. */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + if (sja1105_can_use_vlan_as_tags(skb)) return true; if (sja1105_is_link_local(skb)) return true; @@ -96,6 +109,11 @@ static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, return NULL; } +static u16 sja1105_xmit_tpid(struct sja1105_port *sp) +{ + return sp->xmit_tpid; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -111,15 +129,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, if (unlikely(sja1105_is_link_local(skb))) return sja1105_defer_xmit(dp->priv, skb); - /* If we are under a vlan_filtering bridge, IP termination on - * switch ports based on 802.1Q tags is simply too brittle to - * be passable. So just defer to the dsa_slave_notag_xmit - * implementation. - */ - if (dsa_port_is_vlan_filtering(dp)) - return skb; - - return dsa_8021q_xmit(skb, netdev, ETH_P_SJA1105, + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } @@ -258,7 +268,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, hdr = eth_hdr(skb); tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105); + is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); -- cgit v1.2.3 From 3eaae1d05f2b5be1be834bfad64f8fc2ad39a56d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:33 +0300 Subject: net: dsa: tag_8021q: support up to 8 VLANs per port using sub-VLANs For switches that support VLAN retagging, such as sja1105, we extend dsa_8021q by encoding a "sub-VLAN" into the remaining 3 free bits in the dsa_8021q tag. A sub-VLAN is nothing more than a number in the range 0-7, which serves as an index into a per-port driver lookup table. The sub-VLAN value of zero means that traffic is untagged (this is also backwards-compatible with dsa_8021q without retagging). The switch should be configured to retag VLAN-tagged traffic that gets transmitted towards the CPU port (and towards the CPU only). Example: bridge vlan add dev sw1p0 vid 100 The switch retags frames received on port 0, going to the CPU, and having VID 100, to the VID of 1104 (0x0450). In dsa_8021q language: | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | +-----------+-----+-----------------+-----------+-----------------------+ | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | +-----------+-----+-----------------+-----------+-----------------------+ 0x0450 means: - DIR = 0b01: this is an RX VLAN - SUBVLAN = 0b001: this is subvlan #1 - SWITCH_ID = 0b001: this is switch 1 (see the name "sw1p0") - PORT = 0b0000: this is port 0 (see the name "sw1p0") The driver also remembers the "1 -> 100" mapping. In the hotpath, if the sub-VLAN from the tag encodes a non-untagged frame, this mapping is used to create a VLAN hwaccel tag, with the value of 100. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 16 ++++++++++++++ net/dsa/tag_8021q.c | 56 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 64 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 404bd2cce642..311aa04e7520 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -20,6 +20,8 @@ struct dsa_8021q_crosschip_link { refcount_t refcount; }; +#define DSA_8021Q_N_SUBVLAN 8 + #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, @@ -42,10 +44,14 @@ u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan); + int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); +u16 dsa_8021q_rx_subvlan(u16 vid); + bool vid_is_dsa_8021q(u16 vid); #else @@ -88,6 +94,11 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) return 0; } +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) +{ + return 0; +} + int dsa_8021q_rx_switch_id(u16 vid) { return 0; @@ -98,6 +109,11 @@ int dsa_8021q_rx_source_port(u16 vid) return 0; } +u16 dsa_8021q_rx_subvlan(u16 vid) +{ + return 0; +} + bool vid_is_dsa_8021q(u16 vid) { return false; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 3236fbbf85b9..3052da668156 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -17,7 +17,7 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | RSV | SWITCH_ID | RSV | PORT | + * | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * * DIR - VID[11:10]: @@ -27,17 +27,24 @@ * These values make the special VIDs of 0, 1 and 4095 to be left * unused by this coding scheme. * - * RSV - VID[9]: - * To be used for further expansion of SWITCH_ID or for other purposes. - * Must be transmitted as zero and ignored on receive. + * SVL/SUBVLAN - { VID[9], VID[5:4] }: + * Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN. + * * 0 (0b000): Field does not encode a sub-VLAN, either because + * received traffic is untagged, PVID-tagged or because a second + * VLAN tag is present after this tag and not inside of it. + * * 1 (0b001): Received traffic is tagged with a VID value private + * to the host. This field encodes the index in the host's lookup + * table through which the value of the ingress VLAN ID can be + * recovered. + * * 2 (0b010): Field encodes a sub-VLAN. + * ... + * * 7 (0b111): Field encodes a sub-VLAN. + * When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero + * (by the host) and ignored on receive (by the switch). * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. * - * RSV - VID[5:4]: - * To be used for further expansion of PORT or for other purposes. - * Must be transmitted as zero and ignored on receive. - * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ @@ -54,6 +61,18 @@ #define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ DSA_8021Q_SWITCH_ID_MASK) +#define DSA_8021Q_SUBVLAN_HI_SHIFT 9 +#define DSA_8021Q_SUBVLAN_HI_MASK GENMASK(9, 9) +#define DSA_8021Q_SUBVLAN_LO_SHIFT 4 +#define DSA_8021Q_SUBVLAN_LO_MASK GENMASK(4, 3) +#define DSA_8021Q_SUBVLAN_HI(x) (((x) & GENMASK(2, 2)) >> 2) +#define DSA_8021Q_SUBVLAN_LO(x) ((x) & GENMASK(1, 0)) +#define DSA_8021Q_SUBVLAN(x) \ + (((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \ + DSA_8021Q_SUBVLAN_LO_MASK) | \ + ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \ + DSA_8021Q_SUBVLAN_HI_MASK)) + #define DSA_8021Q_PORT_SHIFT 0 #define DSA_8021Q_PORT_MASK GENMASK(3, 0) #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ @@ -79,6 +98,13 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) +{ + return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | + DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan); +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan); + /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) { @@ -93,6 +119,20 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); +/* Returns the decoded subvlan from the RX VID. */ +u16 dsa_8021q_rx_subvlan(u16 vid) +{ + u16 svl_hi, svl_lo; + + svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >> + DSA_8021Q_SUBVLAN_HI_SHIFT; + svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >> + DSA_8021Q_SUBVLAN_LO_SHIFT; + + return (svl_hi << 2) | svl_lo; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); + bool vid_is_dsa_8021q(u16 vid) { return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || -- cgit v1.2.3 From 84eeb5d460e399795e9a92a0cd44999254886150 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 May 2020 20:20:34 +0300 Subject: net: dsa: tag_sja1105: implement sub-VLAN decoding Create a subvlan_map as part of each port's tagger private structure. This keeps reverse mappings of bridge-to-dsa_8021q VLAN retagging rules. Note that as of this patch, this piece of code is never engaged, due to the fact that the driver hasn't installed any retagging rule, so we'll always see packets with a subvlan code of 0 (untagged). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 ++++ include/linux/dsa/sja1105.h | 2 ++ net/dsa/tag_sja1105.c | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index b7e4a85caade..fd15a18596ea 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2856,6 +2856,7 @@ static int sja1105_probe(struct spi_device *spi) struct sja1105_port *sp = &priv->ports[port]; struct dsa_port *dp = dsa_to_port(ds, port); struct net_device *slave; + int subvlan; if (!dsa_is_user_port(ds, port)) continue; @@ -2876,6 +2877,9 @@ static int sja1105_probe(struct spi_device *spi) } skb_queue_head_init(&sp->xmit_queue); sp->xmit_tpid = ETH_P_SJA1105; + + for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) + sp->subvlan_map[subvlan] = VLAN_N_VID; } return 0; diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index f821d08b1b5f..dd93735ae228 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -9,6 +9,7 @@ #include #include +#include #include #define ETH_P_SJA1105 ETH_P_DSA_8021Q @@ -53,6 +54,7 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) struct sja1105_port { + u16 subvlan_map[DSA_8021Q_N_SUBVLAN]; struct kthread_worker *xmit_worker; struct kthread_work xmit_work; struct sk_buff_head xmit_queue; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 398e2b9a1b96..ad105550b145 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -254,6 +254,20 @@ static struct sk_buff return skb; } +static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; + u16 vid = sp->subvlan_map[subvlan]; + u16 vlan_tci; + + if (vid == VLAN_N_VID) + return; + + vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid; + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) @@ -263,6 +277,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; + u16 subvlan = 0; bool is_tagged; bool is_meta; @@ -286,6 +301,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + subvlan = dsa_8021q_rx_subvlan(vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -310,6 +326,9 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } + if (subvlan) + sja1105_decode_subvlan(skb, subvlan); + return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } -- cgit v1.2.3 From 4a470f00e10e3336350ab60ec6c3206177093019 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 6 May 2020 14:32:35 +0200 Subject: of: Make self-contained is not self-contained, as it uses _OF_DECLARE() to define RESERVEDMEM_OF_DECLARE(), but does not include . Fix this by adding the missing include. Signed-off-by: Geert Uytterhoeven Reviewed-by: Thierry Reding Acked-by: Rob Herring Signed-off-by: Thierry Reding --- include/linux/of_reserved_mem.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index a1b427ac291b..8216a4156263 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -3,6 +3,7 @@ #define __OF_RESERVED_MEM_H #include +#include struct of_phandle_args; struct reserved_mem_ops; -- cgit v1.2.3 From a3cba697a2a09e6769996d5265991a3228004d92 Mon Sep 17 00:00:00 2001 From: Joseph Lo Date: Wed, 29 May 2019 16:21:34 +0800 Subject: clk: tegra: Export functions for EMC clock scaling Export functions to allow accessing the CAR register required by EMC clock scaling. These functions will be used to access the CAR register as part of the scaling sequence. Signed-off-by: Joseph Lo Signed-off-by: Thierry Reding --- drivers/clk/tegra/clk-tegra210.c | 26 ++++++++++++++++++++++++++ include/linux/clk/tegra.h | 3 +++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c index 57d97e87d870..798920ec50e9 100644 --- a/drivers/clk/tegra/clk-tegra210.c +++ b/drivers/clk/tegra/clk-tegra210.c @@ -37,6 +37,7 @@ #define CLK_SOURCE_LA 0x1f8 #define CLK_SOURCE_SDMMC2 0x154 #define CLK_SOURCE_SDMMC4 0x164 +#define CLK_SOURCE_EMC_DLL 0x664 #define PLLC_BASE 0x80 #define PLLC_OUT 0x84 @@ -227,6 +228,10 @@ #define RST_DFLL_DVCO 0x2f4 #define DVFS_DFLL_RESET_SHIFT 0 +#define CLK_RST_CONTROLLER_CLK_OUT_ENB_X_SET 0x284 +#define CLK_RST_CONTROLLER_CLK_OUT_ENB_X_CLR 0x288 +#define CLK_OUT_ENB_X_CLK_ENB_EMC_DLL BIT(14) + #define CLK_RST_CONTROLLER_RST_DEV_Y_SET 0x2a8 #define CLK_RST_CONTROLLER_RST_DEV_Y_CLR 0x2ac #define CPU_SOFTRST_CTRL 0x380 @@ -555,6 +560,27 @@ void tegra210_set_sata_pll_seq_sw(bool state) } EXPORT_SYMBOL_GPL(tegra210_set_sata_pll_seq_sw); +void tegra210_clk_emc_dll_enable(bool flag) +{ + u32 offset = flag ? CLK_RST_CONTROLLER_CLK_OUT_ENB_X_SET : + CLK_RST_CONTROLLER_CLK_OUT_ENB_X_CLR; + + writel_relaxed(CLK_OUT_ENB_X_CLK_ENB_EMC_DLL, clk_base + offset); +} +EXPORT_SYMBOL_GPL(tegra210_clk_emc_dll_enable); + +void tegra210_clk_emc_dll_update_setting(u32 emc_dll_src_value) +{ + writel_relaxed(emc_dll_src_value, clk_base + CLK_SOURCE_EMC_DLL); +} +EXPORT_SYMBOL_GPL(tegra210_clk_emc_dll_update_setting); + +void tegra210_clk_emc_update_setting(u32 emc_src_value) +{ + writel_relaxed(emc_src_value, clk_base + CLK_SOURCE_EMC); +} +EXPORT_SYMBOL_GPL(tegra210_clk_emc_update_setting); + static void tegra210_generic_mbist_war(struct tegra210_domain_mbist_war *mbist) { u32 val; diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h index 2b1b35240074..5b0bdb413460 100644 --- a/include/linux/clk/tegra.h +++ b/include/linux/clk/tegra.h @@ -131,6 +131,9 @@ extern void tegra210_set_sata_pll_seq_sw(bool state); extern void tegra210_put_utmipll_in_iddq(void); extern void tegra210_put_utmipll_out_iddq(void); extern int tegra210_clk_handle_mbist_war(unsigned int id); +extern void tegra210_clk_emc_dll_enable(bool flag); +extern void tegra210_clk_emc_dll_update_setting(u32 emc_dll_src_value); +extern void tegra210_clk_emc_update_setting(u32 emc_src_value); struct clk; -- cgit v1.2.3 From 0ac65fc946d3a15ff30cea28b38a00b9ba98217b Mon Sep 17 00:00:00 2001 From: Joseph Lo Date: Wed, 29 May 2019 16:21:35 +0800 Subject: clk: tegra: Implement Tegra210 EMC clock The EMC clock needs to carefully coordinate with the EMC controller programming to make sure external memory can be properly clocked. Do so by hooking up the EMC clock with an EMC provider that will specify which rates are supported by the EMC and provide a callback to use for setting the clock rate at the EMC. Based on work by Peter De Schrijver . Signed-off-by: Joseph Lo Signed-off-by: Thierry Reding --- drivers/clk/tegra/Makefile | 1 + drivers/clk/tegra/clk-tegra210-emc.c | 369 +++++++++++++++++++++++++++++++++++ drivers/clk/tegra/clk.h | 3 + include/linux/clk/tegra.h | 24 +++ 4 files changed, 397 insertions(+) create mode 100644 drivers/clk/tegra/clk-tegra210-emc.c (limited to 'include/linux') diff --git a/drivers/clk/tegra/Makefile b/drivers/clk/tegra/Makefile index 7f5f5ec33739..6e1200b3bb68 100644 --- a/drivers/clk/tegra/Makefile +++ b/drivers/clk/tegra/Makefile @@ -25,5 +25,6 @@ obj-$(CONFIG_TEGRA124_EMC) += clk-tegra124-emc.o obj-$(CONFIG_ARCH_TEGRA_132_SOC) += clk-tegra124.o obj-y += cvb.o obj-$(CONFIG_ARCH_TEGRA_210_SOC) += clk-tegra210.o +obj-$(CONFIG_ARCH_TEGRA_210_SOC) += clk-tegra210-emc.o obj-$(CONFIG_CLK_TEGRA_BPMP) += clk-bpmp.o obj-y += clk-utils.o diff --git a/drivers/clk/tegra/clk-tegra210-emc.c b/drivers/clk/tegra/clk-tegra210-emc.c new file mode 100644 index 000000000000..352a2c3fc374 --- /dev/null +++ b/drivers/clk/tegra/clk-tegra210-emc.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define CLK_SOURCE_EMC 0x19c +#define CLK_SOURCE_EMC_2X_CLK_SRC GENMASK(31, 29) +#define CLK_SOURCE_EMC_MC_EMC_SAME_FREQ BIT(16) +#define CLK_SOURCE_EMC_2X_CLK_DIVISOR GENMASK(7, 0) + +#define CLK_SRC_PLLM 0 +#define CLK_SRC_PLLC 1 +#define CLK_SRC_PLLP 2 +#define CLK_SRC_CLK_M 3 +#define CLK_SRC_PLLM_UD 4 +#define CLK_SRC_PLLMB_UD 5 +#define CLK_SRC_PLLMB 6 +#define CLK_SRC_PLLP_UD 7 + +struct tegra210_clk_emc { + struct clk_hw hw; + void __iomem *regs; + + struct tegra210_clk_emc_provider *provider; + + struct clk *parents[8]; +}; + +static inline struct tegra210_clk_emc * +to_tegra210_clk_emc(struct clk_hw *hw) +{ + return container_of(hw, struct tegra210_clk_emc, hw); +} + +static const char *tegra210_clk_emc_parents[] = { + "pll_m", "pll_c", "pll_p", "clk_m", "pll_m_ud", "pll_mb_ud", + "pll_mb", "pll_p_ud", +}; + +static u8 tegra210_clk_emc_get_parent(struct clk_hw *hw) +{ + struct tegra210_clk_emc *emc = to_tegra210_clk_emc(hw); + u32 value; + u8 src; + + value = readl_relaxed(emc->regs + CLK_SOURCE_EMC); + src = FIELD_GET(CLK_SOURCE_EMC_2X_CLK_SRC, value); + + return src; +} + +static unsigned long tegra210_clk_emc_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct tegra210_clk_emc *emc = to_tegra210_clk_emc(hw); + u32 value, div; + + /* + * CCF assumes that neither the parent nor its rate will change during + * ->set_rate(), so the parent rate passed in here was cached from the + * parent before the ->set_rate() call. + * + * This can lead to wrong results being reported for the EMC clock if + * the parent and/or parent rate have changed as part of the EMC rate + * change sequence. Fix this by overriding the parent clock with what + * we know to be the correct value after the rate change. + */ + parent_rate = clk_hw_get_rate(clk_hw_get_parent(hw)); + + value = readl_relaxed(emc->regs + CLK_SOURCE_EMC); + + div = FIELD_GET(CLK_SOURCE_EMC_2X_CLK_DIVISOR, value); + div += 2; + + return DIV_ROUND_UP(parent_rate * 2, div); +} + +static long tegra210_clk_emc_round_rate(struct clk_hw *hw, unsigned long rate, + unsigned long *prate) +{ + struct tegra210_clk_emc *emc = to_tegra210_clk_emc(hw); + struct tegra210_clk_emc_provider *provider = emc->provider; + unsigned int i; + + if (!provider || !provider->configs || provider->num_configs == 0) + return clk_hw_get_rate(hw); + + for (i = 0; i < provider->num_configs; i++) { + if (provider->configs[i].rate >= rate) + return provider->configs[i].rate; + } + + return provider->configs[i - 1].rate; +} + +static struct clk *tegra210_clk_emc_find_parent(struct tegra210_clk_emc *emc, + u8 index) +{ + struct clk_hw *parent = clk_hw_get_parent_by_index(&emc->hw, index); + const char *name = clk_hw_get_name(parent); + + /* XXX implement cache? */ + + return __clk_lookup(name); +} + +static int tegra210_clk_emc_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct tegra210_clk_emc *emc = to_tegra210_clk_emc(hw); + struct tegra210_clk_emc_provider *provider = emc->provider; + struct tegra210_clk_emc_config *config; + struct device *dev = provider->dev; + struct clk_hw *old, *new, *parent; + u8 old_idx, new_idx, index; + struct clk *clk; + unsigned int i; + int err; + + if (!provider || !provider->configs || provider->num_configs == 0) + return -EINVAL; + + for (i = 0; i < provider->num_configs; i++) { + if (provider->configs[i].rate >= rate) { + config = &provider->configs[i]; + break; + } + } + + if (i == provider->num_configs) + config = &provider->configs[i - 1]; + + old_idx = tegra210_clk_emc_get_parent(hw); + new_idx = FIELD_GET(CLK_SOURCE_EMC_2X_CLK_SRC, config->value); + + old = clk_hw_get_parent_by_index(hw, old_idx); + new = clk_hw_get_parent_by_index(hw, new_idx); + + /* if the rate has changed... */ + if (config->parent_rate != clk_hw_get_rate(old)) { + /* ... but the clock source remains the same ... */ + if (new_idx == old_idx) { + /* ... switch to the alternative clock source. */ + switch (new_idx) { + case CLK_SRC_PLLM: + new_idx = CLK_SRC_PLLMB; + break; + + case CLK_SRC_PLLM_UD: + new_idx = CLK_SRC_PLLMB_UD; + break; + + case CLK_SRC_PLLMB_UD: + new_idx = CLK_SRC_PLLM_UD; + break; + + case CLK_SRC_PLLMB: + new_idx = CLK_SRC_PLLM; + break; + } + + /* + * This should never happen because we can't deal with + * it. + */ + if (WARN_ON(new_idx == old_idx)) + return -EINVAL; + + new = clk_hw_get_parent_by_index(hw, new_idx); + } + + index = new_idx; + parent = new; + } else { + index = old_idx; + parent = old; + } + + clk = tegra210_clk_emc_find_parent(emc, index); + if (IS_ERR(clk)) { + err = PTR_ERR(clk); + dev_err(dev, "failed to get parent clock for index %u: %d\n", + index, err); + return err; + } + + /* set the new parent clock to the required rate */ + if (clk_get_rate(clk) != config->parent_rate) { + err = clk_set_rate(clk, config->parent_rate); + if (err < 0) { + dev_err(dev, "failed to set rate %lu Hz for %pC: %d\n", + config->parent_rate, clk, err); + return err; + } + } + + /* enable the new parent clock */ + if (parent != old) { + err = clk_prepare_enable(clk); + if (err < 0) { + dev_err(dev, "failed to enable parent clock %pC: %d\n", + clk, err); + return err; + } + } + + /* update the EMC source configuration to reflect the new parent */ + config->value &= ~CLK_SOURCE_EMC_2X_CLK_SRC; + config->value |= FIELD_PREP(CLK_SOURCE_EMC_2X_CLK_SRC, index); + + /* + * Finally, switch the EMC programming with both old and new parent + * clocks enabled. + */ + err = provider->set_rate(dev, config); + if (err < 0) { + dev_err(dev, "failed to set EMC rate to %lu Hz: %d\n", rate, + err); + + /* + * If we're unable to switch to the new EMC frequency, we no + * longer need the new parent to be enabled. + */ + if (parent != old) + clk_disable_unprepare(clk); + + return err; + } + + /* reparent to new parent clock and disable the old parent clock */ + if (parent != old) { + clk = tegra210_clk_emc_find_parent(emc, old_idx); + if (IS_ERR(clk)) { + err = PTR_ERR(clk); + dev_err(dev, + "failed to get parent clock for index %u: %d\n", + old_idx, err); + return err; + } + + clk_hw_reparent(hw, parent); + clk_disable_unprepare(clk); + } + + return err; +} + +static const struct clk_ops tegra210_clk_emc_ops = { + .get_parent = tegra210_clk_emc_get_parent, + .recalc_rate = tegra210_clk_emc_recalc_rate, + .round_rate = tegra210_clk_emc_round_rate, + .set_rate = tegra210_clk_emc_set_rate, +}; + +struct clk *tegra210_clk_register_emc(struct device_node *np, + void __iomem *regs) +{ + struct tegra210_clk_emc *emc; + struct clk_init_data init; + struct clk *clk; + + emc = kzalloc(sizeof(*emc), GFP_KERNEL); + if (!emc) + return ERR_PTR(-ENOMEM); + + emc->regs = regs; + + init.name = "emc"; + init.ops = &tegra210_clk_emc_ops; + init.flags = CLK_IS_CRITICAL | CLK_GET_RATE_NOCACHE; + init.parent_names = tegra210_clk_emc_parents; + init.num_parents = ARRAY_SIZE(tegra210_clk_emc_parents); + emc->hw.init = &init; + + clk = clk_register(NULL, &emc->hw); + if (IS_ERR(clk)) { + kfree(emc); + return clk; + } + + return clk; +} + +int tegra210_clk_emc_attach(struct clk *clk, + struct tegra210_clk_emc_provider *provider) +{ + struct clk_hw *hw = __clk_get_hw(clk); + struct tegra210_clk_emc *emc = to_tegra210_clk_emc(hw); + struct device *dev = provider->dev; + unsigned int i; + int err; + + if (!try_module_get(provider->owner)) + return -ENODEV; + + for (i = 0; i < provider->num_configs; i++) { + struct tegra210_clk_emc_config *config = &provider->configs[i]; + struct clk_hw *parent; + bool same_freq; + u8 div, src; + + div = FIELD_GET(CLK_SOURCE_EMC_2X_CLK_DIVISOR, config->value); + src = FIELD_GET(CLK_SOURCE_EMC_2X_CLK_SRC, config->value); + + /* do basic sanity checking on the EMC timings */ + if (div & 0x1) { + dev_err(dev, "invalid odd divider %u for rate %lu Hz\n", + div, config->rate); + err = -EINVAL; + goto put; + } + + same_freq = config->value & CLK_SOURCE_EMC_MC_EMC_SAME_FREQ; + + if (same_freq != config->same_freq) { + dev_err(dev, + "ambiguous EMC to MC ratio for rate %lu Hz\n", + config->rate); + err = -EINVAL; + goto put; + } + + parent = clk_hw_get_parent_by_index(hw, src); + config->parent = src; + + if (src == CLK_SRC_PLLM || src == CLK_SRC_PLLM_UD) { + config->parent_rate = config->rate * (1 + div / 2); + } else { + unsigned long rate = config->rate * (1 + div / 2); + + config->parent_rate = clk_hw_get_rate(parent); + + if (config->parent_rate != rate) { + dev_err(dev, + "rate %lu Hz does not match input\n", + config->rate); + err = -EINVAL; + goto put; + } + } + } + + emc->provider = provider; + + return 0; + +put: + module_put(provider->owner); + return err; +} +EXPORT_SYMBOL_GPL(tegra210_clk_emc_attach); + +void tegra210_clk_emc_detach(struct clk *clk) +{ + struct tegra210_clk_emc *emc = to_tegra210_clk_emc(__clk_get_hw(clk)); + + module_put(emc->provider->owner); + emc->provider = NULL; +} +EXPORT_SYMBOL_GPL(tegra210_clk_emc_detach); diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h index 1d9f1bc18334..09cc700bab41 100644 --- a/drivers/clk/tegra/clk.h +++ b/drivers/clk/tegra/clk.h @@ -907,4 +907,7 @@ void tegra_clk_periph_resume(void); bool tegra20_clk_emc_driver_available(struct clk_hw *emc_hw); struct clk *tegra20_clk_register_emc(void __iomem *ioaddr, bool low_jitter); +struct clk *tegra210_clk_register_emc(struct device_node *np, + void __iomem *regs); + #endif /* TEGRA_CLK_H */ diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h index 5b0bdb413460..3f01d43f0598 100644 --- a/include/linux/clk/tegra.h +++ b/include/linux/clk/tegra.h @@ -146,4 +146,28 @@ void tegra20_clk_set_emc_round_callback(tegra20_clk_emc_round_cb *round_cb, void *cb_arg); int tegra20_clk_prepare_emc_mc_same_freq(struct clk *emc_clk, bool same); +struct tegra210_clk_emc_config { + unsigned long rate; + bool same_freq; + u32 value; + + unsigned long parent_rate; + u8 parent; +}; + +struct tegra210_clk_emc_provider { + struct module *owner; + struct device *dev; + + struct tegra210_clk_emc_config *configs; + unsigned int num_configs; + + int (*set_rate)(struct device *dev, + const struct tegra210_clk_emc_config *config); +}; + +int tegra210_clk_emc_attach(struct clk *clk, + struct tegra210_clk_emc_provider *provider); +void tegra210_clk_emc_detach(struct clk *clk); + #endif /* __LINUX_CLK_TEGRA_H_ */ -- cgit v1.2.3 From 529798bae7c155d38eec211436df736349dca2ee Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:19:43 -0500 Subject: remoteproc: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507191943.GA16033@embeddedor Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 0468be4d02f4..e7b7bab8b235 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -73,7 +73,7 @@ struct resource_table { u32 ver; u32 num; u32 reserved[2]; - u32 offset[0]; + u32 offset[]; } __packed; /** @@ -87,7 +87,7 @@ struct resource_table { */ struct fw_rsc_hdr { u32 type; - u8 data[0]; + u8 data[]; } __packed; /** @@ -306,7 +306,7 @@ struct fw_rsc_vdev { u8 status; u8 num_of_vrings; u8 reserved[2]; - struct fw_rsc_vdev_vring vring[0]; + struct fw_rsc_vdev_vring vring[]; } __packed; struct rproc; -- cgit v1.2.3 From d2fe97545a1e2d01c0ca0105bdc59002a0d0b130 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:13:56 -0700 Subject: fscrypt: fix all kerneldoc warnings Fix all kerneldoc warnings in fs/crypto/ and include/linux/fscrypt.h. Most of these were due to missing documentation for function parameters. Detected with: scripts/kernel-doc -v -none fs/crypto/*.{c,h} include/linux/fscrypt.h This cleanup makes it possible to check new patches for kerneldoc warnings without having to filter out all the existing ones. For consistency, also adjust some function "brief descriptions" to include the parentheses and to wrap at 80 characters. (The latter matches the checkpatch expectation.) Link: https://lore.kernel.org/r/20200511191358.53096-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/crypto.c | 9 ++++++-- fs/crypto/fname.c | 52 +++++++++++++++++++++++++++++++++------------ fs/crypto/fscrypt_private.h | 4 ++-- fs/crypto/hooks.c | 4 ++-- fs/crypto/keysetup.c | 9 +++++--- fs/crypto/policy.c | 19 +++++++++++++---- include/linux/fscrypt.h | 19 +++++++++++------ 7 files changed, 82 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 1ecaac7ee3cb..40c2821a341e 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -54,6 +54,7 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) /** * fscrypt_free_bounce_page() - free a ciphertext bounce page + * @bounce_page: the bounce page to free, or NULL * * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(), * or by fscrypt_alloc_bounce_page() directly. @@ -132,7 +133,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page + * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a + * pagecache page * @page: The locked pagecache page containing the block(s) to encrypt * @len: Total size of the block(s) to encrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -222,7 +224,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page + * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a + * pagecache page * @page: The locked pagecache page containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -346,6 +349,8 @@ void fscrypt_msg(const struct inode *inode, const char *level, /** * fscrypt_init() - Set up for fs encryption. + * + * Return: 0 on success; -errno on failure */ static int __init fscrypt_init(void) { diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 4c212442a8f7..fce3fa29f058 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -18,7 +18,7 @@ #include #include "fscrypt_private.h" -/** +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -105,9 +105,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename - * - * The output buffer must be at least as large as the input buffer. - * Any extra space is filled with NUL padding before encryption. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the filename to encrypt + * @out: (output) the encrypted filename + * @olen: size of the encrypted filename. It must be at least @iname->len. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ @@ -157,8 +160,11 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, /** * fname_decrypt() - decrypt a filename - * - * The caller must have allocated sufficient memory for the @oname string. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the encrypted filename to decrypt + * @oname: (output) the decrypted filename. The caller must have allocated + * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ @@ -206,7 +212,10 @@ static const char lookup_table[65] = #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - + * base64_encode() - base64-encode some bytes + * @src: the bytes to encode + * @len: number of bytes to encode + * @dst: (output) the base64-encoded string. Not NUL-terminated. * * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. @@ -272,7 +281,12 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, } /** - * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @max_encrypted_len: maximum length of encrypted filenames the buffer will be + * used to present + * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. @@ -297,9 +311,10 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_free_buffer - free the buffer for presented filenames + * fscrypt_fname_free_buffer() - free a buffer for presented filenames + * @crypto_str: the buffer to free * - * Free the buffer allocated by fscrypt_fname_alloc_buffer(). + * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { @@ -311,10 +326,19 @@ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user - * space - * - * The caller must have allocated sufficient memory for the @oname string. + * fscrypt_fname_disk_to_usr() - convert an encrypted filename to + * user-presentable form + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @hash: first part of the name's dirhash, if applicable. This only needs to + * be provided if the filename is located in an indexed directory whose + * encryption key may be unavailable. Not needed for symlink targets. + * @minor_hash: second part of the name's dirhash, if applicable + * @iname: encrypted filename to convert. May also be "." or "..", which + * aren't actually encrypted. + * @oname: output buffer for the user-presentable filename. The caller must + * have allocated enough space for this, e.g. using + * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index dbced2937ec8..f547094100be 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -43,7 +43,7 @@ struct fscrypt_context_v2 { u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; -/** +/* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each @@ -157,7 +157,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -/** +/* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 5ef861742921..09fb8aa0f2e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -10,7 +10,7 @@ #include "fscrypt_private.h" /** - * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * @@ -262,7 +262,7 @@ err_free_sd: EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** - * fscrypt_get_symlink - get the target of an encrypted symlink + * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 302375e9f719..edc4590c69f0 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -475,7 +475,8 @@ out: EXPORT_SYMBOL(fscrypt_get_encryption_info); /** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * fscrypt_put_encryption_info() - free most of an inode's fscrypt data + * @inode: an inode being evicted * * Free the inode's fscrypt_info. Filesystems must call this when the inode is * being evicted. An RCU grace period need not have elapsed yet. @@ -488,7 +489,8 @@ void fscrypt_put_encryption_info(struct inode *inode) EXPORT_SYMBOL(fscrypt_put_encryption_info); /** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay + * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. @@ -503,7 +505,8 @@ void fscrypt_free_inode(struct inode *inode) EXPORT_SYMBOL(fscrypt_free_inode); /** - * fscrypt_drop_inode - check whether the inode's master key has been removed + * fscrypt_drop_inode() - check whether the inode's master key has been removed + * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 10ccf945020c..0a95537fcef9 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -16,7 +16,9 @@ #include "fscrypt_private.h" /** - * fscrypt_policies_equal - check whether two encryption policies are the same + * fscrypt_policies_equal() - check whether two encryption policies are the same + * @policy1: the first policy + * @policy2: the second policy * * Return: %true if equal, else %false */ @@ -170,7 +172,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, } /** - * fscrypt_supported_policy - check whether an encryption policy is supported + * fscrypt_supported_policy() - check whether an encryption policy is supported + * @policy_u: the encryption policy + * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't @@ -192,7 +196,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * fscrypt_new_context_from_policy() - create a new fscrypt_context from + * an fscrypt_policy + * @ctx_u: output context + * @policy_u: input policy * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. A new nonce is randomly generated. @@ -242,7 +249,11 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, } /** - * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * fscrypt_policy_from_context() - convert an fscrypt_context to + * an fscrypt_policy + * @policy_u: output policy + * @ctx_u: input context + * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e3c2d2a15525..cb2c41f8dfde 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -75,6 +75,7 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption + * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. @@ -504,7 +505,7 @@ static inline void fscrypt_set_ops(struct super_block *sb, #endif /* !CONFIG_FS_ENCRYPTION */ /** - * fscrypt_require_key - require an inode's encryption key + * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. @@ -530,7 +531,8 @@ static inline int fscrypt_require_key(struct inode *inode) } /** - * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory + * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted + * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename @@ -557,7 +559,8 @@ static inline int fscrypt_prepare_link(struct dentry *old_dentry, } /** - * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories + * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted + * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory @@ -590,7 +593,8 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir, } /** - * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory + * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted + * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory @@ -623,7 +627,8 @@ static inline int fscrypt_prepare_lookup(struct inode *dir, } /** - * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes + * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's + * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * @@ -648,7 +653,7 @@ static inline int fscrypt_prepare_setattr(struct dentry *dentry, } /** - * fscrypt_prepare_symlink - prepare to create a possibly-encrypted symlink + * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator @@ -687,7 +692,7 @@ static inline int fscrypt_prepare_symlink(struct inode *dir, } /** - * fscrypt_encrypt_symlink - encrypt the symlink target if needed + * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator -- cgit v1.2.3 From fe015a78e5d0139cb126e8dbfc46a80be2bd27ad Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:13:57 -0700 Subject: fscrypt: name all function parameters Name all the function parameters. This makes it so that we don't have a mix of both styles, so it won't be ambiguous what to use in new fscrypt patches. This also matches the checkpatch expectation. Link: https://lore.kernel.org/r/20200511191358.53096-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index cb2c41f8dfde..210a05dd9ecd 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -56,10 +56,11 @@ struct fscrypt_name { struct fscrypt_operations { unsigned int flags; const char *key_prefix; - int (*get_context)(struct inode *, void *, size_t); - int (*set_context)(struct inode *, const void *, size_t, void *); - bool (*dummy_context)(struct inode *); - bool (*empty_dir)(struct inode *); + int (*get_context)(struct inode *inode, void *ctx, size_t len); + int (*set_context)(struct inode *inode, const void *ctx, size_t len, + void *fs_data); + bool (*dummy_context)(struct inode *inode); + bool (*empty_dir)(struct inode *inode); unsigned int max_namelen; bool (*has_stable_inodes)(struct super_block *sb); void (*get_ino_and_lblk_bits)(struct super_block *sb, @@ -137,13 +138,15 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) extern void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ -extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); -extern int fscrypt_ioctl_get_policy(struct file *, void __user *); -extern int fscrypt_ioctl_get_policy_ex(struct file *, void __user *); +extern int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); +extern int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); +extern int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); extern int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); -extern int fscrypt_has_permitted_context(struct inode *, struct inode *); -extern int fscrypt_inherit_context(struct inode *, struct inode *, - void *, bool); +extern int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child); +extern int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload); + /* keyring.c */ extern void fscrypt_sb_free(struct super_block *sb); extern int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); @@ -153,23 +156,24 @@ extern int fscrypt_ioctl_remove_key_all_users(struct file *filp, extern int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ -extern int fscrypt_get_encryption_info(struct inode *); -extern void fscrypt_put_encryption_info(struct inode *); -extern void fscrypt_free_inode(struct inode *); +extern int fscrypt_get_encryption_info(struct inode *inode); +extern void fscrypt_put_encryption_info(struct inode *inode); +extern void fscrypt_free_inode(struct inode *inode); extern int fscrypt_drop_inode(struct inode *inode); /* fname.c */ -extern int fscrypt_setup_filename(struct inode *, const struct qstr *, - int lookup, struct fscrypt_name *); +extern int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, + int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } -extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, - struct fscrypt_str *); -extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 max_encrypted_len, + struct fscrypt_str *crypto_str); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); extern int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, @@ -180,9 +184,9 @@ extern u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ -extern void fscrypt_decrypt_bio(struct bio *); -extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, - unsigned int); +extern void fscrypt_decrypt_bio(struct bio *bio); +extern int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len); /* hooks.c */ extern int fscrypt_file_open(struct inode *inode, struct file *filp); -- cgit v1.2.3 From 607009020a5e7fd9353fb2dd4cdcc73e26f3350f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:13:58 -0700 Subject: fscrypt: remove unnecessary extern keywords Remove the unnecessary 'extern' keywords from function declarations. This makes it so that we don't have a mix of both styles, so it won't be ambiguous what to use in new fscrypt patches. This also makes the code shorter and matches the 'checkpatch --strict' expectation. Link: https://lore.kernel.org/r/20200511191358.53096-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 84 +++++++++++++-------------- include/linux/fscrypt.h | 138 +++++++++++++++++++++----------------------- 2 files changed, 105 insertions(+), 117 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index f547094100be..20cbd9a4b28b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -231,15 +231,14 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -extern int fscrypt_initialize(unsigned int cop_flags); -extern int fscrypt_crypt_block(const struct inode *inode, - fscrypt_direction_t rw, u64 lblk_num, - struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags); -extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); - -extern void __printf(3, 4) __cold +int fscrypt_initialize(unsigned int cop_flags); +int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags); +struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); + +void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ @@ -264,12 +263,10 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fscrypt_fname_encrypt(const struct inode *inode, - const struct qstr *iname, - u8 *out, unsigned int olen); -extern bool fscrypt_fname_encrypted_size(const struct inode *inode, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -278,8 +275,8 @@ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; -extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -294,11 +291,11 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 -extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); -extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* keyring.c */ @@ -436,14 +433,14 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -extern struct key * +struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -extern int fscrypt_verify_key_added(struct super_block *sb, - const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); -extern int __init fscrypt_init_keyring(void); +int __init fscrypt_init_keyring(void); /* keysetup.c */ @@ -457,33 +454,32 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; -extern struct crypto_skcipher * -fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode); +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode); -extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, - const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); -extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, - const struct fscrypt_master_key *mk); +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ -extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); -extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, - const u8 *raw_master_key); +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); + +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); -extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( - struct fscrypt_info *ci); /* policy.c */ -extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, - const union fscrypt_policy *policy2); -extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, - const struct inode *inode); -extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, - const union fscrypt_context *ctx_u, - int ctx_size); +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 210a05dd9ecd..0e0c7ad19383 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -108,22 +108,21 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry) } /* crypto.c */ -extern void fscrypt_enqueue_decrypt_work(struct work_struct *); - -extern struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags); -extern int fscrypt_encrypt_block_inplace(const struct inode *inode, - struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num, - gfp_t gfp_flags); - -extern int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs); -extern int fscrypt_decrypt_block_inplace(const struct inode *inode, - struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num); +void fscrypt_enqueue_decrypt_work(struct work_struct *); + +struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, + unsigned int len, + unsigned int offs, + gfp_t gfp_flags); +int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num, gfp_t gfp_flags); + +int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, + unsigned int offs); +int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { @@ -135,81 +134,74 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return (struct page *)page_private(bounce_page); } -extern void fscrypt_free_bounce_page(struct page *bounce_page); +void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ -extern int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); -extern int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); -extern int fscrypt_has_permitted_context(struct inode *parent, - struct inode *child); -extern int fscrypt_inherit_context(struct inode *parent, struct inode *child, - void *fs_data, bool preload); +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); +int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); +int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); +int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); +int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload); /* keyring.c */ -extern void fscrypt_sb_free(struct super_block *sb); -extern int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_remove_key_all_users(struct file *filp, - void __user *arg); -extern int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); +void fscrypt_sb_free(struct super_block *sb); +int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); +int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); +int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); +int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ -extern int fscrypt_get_encryption_info(struct inode *inode); -extern void fscrypt_put_encryption_info(struct inode *inode); -extern void fscrypt_free_inode(struct inode *inode); -extern int fscrypt_drop_inode(struct inode *inode); +int fscrypt_get_encryption_info(struct inode *inode); +void fscrypt_put_encryption_info(struct inode *inode); +void fscrypt_free_inode(struct inode *inode); +int fscrypt_drop_inode(struct inode *inode); /* fname.c */ -extern int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, - int lookup, struct fscrypt_name *fname); +int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, + int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } -extern int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 max_encrypted_len, - struct fscrypt_str *crypto_str); -extern void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); -extern int fscrypt_fname_disk_to_usr(const struct inode *inode, - u32 hash, u32 minor_hash, - const struct fscrypt_str *iname, - struct fscrypt_str *oname); -extern bool fscrypt_match_name(const struct fscrypt_name *fname, - const u8 *de_name, u32 de_name_len); -extern u64 fscrypt_fname_siphash(const struct inode *dir, - const struct qstr *name); +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 max_encrypted_len, + struct fscrypt_str *crypto_str); +void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); +int fscrypt_fname_disk_to_usr(const struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname); +bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len); +u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ -extern void fscrypt_decrypt_bio(struct bio *bio); -extern int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len); +void fscrypt_decrypt_bio(struct bio *bio); +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len); /* hooks.c */ -extern int fscrypt_file_open(struct inode *inode, struct file *filp); -extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, - struct dentry *dentry); -extern int __fscrypt_prepare_rename(struct inode *old_dir, - struct dentry *old_dentry, - struct inode *new_dir, - struct dentry *new_dentry, - unsigned int flags); -extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, - struct fscrypt_name *fname); -extern int fscrypt_prepare_setflags(struct inode *inode, - unsigned int oldflags, unsigned int flags); -extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, - unsigned int max_len, - struct fscrypt_str *disk_link); -extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, - unsigned int len, - struct fscrypt_str *disk_link); -extern const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, - unsigned int max_size, - struct delayed_call *done); +int fscrypt_file_open(struct inode *inode, struct file *filp); +int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, + struct dentry *dentry); +int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); +int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct fscrypt_name *fname); +int fscrypt_prepare_setflags(struct inode *inode, + unsigned int oldflags, unsigned int flags); +int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link); +int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, struct fscrypt_str *disk_link); +const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size, + struct delayed_call *done); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { -- cgit v1.2.3 From 6377a38bd345b7f3b664c43ebb647ae55c1166e4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:21:17 -0700 Subject: fs-verity: fix all kerneldoc warnings Fix all kerneldoc warnings in fs/verity/ and include/linux/fsverity.h. Most of these were due to missing documentation for function parameters. Detected with: scripts/kernel-doc -v -none fs/verity/*.{c,h} include/linux/fsverity.h This cleanup makes it possible to check new patches for kerneldoc warnings without having to filter out all the existing ones. Link: https://lore.kernel.org/r/20200511192118.71427-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/enable.c | 2 ++ fs/verity/fsverity_private.h | 2 +- fs/verity/measure.c | 2 ++ fs/verity/open.c | 1 + fs/verity/signature.c | 3 +++ fs/verity/verify.c | 3 +++ include/linux/fsverity.h | 3 +++ 7 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/verity/enable.c b/fs/verity/enable.c index d98bea308fd7..5ab3bbec8108 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -329,6 +329,8 @@ rollback: /** * fsverity_ioctl_enable() - enable verity on a file + * @filp: file to enable verity on + * @uarg: user pointer to fsverity_enable_arg * * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 74768cf539da..8788f3a5035c 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -61,7 +61,7 @@ struct merkle_tree_params { u64 level_start[FS_VERITY_MAX_LEVELS]; }; -/** +/* * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 05049b68c745..df409a5682ed 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -11,6 +11,8 @@ /** * fsverity_ioctl_measure() - get a verity file's measurement + * @filp: file to get measurement of + * @_uarg: user pointer to fsverity_digest * * Retrieve the file measurement that the kernel is enforcing for reads from a * verity file. See the "FS_IOC_MEASURE_VERITY" section of diff --git a/fs/verity/open.c b/fs/verity/open.c index c5fe6948e262..d007db0c9304 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -330,6 +330,7 @@ EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); /** * fsverity_cleanup_inode() - free the inode's verity info, if present + * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ diff --git a/fs/verity/signature.c b/fs/verity/signature.c index c8b255232de5..b14ed96387ec 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -28,6 +28,9 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature + * @vi: the file's fsverity_info + * @desc: the file's fsverity_descriptor + * @desc_size: size of @desc * * If the file's fs-verity descriptor includes a signature of the file * measurement, verify it against the certificates in the fs-verity keyring. diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e0cb62da3864..a8b68c6f663d 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -179,6 +179,7 @@ out: /** * fsverity_verify_page() - verify a data page + * @page: the page to verity * * Verify a page that has just been read from a verity file. The page must be a * pagecache page that is still locked and not yet uptodate. @@ -206,6 +207,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed + * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages * must be pagecache pages that are still locked and not yet uptodate. Pages @@ -264,6 +266,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_bio); /** * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * @work: the work to enqueue * * Enqueue verification work for asynchronous processing. */ diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index ecc604e61d61..5ac30198e032 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -200,6 +200,7 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) /** * fsverity_active() - do reads from the inode need to go through fs-verity? + * @inode: inode to check * * This checks whether ->i_verity_info has been set. * @@ -207,6 +208,8 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work) * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * + * Return: true if reads need to go through fs-verity, otherwise false */ static inline bool fsverity_active(const struct inode *inode) { -- cgit v1.2.3 From 9cd6b593cfc9eaa476c9a3fa768b08bca73213d0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:21:18 -0700 Subject: fs-verity: remove unnecessary extern keywords Remove the unnecessary 'extern' keywords from function declarations. This makes it so that we don't have a mix of both styles, so it won't be ambiguous what to use in new fs-verity patches. This also makes the code shorter and matches the 'checkpatch --strict' expectation. Link: https://lore.kernel.org/r/20200511192118.71427-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/fsverity_private.h | 2 +- include/linux/fsverity.h | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 8788f3a5035c..e96d99d5145e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -134,7 +134,7 @@ void __init fsverity_check_hash_algs(void); /* init.c */ -extern void __printf(3, 4) __cold +void __printf(3, 4) __cold fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...); diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 5ac30198e032..78201a6d35f6 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -121,23 +121,23 @@ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) /* enable.c */ -extern int fsverity_ioctl_enable(struct file *filp, const void __user *arg); +int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ -extern int fsverity_ioctl_measure(struct file *filp, void __user *arg); +int fsverity_ioctl_measure(struct file *filp, void __user *arg); /* open.c */ -extern int fsverity_file_open(struct inode *inode, struct file *filp); -extern int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); -extern void fsverity_cleanup_inode(struct inode *inode); +int fsverity_file_open(struct inode *inode, struct file *filp); +int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); +void fsverity_cleanup_inode(struct inode *inode); /* verify.c */ -extern bool fsverity_verify_page(struct page *page); -extern void fsverity_verify_bio(struct bio *bio); -extern void fsverity_enqueue_verify_work(struct work_struct *work); +bool fsverity_verify_page(struct page *page); +void fsverity_verify_bio(struct bio *bio); +void fsverity_enqueue_verify_work(struct work_struct *work); #else /* !CONFIG_FS_VERITY */ -- cgit v1.2.3 From 07c4e1e834f8e7c991aa6dcb5ee0b7f2842e495d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 May 2020 16:17:56 +0800 Subject: block: only define 'nr_sects_seq' in hd_part for 32bit SMP The seqcount of 'nr_sects_seq' is only needed in case of 32bit SMP, so define it just for 32bit SMP. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Yufen Yu Cc: Christoph Hellwig Cc: Hou Tao Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- block/partitions/core.c | 2 +- include/linux/genhd.h | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index af6cba2a9332..1a51a9ad1035 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1731,7 +1731,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) * TODO: Ideally set_capacity() and get_capacity() should be * converted to make use of bd_mutex and sequence counters. */ - seqcount_init(&disk->part0.nr_sects_seq); + hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) { hd_free_part(&disk->part0); kfree(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 8d8c87207fc2..297004fd2264 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -392,7 +392,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_free; } - seqcount_init(&p->nr_sects_seq); + hd_sects_seq_init(p); pdev = part_to_dev(p); p->start_sect = start; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f9c226f9546a..b4744035ae58 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -68,7 +68,9 @@ struct hd_struct { * can be non-atomic on 32bit machines with 64bit sector_t. */ sector_t nr_sects; +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) seqcount_t nr_sects_seq; +#endif sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; @@ -274,6 +276,13 @@ static inline void disk_put_part(struct hd_struct *part) put_device(part_to_dev(part)); } +static inline void hd_sects_seq_init(struct hd_struct *p) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + seqcount_init(&p->nr_sects_seq); +#endif +} + /* * Smarter partition iterator without context limits. */ -- cgit v1.2.3 From 520138c3b9425c615d1417687947d86821003319 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 May 2020 16:17:57 +0800 Subject: block: re-organize fields of 'struct hd_part' Put all fields accessed in IO path together at the beginning of the struct, so that all can be fetched in single cacheline. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Yufen Yu Cc: Christoph Hellwig Cc: Hou Tao Signed-off-by: Jens Axboe --- include/linux/genhd.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b4744035ae58..a9384449465a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -71,6 +71,14 @@ struct hd_struct { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) seqcount_t nr_sects_seq; #endif + unsigned long stamp; +#ifdef CONFIG_SMP + struct disk_stats __percpu *dkstats; +#else + struct disk_stats dkstats; +#endif + struct percpu_ref ref; + sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; @@ -80,13 +88,6 @@ struct hd_struct { #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; #endif - unsigned long stamp; -#ifdef CONFIG_SMP - struct disk_stats __percpu *dkstats; -#else - struct disk_stats dkstats; -#endif - struct percpu_ref ref; struct rcu_work rcu_work; }; -- cgit v1.2.3 From e6249cdd46e43a7d3bdb8cce5fe24565d6c11e94 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 3 May 2020 09:54:22 +0800 Subject: block: add blk_io_schedule() for avoiding task hung in sync dio Sync dio could be big, or may take long time in discard or in case of IO failure. We have prevented task hung in submit_bio_wait() and blk_execute_rq(), so apply the same trick for prevent task hung from happening in sync dio. Add helper of blk_io_schedule() and use io_schedule_timeout() to prevent task hung warning. Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Cc: Salman Qazi Cc: Jesse Barnes Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap/direct-io.c | 2 +- include/linux/blkdev.h | 12 ++++++++++++ 4 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index 7cbb7b79935e..ebd1507789d2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -255,7 +255,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -449,7 +449,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index 00b4d15bb811..6d5370eac2a8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -500,7 +500,7 @@ static struct bio *dio_await_one(struct dio *dio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 20dde5aadcdd..fd3bd06fabb6 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -561,7 +561,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, dio->submit.cookie, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f00bd4042295..222eb5f32279 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -1827,4 +1828,15 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } +static inline void blk_io_schedule(void) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + io_schedule_timeout(timeout); + else + io_schedule(); +} + #endif -- cgit v1.2.3 From 02992df822e7e36685593aad10721a5a9f8d3402 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 12 May 2020 17:55:45 +0900 Subject: block: provide fallbacks for blk_queue_zone_is_seq and blk_queue_zone_no blk_queue_zone_is_seq() and blk_queue_zone_no() have not been called with CONFIG_BLK_DEV_ZONED disabled until now. The introduction of REQ_OP_ZONE_APPEND will change this, so we need to provide noop fallbacks for the !CONFIG_BLK_DEV_ZONED case. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 222eb5f32279..d736acf7f564 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -722,6 +722,16 @@ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { return 0; } +static inline bool blk_queue_zone_is_seq(struct request_queue *q, + sector_t sector) +{ + return false; +} +static inline unsigned int blk_queue_zone_no(struct request_queue *q, + sector_t sector) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) -- cgit v1.2.3 From 0512a75b98f847c2e9a4b664013424e603e202f7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 May 2020 17:55:47 +0900 Subject: block: Introduce REQ_OP_ZONE_APPEND Define REQ_OP_ZONE_APPEND to append-write sectors to a zone of a zoned block device. This is a no-merge write operation. A zone append write BIO must: * Target a zoned block device * Have a sector position indicating the start sector of the target zone * The target zone must be a sequential write zone * The BIO must not cross a zone boundary * The BIO size must not be split to ensure that a single range of LBAs is written with a single command. Implement these checks in generic_make_request_checks() using the helper function blk_check_zone_append(). To avoid write append BIO splitting, introduce the new max_zone_append_sectors queue limit attribute and ensure that a BIO size is always lower than this limit. Export this new limit through sysfs and check these limits in bio_full(). Also when a LLDD can't dispatch a request to a specific zone, it will return BLK_STS_ZONE_RESOURCE indicating this request needs to be delayed, e.g. because the zone it will be dispatched to is still write-locked. If this happens set the request aside in a local list to continue trying dispatching requests such as READ requests or a WRITE/ZONE_APPEND requests targetting other zones. This way we can still keep a high queue depth without starving other requests even if one request can't be served due to zone write-locking. Finally, make sure that the bio sector position indicates the actual write position as indicated by the device on completion. Signed-off-by: Keith Busch [ jth: added zone-append specific add_page and merge_page helpers ] Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 62 ++++++++++++++++++++++++++++++++++++++++++++--- block/blk-core.c | 52 +++++++++++++++++++++++++++++++++++++++ block/blk-mq.c | 27 +++++++++++++++++++++ block/blk-settings.c | 31 ++++++++++++++++++++++++ block/blk-sysfs.c | 13 ++++++++++ drivers/scsi/scsi_lib.c | 1 + include/linux/blk_types.h | 14 +++++++++++ include/linux/blkdev.h | 11 +++++++++ 8 files changed, 207 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index aad0a6dad4f9..3aa3c4ce2e5e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1025,6 +1025,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } +static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct request_queue *q = bio->bi_disk->queue; + unsigned int max_append_sectors = queue_max_zone_append_sectors(q); + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + ssize_t size, left; + unsigned len, i; + size_t offset; + + if (WARN_ON_ONCE(!max_append_sectors)) + return 0; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; + bool same_page = false; + + len = min_t(size_t, PAGE_SIZE - offset, left); + if (bio_add_hw_page(q, bio, page, len, offset, + max_append_sectors, &same_page) != len) + return -EINVAL; + if (same_page) + put_page(page); + offset = 0; + } + + iov_iter_advance(iter, size); + return 0; +} + /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to @@ -1054,10 +1098,16 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return -EINVAL; do { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (WARN_ON_ONCE(is_bvec)) + return -EINVAL; + ret = __bio_iov_append_get_pages(bio, iter); + } else { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (is_bvec) @@ -1460,6 +1510,10 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); + /* Zone append commands cannot be split */ + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return NULL; + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 409e1a6b73b0..cf5b2163edfe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -135,6 +135,7 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), + REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -240,6 +241,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); + if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); @@ -887,6 +899,41 @@ out: return ret; } +/* + * Check write append to a zoned block device. + */ +static inline blk_status_t blk_check_zone_append(struct request_queue *q, + struct bio *bio) +{ + sector_t pos = bio->bi_iter.bi_sector; + int nr_sectors = bio_sectors(bio); + + /* Only applicable to zoned block devices */ + if (!blk_queue_is_zoned(q)) + return BLK_STS_NOTSUPP; + + /* The bio sector must point to the start of a sequential zone */ + if (pos & (blk_queue_zone_sectors(q) - 1) || + !blk_queue_zone_is_seq(q, pos)) + return BLK_STS_IOERR; + + /* + * Not allowed to cross zone boundaries. Otherwise, the BIO will be + * split and could result in non-contiguous sectors being written in + * different zones. + */ + if (nr_sectors > q->limits.chunk_sectors) + return BLK_STS_IOERR; + + /* Make sure the BIO is small enough and will not get split */ + if (nr_sectors > q->limits.max_zone_append_sectors) + return BLK_STS_IOERR; + + bio->bi_opf |= REQ_NOMERGE; + + return BLK_STS_OK; +} + static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { @@ -959,6 +1006,11 @@ generic_make_request_checks(struct bio *bio) if (!q->limits.max_write_same_sectors) goto not_supported; break; + case REQ_OP_ZONE_APPEND: + status = blk_check_zone_append(q, bio); + if (status != BLK_STS_OK) + goto end_io; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: diff --git a/block/blk-mq.c b/block/blk-mq.c index d82cefb0474f..9ee695bdf873 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1183,6 +1183,19 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } +static void blk_mq_handle_zone_resource(struct request *rq, + struct list_head *zone_list) +{ + /* + * If we end up here it is because we cannot dispatch a request to a + * specific zone due to LLD level zone-write locking or other zone + * related resource not being available. In this case, set the request + * aside in zone_list for retrying it later. + */ + list_add(&rq->queuelist, zone_list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1195,6 +1208,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; bool no_budget_avail = false; + LIST_HEAD(zone_list); if (list_empty(list)) return false; @@ -1256,6 +1270,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_handle_dev_resource(rq, list); break; + } else if (ret == BLK_STS_ZONE_RESOURCE) { + /* + * Move the request to zone_list and keep going through + * the dispatch list to find more requests the drive can + * accept. + */ + blk_mq_handle_zone_resource(rq, &zone_list); + if (list_empty(list)) + break; + continue; } if (unlikely(ret != BLK_STS_OK)) { @@ -1267,6 +1291,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, queued++; } while (!list_empty(list)); + if (!list_empty(&zone_list)) + list_splice_tail_init(&zone_list, list); + hctx->dispatched[queued_to_index(queued)]++; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 2ab1967b9716..9a2c23cd9700 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_write_zeroes_sectors = 0; + lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; + lim->max_zone_append_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -221,6 +223,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); +/** + * blk_queue_max_zone_append_sectors - set max sectors for a single zone append + * @q: the request queue for the device + * @max_zone_append_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors) +{ + unsigned int max_sectors; + + if (WARN_ON(!blk_queue_is_zoned(q))) + return; + + max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); + + /* + * Signal eventual driver bugs resulting in the max_zone_append sectors limit + * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, + * or the max_hw_sectors limit not set. + */ + WARN_ON(!max_sectors); + + q->limits.max_zone_append_sectors = max_sectors; +} +EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -470,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_same_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); + t->max_zone_append_sectors = min(t->max_zone_append_sectors, + b->max_zone_append_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index fca9b158f4a0..02643e149d5e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) +{ + unsigned long long max_sectors = q->limits.max_zone_append_sectors; + + return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = { .show = queue_write_zeroes_max_show, }; +static struct queue_sysfs_entry queue_zone_append_max_entry = { + .attr = {.name = "zone_append_max_bytes", .mode = 0444 }, + .show = queue_zone_append_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = 0644 }, .show = queue_show_nonrot, @@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = { &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, + &queue_zone_append_max_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0a73230a8f16..82ad0244b3d0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1706,6 +1706,7 @@ out_put_budget: case BLK_STS_OK: break; case BLK_STS_RESOURCE: + case BLK_STS_ZONE_RESOURCE: if (atomic_read(&sdev->device_busy) || scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 90895d594e64..b90dca1fa430 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -63,6 +63,18 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) +/* + * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone + * related resources are unavailable, but the driver can guarantee the queue + * will be rerun in the future once the resources become available again. + * + * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references + * a zone specific resource and IO to a different zone on the same device could + * still be served. Examples of that are zones that are write-locked, but a read + * to the same zone could be served. + */ +#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -296,6 +308,8 @@ enum req_opf { REQ_OP_ZONE_CLOSE = 11, /* Transition a zone to full */ REQ_OP_ZONE_FINISH = 12, + /* write data at the current zone write pointer */ + REQ_OP_ZONE_APPEND = 13, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d736acf7f564..5647c78bb876 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -332,6 +332,7 @@ struct queue_limits { unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -750,6 +751,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; + if (req_op(rq) == REQ_OP_ZONE_APPEND) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1084,6 +1088,8 @@ extern void blk_queue_max_write_same_sectors(struct request_queue *q, extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); +extern void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); @@ -1301,6 +1307,11 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } +static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +{ + return q->limits.max_zone_append_sectors; +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; -- cgit v1.2.3 From 1392d37018d4f68c5bb2c98dae9a018b73926865 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 12 May 2020 17:55:48 +0900 Subject: block: introduce blk_req_zone_write_trylock Introduce blk_req_zone_write_trylock(), which either grabs the write-lock for a sequential zone or returns false, if the zone is already locked. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 14 ++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index f87956e0dcaf..c822cfa7a102 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -82,6 +82,20 @@ bool blk_req_needs_zone_write_lock(struct request *rq) } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); +bool blk_req_zone_write_trylock(struct request *rq) +{ + unsigned int zno = blk_rq_zone_no(rq); + + if (test_and_set_bit(zno, rq->q->seq_zones_wlock)) + return false; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; + + return true; +} +EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); + void __blk_req_zone_write_lock(struct request *rq) { if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5647c78bb876..73f4f4f1df92 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1738,6 +1738,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, #ifdef CONFIG_BLK_DEV_ZONED bool blk_req_needs_zone_write_lock(struct request *rq); +bool blk_req_zone_write_trylock(struct request *rq); void __blk_req_zone_write_lock(struct request *rq); void __blk_req_zone_write_unlock(struct request *rq); -- cgit v1.2.3 From e732671aa5f67232cf760666a15242dead003362 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 May 2020 17:55:49 +0900 Subject: block: Modify revalidate zones Modify the interface of blk_revalidate_disk_zones() to add an optional driver callback function that a driver can use to extend processing done during zone revalidation. The callback, if defined, is executed with the device request queue frozen, after all zones have been inspected. Signed-off-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 9 ++++++++- drivers/block/null_blk_zoned.c | 2 +- include/linux/blkdev.h | 3 ++- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c822cfa7a102..23831fa8701d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -471,14 +471,19 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, /** * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps * @disk: Target disk + * @update_driver_data: Callback to update driver data on the frozen disk * * Helper function for low-level device drivers to (re) allocate and initialize * a disk request queue zone bitmaps. This functions should normally be called * within the disk ->revalidate method for blk-mq based drivers. For BIO based * drivers only q->nr_zones needs to be updated so that the sysfs exposed value * is correct. + * If the @update_driver_data callback function is not NULL, the callback is + * executed with the device request queue frozen after all zones have been + * checked. */ -int blk_revalidate_disk_zones(struct gendisk *disk) +int blk_revalidate_disk_zones(struct gendisk *disk, + void (*update_driver_data)(struct gendisk *disk)) { struct request_queue *q = disk->queue; struct blk_revalidate_zone_args args = { @@ -512,6 +517,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk) q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); swap(q->conv_zones_bitmap, args.conv_zones_bitmap); + if (update_driver_data) + update_driver_data(disk); ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 9e4bcdad1a80..46641df2e58e 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -73,7 +73,7 @@ int null_register_zoned_dev(struct nullb *nullb) struct request_queue *q = nullb->q; if (queue_is_mq(q)) - return blk_revalidate_disk_zones(nullb->disk); + return blk_revalidate_disk_zones(nullb->disk, NULL); blk_queue_chunk_sectors(q, nullb->dev->zone_size_sects); q->nr_zones = blkdev_nr_zones(nullb->disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 73f4f4f1df92..5360696d85ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -358,7 +358,8 @@ unsigned int blkdev_nr_zones(struct gendisk *disk); extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); -extern int blk_revalidate_disk_zones(struct gendisk *disk); +int blk_revalidate_disk_zones(struct gendisk *disk, + void (*update_driver_data)(struct gendisk *disk)); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -- cgit v1.2.3 From 755a7397947e21123d8162eaf7477b614732ff22 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Sun, 26 Apr 2020 16:11:43 +0800 Subject: dt-bindings: firmware: imx: Move system control into dt-binding headfile i.MX8 SoCs DTS file needs system control macro definitions, so move them into dt-binding headfile, then include/linux/firmware/imx/types.h can be removed and those drivers using it should be changed accordingly. Signed-off-by: Dong Aisheng Signed-off-by: Jacky Bai Signed-off-by: Anson Huang Signed-off-by: Shawn Guo --- drivers/firmware/imx/imx-scu.c | 1 - drivers/thermal/imx_sc_thermal.c | 2 +- include/dt-bindings/firmware/imx/rsrc.h | 51 ++++++++++++++++++++++++++ include/linux/firmware/imx/sci.h | 1 - include/linux/firmware/imx/types.h | 65 --------------------------------- 5 files changed, 52 insertions(+), 68 deletions(-) delete mode 100644 include/linux/firmware/imx/types.h (limited to 'include/linux') diff --git a/drivers/firmware/imx/imx-scu.c b/drivers/firmware/imx/imx-scu.c index b3da2e193ad2..d02c00d0d5b5 100644 --- a/drivers/firmware/imx/imx-scu.c +++ b/drivers/firmware/imx/imx-scu.c @@ -8,7 +8,6 @@ */ #include -#include #include #include #include diff --git a/drivers/thermal/imx_sc_thermal.c b/drivers/thermal/imx_sc_thermal.c index a8723b1eb8b0..8938ea81a525 100644 --- a/drivers/thermal/imx_sc_thermal.c +++ b/drivers/thermal/imx_sc_thermal.c @@ -3,9 +3,9 @@ * Copyright 2018-2020 NXP. */ +#include #include #include -#include #include #include #include diff --git a/include/dt-bindings/firmware/imx/rsrc.h b/include/dt-bindings/firmware/imx/rsrc.h index 4e61f6485097..cdcda009efbd 100644 --- a/include/dt-bindings/firmware/imx/rsrc.h +++ b/include/dt-bindings/firmware/imx/rsrc.h @@ -547,4 +547,55 @@ #define IMX_SC_R_ATTESTATION 545 #define IMX_SC_R_LAST 546 +/* + * Defines for SC CONTROL + */ +#define IMX_SC_C_TEMP 0 +#define IMX_SC_C_TEMP_HI 1 +#define IMX_SC_C_TEMP_LOW 2 +#define IMX_SC_C_PXL_LINK_MST1_ADDR 3 +#define IMX_SC_C_PXL_LINK_MST2_ADDR 4 +#define IMX_SC_C_PXL_LINK_MST_ENB 5 +#define IMX_SC_C_PXL_LINK_MST1_ENB 6 +#define IMX_SC_C_PXL_LINK_MST2_ENB 7 +#define IMX_SC_C_PXL_LINK_SLV1_ADDR 8 +#define IMX_SC_C_PXL_LINK_SLV2_ADDR 9 +#define IMX_SC_C_PXL_LINK_MST_VLD 10 +#define IMX_SC_C_PXL_LINK_MST1_VLD 11 +#define IMX_SC_C_PXL_LINK_MST2_VLD 12 +#define IMX_SC_C_SINGLE_MODE 13 +#define IMX_SC_C_ID 14 +#define IMX_SC_C_PXL_CLK_POLARITY 15 +#define IMX_SC_C_LINESTATE 16 +#define IMX_SC_C_PCIE_G_RST 17 +#define IMX_SC_C_PCIE_BUTTON_RST 18 +#define IMX_SC_C_PCIE_PERST 19 +#define IMX_SC_C_PHY_RESET 20 +#define IMX_SC_C_PXL_LINK_RATE_CORRECTION 21 +#define IMX_SC_C_PANIC 22 +#define IMX_SC_C_PRIORITY_GROUP 23 +#define IMX_SC_C_TXCLK 24 +#define IMX_SC_C_CLKDIV 25 +#define IMX_SC_C_DISABLE_50 26 +#define IMX_SC_C_DISABLE_125 27 +#define IMX_SC_C_SEL_125 28 +#define IMX_SC_C_MODE 29 +#define IMX_SC_C_SYNC_CTRL0 30 +#define IMX_SC_C_KACHUNK_CNT 31 +#define IMX_SC_C_KACHUNK_SEL 32 +#define IMX_SC_C_SYNC_CTRL1 33 +#define IMX_SC_C_DPI_RESET 34 +#define IMX_SC_C_MIPI_RESET 35 +#define IMX_SC_C_DUAL_MODE 36 +#define IMX_SC_C_VOLTAGE 37 +#define IMX_SC_C_PXL_LINK_SEL 38 +#define IMX_SC_C_OFS_SEL 39 +#define IMX_SC_C_OFS_AUDIO 40 +#define IMX_SC_C_OFS_PERIPH 41 +#define IMX_SC_C_OFS_IRQ 42 +#define IMX_SC_C_RST0 43 +#define IMX_SC_C_RST1 44 +#define IMX_SC_C_SEL0 45 +#define IMX_SC_C_LAST 46 + #endif /* __DT_BINDINGS_RSCRC_IMX_H */ diff --git a/include/linux/firmware/imx/sci.h b/include/linux/firmware/imx/sci.h index 17ba4e405129..3fa418a4ca67 100644 --- a/include/linux/firmware/imx/sci.h +++ b/include/linux/firmware/imx/sci.h @@ -11,7 +11,6 @@ #define _SC_SCI_H #include -#include #include #include diff --git a/include/linux/firmware/imx/types.h b/include/linux/firmware/imx/types.h deleted file mode 100644 index 80821100e85f..000000000000 --- a/include/linux/firmware/imx/types.h +++ /dev/null @@ -1,65 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Copyright (C) 2016 Freescale Semiconductor, Inc. - * Copyright 2017~2018 NXP - * - * Header file containing types used across multiple service APIs. - */ - -#ifndef _SC_TYPES_H -#define _SC_TYPES_H - -/* - * This type is used to indicate a control. - */ -enum imx_sc_ctrl { - IMX_SC_C_TEMP = 0, - IMX_SC_C_TEMP_HI = 1, - IMX_SC_C_TEMP_LOW = 2, - IMX_SC_C_PXL_LINK_MST1_ADDR = 3, - IMX_SC_C_PXL_LINK_MST2_ADDR = 4, - IMX_SC_C_PXL_LINK_MST_ENB = 5, - IMX_SC_C_PXL_LINK_MST1_ENB = 6, - IMX_SC_C_PXL_LINK_MST2_ENB = 7, - IMX_SC_C_PXL_LINK_SLV1_ADDR = 8, - IMX_SC_C_PXL_LINK_SLV2_ADDR = 9, - IMX_SC_C_PXL_LINK_MST_VLD = 10, - IMX_SC_C_PXL_LINK_MST1_VLD = 11, - IMX_SC_C_PXL_LINK_MST2_VLD = 12, - IMX_SC_C_SINGLE_MODE = 13, - IMX_SC_C_ID = 14, - IMX_SC_C_PXL_CLK_POLARITY = 15, - IMX_SC_C_LINESTATE = 16, - IMX_SC_C_PCIE_G_RST = 17, - IMX_SC_C_PCIE_BUTTON_RST = 18, - IMX_SC_C_PCIE_PERST = 19, - IMX_SC_C_PHY_RESET = 20, - IMX_SC_C_PXL_LINK_RATE_CORRECTION = 21, - IMX_SC_C_PANIC = 22, - IMX_SC_C_PRIORITY_GROUP = 23, - IMX_SC_C_TXCLK = 24, - IMX_SC_C_CLKDIV = 25, - IMX_SC_C_DISABLE_50 = 26, - IMX_SC_C_DISABLE_125 = 27, - IMX_SC_C_SEL_125 = 28, - IMX_SC_C_MODE = 29, - IMX_SC_C_SYNC_CTRL0 = 30, - IMX_SC_C_KACHUNK_CNT = 31, - IMX_SC_C_KACHUNK_SEL = 32, - IMX_SC_C_SYNC_CTRL1 = 33, - IMX_SC_C_DPI_RESET = 34, - IMX_SC_C_MIPI_RESET = 35, - IMX_SC_C_DUAL_MODE = 36, - IMX_SC_C_VOLTAGE = 37, - IMX_SC_C_PXL_LINK_SEL = 38, - IMX_SC_C_OFS_SEL = 39, - IMX_SC_C_OFS_AUDIO = 40, - IMX_SC_C_OFS_PERIPH = 41, - IMX_SC_C_OFS_IRQ = 42, - IMX_SC_C_RST0 = 43, - IMX_SC_C_RST1 = 44, - IMX_SC_C_SEL0 = 45, - IMX_SC_C_LAST -}; - -#endif /* _SC_TYPES_H */ -- cgit v1.2.3 From 32b1924b210a70dcacdf65abd687c5ef86a67541 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 9 Apr 2020 11:29:47 +0300 Subject: ovl: skip overlayfs superblocks at global sync Stacked filesystems like overlayfs has no own writeback, but they have to forward syncfs() requests to backend for keeping data integrity. During global sync() each overlayfs instance calls method ->sync_fs() for backend although it itself is in global list of superblocks too. As a result one syscall sync() could write one superblock several times and send multiple disk barriers. This patch adds flag SB_I_SKIP_SYNC into sb->sb_iflags to avoid that. Reported-by: Dmitry Monakhov Signed-off-by: Konstantin Khlebnikov Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 5 +++-- fs/sync.c | 3 ++- include/linux/fs.h | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 6e3fa96613f5..f57aa348dcd6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -261,8 +261,8 @@ static int ovl_sync_fs(struct super_block *sb, int wait) return 0; /* - * If this is a sync(2) call or an emergency sync, all the super blocks - * will be iterated, including upper_sb, so no need to do anything. + * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). + * All the super blocks will be iterated, including upper_sb. * * If this is a syncfs(2) call, then we do need to call * sync_filesystem() on upper_sb, but enough if we do it when being @@ -1870,6 +1870,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ovl_xattr_handlers; sb->s_fs_info = ofs; sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_SKIP_SYNC; err = -ENOMEM; root_dentry = ovl_get_root(sb, upperpath.dentry, oe); diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5a..16c2630ee4bf 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -76,7 +76,8 @@ static void sync_inodes_one_sb(struct super_block *sb, void *arg) static void sync_fs_one_sb(struct super_block *sb, void *arg) { - if (!sb_rdonly(sb) && sb->s_op->sync_fs) + if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) && + sb->s_op->sync_fs) sb->s_op->sync_fs(sb, *(int *)arg); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..f186a966a36c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1409,6 +1409,8 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ + /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ -- cgit v1.2.3 From 303cc571d107b3641d6487061b748e70ffe15ce4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 May 2020 16:04:31 +0200 Subject: nsproxy: attach to namespaces via pidfds For quite a while we have been thinking about using pidfds to attach to namespaces. This patchset has existed for about a year already but we've wanted to wait to see how the general api would be received and adopted. Now that more and more programs in userspace have started using pidfds for process management it's time to send this one out. This patch makes it possible to use pidfds to attach to the namespaces of another process, i.e. they can be passed as the first argument to the setns() syscall. When only a single namespace type is specified the semantics are equivalent to passing an nsfd. That means setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However, when a pidfd is passed, multiple namespace flags can be specified in the second setns() argument and setns() will attach the caller to all the specified namespaces all at once or to none of them. Specifying 0 is not valid together with a pidfd. Here are just two obvious examples: setns(pidfd, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET); setns(pidfd, CLONE_NEWUSER); Allowing to also attach subsets of namespaces supports various use-cases where callers setns to a subset of namespaces to retain privilege, perform an action and then re-attach another subset of namespaces. If the need arises, as Eric suggested, we can extend this patchset to assume even more context than just attaching all namespaces. His suggestion specifically was about assuming the process' root directory when setns(pidfd, 0) or setns(pidfd, SETNS_PIDFD) is specified. For now, just keep it flexible in terms of supporting subsets of namespaces but let's wait until we have users asking for even more context to be assumed. At that point we can add an extension. The obvious example where this is useful is a standard container manager interacting with a running container: pushing and pulling files or directories, injecting mounts, attaching/execing any kind of process, managing network devices all these operations require attaching to all or at least multiple namespaces at the same time. Given that nowadays most containers are spawned with all namespaces enabled we're currently looking at at least 14 syscalls, 7 to open the /proc//ns/ nsfds, another 7 to actually perform the namespace switch. With time namespaces we're looking at about 16 syscalls. (We could amortize the first 7 or 8 syscalls for opening the nsfds by stashing them in each container's monitor process but that would mean we need to send around those file descriptors through unix sockets everytime we want to interact with the container or keep on-disk state. Even in scenarios where a caller wants to join a particular namespace in a particular order callers still profit from batching other namespaces. That mostly applies to the user namespace but all container runtimes I found join the user namespace first no matter if it privileges or deprivileges the container similar to how unshare behaves.) With pidfds this becomes a single syscall no matter how many namespaces are supposed to be attached to. A decently designed, large-scale container manager usually isn't the parent of any of the containers it spawns so the containers don't die when it crashes or needs to update or reinitialize. This means that for the manager to interact with containers through pids is inherently racy especially on systems where the maximum pid number is not significicantly bumped. This is even more problematic since we often spawn and manage thousands or ten-thousands of containers. Interacting with a container through a pid thus can become risky quite quickly. Especially since we allow for an administrator to enable advanced features such as syscall interception where we're performing syscalls in lieu of the container. In all of those cases we use pidfds if they are available and we pass them around as stable references. Using them to setns() to the target process' namespaces is as reliable as using nsfds. Either the target process is already dead and we get ESRCH or we manage to attach to its namespaces but we can't accidently attach to another process' namespaces. So pidfds lend themselves to be used with this api. The other main advantage is that with this change the pidfd becomes the only relevant token for most container interactions and it's the only token we need to create and send around. Apart from significiantly reducing the number of syscalls from double digit to single digit which is a decent reason post-spectre/meltdown this also allows to switch to a set of namespaces atomically, i.e. either attaching to all the specified namespaces succeeds or we fail. If we fail we haven't changed a single namespace. There are currently three namespaces that can fail (other than for ENOMEM which really is not very interesting since we then have other problems anyway) for non-trivial reasons, user, mount, and pid namespaces. We can fail to attach to a pid namespace if it is not our current active pid namespace or a descendant of it. We can fail to attach to a user namespace because we are multi-threaded or because our current mount namespace shares filesystem state with other tasks, or because we're trying to setns() to the same user namespace, i.e. the target task has the same user namespace as we do. We can fail to attach to a mount namespace because it shares filesystem state with other tasks or because we fail to lookup the new root for the new mount namespace. In most non-pathological scenarios these issues can be somewhat mitigated. But there are cases where we're half-attached to some namespace and failing to attach to another one. I've talked about some of these problem during the hallway track (something only the pre-COVID-19 generation will remember) of Plumbers in Los Angeles in 2018(?). Even if all these issues could be avoided with super careful userspace coding it would be nicer to have this done in-kernel. Pidfds seem to lend themselves nicely for this. The other neat thing about this is that setns() becomes an actual counterpart to the namespace bits of unshare(). Signed-off-by: Christian Brauner Reviewed-by: Serge Hallyn Cc: Eric W. Biederman Cc: Serge Hallyn Cc: Jann Horn Cc: Michael Kerrisk Cc: Aleksa Sarai Link: https://lore.kernel.org/r/20200505140432.181565-3-christian.brauner@ubuntu.com --- fs/namespace.c | 5 + fs/nsfs.c | 5 + include/linux/mnt_namespace.h | 1 + include/linux/proc_fs.h | 2 + kernel/nsproxy.c | 229 +++++++++++++++++++++++++++++++++++++++--- 5 files changed, 226 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 62899fad4a04..be99e80e3c7c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1733,6 +1733,11 @@ static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) return container_of(ns, struct mnt_namespace, ns); } +struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) +{ + return &mnt->ns; +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a diff --git a/fs/nsfs.c b/fs/nsfs.c index 4f1205725cfe..800c1d0eb0d0 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -229,6 +229,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, return res; } +bool proc_ns_file(const struct file *file) +{ + return file->f_op == &ns_file_operations; +} + struct file *proc_ns_fget(int fd) { struct file *file; diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 007cfa52efb2..8f882f5881e8 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,6 +11,7 @@ struct ns_common; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); +extern struct ns_common *from_mnt_ns(struct mnt_namespace *); extern const struct file_operations proc_mounts_operations; extern const struct file_operations proc_mountinfo_operations; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..0cfc44d7ac74 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -179,4 +179,6 @@ static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) return inode->i_sb->s_fs_info; } +bool proc_ns_file(const struct file *file); + #endif /* _LINUX_PROC_FS_H */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b7954fd60475..b03df67621d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -258,17 +259,58 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +static int check_setns_flags(unsigned long flags) +{ + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | + CLONE_NEWCGROUP))) + return -EINVAL; + +#ifndef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + return -EINVAL; +#endif +#ifndef CONFIG_PID_NS + if (flags & CLONE_NEWPID) + return -EINVAL; +#endif +#ifndef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) + return -EINVAL; +#endif +#ifndef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + return -EINVAL; +#endif +#ifndef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) + return -EINVAL; +#endif +#ifndef CONFIG_NET_NS + if (flags & CLONE_NEWNET) + return -EINVAL; +#endif + + return 0; +} + static void put_nsset(struct nsset *nsset) { unsigned flags = nsset->flags; if (flags & CLONE_NEWUSER) put_cred(nsset_cred(nsset)); + /* + * We only created a temporary copy if we attached to more than just + * the mount namespace. + */ + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) + free_fs_struct(nsset->fs); if (nsset->nsproxy) free_nsproxy(nsset->nsproxy); } -static int prepare_nsset(int nstype, struct nsset *nsset) +static int prepare_nsset(unsigned flags, struct nsset *nsset) { struct task_struct *me = current; @@ -276,17 +318,23 @@ static int prepare_nsset(int nstype, struct nsset *nsset) if (IS_ERR(nsset->nsproxy)) return PTR_ERR(nsset->nsproxy); - if (nstype == CLONE_NEWUSER) + if (flags & CLONE_NEWUSER) nsset->cred = prepare_creds(); else nsset->cred = current_cred(); if (!nsset->cred) goto out; - if (nstype == CLONE_NEWNS) + /* Only create a temporary copy of fs_struct if we really need to. */ + if (flags == CLONE_NEWNS) { nsset->fs = me->fs; + } else if (flags & CLONE_NEWNS) { + nsset->fs = copy_fs_struct(me->fs); + if (!nsset->fs) + goto out; + } - nsset->flags = nstype; + nsset->flags = flags; return 0; out: @@ -294,6 +342,138 @@ out: return -ENOMEM; } +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns) +{ + return ns->ops->install(nsset, ns); +} + +/* + * This is the inverse operation to unshare(). + * Ordering is equivalent to the standard ordering used everywhere else + * during unshare and process creation. The switch to the new set of + * namespaces occurs at the point of no return after installation of + * all requested namespaces was successful in commit_nsset(). + */ +static int validate_nsset(struct nsset *nsset, struct pid *pid) +{ + int ret = 0; + unsigned flags = nsset->flags; + struct user_namespace *user_ns = NULL; + struct pid_namespace *pid_ns = NULL; + struct nsproxy *nsp; + struct task_struct *tsk; + + /* Take a "snapshot" of the target task's namespaces. */ + rcu_read_lock(); + tsk = pid_task(pid, PIDTYPE_PID); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + return -EPERM; + } + + task_lock(tsk); + nsp = tsk->nsproxy; + if (nsp) + get_nsproxy(nsp); + task_unlock(tsk); + if (!nsp) { + rcu_read_unlock(); + return -ESRCH; + } + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + pid_ns = task_active_pid_ns(tsk); + if (unlikely(!pid_ns)) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_pid_ns(pid_ns); + } +#endif + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif + rcu_read_unlock(); + + /* + * Install requested namespaces. The caller will have + * verified earlier that the requested namespaces are + * supported on this kernel. We don't report errors here + * if a namespace is requested that isn't supported. + */ +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + ret = validate_ns(nsset, &user_ns->ns); + if (ret) + goto out; + } +#endif + + if (flags & CLONE_NEWNS) { + ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns)); + if (ret) + goto out; + } + +#ifdef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) { + ret = validate_ns(nsset, &nsp->uts_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) { + ret = validate_ns(nsset, &nsp->ipc_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + ret = validate_ns(nsset, &pid_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) { + ret = validate_ns(nsset, &nsp->cgroup_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_NET_NS + if (flags & CLONE_NEWNET) { + ret = validate_ns(nsset, &nsp->net_ns->ns); + if (ret) + goto out; + } +#endif + +out: + if (pid_ns) + put_pid_ns(pid_ns); + if (nsp) + put_nsproxy(nsp); + put_user_ns(user_ns); + + return ret; +} + /* * This is the point of no return. There are just a few namespaces * that do some actual work here and it's sufficiently minimal that @@ -316,6 +496,12 @@ static void commit_nsset(struct nsset *nsset) } #endif + /* We only need to commit if we have used a temporary fs_struct. */ + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) { + set_fs_root(me->fs, &nsset->fs->root); + set_fs_pwd(me->fs, &nsset->fs->pwd); + } + #ifdef CONFIG_IPC_NS if (flags & CLONE_NEWIPC) exit_sem(me); @@ -326,27 +512,38 @@ static void commit_nsset(struct nsset *nsset) nsset->nsproxy = NULL; } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) +SYSCALL_DEFINE2(setns, int, fd, int, flags) { struct file *file; - struct ns_common *ns; + struct ns_common *ns = NULL; struct nsset nsset = {}; - int err; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); + int err = 0; - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) + file = fget(fd); + if (!file) + return -EBADF; + + if (proc_ns_file(file)) { + ns = get_proc_ns(file_inode(file)); + if (flags && (ns->ops->type != flags)) + err = -EINVAL; + flags = ns->ops->type; + } else if (!IS_ERR(pidfd_pid(file))) { + err = check_setns_flags(flags); + } else { + err = -EBADF; + } + if (err) goto out; - err = prepare_nsset(ns->ops->type, &nsset); + err = prepare_nsset(flags, &nsset); if (err) goto out; - err = ns->ops->install(&nsset, ns); + if (proc_ns_file(file)) + err = validate_ns(&nsset, ns); + else + err = validate_nsset(&nsset, file->private_data); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); -- cgit v1.2.3 From 1597d453289b385237628cd96d57d147632ab105 Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Tue, 12 May 2020 15:53:20 +0300 Subject: interconnect: Add of_icc_get_by_index() helper function This is the same as the traditional of_icc_get() function, but the difference is that it takes index as an argument, instead of name. Reviewed-by: Matthias Kaehlcke Reviewed-by: Sibi Sankar Link: https://lore.kernel.org/r/20200512125327.1868-4-georgi.djakov@linaro.org Signed-off-by: Georgi Djakov --- drivers/interconnect/core.c | 72 +++++++++++++++++++++++++++++++++----------- include/linux/interconnect.h | 6 ++++ 2 files changed, 60 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 2c6515e3ecf1..aba5d38ea9d1 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -351,9 +351,9 @@ static struct icc_node *of_icc_get_from_provider(struct of_phandle_args *spec) } /** - * of_icc_get() - get a path handle from a DT node based on name + * of_icc_get_by_index() - get a path handle from a DT node based on index * @dev: device pointer for the consumer device - * @name: interconnect path name + * @idx: interconnect path index * * This function will search for a path between two endpoints and return an * icc_path handle on success. Use icc_put() to release constraints when they @@ -365,13 +365,12 @@ static struct icc_node *of_icc_get_from_provider(struct of_phandle_args *spec) * Return: icc_path pointer on success or ERR_PTR() on error. NULL is returned * when the API is disabled or the "interconnects" DT property is missing. */ -struct icc_path *of_icc_get(struct device *dev, const char *name) +struct icc_path *of_icc_get_by_index(struct device *dev, int idx) { - struct icc_path *path = ERR_PTR(-EPROBE_DEFER); + struct icc_path *path; struct icc_node *src_node, *dst_node; - struct device_node *np = NULL; + struct device_node *np; struct of_phandle_args src_args, dst_args; - int idx = 0; int ret; if (!dev || !dev->of_node) @@ -391,12 +390,6 @@ struct icc_path *of_icc_get(struct device *dev, const char *name) * lets support only global ids and extend this in the future if needed * without breaking DT compatibility. */ - if (name) { - idx = of_property_match_string(np, "interconnect-names", name); - if (idx < 0) - return ERR_PTR(idx); - } - ret = of_parse_phandle_with_args(np, "interconnects", "#interconnect-cells", idx * 2, &src_args); @@ -439,12 +432,8 @@ struct icc_path *of_icc_get(struct device *dev, const char *name) return path; } - if (name) - path->name = kstrdup_const(name, GFP_KERNEL); - else - path->name = kasprintf(GFP_KERNEL, "%s-%s", - src_node->name, dst_node->name); - + path->name = kasprintf(GFP_KERNEL, "%s-%s", + src_node->name, dst_node->name); if (!path->name) { kfree(path); return ERR_PTR(-ENOMEM); @@ -452,6 +441,53 @@ struct icc_path *of_icc_get(struct device *dev, const char *name) return path; } +EXPORT_SYMBOL_GPL(of_icc_get_by_index); + +/** + * of_icc_get() - get a path handle from a DT node based on name + * @dev: device pointer for the consumer device + * @name: interconnect path name + * + * This function will search for a path between two endpoints and return an + * icc_path handle on success. Use icc_put() to release constraints when they + * are not needed anymore. + * If the interconnect API is disabled, NULL is returned and the consumer + * drivers will still build. Drivers are free to handle this specifically, + * but they don't have to. + * + * Return: icc_path pointer on success or ERR_PTR() on error. NULL is returned + * when the API is disabled or the "interconnects" DT property is missing. + */ +struct icc_path *of_icc_get(struct device *dev, const char *name) +{ + struct device_node *np; + int idx = 0; + + if (!dev || !dev->of_node) + return ERR_PTR(-ENODEV); + + np = dev->of_node; + + /* + * When the consumer DT node do not have "interconnects" property + * return a NULL path to skip setting constraints. + */ + if (!of_find_property(np, "interconnects", NULL)) + return NULL; + + /* + * We use a combination of phandle and specifier for endpoint. For now + * lets support only global ids and extend this in the future if needed + * without breaking DT compatibility. + */ + if (name) { + idx = of_property_match_string(np, "interconnect-names", name); + if (idx < 0) + return ERR_PTR(idx); + } + + return of_icc_get_by_index(dev, idx); +} EXPORT_SYMBOL_GPL(of_icc_get); /** diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index d70a914cba11..34e97231a6ab 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -28,6 +28,7 @@ struct device; struct icc_path *icc_get(struct device *dev, const int src_id, const int dst_id); struct icc_path *of_icc_get(struct device *dev, const char *name); +struct icc_path *of_icc_get_by_index(struct device *dev, int idx); void icc_put(struct icc_path *path); int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw); void icc_set_tag(struct icc_path *path, u32 tag); @@ -46,6 +47,11 @@ static inline struct icc_path *of_icc_get(struct device *dev, return NULL; } +static inline struct icc_path *of_icc_get_by_index(struct device *dev, int idx) +{ + return NULL; +} + static inline void icc_put(struct icc_path *path) { } -- cgit v1.2.3 From 84c1e51d7df85332705ffebc40ef135205492036 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:18 -0500 Subject: greybus: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507185318.GA14393@embeddedor Signed-off-by: Greg Kroah-Hartman --- drivers/greybus/arpc.h | 2 +- include/linux/greybus/greybus_protocols.h | 44 +++++++++++++++---------------- 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/greybus/arpc.h b/drivers/greybus/arpc.h index c8b83c5cfa79..b9ea81b55b29 100644 --- a/drivers/greybus/arpc.h +++ b/drivers/greybus/arpc.h @@ -21,7 +21,7 @@ struct arpc_request_message { __le16 id; /* RPC unique id */ __le16 size; /* Size in bytes of header + payload */ __u8 type; /* RPC type */ - __u8 data[0]; /* ARPC data */ + __u8 data[]; /* ARPC data */ } __packed; struct arpc_response_message { diff --git a/include/linux/greybus/greybus_protocols.h b/include/linux/greybus/greybus_protocols.h index dfbc6c39a74b..aeb8f9243545 100644 --- a/include/linux/greybus/greybus_protocols.h +++ b/include/linux/greybus/greybus_protocols.h @@ -345,7 +345,7 @@ struct gb_cap_get_ims_certificate_request { struct gb_cap_get_ims_certificate_response { __u8 result_code; - __u8 certificate[0]; + __u8 certificate[]; } __packed; /* CAP authenticate request/response */ @@ -358,7 +358,7 @@ struct gb_cap_authenticate_request { struct gb_cap_authenticate_response { __u8 result_code; __u8 response[64]; - __u8 signature[0]; + __u8 signature[]; } __packed; @@ -642,7 +642,7 @@ struct gb_hid_get_report_request { struct gb_hid_set_report_request { __u8 report_type; __u8 report_id; - __u8 report[0]; + __u8 report[]; } __packed; /* HID input report request, via interrupt pipe */ @@ -680,7 +680,7 @@ struct gb_i2c_transfer_op { struct gb_i2c_transfer_request { __le16 op_count; - struct gb_i2c_transfer_op ops[0]; /* op_count of these */ + struct gb_i2c_transfer_op ops[]; /* op_count of these */ } __packed; struct gb_i2c_transfer_response { __u8 data[0]; /* inbound data */ @@ -908,7 +908,7 @@ struct gb_spi_transfer_request { __u8 chip_select; /* of the spi device */ __u8 mode; /* of the spi device */ __le16 count; - struct gb_spi_transfer transfers[0]; /* count of these */ + struct gb_spi_transfer transfers[]; /* count of these */ } __packed; struct gb_spi_transfer_response { @@ -1188,7 +1188,7 @@ struct gb_svc_pwrmon_rail_count_get_response { struct gb_svc_pwrmon_rail_names_get_response { __u8 status; - __u8 name[0][GB_SVC_PWRMON_RAIL_NAME_BUFSIZE]; + __u8 name[][GB_SVC_PWRMON_RAIL_NAME_BUFSIZE]; } __packed; #define GB_SVC_PWRMON_TYPE_CURR 0x01 @@ -1281,7 +1281,7 @@ struct gb_svc_intf_oops_request { struct gb_raw_send_request { __le32 len; - __u8 data[0]; + __u8 data[]; } __packed; @@ -1300,7 +1300,7 @@ struct gb_raw_send_request { /* Represents data from AP -> Module */ struct gb_uart_send_data_request { __le16 size; - __u8 data[0]; + __u8 data[]; } __packed; /* recv-data-request flags */ @@ -1313,7 +1313,7 @@ struct gb_uart_send_data_request { struct gb_uart_recv_data_request { __le16 size; __u8 flags; - __u8 data[0]; + __u8 data[]; } __packed; struct gb_uart_receive_credits_request { @@ -1382,14 +1382,14 @@ struct gb_loopback_transfer_request { __le32 len; __le32 reserved0; __le32 reserved1; - __u8 data[0]; + __u8 data[]; } __packed; struct gb_loopback_transfer_response { __le32 len; __le32 reserved0; __le32 reserved1; - __u8 data[0]; + __u8 data[]; } __packed; /* SDIO */ @@ -1530,13 +1530,13 @@ struct gb_sdio_transfer_request { __le16 data_blocks; __le16 data_blksz; - __u8 data[0]; + __u8 data[]; } __packed; struct gb_sdio_transfer_response { __le16 data_blocks; __le16 data_blksz; - __u8 data[0]; + __u8 data[]; } __packed; /* event request: generated by module and is defined as unidirectional */ @@ -1572,7 +1572,7 @@ struct gb_camera_configure_streams_request { __u8 flags; #define GB_CAMERA_CONFIGURE_STREAMS_TEST_ONLY 0x01 __le16 padding; - struct gb_camera_stream_config_request config[0]; + struct gb_camera_stream_config_request config[]; } __packed; /* Greybus Camera Configure Streams response payload */ @@ -1593,7 +1593,7 @@ struct gb_camera_configure_streams_response { __u8 flags; __u8 padding[2]; __le32 data_rate; - struct gb_camera_stream_config_response config[0]; + struct gb_camera_stream_config_response config[]; }; /* Greybus Camera Capture request payload - response has no payload */ @@ -1602,7 +1602,7 @@ struct gb_camera_capture_request { __u8 streams; __u8 padding; __le16 num_frames; - __u8 settings[0]; + __u8 settings[]; } __packed; /* Greybus Camera Flush response payload - request has no payload */ @@ -1616,7 +1616,7 @@ struct gb_camera_metadata_request { __le16 frame_number; __u8 stream; __u8 padding; - __u8 metadata[0]; + __u8 metadata[]; } __packed; /* Lights */ @@ -1993,7 +1993,7 @@ struct gb_audio_integer64 { struct gb_audio_enumerated { __le32 items; __le16 names_length; - __u8 names[0]; + __u8 names[]; } __packed; struct gb_audio_ctl_elem_info { /* See snd_ctl_elem_info in Linux source */ @@ -2033,7 +2033,7 @@ struct gb_audio_widget { __u8 type; /* GB_AUDIO_WIDGET_TYPE_* */ __u8 state; /* GB_AUDIO_WIDGET_STATE_* */ __u8 ncontrols; - struct gb_audio_control ctl[0]; /* 'ncontrols' entries */ + struct gb_audio_control ctl[]; /* 'ncontrols' entries */ } __packed; struct gb_audio_route { @@ -2059,7 +2059,7 @@ struct gb_audio_topology { * struct gb_audio_widget widgets[num_widgets]; * struct gb_audio_route routes[num_routes]; */ - __u8 data[0]; + __u8 data[]; } __packed; struct gb_audio_get_topology_size_response { @@ -2157,7 +2157,7 @@ struct gb_audio_streaming_event_request { struct gb_audio_send_data_request { __le64 timestamp; - __u8 data[0]; + __u8 data[]; } __packed; @@ -2171,7 +2171,7 @@ struct gb_audio_send_data_request { struct gb_log_send_log_request { __le16 len; - __u8 msg[0]; + __u8 msg[]; } __packed; #endif /* __GREYBUS_PROTOCOLS_H */ -- cgit v1.2.3 From 8c49c9ee4a91c158d28c583862718b348881cb16 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Thu, 7 May 2020 18:08:57 +0300 Subject: usb: typec: Add typec_find_orientation() Function that converts orientation string into orientation value. Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20200507150900.12102-2-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 36 ++++++++++++++++++++++++------------ include/linux/usb/typec.h | 1 + 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 8d894bdff77d..c9234748537a 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -917,6 +917,12 @@ EXPORT_SYMBOL_GPL(typec_unregister_cable); /* ------------------------------------------------------------------------- */ /* USB Type-C ports */ +static const char * const typec_orientations[] = { + [TYPEC_ORIENTATION_NONE] = "unknown", + [TYPEC_ORIENTATION_NORMAL] = "normal", + [TYPEC_ORIENTATION_REVERSE] = "reverse", +}; + static const char * const typec_roles[] = { [TYPEC_SINK] = "sink", [TYPEC_SOURCE] = "source", @@ -1248,18 +1254,9 @@ static ssize_t orientation_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct typec_port *p = to_typec_port(dev); - enum typec_orientation orientation = typec_get_orientation(p); - - switch (orientation) { - case TYPEC_ORIENTATION_NORMAL: - return sprintf(buf, "%s\n", "normal"); - case TYPEC_ORIENTATION_REVERSE: - return sprintf(buf, "%s\n", "reverse"); - case TYPEC_ORIENTATION_NONE: - default: - return sprintf(buf, "%s\n", "unknown"); - } + struct typec_port *port = to_typec_port(dev); + + return sprintf(buf, "%s\n", typec_orientations[port->orientation]); } static DEVICE_ATTR_RO(orientation); @@ -1451,6 +1448,21 @@ void typec_set_pwr_opmode(struct typec_port *port, } EXPORT_SYMBOL_GPL(typec_set_pwr_opmode); +/** + * typec_find_orientation - Convert orientation string to enum typec_orientation + * @name: Orientation string + * + * This routine is used to find the typec_orientation by its string name @name. + * + * Returns the orientation value on success, otherwise negative error code. + */ +int typec_find_orientation(const char *name) +{ + return match_string(typec_orientations, ARRAY_SIZE(typec_orientations), + name); +} +EXPORT_SYMBOL_GPL(typec_find_orientation); + /** * typec_find_port_power_role - Get the typec port power capability * @name: port power capability string diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index b00a2642a9cd..5daa1c49761c 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -254,6 +254,7 @@ int typec_set_mode(struct typec_port *port, int mode); void *typec_get_drvdata(struct typec_port *port); +int typec_find_orientation(const char *name); int typec_find_port_power_role(const char *name); int typec_find_power_role(const char *name); int typec_find_port_data_role(const char *name); -- cgit v1.2.3 From d9d200bcebc1f6e56f0178cbb8db9953e8cc9a11 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 13 May 2020 15:32:08 +0200 Subject: dma-mapping: add generic helpers for mapping sgtable objects struct sg_table is a common structure used for describing a memory buffer. It consists of a scatterlist with memory pages and DMA addresses (sgl entry), as well as the number of scatterlist entries: CPU pages (orig_nents entry) and DMA mapped pages (nents entry). It turned out that it was a common mistake to misuse nents and orig_nents entries, calling DMA-mapping functions with a wrong number of entries or ignoring the number of mapped entries returned by the dma_map_sg function. To avoid such issues, let's introduce a common wrappers operating directly on the struct sg_table objects, which take care of the proper use of the nents and orig_nents entries. Signed-off-by: Marek Szyprowski Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 80 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 330ad58fbf4d..936e30b86cd3 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -609,6 +609,86 @@ static inline void dma_sync_single_range_for_device(struct device *dev, return dma_sync_single_for_device(dev, addr + offset, size, dir); } +/** + * dma_map_sgtable - Map the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Maps a buffer described by a scatterlist stored in the given sg_table + * object for the @dir DMA operation by the @dev device. After success the + * ownership for the buffer is transferred to the DMA domain. One has to + * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the + * ownership of the buffer back to the CPU domain before touching the + * buffer by the CPU. + * + * Returns 0 on success or -EINVAL on error during mapping the buffer. + */ +static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + int nents; + + nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); + if (nents <= 0) + return -EINVAL; + sgt->nents = nents; + return 0; +} + +/** + * dma_unmap_sgtable - Unmap the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the unmap operation + * + * Unmaps a buffer described by a scatterlist stored in the given sg_table + * object for the @dir DMA operation by the @dev device. After this function + * the ownership of the buffer is transferred back to the CPU domain. + */ +static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); +} + +/** + * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * + * Performs the needed cache synchronization and moves the ownership of the + * buffer back to the CPU domain, so it is safe to perform any access to it + * by the CPU. Before doing any further DMA operations, one has to transfer + * the ownership of the buffer back to the DMA domain by calling the + * dma_sync_sgtable_for_device(). + */ +static inline void dma_sync_sgtable_for_cpu(struct device *dev, + struct sg_table *sgt, enum dma_data_direction dir) +{ + dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir); +} + +/** + * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * + * Performs the needed cache synchronization and moves the ownership of the + * buffer back to the DMA domain, so it is safe to perform the DMA operation. + * Once finished, one has to call dma_sync_sgtable_for_cpu() or + * dma_unmap_sgtable(). + */ +static inline void dma_sync_sgtable_for_device(struct device *dev, + struct sg_table *sgt, enum dma_data_direction dir) +{ + dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir); +} + #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) -- cgit v1.2.3 From 709d6d73c756107fb8a292a9f957d630097425fa Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 13 May 2020 15:32:09 +0200 Subject: scatterlist: add generic wrappers for iterating over sgtable objects struct sg_table is a common structure used for describing a memory buffer. It consists of a scatterlist with memory pages and DMA addresses (sgl entry), as well as the number of scatterlist entries: CPU pages (orig_nents entry) and DMA mapped pages (nents entry). It turned out that it was a common mistake to misuse nents and orig_nents entries, calling the scatterlist iterating functions with a wrong number of the entries. To avoid such issues, lets introduce a common wrappers operating directly on the struct sg_table objects, which take care of the proper use of the nents and orig_nents entries. While touching this, lets clarify some ambiguities in the comments for the existing for_each helpers. Signed-off-by: Marek Szyprowski Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 50 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6eec50fb36c8..4f922afb607a 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -151,6 +151,20 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, #define for_each_sg(sglist, sg, nr, __i) \ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) +/* + * Loop over each sg element in the given sg_table object. + */ +#define for_each_sgtable_sg(sgt, sg, i) \ + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) + +/* + * Loop over each sg element in the given *DMA mapped* sg_table object. + * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses + * of the each element. + */ +#define for_each_sgtable_dma_sg(sgt, sg, i) \ + for_each_sg(sgt->sgl, sg, sgt->nents, i) + /** * sg_chain - Chain two sglists together * @prv: First scatterlist @@ -401,9 +415,10 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) * @sglist: sglist to iterate over * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over - * @pgoffset: starting page offset + * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_page() to get each page pointer. + * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ @@ -412,18 +427,47 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) /** * for_each_sg_dma_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over - * @dma_iter: page iterator to hold current page + * @dma_iter: DMA page iterator to hold current page * @dma_nents: maximum number of sg entries to iterate over, this is the value * returned from dma_map_sg - * @pgoffset: starting page offset + * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_dma_address() to get each page's DMA address. + * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ pgoffset); \ __sg_page_iter_dma_next(dma_iter);) +/** + * for_each_sgtable_page - iterate over all pages in the sg_table object + * @sgt: sg_table object to iterate over + * @piter: page iterator to hold current page + * @pgoffset: starting page offset (in pages) + * + * Iterates over the all memory pages in the buffer described by + * a scatterlist stored in the given sg_table object. + * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. + */ +#define for_each_sgtable_page(sgt, piter, pgoffset) \ + for_each_sg_page(sgt->sgl, piter, sgt->orig_nents, pgoffset) + +/** + * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object + * @sgt: sg_table object to iterate over + * @dma_iter: DMA page iterator to hold current page + * @pgoffset: starting page offset (in pages) + * + * Iterates over the all DMA mapped pages in the buffer described by + * a scatterlist stored in the given sg_table object. + * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE + * unit. + */ +#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ + for_each_sg_dma_page(sgt->sgl, dma_iter, sgt->nents, pgoffset) + + /* * Mapping sg iterator * -- cgit v1.2.3 From 48530d9fab0d3bf08827f9167be54acf66d4d457 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 13 May 2020 15:32:10 +0200 Subject: iommu: add generic helper for mapping sgtable objects struct sg_table is a common structure used for describing a memory buffer. It consists of a scatterlist with memory pages and DMA addresses (sgl entry), as well as the number of scatterlist entries: CPU pages (orig_nents entry) and DMA mapped pages (nents entry). It turned out that it was a common mistake to misuse nents and orig_nents entries, calling mapping functions with a wrong number of entries. To avoid such issues, lets introduce a common wrapper operating directly on the struct sg_table objects, which take care of the proper use of the nents and orig_nents entries. Signed-off-by: Marek Szyprowski Acked-by: Joerg Roedel Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/iommu.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ef8b0bda695..b1bfbe6dff42 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -466,6 +466,22 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io extern void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token); +/** + * iommu_map_sgtable - Map the given buffer to the IOMMU domain + * @domain: The IOMMU domain to perform the mapping + * @iova: The start address to map the buffer + * @sgt: The sg_table object describing the buffer + * @prot: IOMMU protection bits + * + * Creates a mapping at @iova for the buffer described by a scatterlist + * stored in the given sg_table object in the provided IOMMU domain. + */ +static inline size_t iommu_map_sgtable(struct iommu_domain *domain, + unsigned long iova, struct sg_table *sgt, int prot) +{ + return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot); +} + extern void iommu_get_resv_regions(struct device *dev, struct list_head *list); extern void iommu_put_resv_regions(struct device *dev, struct list_head *list); extern void generic_iommu_put_resv_regions(struct device *dev, -- cgit v1.2.3 From dae2f8ed7992e88c8d62c54e8295ffc8475b4a80 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:37 -0700 Subject: fs: Lift XFS_IDONTCACHE to the VFS layer DAX effective mode (S_DAX) changes requires inode eviction. XFS has an advisory flag (XFS_IDONTCACHE) to prevent caching of the inode if no other additional references are taken. We lift this flag to the VFS layer and change the behavior slightly by allowing the flag to remain even if multiple references are taken. This will expedite the eviction of inodes to change S_DAX. Cc: Al Viro Reviewed-by: Dave Chinner Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 4 ++-- fs/xfs/xfs_inode.h | 3 +-- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 6 +++++- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8bf1d15be3f6..fa8554b3308d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -479,7 +479,7 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); if (!(flags & XFS_IGET_INCORE)) - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + xfs_iflags_clear(ip, XFS_ISTALE); XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -561,7 +561,7 @@ xfs_iget_cache_miss( */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; + VFS_I(ip)->i_state |= I_DONTCACHE; ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c6a63f6764a6..95209e3256a2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -218,8 +218,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) -#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ -#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 424bb9a2d532..a7f8e387ebfe 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -702,7 +702,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); + return generic_drop_inode(inode); } static void diff --git a/include/linux/fs.h b/include/linux/fs.h index a87cc5845a02..44bd45af760f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2156,6 +2156,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_CREATING New object's inode in the middle of setting up. * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2178,6 +2180,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) +#define I_DONTCACHE (1 << 16) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) @@ -3049,7 +3052,8 @@ extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { - return !inode->i_nlink || inode_unhashed(inode); + return !inode->i_nlink || inode_unhashed(inode) || + (inode->i_state & I_DONTCACHE); } extern struct inode *ilookup5_nowait(struct super_block *sb, -- cgit v1.2.3 From 2c567af418e3f9380c2051aada58b4e5a4b5c2ad Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:37 -0700 Subject: fs: Introduce DCACHE_DONTCACHE DCACHE_DONTCACHE indicates a dentry should not be cached on final dput(). Also add a helper function to mark DCACHE_DONTCACHE on all dentries pointing to a specific inode when that inode is being set I_DONTCACHE. This facilitates dropping dentry references to inodes sooner which require eviction to swap S_DAX mode. Cc: Al Viro Signed-off-by: Ira Weiny Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/dcache.c | 19 +++++++++++++++++++ fs/xfs/xfs_icache.c | 2 +- include/linux/dcache.h | 2 ++ include/linux/fs.h | 1 + 4 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..0d07fb335b78 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -647,6 +647,10 @@ static inline bool retain_dentry(struct dentry *dentry) if (dentry->d_op->d_delete(dentry)) return false; } + + if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + return false; + /* retain; LRU fodder */ dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) @@ -656,6 +660,21 @@ static inline bool retain_dentry(struct dentry *dentry) return true; } +void d_mark_dontcache(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + spin_lock(&de->d_lock); + de->d_flags |= DCACHE_DONTCACHE; + spin_unlock(&de->d_lock); + } + inode->i_state |= I_DONTCACHE; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_mark_dontcache); + /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fa8554b3308d..d7deab608e2c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -561,7 +561,7 @@ xfs_iget_cache_miss( */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) - VFS_I(ip)->i_state |= I_DONTCACHE; + d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c1488cc84fd9..a81f0c3cf352 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -177,6 +177,8 @@ struct dentry_operations { #define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ +#define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ + #define DCACHE_CANT_MOUNT 0x00000100 #define DCACHE_GENOCIDE 0x00000200 #define DCACHE_SHRINK_LIST 0x00000400 diff --git a/include/linux/fs.h b/include/linux/fs.h index 44bd45af760f..7c3e8c0306e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3055,6 +3055,7 @@ static inline int generic_drop_inode(struct inode *inode) return !inode->i_nlink || inode_unhashed(inode) || (inode->i_state & I_DONTCACHE); } +extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), -- cgit v1.2.3 From 9254f8ed15b6dcc9b04b9ad32863a7518cc5a5b1 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 4 May 2020 08:30:10 +0300 Subject: net/mlx5: Add support in forward to namespace Currently, fs_core supports rule of forward the traffic to continue matching in the next priority, now we add support to forward the traffic matching in the next namespace. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 57 +++++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 + include/linux/mlx5/fs.h | 1 + 3 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 705f433e2590..41aa1fa0c69e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -384,6 +384,12 @@ static struct fs_prio *find_prio(struct mlx5_flow_namespace *ns, return NULL; } +static bool is_fwd_next_action(u32 action) +{ + return action & (MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO | + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS); +} + static bool check_valid_spec(const struct mlx5_flow_spec *spec) { int i; @@ -502,7 +508,7 @@ static void del_sw_hw_rule(struct fs_node *node) fs_get_obj(rule, node); fs_get_obj(fte, rule->node.parent); trace_mlx5_fs_del_rule(rule); - if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (is_fwd_next_action(rule->sw_action)) { mutex_lock(&rule->dest_attr.ft->lock); list_del(&rule->next_ft); mutex_unlock(&rule->dest_attr.ft->lock); @@ -826,6 +832,36 @@ static struct mlx5_flow_table *find_prev_chained_ft(struct fs_prio *prio) return find_closest_ft(prio, true); } +static struct fs_prio *find_fwd_ns_prio(struct mlx5_flow_root_namespace *root, + struct mlx5_flow_namespace *ns) +{ + struct mlx5_flow_namespace *root_ns = &root->ns; + struct fs_prio *iter_prio; + struct fs_prio *prio; + + fs_get_obj(prio, ns->node.parent); + list_for_each_entry(iter_prio, &root_ns->node.children, node.list) { + if (iter_prio == prio && + !list_is_last(&prio->node.children, &iter_prio->node.list)) + return list_next_entry(iter_prio, node.list); + } + return NULL; +} + +static struct mlx5_flow_table *find_next_fwd_ft(struct mlx5_flow_table *ft, + struct mlx5_flow_act *flow_act) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + struct fs_prio *prio; + + if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS) + prio = find_fwd_ns_prio(root, ft->ns); + else + fs_get_obj(prio, ft->node.parent); + + return (prio) ? find_next_chained_ft(prio) : NULL; +} + static int connect_fts_in_prio(struct mlx5_core_dev *dev, struct fs_prio *prio, struct mlx5_flow_table *ft) @@ -976,6 +1012,10 @@ static int connect_fwd_rules(struct mlx5_core_dev *dev, list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules); mutex_unlock(&old_next_ft->lock); list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) { + if ((iter->sw_action & MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS) && + iter->ft->ns == new_next_ft->ns) + continue; + err = _mlx5_modify_rule_destination(iter, &dest); if (err) pr_err("mlx5_core: failed to modify rule to point on flow table %d\n", @@ -1077,6 +1117,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa next_ft = unmanaged ? ft_attr->next_ft : find_next_chained_ft(fs_prio); ft->def_miss_action = ns->def_miss_action; + ft->ns = ns; err = root->cmds->create_flow_table(root, ft, log_table_sz, next_ft); if (err) goto free_ft; @@ -1903,21 +1944,19 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, struct mlx5_flow_table *next_ft = NULL; struct mlx5_flow_handle *handle = NULL; u32 sw_action = flow_act->action; - struct fs_prio *prio; int i; if (!spec) spec = &zero_spec; - if (!(sw_action & MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO)) + if (!is_fwd_next_action(sw_action)) return _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest); if (!fwd_next_prio_supported(ft)) return ERR_PTR(-EOPNOTSUPP); mutex_lock(&root->chain_lock); - fs_get_obj(prio, ft->node.parent); - next_ft = find_next_chained_ft(prio); + next_ft = find_next_fwd_ft(ft, flow_act); if (!next_ft) { handle = ERR_PTR(-EOPNOTSUPP); goto unlock; @@ -1936,8 +1975,8 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, gen_dest[i].ft = next_ft; dest = gen_dest; num_dest++; - flow_act->action &= - ~MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + flow_act->action &= ~(MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO | + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS); flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest); if (IS_ERR(handle)) @@ -1948,8 +1987,8 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, list_add(&handle->rule[num_dest - 1]->next_ft, &next_ft->fwd_rules); mutex_unlock(&next_ft->lock); - handle->rule[num_dest - 1]->sw_action = - MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + handle->rule[num_dest - 1]->sw_action = sw_action; + handle->rule[num_dest - 1]->ft = ft; } unlock: mutex_unlock(&root->chain_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 508108c58dae..825b662f809b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -138,6 +138,7 @@ struct fs_node { struct mlx5_flow_rule { struct fs_node node; + struct mlx5_flow_table *ft; struct mlx5_flow_destination dest_attr; /* next_ft should be accessed under chain_lock and only of * destination type is FWD_NEXT_fT. @@ -175,6 +176,7 @@ struct mlx5_flow_table { u32 flags; struct rhltable fgs_hash; enum mlx5_flow_table_miss_action def_miss_action; + struct mlx5_flow_namespace *ns; }; struct mlx5_ft_underlay_qp { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e2d13e074067..6c5aa0a21425 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -42,6 +42,7 @@ enum { MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, MLX5_FLOW_CONTEXT_ACTION_ENCRYPT = 1 << 17, MLX5_FLOW_CONTEXT_ACTION_DECRYPT = 1 << 18, + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS = 1 << 19, }; enum { -- cgit v1.2.3 From 9d9a6ebfea329cadeda87544dceae01e9c27aaef Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:34 -0700 Subject: rcuwait: Let rcuwait_wake_up() return whether or not a task was awoken Propagating the return value of wake_up_process() back to the caller can come in handy for future users, such as for statistics or accounting purposes. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-3-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 2 +- kernel/exit.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 2ffe1ee6d482..6ebb23258a27 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -25,7 +25,7 @@ static inline void rcuwait_init(struct rcuwait *w) w->task = NULL; } -extern void rcuwait_wake_up(struct rcuwait *w); +extern int rcuwait_wake_up(struct rcuwait *w); /* * The caller is responsible for locking around rcuwait_wait_event(), diff --git a/kernel/exit.c b/kernel/exit.c index 9f9015f3f6b0..f3beb637acf7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -227,8 +227,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -248,8 +249,10 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); -- cgit v1.2.3 From 5c21f7b322cb9fcb97f2e22f5976a0ae8dd71354 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:35 -0700 Subject: rcuwait: Introduce prepare_to and finish_rcuwait This allows further flexibility for some callers to implement ad-hoc versions of the generic rcuwait_wait_event(). For example, kvm will need this to maintain tracing semantics. The naming is of course similar to what waitqueue apis offer. Also go ahead and make use of rcu_assign_pointer() for both task writes as it will make the __rcu sparse people happy - this will be the special nil case, thus no added serialization. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-4-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 6ebb23258a27..45bc6604e9b1 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -29,12 +29,25 @@ extern int rcuwait_wake_up(struct rcuwait *w); /* * The caller is responsible for locking around rcuwait_wait_event(), - * such that writes to @task are properly serialized. + * and [prepare_to/finish]_rcuwait() such that writes to @task are + * properly serialized. */ + +static inline void prepare_to_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, current); +} + +static inline void finish_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, NULL); + __set_current_state(TASK_RUNNING); +} + #define rcuwait_wait_event(w, condition, state) \ ({ \ int __ret = 0; \ - rcu_assign_pointer((w)->task, current); \ + prepare_to_rcuwait(w); \ for (;;) { \ /* \ * Implicit barrier (A) pairs with (B) in \ @@ -51,9 +64,7 @@ extern int rcuwait_wake_up(struct rcuwait *w); \ schedule(); \ } \ - \ - WRITE_ONCE((w)->task, NULL); \ - __set_current_state(TASK_RUNNING); \ + finish_rcuwait(w); \ __ret; \ }) -- cgit v1.2.3 From 191a43be61d6791fd4a9098a35c1a09e73f55228 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:36 -0700 Subject: rcuwait: Introduce rcuwait_active() This call is lockless and thus should not be trusted blindly. For example, the return value of rcuwait_wakeup() should be used to track whether a process was woken up. Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-5-dave@stgolabs.net> Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 45bc6604e9b1..c1414ce44abc 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -25,6 +25,15 @@ static inline void rcuwait_init(struct rcuwait *w) w->task = NULL; } +/* + * Note: this provides no serialization and, just as with waitqueues, + * requires care to estimate as to whether or not the wait is active. + */ +static inline int rcuwait_active(struct rcuwait *w) +{ + return !!rcu_dereference(w->task); +} + extern int rcuwait_wake_up(struct rcuwait *w); /* -- cgit v1.2.3 From da4ad88cab5867ee240dfd0585e9d115a8cc47db Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:37 -0700 Subject: kvm: Replace vcpu->swait with rcuwait The use of any sort of waitqueue (simple or regular) for wait/waking vcpus has always been an overkill and semantically wrong. Because this is per-vcpu (which is blocked) there is only ever a single waiting vcpu, thus no need for any sort of queue. As such, make use of the rcuwait primitive, with the following considerations: - rcuwait already provides the proper barriers that serialize concurrent waiter and waker. - Task wakeup is done in rcu read critical region, with a stable task pointer. - Because there is no concurrency among waiters, we need not worry about rcuwait_wait_event() calls corrupting the wait->task. As a consequence, this saves the locking done in swait when modifying the queue. This also applies to per-vcore wait for powerpc kvm-hv. The x86 tscdeadline_latency test mentioned in 8577370fb0cb ("KVM: Use simple waitqueue for vcpu->wq") shows that, on avg, latency is reduced by around 15-20% with this change. Cc: Paul Mackerras Cc: kvmarm@lists.cs.columbia.edu Cc: linux-mips@vger.kernel.org Reviewed-by: Marc Zyngier Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-6-dave@stgolabs.net> [Avoid extra logic changes. - Paolo] Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 6 ++---- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 23 ++++++++++------------- arch/powerpc/kvm/powerpc.c | 2 +- arch/x86/kvm/lapic.c | 2 +- include/linux/kvm_host.h | 10 +++++----- virt/kvm/arm/arch_timer.c | 3 ++- virt/kvm/arm/arm.c | 9 +++++---- virt/kvm/async_pf.c | 3 +-- virt/kvm/kvm_main.c | 19 +++++++++---------- 11 files changed, 38 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9f50ceef9978..9787cdec33e6 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -283,8 +283,7 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) kvm_mips_callbacks->queue_timer_int(vcpu); vcpu->arch.wait = 0; - if (swq_has_sleeper(&vcpu->wq)) - swake_up_one(&vcpu->wq); + rcuwait_wake_up(&vcpu->wait); return kvm_mips_count_timeout(vcpu); } @@ -511,8 +510,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, dvcpu->arch.wait = 0; - if (swq_has_sleeper(&dvcpu->wq)) - swake_up_one(&dvcpu->wq); + rcuwait_wake_up(&dvcpu->wait); return 0; } diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 506e4df2d730..6e5d85ba588d 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -78,7 +78,7 @@ struct kvmppc_vcore { struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; struct list_head preempt_list; spinlock_t lock; - struct swait_queue_head wq; + struct rcuwait wait; spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ u64 stolen_tb; u64 preempt_tb; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1dc63101ffe1..337047ba4a56 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -751,7 +751,7 @@ struct kvm_vcpu_arch { u8 irq_pending; /* Used by XIVE to signal pending guest irqs */ u32 last_inst; - struct swait_queue_head *wqp; + struct rcuwait *waitp; struct kvmppc_vcore *vcore; int ret; int trap; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 93493f0cbfe8..7f59c47a5b9d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -230,13 +230,11 @@ static bool kvmppc_ipi_thread(int cpu) static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { int cpu; - struct swait_queue_head *wqp; + struct rcuwait *waitp; - wqp = kvm_arch_vcpu_wq(vcpu); - if (swq_has_sleeper(wqp)) { - swake_up_one(wqp); + waitp = kvm_arch_vcpu_get_wait(vcpu); + if (rcuwait_wake_up(waitp)) ++vcpu->stat.halt_wakeup; - } cpu = READ_ONCE(vcpu->arch.thread_cpu); if (cpu >= 0 && kvmppc_ipi_thread(cpu)) @@ -2125,7 +2123,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id) spin_lock_init(&vcore->lock); spin_lock_init(&vcore->stoltb_lock); - init_swait_queue_head(&vcore->wq); + rcuwait_init(&vcore->wait); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; vcore->first_vcpuid = id; @@ -3784,7 +3782,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) ktime_t cur, start_poll, start_wait; int do_sleep = 1; u64 block_ns; - DECLARE_SWAITQUEUE(wait); /* Poll for pending exceptions and ceded state */ cur = start_poll = ktime_get(); @@ -3812,10 +3809,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) } } - prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE); - + prepare_to_rcuwait(&vc->wait); + set_current_state(TASK_INTERRUPTIBLE); if (kvmppc_vcore_check_block(vc)) { - finish_swait(&vc->wq, &wait); + finish_rcuwait(&vc->wait); do_sleep = 0; /* If we polled, count this as a successful poll */ if (vc->halt_poll_ns) @@ -3829,7 +3826,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) trace_kvmppc_vcore_blocked(vc, 0); spin_unlock(&vc->lock); schedule(); - finish_swait(&vc->wq, &wait); + finish_rcuwait(&vc->wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); @@ -3940,7 +3937,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { - swake_up_one(&vc->wq); + rcuwait_wake_up(&vc->wait); } } @@ -4279,7 +4276,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } user_vrsave = mfspr(SPRN_VRSAVE); - vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.waitp = &vcpu->arch.vcore->wait; vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 052614e9d468..27ccff612903 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -752,7 +752,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) goto out_vcpu_uninit; - vcpu->arch.wqp = &vcpu->wq; + vcpu->arch.waitp = &vcpu->wait; kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); return 0; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 38f7dc9c16ee..42cd2e3ec6fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1861,7 +1861,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; - WARN_ON(swait_active(&vcpu->wq)); + WARN_ON(rcuwait_active(&vcpu->wait)); cancel_hv_timer(apic); apic_timer_expired(apic); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index abfa71cb5d2d..161684696610 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -277,7 +277,7 @@ struct kvm_vcpu { struct mutex mutex; struct kvm_run *run; - struct swait_queue_head wq; + struct rcuwait wait; struct pid __rcu *pid; int sigset_active; sigset_t sigset; @@ -960,12 +960,12 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) } #endif -static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) +static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP - return vcpu->arch.wqp; + return vcpu->arch.waitp; #else - return &vcpu->wq; + return &vcpu->wait; #endif } diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 93bd59b46848..d5024416e722 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -571,6 +571,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); if (unlikely(!timer->enabled)) return; @@ -593,7 +594,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); - if (swait_active(kvm_arch_vcpu_wq(vcpu))) + if (rcuwait_active(wait)) kvm_timer_blocking(vcpu); /* diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f5390ac2165b..d5db0d6141ff 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -579,16 +579,17 @@ void kvm_arm_resume_guest(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.pause = false; - swake_up_one(kvm_arch_vcpu_wq(vcpu)); + rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); } } static void vcpu_req_sleep(struct kvm_vcpu *vcpu) { - struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); - swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && - (!vcpu->arch.pause))); + rcuwait_wait_event(wait, + (!vcpu->arch.power_off) &&(!vcpu->arch.pause), + TASK_INTERRUPTIBLE); if (vcpu->arch.power_off || vcpu->arch.pause) { /* Awaken to handle a signal, request we sleep again later. */ diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 15e5b037f92d..10b533f641a6 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,8 +80,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, cr2_or_gpa); - if (swq_has_sleeper(&vcpu->wq)) - swake_up_one(&vcpu->wq); + rcuwait_wake_up(&vcpu->wait); mmput(mm); kvm_put_kvm(vcpu->kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7525f3838160..e12317f32c5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -349,7 +349,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; - init_swait_queue_head(&vcpu->wq); + rcuwait_init(&vcpu->wait); kvm_async_pf_vcpu_init(vcpu); vcpu->pre_pcpu = -1; @@ -2678,7 +2678,6 @@ out: void kvm_vcpu_block(struct kvm_vcpu *vcpu) { ktime_t start, cur; - DECLARE_SWAITQUEUE(wait); bool waited = false; u64 block_ns; @@ -2704,8 +2703,9 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } while (single_task_running() && ktime_before(cur, stop)); } + prepare_to_rcuwait(&vcpu->wait); for (;;) { - prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; @@ -2713,8 +2713,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) waited = true; schedule(); } - - finish_swait(&vcpu->wq, &wait); + finish_rcuwait(&vcpu->wait); cur = ktime_get(); out: kvm_arch_vcpu_unblocking(vcpu); @@ -2746,11 +2745,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_block); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { - struct swait_queue_head *wqp; + struct rcuwait *waitp; - wqp = kvm_arch_vcpu_wq(vcpu); - if (swq_has_sleeper(wqp)) { - swake_up_one(wqp); + waitp = kvm_arch_vcpu_get_wait(vcpu); + if (rcuwait_wake_up(waitp)) { WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.halt_wakeup; return true; @@ -2892,7 +2890,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) + if (rcuwait_active(&vcpu->wait) && + !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) -- cgit v1.2.3 From d06cfe3f123c50449a0c3ece21bc16668289c50f Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 29 Apr 2020 15:58:21 -0500 Subject: bus: vexpress-config: Merge vexpress-syscfg into vexpress-config The only thing that vexpress-syscfg does is provide a regmap to vexpress-config bus child devices. There's little reason to have 2 components for this. The current structure with initcall ordering requirements makes turning these components into modules more difficult. So let's start to simplify things and merge vexpress-syscfg into vexpress-config. There's no functional change in this commit and it's still separate components until subsequent commits. Cc: Lorenzo Pieralisi Cc: Linus Walleij Cc: Arnd Bergmann Reviewed-by: Sudeep Holla Acked-by: Greg Kroah-Hartman Acked-by: Liviu Dudau Signed-off-by: Rob Herring --- arch/arm/mach-vexpress/Kconfig | 1 - drivers/bus/vexpress-config.c | 283 +++++++++++++++++++++++++++++++++++++++-- drivers/misc/Kconfig | 9 -- drivers/misc/Makefile | 1 - drivers/misc/vexpress-syscfg.c | 280 ---------------------------------------- include/linux/vexpress.h | 17 --- 6 files changed, 274 insertions(+), 317 deletions(-) delete mode 100644 drivers/misc/vexpress-syscfg.c (limited to 'include/linux') diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig index 2d1fdec4c230..065e12991663 100644 --- a/arch/arm/mach-vexpress/Kconfig +++ b/arch/arm/mach-vexpress/Kconfig @@ -20,7 +20,6 @@ menuconfig ARCH_VEXPRESS select REGULATOR if MMC_ARMMMCI select REGULATOR_FIXED_VOLTAGE if REGULATOR select VEXPRESS_CONFIG - select VEXPRESS_SYSCFG help This option enables support for systems using Cortex processor based ARM core and logic (FPGA) tiles on the Versatile Express motherboard, diff --git a/drivers/bus/vexpress-config.c b/drivers/bus/vexpress-config.c index ff70575b2db6..43f5beac9811 100644 --- a/drivers/bus/vexpress-config.c +++ b/drivers/bus/vexpress-config.c @@ -6,10 +6,48 @@ #include #include +#include #include +#include #include +#include +#include #include +#define SYS_CFGDATA 0x0 + +#define SYS_CFGCTRL 0x4 +#define SYS_CFGCTRL_START (1 << 31) +#define SYS_CFGCTRL_WRITE (1 << 30) +#define SYS_CFGCTRL_DCC(n) (((n) & 0xf) << 26) +#define SYS_CFGCTRL_FUNC(n) (((n) & 0x3f) << 20) +#define SYS_CFGCTRL_SITE(n) (((n) & 0x3) << 16) +#define SYS_CFGCTRL_POSITION(n) (((n) & 0xf) << 12) +#define SYS_CFGCTRL_DEVICE(n) (((n) & 0xfff) << 0) + +#define SYS_CFGSTAT 0x8 +#define SYS_CFGSTAT_ERR (1 << 1) +#define SYS_CFGSTAT_COMPLETE (1 << 0) + + +struct vexpress_syscfg { + struct device *dev; + void __iomem *base; + struct list_head funcs; +}; + +struct vexpress_syscfg_func { + struct list_head list; + struct vexpress_syscfg *syscfg; + struct regmap *regmap; + int num_templates; + u32 template[]; /* Keep it last! */ +}; + +struct vexpress_config_bridge_ops { + struct regmap * (*regmap_init)(struct device *dev, void *context); + void (*regmap_exit)(struct regmap *regmap, void *context); +}; struct vexpress_config_bridge { struct vexpress_config_bridge_ops *ops; @@ -27,17 +65,12 @@ void vexpress_config_set_master(u32 site) vexpress_config_site_master = site; } -u32 vexpress_config_get_master(void) -{ - return vexpress_config_site_master; -} - -void vexpress_config_lock(void *arg) +static void vexpress_config_lock(void *arg) { mutex_lock(&vexpress_config_mutex); } -void vexpress_config_unlock(void *arg) +static void vexpress_config_unlock(void *arg) { mutex_unlock(&vexpress_config_mutex); } @@ -59,7 +92,7 @@ static void vexpress_config_find_prop(struct device_node *node, } } -int vexpress_config_get_topo(struct device_node *node, u32 *site, +static int vexpress_config_get_topo(struct device_node *node, u32 *site, u32 *position, u32 *dcc) { vexpress_config_find_prop(node, "arm,vexpress,site", site); @@ -113,7 +146,7 @@ struct regmap *devm_regmap_init_vexpress_config(struct device *dev) } EXPORT_SYMBOL_GPL(devm_regmap_init_vexpress_config); -struct device *vexpress_config_bridge_register(struct device *parent, +static struct device *vexpress_config_bridge_register(struct device *parent, struct vexpress_config_bridge_ops *ops, void *context) { struct device *dev; @@ -201,3 +234,235 @@ static int __init vexpress_config_init(void) } postcore_initcall(vexpress_config_init); +static int vexpress_syscfg_exec(struct vexpress_syscfg_func *func, + int index, bool write, u32 *data) +{ + struct vexpress_syscfg *syscfg = func->syscfg; + u32 command, status; + int tries; + long timeout; + + if (WARN_ON(index >= func->num_templates)) + return -EINVAL; + + command = readl(syscfg->base + SYS_CFGCTRL); + if (WARN_ON(command & SYS_CFGCTRL_START)) + return -EBUSY; + + command = func->template[index]; + command |= SYS_CFGCTRL_START; + command |= write ? SYS_CFGCTRL_WRITE : 0; + + /* Use a canary for reads */ + if (!write) + *data = 0xdeadbeef; + + dev_dbg(syscfg->dev, "func %p, command %x, data %x\n", + func, command, *data); + writel(*data, syscfg->base + SYS_CFGDATA); + writel(0, syscfg->base + SYS_CFGSTAT); + writel(command, syscfg->base + SYS_CFGCTRL); + mb(); + + /* The operation can take ages... Go to sleep, 100us initially */ + tries = 100; + timeout = 100; + do { + if (!irqs_disabled()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(timeout)); + if (signal_pending(current)) + return -EINTR; + } else { + udelay(timeout); + } + + status = readl(syscfg->base + SYS_CFGSTAT); + if (status & SYS_CFGSTAT_ERR) + return -EFAULT; + + if (timeout > 20) + timeout -= 20; + } while (--tries && !(status & SYS_CFGSTAT_COMPLETE)); + if (WARN_ON_ONCE(!tries)) + return -ETIMEDOUT; + + if (!write) { + *data = readl(syscfg->base + SYS_CFGDATA); + dev_dbg(syscfg->dev, "func %p, read data %x\n", func, *data); + } + + return 0; +} + +static int vexpress_syscfg_read(void *context, unsigned int index, + unsigned int *val) +{ + struct vexpress_syscfg_func *func = context; + + return vexpress_syscfg_exec(func, index, false, val); +} + +static int vexpress_syscfg_write(void *context, unsigned int index, + unsigned int val) +{ + struct vexpress_syscfg_func *func = context; + + return vexpress_syscfg_exec(func, index, true, &val); +} + +static struct regmap_config vexpress_syscfg_regmap_config = { + .lock = vexpress_config_lock, + .unlock = vexpress_config_unlock, + .reg_bits = 32, + .val_bits = 32, + .reg_read = vexpress_syscfg_read, + .reg_write = vexpress_syscfg_write, + .reg_format_endian = REGMAP_ENDIAN_LITTLE, + .val_format_endian = REGMAP_ENDIAN_LITTLE, +}; + + +static struct regmap *vexpress_syscfg_regmap_init(struct device *dev, + void *context) +{ + int err; + struct vexpress_syscfg *syscfg = context; + struct vexpress_syscfg_func *func; + struct property *prop; + const __be32 *val = NULL; + __be32 energy_quirk[4]; + int num; + u32 site, position, dcc; + int i; + + err = vexpress_config_get_topo(dev->of_node, &site, + &position, &dcc); + if (err) + return ERR_PTR(err); + + prop = of_find_property(dev->of_node, + "arm,vexpress-sysreg,func", NULL); + if (!prop) + return ERR_PTR(-EINVAL); + + num = prop->length / sizeof(u32) / 2; + val = prop->value; + + /* + * "arm,vexpress-energy" function used to be described + * by its first device only, now it requires both + */ + if (num == 1 && of_device_is_compatible(dev->of_node, + "arm,vexpress-energy")) { + num = 2; + energy_quirk[0] = *val; + energy_quirk[2] = *val++; + energy_quirk[1] = *val; + energy_quirk[3] = cpu_to_be32(be32_to_cpup(val) + 1); + val = energy_quirk; + } + + func = kzalloc(struct_size(func, template, num), GFP_KERNEL); + if (!func) + return ERR_PTR(-ENOMEM); + + func->syscfg = syscfg; + func->num_templates = num; + + for (i = 0; i < num; i++) { + u32 function, device; + + function = be32_to_cpup(val++); + device = be32_to_cpup(val++); + + dev_dbg(dev, "func %p: %u/%u/%u/%u/%u\n", + func, site, position, dcc, + function, device); + + func->template[i] = SYS_CFGCTRL_DCC(dcc); + func->template[i] |= SYS_CFGCTRL_SITE(site); + func->template[i] |= SYS_CFGCTRL_POSITION(position); + func->template[i] |= SYS_CFGCTRL_FUNC(function); + func->template[i] |= SYS_CFGCTRL_DEVICE(device); + } + + vexpress_syscfg_regmap_config.max_register = num - 1; + + func->regmap = regmap_init(dev, NULL, func, + &vexpress_syscfg_regmap_config); + + if (IS_ERR(func->regmap)) { + void *err = func->regmap; + + kfree(func); + return err; + } + + list_add(&func->list, &syscfg->funcs); + + return func->regmap; +} + +static void vexpress_syscfg_regmap_exit(struct regmap *regmap, void *context) +{ + struct vexpress_syscfg *syscfg = context; + struct vexpress_syscfg_func *func, *tmp; + + regmap_exit(regmap); + + list_for_each_entry_safe(func, tmp, &syscfg->funcs, list) { + if (func->regmap == regmap) { + list_del(&syscfg->funcs); + kfree(func); + break; + } + } +} + +static struct vexpress_config_bridge_ops vexpress_syscfg_bridge_ops = { + .regmap_init = vexpress_syscfg_regmap_init, + .regmap_exit = vexpress_syscfg_regmap_exit, +}; + + +static int vexpress_syscfg_probe(struct platform_device *pdev) +{ + struct vexpress_syscfg *syscfg; + struct resource *res; + struct device *bridge; + + syscfg = devm_kzalloc(&pdev->dev, sizeof(*syscfg), GFP_KERNEL); + if (!syscfg) + return -ENOMEM; + syscfg->dev = &pdev->dev; + INIT_LIST_HEAD(&syscfg->funcs); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + syscfg->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(syscfg->base)) + return PTR_ERR(syscfg->base); + + /* Must use dev.parent (MFD), as that's where DT phandle points at... */ + bridge = vexpress_config_bridge_register(pdev->dev.parent, + &vexpress_syscfg_bridge_ops, syscfg); + + return PTR_ERR_OR_ZERO(bridge); +} + +static const struct platform_device_id vexpress_syscfg_id_table[] = { + { "vexpress-syscfg", }, + {}, +}; + +static struct platform_driver vexpress_syscfg_driver = { + .driver.name = "vexpress-syscfg", + .id_table = vexpress_syscfg_id_table, + .probe = vexpress_syscfg_probe, +}; + +static int __init vexpress_syscfg_init(void) +{ + return platform_driver_register(&vexpress_syscfg_driver); +} +core_initcall(vexpress_syscfg_init); diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 99e151475d8f..edd5dd5ebfdc 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -423,15 +423,6 @@ config SRAM config SRAM_EXEC bool -config VEXPRESS_SYSCFG - bool "Versatile Express System Configuration driver" - depends on VEXPRESS_CONFIG - default y - help - ARM Ltd. Versatile Express uses specialised platform configuration - bus. System Configuration interface is one of the possible means - of generating transactions on this bus. - config PCI_ENDPOINT_TEST depends on PCI select CRC32 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 9abf2923d831..c7bd01ac6291 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_SRAM_EXEC) += sram-exec.o obj-y += mic/ obj-$(CONFIG_GENWQE) += genwqe/ obj-$(CONFIG_ECHO) += echo/ -obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o obj-$(CONFIG_CXL_BASE) += cxl/ obj-$(CONFIG_PCI_ENDPOINT_TEST) += pci_endpoint_test.o obj-$(CONFIG_OCXL) += ocxl/ diff --git a/drivers/misc/vexpress-syscfg.c b/drivers/misc/vexpress-syscfg.c deleted file mode 100644 index a431787c0898..000000000000 --- a/drivers/misc/vexpress-syscfg.c +++ /dev/null @@ -1,280 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Copyright (C) 2014 ARM Limited - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#define SYS_CFGDATA 0x0 - -#define SYS_CFGCTRL 0x4 -#define SYS_CFGCTRL_START (1 << 31) -#define SYS_CFGCTRL_WRITE (1 << 30) -#define SYS_CFGCTRL_DCC(n) (((n) & 0xf) << 26) -#define SYS_CFGCTRL_FUNC(n) (((n) & 0x3f) << 20) -#define SYS_CFGCTRL_SITE(n) (((n) & 0x3) << 16) -#define SYS_CFGCTRL_POSITION(n) (((n) & 0xf) << 12) -#define SYS_CFGCTRL_DEVICE(n) (((n) & 0xfff) << 0) - -#define SYS_CFGSTAT 0x8 -#define SYS_CFGSTAT_ERR (1 << 1) -#define SYS_CFGSTAT_COMPLETE (1 << 0) - - -struct vexpress_syscfg { - struct device *dev; - void __iomem *base; - struct list_head funcs; -}; - -struct vexpress_syscfg_func { - struct list_head list; - struct vexpress_syscfg *syscfg; - struct regmap *regmap; - int num_templates; - u32 template[]; /* Keep it last! */ -}; - - -static int vexpress_syscfg_exec(struct vexpress_syscfg_func *func, - int index, bool write, u32 *data) -{ - struct vexpress_syscfg *syscfg = func->syscfg; - u32 command, status; - int tries; - long timeout; - - if (WARN_ON(index >= func->num_templates)) - return -EINVAL; - - command = readl(syscfg->base + SYS_CFGCTRL); - if (WARN_ON(command & SYS_CFGCTRL_START)) - return -EBUSY; - - command = func->template[index]; - command |= SYS_CFGCTRL_START; - command |= write ? SYS_CFGCTRL_WRITE : 0; - - /* Use a canary for reads */ - if (!write) - *data = 0xdeadbeef; - - dev_dbg(syscfg->dev, "func %p, command %x, data %x\n", - func, command, *data); - writel(*data, syscfg->base + SYS_CFGDATA); - writel(0, syscfg->base + SYS_CFGSTAT); - writel(command, syscfg->base + SYS_CFGCTRL); - mb(); - - /* The operation can take ages... Go to sleep, 100us initially */ - tries = 100; - timeout = 100; - do { - if (!irqs_disabled()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(timeout)); - if (signal_pending(current)) - return -EINTR; - } else { - udelay(timeout); - } - - status = readl(syscfg->base + SYS_CFGSTAT); - if (status & SYS_CFGSTAT_ERR) - return -EFAULT; - - if (timeout > 20) - timeout -= 20; - } while (--tries && !(status & SYS_CFGSTAT_COMPLETE)); - if (WARN_ON_ONCE(!tries)) - return -ETIMEDOUT; - - if (!write) { - *data = readl(syscfg->base + SYS_CFGDATA); - dev_dbg(syscfg->dev, "func %p, read data %x\n", func, *data); - } - - return 0; -} - -static int vexpress_syscfg_read(void *context, unsigned int index, - unsigned int *val) -{ - struct vexpress_syscfg_func *func = context; - - return vexpress_syscfg_exec(func, index, false, val); -} - -static int vexpress_syscfg_write(void *context, unsigned int index, - unsigned int val) -{ - struct vexpress_syscfg_func *func = context; - - return vexpress_syscfg_exec(func, index, true, &val); -} - -static struct regmap_config vexpress_syscfg_regmap_config = { - .lock = vexpress_config_lock, - .unlock = vexpress_config_unlock, - .reg_bits = 32, - .val_bits = 32, - .reg_read = vexpress_syscfg_read, - .reg_write = vexpress_syscfg_write, - .reg_format_endian = REGMAP_ENDIAN_LITTLE, - .val_format_endian = REGMAP_ENDIAN_LITTLE, -}; - - -static struct regmap *vexpress_syscfg_regmap_init(struct device *dev, - void *context) -{ - int err; - struct vexpress_syscfg *syscfg = context; - struct vexpress_syscfg_func *func; - struct property *prop; - const __be32 *val = NULL; - __be32 energy_quirk[4]; - int num; - u32 site, position, dcc; - int i; - - err = vexpress_config_get_topo(dev->of_node, &site, - &position, &dcc); - if (err) - return ERR_PTR(err); - - prop = of_find_property(dev->of_node, - "arm,vexpress-sysreg,func", NULL); - if (!prop) - return ERR_PTR(-EINVAL); - - num = prop->length / sizeof(u32) / 2; - val = prop->value; - - /* - * "arm,vexpress-energy" function used to be described - * by its first device only, now it requires both - */ - if (num == 1 && of_device_is_compatible(dev->of_node, - "arm,vexpress-energy")) { - num = 2; - energy_quirk[0] = *val; - energy_quirk[2] = *val++; - energy_quirk[1] = *val; - energy_quirk[3] = cpu_to_be32(be32_to_cpup(val) + 1); - val = energy_quirk; - } - - func = kzalloc(struct_size(func, template, num), GFP_KERNEL); - if (!func) - return ERR_PTR(-ENOMEM); - - func->syscfg = syscfg; - func->num_templates = num; - - for (i = 0; i < num; i++) { - u32 function, device; - - function = be32_to_cpup(val++); - device = be32_to_cpup(val++); - - dev_dbg(dev, "func %p: %u/%u/%u/%u/%u\n", - func, site, position, dcc, - function, device); - - func->template[i] = SYS_CFGCTRL_DCC(dcc); - func->template[i] |= SYS_CFGCTRL_SITE(site); - func->template[i] |= SYS_CFGCTRL_POSITION(position); - func->template[i] |= SYS_CFGCTRL_FUNC(function); - func->template[i] |= SYS_CFGCTRL_DEVICE(device); - } - - vexpress_syscfg_regmap_config.max_register = num - 1; - - func->regmap = regmap_init(dev, NULL, func, - &vexpress_syscfg_regmap_config); - - if (IS_ERR(func->regmap)) { - void *err = func->regmap; - - kfree(func); - return err; - } - - list_add(&func->list, &syscfg->funcs); - - return func->regmap; -} - -static void vexpress_syscfg_regmap_exit(struct regmap *regmap, void *context) -{ - struct vexpress_syscfg *syscfg = context; - struct vexpress_syscfg_func *func, *tmp; - - regmap_exit(regmap); - - list_for_each_entry_safe(func, tmp, &syscfg->funcs, list) { - if (func->regmap == regmap) { - list_del(&syscfg->funcs); - kfree(func); - break; - } - } -} - -static struct vexpress_config_bridge_ops vexpress_syscfg_bridge_ops = { - .regmap_init = vexpress_syscfg_regmap_init, - .regmap_exit = vexpress_syscfg_regmap_exit, -}; - - -static int vexpress_syscfg_probe(struct platform_device *pdev) -{ - struct vexpress_syscfg *syscfg; - struct resource *res; - struct device *bridge; - - syscfg = devm_kzalloc(&pdev->dev, sizeof(*syscfg), GFP_KERNEL); - if (!syscfg) - return -ENOMEM; - syscfg->dev = &pdev->dev; - INIT_LIST_HEAD(&syscfg->funcs); - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - syscfg->base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(syscfg->base)) - return PTR_ERR(syscfg->base); - - /* Must use dev.parent (MFD), as that's where DT phandle points at... */ - bridge = vexpress_config_bridge_register(pdev->dev.parent, - &vexpress_syscfg_bridge_ops, syscfg); - - return PTR_ERR_OR_ZERO(bridge); -} - -static const struct platform_device_id vexpress_syscfg_id_table[] = { - { "vexpress-syscfg", }, - {}, -}; - -static struct platform_driver vexpress_syscfg_driver = { - .driver.name = "vexpress-syscfg", - .id_table = vexpress_syscfg_id_table, - .probe = vexpress_syscfg_probe, -}; - -static int __init vexpress_syscfg_init(void) -{ - return platform_driver_register(&vexpress_syscfg_driver); -} -core_initcall(vexpress_syscfg_init); diff --git a/include/linux/vexpress.h b/include/linux/vexpress.h index 2ec7992b054c..65096c792d57 100644 --- a/include/linux/vexpress.h +++ b/include/linux/vexpress.h @@ -18,23 +18,6 @@ /* Config infrastructure */ void vexpress_config_set_master(u32 site); -u32 vexpress_config_get_master(void); - -void vexpress_config_lock(void *arg); -void vexpress_config_unlock(void *arg); - -int vexpress_config_get_topo(struct device_node *node, u32 *site, - u32 *position, u32 *dcc); - -/* Config bridge API */ - -struct vexpress_config_bridge_ops { - struct regmap * (*regmap_init)(struct device *dev, void *context); - void (*regmap_exit)(struct regmap *regmap, void *context); -}; - -struct device *vexpress_config_bridge_register(struct device *parent, - struct vexpress_config_bridge_ops *ops, void *context); /* Config regmap API */ -- cgit v1.2.3 From 310f80d61717425fbf799ef0ff0926e64cd57d9c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 29 Apr 2020 15:58:23 -0500 Subject: vexpress: Move setting master site to vexpress-config bus There's only a single caller of vexpress_config_set_master() from vexpress-sysreg.c. Let's just make the registers needed available to vexpress-config and move all the code there. The registers needed aren't used anywhere else either. With this, we can get rid of the private API between these 2 drivers. Cc: Lorenzo Pieralisi Cc: Linus Walleij Cc: Greg Kroah-Hartman Acked-by: Liviu Dudau Acked-by: Sudeep Holla Acked-by: Lee Jones Signed-off-by: Rob Herring --- drivers/bus/vexpress-config.c | 37 +++++++++++++++++++++++++++++++++---- drivers/mfd/vexpress-sysreg.c | 25 +------------------------ include/linux/vexpress.h | 9 --------- 3 files changed, 34 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/vexpress-config.c b/drivers/bus/vexpress-config.c index 43deb4df140b..caa35a4cb34d 100644 --- a/drivers/bus/vexpress-config.c +++ b/drivers/bus/vexpress-config.c @@ -14,9 +14,17 @@ #include #include -#define SYS_CFGDATA 0x0 +#define SYS_MISC 0x0 +#define SYS_MISC_MASTERSITE (1 << 14) -#define SYS_CFGCTRL 0x4 +#define SYS_PROCID0 0x24 +#define SYS_PROCID1 0x28 +#define SYS_HBI_MASK 0xfff +#define SYS_PROCIDx_HBI_SHIFT 0 + +#define SYS_CFGDATA 0x40 + +#define SYS_CFGCTRL 0x44 #define SYS_CFGCTRL_START (1 << 31) #define SYS_CFGCTRL_WRITE (1 << 30) #define SYS_CFGCTRL_DCC(n) (((n) & 0xf) << 26) @@ -25,10 +33,14 @@ #define SYS_CFGCTRL_POSITION(n) (((n) & 0xf) << 12) #define SYS_CFGCTRL_DEVICE(n) (((n) & 0xfff) << 0) -#define SYS_CFGSTAT 0x8 +#define SYS_CFGSTAT 0x48 #define SYS_CFGSTAT_ERR (1 << 1) #define SYS_CFGSTAT_COMPLETE (1 << 0) +#define VEXPRESS_SITE_MB 0 +#define VEXPRESS_SITE_DB1 1 +#define VEXPRESS_SITE_DB2 2 +#define VEXPRESS_SITE_MASTER 0xf struct vexpress_syscfg { struct device *dev; @@ -59,7 +71,7 @@ static DEFINE_MUTEX(vexpress_config_mutex); static u32 vexpress_config_site_master = VEXPRESS_SITE_MASTER; -void vexpress_config_set_master(u32 site) +static void vexpress_config_set_master(u32 site) { vexpress_config_site_master = site; } @@ -340,6 +352,8 @@ static int vexpress_syscfg_probe(struct platform_device *pdev) struct resource *res; struct vexpress_config_bridge *bridge; struct device_node *node; + int master; + u32 dt_hbi; syscfg = devm_kzalloc(&pdev->dev, sizeof(*syscfg), GFP_KERNEL); if (!syscfg) @@ -361,6 +375,21 @@ static int vexpress_syscfg_probe(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, bridge); + master = readl(syscfg->base + SYS_MISC) & SYS_MISC_MASTERSITE ? + VEXPRESS_SITE_DB2 : VEXPRESS_SITE_DB1; + vexpress_config_set_master(master); + + /* Confirm board type against DT property, if available */ + if (of_property_read_u32(of_root, "arm,hbi", &dt_hbi) == 0) { + u32 id = readl(syscfg->base + (master == VEXPRESS_SITE_DB1 ? + SYS_PROCID0 : SYS_PROCID1)); + u32 hbi = (id >> SYS_PROCIDx_HBI_SHIFT) & SYS_HBI_MASK; + + if (WARN_ON(dt_hbi != hbi)) + dev_warn(&pdev->dev, "DT HBI (%x) is not matching hardware (%x)!\n", + dt_hbi, hbi); + } + for_each_compatible_node(node, NULL, "arm,vexpress,config-bus") { struct device_node *bridge_np; diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c index eeeeb1d26d5d..aaf24af287dd 100644 --- a/drivers/mfd/vexpress-sysreg.c +++ b/drivers/mfd/vexpress-sysreg.c @@ -14,7 +14,6 @@ #include #include #include -#include #define SYS_ID 0x000 #define SYS_SW 0x004 @@ -37,11 +36,6 @@ #define SYS_CFGCTRL 0x0a4 #define SYS_CFGSTAT 0x0a8 -#define SYS_HBI_MASK 0xfff -#define SYS_PROCIDx_HBI_SHIFT 0 - -#define SYS_MISC_MASTERSITE (1 << 14) - /* The sysreg block is just a random collection of various functions... */ static struct bgpio_pdata vexpress_sysreg_sys_led_pdata = { @@ -94,7 +88,7 @@ static struct mfd_cell vexpress_sysreg_cells[] = { .name = "vexpress-syscfg", .num_resources = 1, .resources = (struct resource []) { - DEFINE_RES_MEM(SYS_CFGDATA, 0xc), + DEFINE_RES_MEM(SYS_MISC, 0x4c), }, } }; @@ -104,8 +98,6 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) struct resource *mem; void __iomem *base; struct gpio_chip *mmc_gpio_chip; - int master; - u32 dt_hbi; mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!mem) @@ -115,21 +107,6 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) if (!base) return -ENOMEM; - master = readl(base + SYS_MISC) & SYS_MISC_MASTERSITE ? - VEXPRESS_SITE_DB2 : VEXPRESS_SITE_DB1; - vexpress_config_set_master(master); - - /* Confirm board type against DT property, if available */ - if (of_property_read_u32(of_root, "arm,hbi", &dt_hbi) == 0) { - u32 id = readl(base + (master == VEXPRESS_SITE_DB1 ? - SYS_PROCID0 : SYS_PROCID1)); - u32 hbi = (id >> SYS_PROCIDx_HBI_SHIFT) & SYS_HBI_MASK; - - if (WARN_ON(dt_hbi != hbi)) - dev_warn(&pdev->dev, "DT HBI (%x) is not matching hardware (%x)!\n", - dt_hbi, hbi); - } - /* * Duplicated SYS_MCI pseudo-GPIO controller for compatibility with * older trees using sysreg node for MMC control lines. diff --git a/include/linux/vexpress.h b/include/linux/vexpress.h index 65096c792d57..2f9dd072f11f 100644 --- a/include/linux/vexpress.h +++ b/include/linux/vexpress.h @@ -10,15 +10,6 @@ #include #include -#define VEXPRESS_SITE_MB 0 -#define VEXPRESS_SITE_DB1 1 -#define VEXPRESS_SITE_DB2 2 -#define VEXPRESS_SITE_MASTER 0xf - -/* Config infrastructure */ - -void vexpress_config_set_master(u32 site); - /* Config regmap API */ struct regmap *devm_regmap_init_vexpress_config(struct device *dev); -- cgit v1.2.3 From 21aef70eade22a656297c28d5da93301915d2ac2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:16 -0700 Subject: bpf: Change btf_iter func proto prefix to "bpf_iter_" This is to be consistent with tracing and lsm programs which have prefix "bpf_trace_" and "bpf_lsm_" respectively. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180216.2949387-1-yhs@fb.com --- include/linux/bpf.h | 6 +++--- tools/lib/bpf/libbpf.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf4b6e44f2bc..ab94dfd8826f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1131,10 +1131,10 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); -#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" +#define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ - extern int __bpf_iter__ ## target(args); \ - int __init __bpf_iter__ ## target(args) { return 0; } + extern int bpf_iter_ ## target(args); \ + int __init bpf_iter_ ## target(args) { return 0; } typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fd882616ab52..292257995487 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6919,7 +6919,7 @@ invalid_prog: #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" -#define BTF_ITER_PREFIX "__bpf_iter__" +#define BTF_ITER_PREFIX "bpf_iter_" #define BTF_MAX_NAME_SIZE 128 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- include/linux/bpf.h | 2 +- kernel/bpf/bpf_iter.c | 36 +++++++++++++++++------------------- kernel/bpf/map_iter.c | 18 +++++++++--------- kernel/bpf/task_iter.c | 30 ++++++++++++++++-------------- net/ipv6/route.c | 18 +++++++++--------- net/netlink/af_netlink.c | 18 +++++++++--------- 6 files changed, 61 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ab94dfd8826f..6fa773e2d1bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1153,7 +1153,7 @@ struct bpf_iter_meta { u64 seq_num; }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0a45a6cdfabd..051fb8cab62a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -8,11 +8,7 @@ struct bpf_iter_target_info { struct list_head list; - const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; + const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; @@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->fini_seq_private) - iter_priv->tinfo->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = { .release = iter_release, }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) if (!tinfo) return -ENOMEM; - tinfo->target = reg_info->target; - tinfo->seq_ops = reg_info->seq_ops; - tinfo->init_seq_private = reg_info->init_seq_private; - tinfo->fini_seq_private = reg_info->fini_seq_private; - tinfo->seq_priv_size = reg_info->seq_priv_size; + tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); @@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target) mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->target)) { + if (!strcmp(target, tinfo->reg_info->target)) { list_del(&tinfo->list); kfree(tinfo); found = true; @@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) supported = true; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; @@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->init_seq_private) { - err = tinfo->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8162e0c00b9f..c6216a5fe56e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + static int __init bpf_map_iter_init(void) { - struct bpf_iter_reg reg_info = { - .target = "bpf_map", - .seq_ops = &bpf_map_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&bpf_map_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index aeed662d8451..bd7bfd83d9e0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + static int __init task_iter_init(void) { - struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", - .seq_ops = &task_file_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), - }; - struct bpf_iter_reg task_reg_info = { - .target = "task", - .seq_ops = &task_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), - }; int ret; ret = bpf_iter_reg_target(&task_reg_info); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25f6d3e619d0..6ad2fa51a23a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6397,17 +6397,17 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) +static const struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), +}; + static int __init bpf_iter_register(void) { - struct bpf_iter_reg reg_info = { - .target = "ipv6_route", - .seq_ops = &ipv6_route_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct ipv6_route_iter), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 33cda9baa979..839827227e98 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2803,17 +2803,17 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static const struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), +}; + static int __init bpf_iter_register(void) { - struct bpf_iter_reg reg_info = { - .target = "netlink", - .seq_ops = &netlink_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct nl_seq_iter), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&netlink_reg_info); } #endif -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- include/linux/bpf.h | 2 +- kernel/bpf/bpf_iter.c | 4 ++-- net/ipv6/route.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6fa773e2d1bf..534174eca86b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1154,7 +1154,7 @@ struct bpf_iter_meta { }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); -void bpf_iter_unreg_target(const char *target); +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 051fb8cab62a..644f8626b2c0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) return 0; } -void bpf_iter_unreg_target(const char *target) +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->reg_info->target)) { + if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6ad2fa51a23a..22bf4e36c093 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6412,7 +6412,7 @@ static int __init bpf_iter_register(void) static void bpf_iter_unregister(void) { - bpf_iter_unreg_target("ipv6_route"); + bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++++++- include/net/ip6_fib.h | 7 +++++++ kernel/bpf/bpf_iter.c | 5 +++++ kernel/bpf/btf.c | 15 ++++++++++----- kernel/bpf/map_iter.c | 5 +++++ kernel/bpf/task_iter.c | 12 ++++++++++++ kernel/bpf/verifier.c | 1 - net/ipv6/ip6_fib.c | 5 ----- net/ipv6/route.c | 5 +++++ net/netlink/af_netlink.c | 5 +++++ 10 files changed, 60 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 534174eca86b..c45d198ac38c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -643,6 +643,12 @@ struct bpf_jit_poke_descriptor { u16 reason; }; +/* reg_type info for ctx arguments */ +struct bpf_ctx_arg_aux { + u32 offset; + enum bpf_reg_type reg_type; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -654,12 +660,13 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + u32 ctx_arg_info_size; + const struct bpf_ctx_arg_aux *ctx_arg_info; struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; - bool btf_id_or_null_non0_off; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1139,12 +1146,15 @@ int bpf_obj_get_user(const char __user *pathname, int flags); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); +#define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 ctx_arg_info_size; + struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; }; struct bpf_iter_meta { diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 80262d2980f5..870b646c5797 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -540,6 +540,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; +#endif + #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 644f8626b2c0..dd612b80b9fe 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + return supported; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcd233139294..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - if (off != 0 && prog->aux->btf_id_or_null_non0_off) - info->reg_type = PTR_TO_BTF_ID_OR_NULL; - else - info->reg_type = PTR_TO_BTF_ID; + info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c6216a5fe56e..c69071e334bf 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = { .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index bd7bfd83d9e0..a9b7264dda08 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static const struct bpf_iter_reg task_file_reg_info = { @@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init task_iter_init(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a1826c76bb6..a3f2af756fd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a1fcc0ca21af..250ff52c674e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2638,11 +2638,6 @@ static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) -struct bpf_iter__ipv6_route { - __bpf_md_ptr(struct bpf_iter_meta *, meta); - __bpf_md_ptr(struct fib6_info *, rt); -}; - static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 22bf4e36c093..22e56465f14d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6403,6 +6403,11 @@ static const struct bpf_iter_reg ipv6_route_reg_info = { .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__ipv6_route, rt), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_iter_register(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 839827227e98..4f2c3b14ddbf 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2809,6 +2809,11 @@ static const struct bpf_iter_reg netlink_reg_info = { .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__netlink, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_iter_register(void) -- cgit v1.2.3 From 11ecf8c55b91806e4dc6a1b9fe7cbf68cdc9b006 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 13 May 2020 18:35:23 +0200 Subject: net: phy: broadcom: add cable test support Most modern broadcom PHYs support ECD (enhanced cable diagnostics). Add support for it in the bcm-phy-lib so they can easily be used in the PHY driver. There are two access methods for ECD: legacy by expansion registers and via the new RDB registers which are exclusive. Provide functions in two variants where the PHY driver can choose from. To keep things simple for now, we just switch the register access to expansion registers in the RDB variant for now. On the flipside, we have to keep a bus lock to prevent any other non-legacy access on the PHY. The results of the intra-pair tests are inconclusive (at least for the BCM54140). Most of the times half the length is reported but sometimes the length is correct. Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/bcm-phy-lib.c | 189 ++++++++++++++++++++++++++++++++++++++++++ drivers/net/phy/bcm-phy-lib.h | 6 ++ include/linux/brcmphy.h | 52 ++++++++++++ 3 files changed, 247 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c index 41c728fbcfb2..cb92786e3ded 100644 --- a/drivers/net/phy/bcm-phy-lib.c +++ b/drivers/net/phy/bcm-phy-lib.c @@ -4,12 +4,14 @@ */ #include "bcm-phy-lib.h" +#include #include #include #include #include #include #include +#include #define MII_BCM_CHANNEL_WIDTH 0x2000 #define BCM_CL45VEN_EEE_ADV 0x3c @@ -581,6 +583,193 @@ int bcm_phy_enable_jumbo(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(bcm_phy_enable_jumbo); +int __bcm_phy_enable_rdb_access(struct phy_device *phydev) +{ + return __bcm_phy_write_exp(phydev, BCM54XX_EXP_REG7E, 0); +} +EXPORT_SYMBOL_GPL(__bcm_phy_enable_rdb_access); + +int __bcm_phy_enable_legacy_access(struct phy_device *phydev) +{ + return __bcm_phy_write_rdb(phydev, BCM54XX_RDB_REG0087, + BCM54XX_ACCESS_MODE_LEGACY_EN); +} +EXPORT_SYMBOL_GPL(__bcm_phy_enable_legacy_access); + +static int _bcm_phy_cable_test_start(struct phy_device *phydev, bool is_rdb) +{ + u16 mask, set; + int ret; + + /* Auto-negotiation must be enabled for cable diagnostics to work, but + * don't advertise any capabilities. + */ + phy_write(phydev, MII_BMCR, BMCR_ANENABLE); + phy_write(phydev, MII_ADVERTISE, ADVERTISE_CSMA); + phy_write(phydev, MII_CTRL1000, 0); + + phy_lock_mdio_bus(phydev); + if (is_rdb) { + ret = __bcm_phy_enable_legacy_access(phydev); + if (ret) + goto out; + } + + mask = BCM54XX_ECD_CTRL_CROSS_SHORT_DIS | BCM54XX_ECD_CTRL_UNIT_MASK; + set = BCM54XX_ECD_CTRL_RUN | BCM54XX_ECD_CTRL_BREAK_LINK | + FIELD_PREP(BCM54XX_ECD_CTRL_UNIT_MASK, + BCM54XX_ECD_CTRL_UNIT_CM); + + ret = __bcm_phy_modify_exp(phydev, BCM54XX_EXP_ECD_CTRL, mask, set); + +out: + /* re-enable the RDB access even if there was an error */ + if (is_rdb) + ret = __bcm_phy_enable_rdb_access(phydev) ? : ret; + + phy_unlock_mdio_bus(phydev); + + return ret; +} + +static int bcm_phy_cable_test_report_trans(int result) +{ + switch (result) { + case BCM54XX_ECD_FAULT_TYPE_OK: + return ETHTOOL_A_CABLE_RESULT_CODE_OK; + case BCM54XX_ECD_FAULT_TYPE_OPEN: + return ETHTOOL_A_CABLE_RESULT_CODE_OPEN; + case BCM54XX_ECD_FAULT_TYPE_SAME_SHORT: + return ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT; + case BCM54XX_ECD_FAULT_TYPE_CROSS_SHORT: + return ETHTOOL_A_CABLE_RESULT_CODE_CROSS_SHORT; + case BCM54XX_ECD_FAULT_TYPE_INVALID: + case BCM54XX_ECD_FAULT_TYPE_BUSY: + default: + return ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC; + } +} + +static bool bcm_phy_distance_valid(int result) +{ + switch (result) { + case BCM54XX_ECD_FAULT_TYPE_OPEN: + case BCM54XX_ECD_FAULT_TYPE_SAME_SHORT: + case BCM54XX_ECD_FAULT_TYPE_CROSS_SHORT: + return true; + } + return false; +} + +static int bcm_phy_report_length(struct phy_device *phydev, int pair) +{ + int val; + + val = __bcm_phy_read_exp(phydev, + BCM54XX_EXP_ECD_PAIR_A_LENGTH_RESULTS + pair); + if (val < 0) + return val; + + if (val == BCM54XX_ECD_LENGTH_RESULTS_INVALID) + return 0; + + ethnl_cable_test_fault_length(phydev, pair, val); + + return 0; +} + +static int _bcm_phy_cable_test_get_status(struct phy_device *phydev, + bool *finished, bool is_rdb) +{ + int pair_a, pair_b, pair_c, pair_d, ret; + + *finished = false; + + phy_lock_mdio_bus(phydev); + + if (is_rdb) { + ret = __bcm_phy_enable_legacy_access(phydev); + if (ret) + goto out; + } + + ret = __bcm_phy_read_exp(phydev, BCM54XX_EXP_ECD_CTRL); + if (ret < 0) + goto out; + + if (ret & BCM54XX_ECD_CTRL_IN_PROGRESS) { + ret = 0; + goto out; + } + + ret = __bcm_phy_read_exp(phydev, BCM54XX_EXP_ECD_FAULT_TYPE); + if (ret < 0) + goto out; + + pair_a = FIELD_GET(BCM54XX_ECD_FAULT_TYPE_PAIR_A_MASK, ret); + pair_b = FIELD_GET(BCM54XX_ECD_FAULT_TYPE_PAIR_B_MASK, ret); + pair_c = FIELD_GET(BCM54XX_ECD_FAULT_TYPE_PAIR_C_MASK, ret); + pair_d = FIELD_GET(BCM54XX_ECD_FAULT_TYPE_PAIR_D_MASK, ret); + + ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, + bcm_phy_cable_test_report_trans(pair_a)); + ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_B, + bcm_phy_cable_test_report_trans(pair_b)); + ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_C, + bcm_phy_cable_test_report_trans(pair_c)); + ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_D, + bcm_phy_cable_test_report_trans(pair_d)); + + if (bcm_phy_distance_valid(pair_a)) + bcm_phy_report_length(phydev, 0); + if (bcm_phy_distance_valid(pair_b)) + bcm_phy_report_length(phydev, 1); + if (bcm_phy_distance_valid(pair_c)) + bcm_phy_report_length(phydev, 2); + if (bcm_phy_distance_valid(pair_d)) + bcm_phy_report_length(phydev, 3); + + ret = 0; + *finished = true; +out: + /* re-enable the RDB access even if there was an error */ + if (is_rdb) + ret = __bcm_phy_enable_rdb_access(phydev) ? : ret; + + phy_unlock_mdio_bus(phydev); + + return ret; +} + +int bcm_phy_cable_test_start(struct phy_device *phydev) +{ + return _bcm_phy_cable_test_start(phydev, false); +} +EXPORT_SYMBOL_GPL(bcm_phy_cable_test_start); + +int bcm_phy_cable_test_get_status(struct phy_device *phydev, bool *finished) +{ + return _bcm_phy_cable_test_get_status(phydev, finished, false); +} +EXPORT_SYMBOL_GPL(bcm_phy_cable_test_get_status); + +/* We assume that all PHYs which support RDB access can be switched to legacy + * mode. If, in the future, this is not true anymore, we have to re-implement + * this with RDB access. + */ +int bcm_phy_cable_test_start_rdb(struct phy_device *phydev) +{ + return _bcm_phy_cable_test_start(phydev, true); +} +EXPORT_SYMBOL_GPL(bcm_phy_cable_test_start_rdb); + +int bcm_phy_cable_test_get_status_rdb(struct phy_device *phydev, + bool *finished) +{ + return _bcm_phy_cable_test_get_status(phydev, finished, true); +} +EXPORT_SYMBOL_GPL(bcm_phy_cable_test_get_status_rdb); + MODULE_DESCRIPTION("Broadcom PHY Library"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Broadcom Corporation"); diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h index b35d880220b9..237a8503c9b4 100644 --- a/drivers/net/phy/bcm-phy-lib.h +++ b/drivers/net/phy/bcm-phy-lib.h @@ -80,4 +80,10 @@ void bcm_phy_r_rc_cal_reset(struct phy_device *phydev); int bcm_phy_28nm_a0b0_afe_config_init(struct phy_device *phydev); int bcm_phy_enable_jumbo(struct phy_device *phydev); +int bcm_phy_cable_test_get_status_rdb(struct phy_device *phydev, + bool *finished); +int bcm_phy_cable_test_start_rdb(struct phy_device *phydev); +int bcm_phy_cable_test_start(struct phy_device *phydev); +int bcm_phy_cable_test_get_status(struct phy_device *phydev, bool *finished); + #endif /* _LINUX_BCM_PHY_LIB_H */ diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 58d0150acc3e..d41624db6de2 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -119,6 +119,11 @@ #define MII_BCM54XX_RDB_ADDR 0x1e #define MII_BCM54XX_RDB_DATA 0x1f +/* legacy access control via rdb/expansion register */ +#define BCM54XX_RDB_REG0087 0x0087 +#define BCM54XX_EXP_REG7E (MII_BCM54XX_EXP_SEL_ER + 0x7E) +#define BCM54XX_ACCESS_MODE_LEGACY_EN BIT(15) + /* * AUXILIARY CONTROL SHADOW ACCESS REGISTERS. (PHY REG 0x18) */ @@ -294,4 +299,51 @@ #define MII_BRCM_CORE_EXPB0 0xB0 #define MII_BRCM_CORE_EXPB1 0xB1 +/* Enhanced Cable Diagnostics */ +#define BCM54XX_RDB_ECD_CTRL 0x2a0 +#define BCM54XX_EXP_ECD_CTRL (MII_BCM54XX_EXP_SEL_ER + 0xc0) + +#define BCM54XX_ECD_CTRL_CABLE_TYPE_CAT3 1 /* CAT3 or worse */ +#define BCM54XX_ECD_CTRL_CABLE_TYPE_CAT5 0 /* CAT5 or better */ +#define BCM54XX_ECD_CTRL_CABLE_TYPE_MASK BIT(0) /* cable type */ +#define BCM54XX_ECD_CTRL_INVALID BIT(3) /* invalid result */ +#define BCM54XX_ECD_CTRL_UNIT_CM 0 /* centimeters */ +#define BCM54XX_ECD_CTRL_UNIT_M 1 /* meters */ +#define BCM54XX_ECD_CTRL_UNIT_MASK BIT(10) /* cable length unit */ +#define BCM54XX_ECD_CTRL_IN_PROGRESS BIT(11) /* test in progress */ +#define BCM54XX_ECD_CTRL_BREAK_LINK BIT(12) /* unconnect link + * during test + */ +#define BCM54XX_ECD_CTRL_CROSS_SHORT_DIS BIT(13) /* disable inter-pair + * short check + */ +#define BCM54XX_ECD_CTRL_RUN BIT(15) /* run immediate */ + +#define BCM54XX_RDB_ECD_FAULT_TYPE 0x2a1 +#define BCM54XX_EXP_ECD_FAULT_TYPE (MII_BCM54XX_EXP_SEL_ER + 0xc1) +#define BCM54XX_ECD_FAULT_TYPE_INVALID 0x0 +#define BCM54XX_ECD_FAULT_TYPE_OK 0x1 +#define BCM54XX_ECD_FAULT_TYPE_OPEN 0x2 +#define BCM54XX_ECD_FAULT_TYPE_SAME_SHORT 0x3 /* short same pair */ +#define BCM54XX_ECD_FAULT_TYPE_CROSS_SHORT 0x4 /* short different pairs */ +#define BCM54XX_ECD_FAULT_TYPE_BUSY 0x9 +#define BCM54XX_ECD_FAULT_TYPE_PAIR_D_MASK GENMASK(3, 0) +#define BCM54XX_ECD_FAULT_TYPE_PAIR_C_MASK GENMASK(7, 4) +#define BCM54XX_ECD_FAULT_TYPE_PAIR_B_MASK GENMASK(11, 8) +#define BCM54XX_ECD_FAULT_TYPE_PAIR_A_MASK GENMASK(15, 12) +#define BCM54XX_ECD_PAIR_A_LENGTH_RESULTS 0x2a2 +#define BCM54XX_ECD_PAIR_B_LENGTH_RESULTS 0x2a3 +#define BCM54XX_ECD_PAIR_C_LENGTH_RESULTS 0x2a4 +#define BCM54XX_ECD_PAIR_D_LENGTH_RESULTS 0x2a5 + +#define BCM54XX_RDB_ECD_PAIR_A_LENGTH_RESULTS 0x2a2 +#define BCM54XX_EXP_ECD_PAIR_A_LENGTH_RESULTS (MII_BCM54XX_EXP_SEL_ER + 0xc2) +#define BCM54XX_RDB_ECD_PAIR_B_LENGTH_RESULTS 0x2a3 +#define BCM54XX_EXP_ECD_PAIR_B_LENGTH_RESULTS (MII_BCM54XX_EXP_SEL_ER + 0xc3) +#define BCM54XX_RDB_ECD_PAIR_C_LENGTH_RESULTS 0x2a4 +#define BCM54XX_EXP_ECD_PAIR_C_LENGTH_RESULTS (MII_BCM54XX_EXP_SEL_ER + 0xc4) +#define BCM54XX_RDB_ECD_PAIR_D_LENGTH_RESULTS 0x2a5 +#define BCM54XX_EXP_ECD_PAIR_D_LENGTH_RESULTS (MII_BCM54XX_EXP_SEL_ER + 0xc5) +#define BCM54XX_ECD_LENGTH_RESULTS_INVALID 0xffff + #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3 From f45ce9336ff0640e491c642a84ea02f21daac3a4 Mon Sep 17 00:00:00 2001 From: Gwan-gyeong Mun Date: Thu, 14 May 2020 09:07:19 +0300 Subject: video/hdmi: Add Unpack only function for DRM infoframe It adds an unpack only function for DRM infoframe for dynamic range and mastering infoframe readout. It unpacks the information data block contained in the binary buffer into a structured frame of the HDMI Dynamic Range and Mastering (DRM) information frame. In contrast to hdmi_drm_infoframe_unpack() function, it does not verify a checksum. It can be used for unpacking a DP HDR Metadata Infoframe SDP case. DP HDR Metadata Infoframe SDP uses the same Dynamic Range and Mastering (DRM) information (CTA-861-G spec.) such as HDMI DRM infoframe. But DP SDP header and payload structure are different from HDMI DRM Infoframe. Therefore unpacking DRM infoframe for DP requires skipping of a verifying checksum. v9: Add clear comments to hdmi_drm_infoframe_unpack_only() and hdmi_drm_infoframe_unpack() (Laurent Pinchart) Signed-off-by: Gwan-gyeong Mun Reviewed-by: Uma Shankar Cc: Laurent Pinchart Cc: Ville Syrjala Acked-by: Daniel Vetter Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200514060732.3378396-2-gwan-gyeong.mun@intel.com --- drivers/video/hdmi.c | 65 +++++++++++++++++++++++++++++++++++++--------------- include/linux/hdmi.h | 2 ++ 2 files changed, 48 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c index 856a8c4e84a2..e70792b3e367 100644 --- a/drivers/video/hdmi.c +++ b/drivers/video/hdmi.c @@ -1768,20 +1768,21 @@ hdmi_vendor_any_infoframe_unpack(union hdmi_vendor_any_infoframe *frame, } /** - * hdmi_drm_infoframe_unpack() - unpack binary buffer to a HDMI DRM infoframe + * hdmi_drm_infoframe_unpack_only() - unpack binary buffer of CTA-861-G DRM + * infoframe DataBytes to a HDMI DRM + * infoframe * @frame: HDMI DRM infoframe * @buffer: source buffer * @size: size of buffer * - * Unpacks the information contained in binary @buffer into a structured - * @frame of the HDMI Dynamic Range and Mastering (DRM) information frame. - * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4 - * specification. + * Unpacks CTA-861-G DRM infoframe DataBytes contained in the binary @buffer + * into a structured @frame of the HDMI Dynamic Range and Mastering (DRM) + * infoframe. * * Returns 0 on success or a negative error code on failure. */ -static int hdmi_drm_infoframe_unpack(struct hdmi_drm_infoframe *frame, - const void *buffer, size_t size) +int hdmi_drm_infoframe_unpack_only(struct hdmi_drm_infoframe *frame, + const void *buffer, size_t size) { const u8 *ptr = buffer; const u8 *temp; @@ -1790,23 +1791,13 @@ static int hdmi_drm_infoframe_unpack(struct hdmi_drm_infoframe *frame, int ret; int i; - if (size < HDMI_INFOFRAME_SIZE(DRM)) - return -EINVAL; - - if (ptr[0] != HDMI_INFOFRAME_TYPE_DRM || - ptr[1] != 1 || - ptr[2] != HDMI_DRM_INFOFRAME_SIZE) - return -EINVAL; - - if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(DRM)) != 0) + if (size < HDMI_DRM_INFOFRAME_SIZE) return -EINVAL; ret = hdmi_drm_infoframe_init(frame); if (ret) return ret; - ptr += HDMI_INFOFRAME_HEADER_SIZE; - frame->eotf = ptr[0] & 0x7; frame->metadata_type = ptr[1] & 0x7; @@ -1814,7 +1805,7 @@ static int hdmi_drm_infoframe_unpack(struct hdmi_drm_infoframe *frame, for (i = 0; i < 3; i++) { x_lsb = *temp++; x_msb = *temp++; - frame->display_primaries[i].x = (x_msb << 8) | x_lsb; + frame->display_primaries[i].x = (x_msb << 8) | x_lsb; y_lsb = *temp++; y_msb = *temp++; frame->display_primaries[i].y = (y_msb << 8) | y_lsb; @@ -1830,6 +1821,42 @@ static int hdmi_drm_infoframe_unpack(struct hdmi_drm_infoframe *frame, return 0; } +EXPORT_SYMBOL(hdmi_drm_infoframe_unpack_only); + +/** + * hdmi_drm_infoframe_unpack() - unpack binary buffer to a HDMI DRM infoframe + * @frame: HDMI DRM infoframe + * @buffer: source buffer + * @size: size of buffer + * + * Unpacks the CTA-861-G DRM infoframe contained in the binary @buffer into + * a structured @frame of the HDMI Dynamic Range and Mastering (DRM) + * infoframe. It also verifies the checksum as required by section 5.3.5 of + * the HDMI 1.4 specification. + * + * Returns 0 on success or a negative error code on failure. + */ +static int hdmi_drm_infoframe_unpack(struct hdmi_drm_infoframe *frame, + const void *buffer, size_t size) +{ + const u8 *ptr = buffer; + int ret; + + if (size < HDMI_INFOFRAME_SIZE(DRM)) + return -EINVAL; + + if (ptr[0] != HDMI_INFOFRAME_TYPE_DRM || + ptr[1] != 1 || + ptr[2] != HDMI_DRM_INFOFRAME_SIZE) + return -EINVAL; + + if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(DRM)) != 0) + return -EINVAL; + + ret = hdmi_drm_infoframe_unpack_only(frame, ptr + HDMI_INFOFRAME_HEADER_SIZE, + size - HDMI_INFOFRAME_HEADER_SIZE); + return ret; +} /** * hdmi_infoframe_unpack() - unpack binary buffer to a HDMI infoframe diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 9613d796cfb1..50c31f1a0a2d 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -219,6 +219,8 @@ ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame, void *buffer, ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame, void *buffer, size_t size); int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame); +int hdmi_drm_infoframe_unpack_only(struct hdmi_drm_infoframe *frame, + const void *buffer, size_t size); enum hdmi_spd_sdi { HDMI_SPD_SDI_UNKNOWN, -- cgit v1.2.3 From 2909438d4d62681f392c57df4cd6b7183d19dde0 Mon Sep 17 00:00:00 2001 From: Wang Wenhu Date: Wed, 13 May 2020 07:18:54 -0700 Subject: cpufreq: fix minor typo in struct cpufreq_driver doc comment Delete the duplicate "to", possibly double-typed. Signed-off-by: Wang Wenhu Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index f7240251a949..67d5950bd878 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -330,7 +330,7 @@ struct cpufreq_driver { * * get_intermediate should return a stable intermediate frequency * platform wants to switch to and target_intermediate() should set CPU - * to to that frequency, before jumping to the frequency corresponding + * to that frequency, before jumping to the frequency corresponding * to 'index'. Core will take care of sending notifications and driver * doesn't have to handle them in target_intermediate() or * target_index(). -- cgit v1.2.3 From 71ac860af824ce9ebbbe8de20044e777c0fc33b9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 14 May 2020 16:45:09 +0800 Subject: block: move blk_io_schedule() out of header file blk_io_schedule() isn't called from performance sensitive code path, and it is easier to maintain by exporting it as symbol. Also blk_io_schedule() is only called by CONFIG_BLOCK code, so it is safe to do this way. Meantime fixes build failure when CONFIG_BLOCK is off. Cc: Christoph Hellwig Fixes: e6249cdd46e4 ("block: add blk_io_schedule() for avoiding task hung in sync dio") Reported-by: Satya Tangirala Tested-by: Satya Tangirala Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++++++++ include/linux/blkdev.h | 14 ++------------ 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index cf5b2163edfe..7c1587b45427 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -38,6 +38,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1812,6 +1813,18 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); +void blk_io_schedule(void) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + io_schedule_timeout(timeout); + else + io_schedule(); +} +EXPORT_SYMBOL_GPL(blk_io_schedule); + int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5360696d85ff..f9e4b21b051b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,7 +27,6 @@ #include #include #include -#include struct module; struct scsi_ioctl_command; @@ -1221,6 +1220,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) !list_empty(&plug->cb_list)); } +extern void blk_io_schedule(void); + extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -1851,15 +1852,4 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -static inline void blk_io_schedule(void) -{ - /* Prevent hang_check timer from firing at us during very long I/O */ - unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; - - if (timeout) - io_schedule_timeout(timeout); - else - io_schedule(); -} - #endif -- cgit v1.2.3 From a3c751a50fe6bbe50eb7622a14b18b361804ee0c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:23 +0200 Subject: vfs: allow unprivileged whiteout creation Whiteouts, unlike real device node should not require privileges to create. The general concern with device nodes is that opening them can have side effects. The kernel already avoids zero major (see Documentation/admin-guide/devices.txt). To be on the safe side the patch explicitly forbids registering a char device with 0/0 number (see cdev_add()). This guarantees that a non-O_PATH open on a whiteout will fail with ENODEV; i.e. it won't have any side effect. Signed-off-by: Miklos Szeredi --- fs/char_dev.c | 3 +++ fs/namei.c | 21 +++------------------ include/linux/device_cgroup.h | 3 +++ include/linux/fs.h | 6 +++++- 4 files changed, 14 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/char_dev.c b/fs/char_dev.c index c5e6eff5a381..ba0ded7842a7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -483,6 +483,9 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) p->dev = dev; p->count = count; + if (WARN_ON(dev == WHITEOUT_DEV)) + return -EBUSY; + error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) diff --git a/fs/namei.c b/fs/namei.c index a320371899cf..d81f73ff1a8b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3505,12 +3505,14 @@ EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { + bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(dir, dentry); if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && + !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -4345,9 +4347,6 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, (flags & RENAME_EXCHANGE)) return -EINVAL; - if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) - return -EPERM; - if (flags & RENAME_EXCHANGE) target_flags = 0; @@ -4483,20 +4482,6 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_whiteout(struct inode *dir, struct dentry *dentry) -{ - int error = may_create(dir, dentry); - if (error) - return error; - - if (!dir->i_op->mknod) - return -EPERM; - - return dir->i_op->mknod(dir, dentry, - S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); -} -EXPORT_SYMBOL(vfs_whiteout); - int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index fa35b52e0002..57e63bd63370 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -51,6 +51,9 @@ static inline int devcgroup_inode_mknod(int mode, dev_t dev) if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; + if (S_ISCHR(mode) && dev == WHITEOUT_DEV) + return 0; + if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..15665ef1ef90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1721,7 +1721,11 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); -extern int vfs_whiteout(struct inode *, struct dentry *); + +static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag); -- cgit v1.2.3 From 9f6c61f96f2d97cbb5f7fa85607bc398f843ff0f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:24 +0200 Subject: proc/mounts: add cursor If mounts are deleted after a read(2) call on /proc/self/mounts (or its kin), the subsequent read(2) could miss a mount that comes after the deleted one in the list. This is because the file position is interpreted as the number mount entries from the start of the list. E.g. first read gets entries #0 to #9; the seq file index will be 10. Then entry #5 is deleted, resulting in #10 becoming #9 and #11 becoming #10, etc... The next read will continue from entry #10, and #9 is missed. Solve this by adding a cursor entry for each open instance. Taking the global namespace_sem for write seems excessive, since we are only dealing with a per-namespace list. Instead add a per-namespace spinlock and use that together with namespace_sem taken for read to protect against concurrent modification of the mount list. This may reduce parallelism of is_local_mountpoint(), but it's hardly a big contention point. We could also use RCU freeing of cursors to make traversal not need additional locks, if that turns out to be neceesary. Only move the cursor once for each read (cursor is not added on open) to minimize cacheline invalidation. When EOF is reached, the cursor is taken off the list, in order to prevent an excessive number of cursors due to inactive open file descriptors. Reported-by: Karel Zak Signed-off-by: Miklos Szeredi --- fs/mount.h | 12 +++++-- fs/namespace.c | 91 ++++++++++++++++++++++++++++++++++++++++++--------- fs/proc_namespace.c | 4 ++- include/linux/mount.h | 4 ++- 4 files changed, 90 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/mount.h b/fs/mount.h index 711a4093e475..c7abb7b394d8 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -9,7 +9,13 @@ struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; + /* + * Traversal and modification of .list is protected by either + * - taking namespace_sem for write, OR + * - taking namespace_sem for read AND taking .ns_lock. + */ struct list_head list; + spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ @@ -133,9 +139,7 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - void *cached_mount; - u64 cached_event; - loff_t cached_index; + struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -153,3 +157,5 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } + +extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..3ee236f228a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,6 +648,21 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -673,11 +688,15 @@ bool __is_local_mountpoint(struct dentry *dentry) goto out; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } + unlock_ns_list(ns); up_read(&namespace_sem); out: return is_covered; @@ -1245,46 +1264,71 @@ struct vfsmount *mnt_clone_internal(const struct path *path) } #ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct list_head *prev; down_read(&namespace_sem); - if (p->cached_event == p->ns->event) { - void *v = p->cached_mount; - if (*pos == p->cached_index) - return v; - if (*pos == p->cached_index + 1) { - v = seq_list_next(v, &p->ns->list, &p->cached_index); - return p->cached_mount = v; - } + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; } - p->cached_event = p->ns->event; - p->cached_mount = seq_list_start(&p->ns->list, *pos); - p->cached_index = *pos; - return p->cached_mount; + return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt = v; - p->cached_mount = seq_list_next(v, &p->ns->list, pos); - p->cached_index = *pos; - return p->cached_mount; + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; - struct mount *r = list_entry(v, struct mount, mnt_list); + struct mount *r = v; return p->show(m, &r->mnt); } @@ -1294,6 +1338,15 @@ const struct seq_operations mounts_op = { .stop = m_stop, .show = m_show, }; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} #endif /* CONFIG_PROC_FS */ /** @@ -3202,6 +3255,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3842,10 +3896,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, bool visible = false; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; + if (mnt_is_cursor(mnt)) + continue; + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -3893,6 +3951,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: + unlock_ns_list(ns); up_read(&namespace_sem); return visible; } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 273ee82d8aa9..e4d70c0dffe9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -279,7 +279,8 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - p->cached_event = ~0ULL; + INIT_LIST_HEAD(&p->cursor.mnt_list); + p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -296,6 +297,7 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); + mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..7edac8c7a9c1 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,8 @@ struct fs_context; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ + MNT_CURSOR) #define MNT_INTERNAL 0x4000 @@ -64,6 +65,7 @@ struct fs_context; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 +#define MNT_CURSOR 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From fa2fcf4f1df1559a0a4ee0f46915b496cc2ebf60 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:24 +0200 Subject: statx: add mount ID Systemd is hacking around to get it and it's trivial to add to statx, so... Cc: linux-api@vger.kernel.org Cc: linux-man@vger.kernel.org Signed-off-by: Miklos Szeredi Reviewed-by: Christoph Hellwig --- fs/stat.c | 4 ++++ include/linux/stat.h | 1 + include/uapi/linux/stat.h | 6 +++++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/stat.c b/fs/stat.c index f7f07d1b73cb..3d88c99f7743 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -22,6 +22,7 @@ #include #include "internal.h" +#include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct @@ -199,6 +200,8 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -563,6 +566,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); + tmp.stx_mnt_id = stat->mnt_id; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/include/linux/stat.h b/include/linux/stat.h index 528c4baad091..56614af83d4a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -47,6 +47,7 @@ struct kstat { struct timespec64 ctime; struct timespec64 btime; /* File creation time */ u64 blocks; + u64 mnt_id; }; #endif diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index d1192783139a..d81456247f10 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -123,7 +123,10 @@ struct statx { __u32 stx_dev_major; /* ID of device containing file [uncond] */ __u32 stx_dev_minor; /* 0x90 */ - __u64 __spare2[14]; /* Spare space for future expansion */ + __u64 stx_mnt_id; + __u64 __spare2; + /* 0xa0 */ + __u64 __spare3[12]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -148,6 +151,7 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ -- cgit v1.2.3 From c8ffd8bcdd28296a198f237cc595148a8d4adfbe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:25 +0200 Subject: vfs: add faccessat2 syscall POSIX defines faccessat() as having a fourth "flags" argument, while the linux syscall doesn't have it. Glibc tries to emulate AT_EACCESS and AT_SYMLINK_NOFOLLOW, but AT_EACCESS emulation is broken. Add a new faccessat(2) syscall with the added flags argument and implement both flags. The value of AT_EACCESS is defined in glibc headers to be the same as AT_REMOVEDIR. Use this value for the kernel interface as well, together with the explanatory comment. Also add AT_EMPTY_PATH support, which is not documented by POSIX, but can be useful and is trivial to implement. Signed-off-by: Miklos Szeredi --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + fs/internal.h | 1 - fs/open.c | 34 ++++++++++++++++++++++------- include/linux/syscalls.h | 6 +++-- include/uapi/asm-generic/unistd.h | 4 +++- include/uapi/linux/fcntl.h | 10 +++++++++ 23 files changed, 62 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 36d42da7466a..5ddd128d4b7a 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -477,3 +477,4 @@ # 545 reserved for clone3 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd +549 common faccessat2 sys_faccessat2 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 4d1cf74a2caa..d5cae5ffede0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -451,3 +451,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 803039d504de..3b859596840d 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 439 +#define __NR_compat_syscalls 440 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index c1c61635f89c..6d95d0c8bf2f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -883,6 +883,8 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 042911e670b8..49e325b604b3 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -358,3 +358,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f4f49fcb76d0..f71b1bbcc198 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 435 common clone3 __sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 4c67b11f9c9e..edacc4561f2b 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 1f9e8ad636cc..f777141f5256 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -376,3 +376,4 @@ 435 n32 clone3 __sys_clone3 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd +439 n32 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c0b9d802dbf6..da8c76394e17 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -352,3 +352,4 @@ 435 n64 clone3 __sys_clone3 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd +439 n64 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index ac586774c980..13280625d312 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -425,3 +425,4 @@ 435 o32 clone3 __sys_clone3 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd +439 o32 faccessat2 sys_faccessat2 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 52a15f5cd130..5a758fa6ec52 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -435,3 +435,4 @@ 435 common clone3 sys_clone3_wrapper 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 220ae11555f2..f833a3190822 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -527,3 +527,4 @@ 435 spu clone3 sys_ni_syscall 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bd7bd3581a0f..bfdcb7633957 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 435 common clone3 sys_clone3 sys_clone3 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c7a30fcd135f..acc35daa1b79 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index f13615ecdecc..8004a276cb74 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -483,3 +483,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..d8f8a1a69ed1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -442,3 +442,4 @@ 435 i386 clone3 sys_clone3 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..78847b32e137 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -359,6 +359,7 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 85a9ab1bc04d..69d0d73876b3 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -408,3 +408,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..0d467e32dd7e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,7 +126,6 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); diff --git a/fs/open.c b/fs/open.c index 0ea3cd1a1250..e62b1db06638 100644 --- a/fs/open.c +++ b/fs/open.c @@ -394,20 +394,30 @@ static const struct cred *access_override_creds(void) return old_cred; } -long do_faccessat(int dfd, const char __user *filename, int mode) +long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; - const struct cred *old_cred; + const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; - old_cred = access_override_creds(); - if (!old_cred) - return -ENOMEM; + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (!(flags & AT_EACCESS)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } retry: res = user_path_at(dfd, filename, lookup_flags, &path); @@ -450,18 +460,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); + if (old_cred) + revert_creds(old_cred); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } int ksys_chdir(const char __user *filename) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1815065d52f3..7c354c2955f5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -428,6 +428,8 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); #endif asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode); +asmlinkage long sys_faccessat2(int dfd, const char __user *filename, int mode, + int flags); asmlinkage long sys_chdir(const char __user *filename); asmlinkage long sys_fchdir(unsigned int fd); asmlinkage long sys_chroot(const char __user *filename); @@ -1333,11 +1335,11 @@ static inline int ksys_chmod(const char __user *filename, umode_t mode) return do_fchmodat(AT_FDCWD, filename, mode); } -extern long do_faccessat(int dfd, const char __user *filename, int mode); +long do_faccessat(int dfd, const char __user *filename, int mode, int flags); static inline long ksys_access(const char __user *filename, int mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } extern int do_fchownat(int dfd, const char __user *filename, uid_t user, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 3a3201e4618e..f4a01305d9a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -855,9 +855,11 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) #undef __NR_syscalls -#define __NR_syscalls 439 +#define __NR_syscalls 440 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ca88b7bce553..2f86b2ad6d7e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -84,10 +84,20 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ #define AT_FDCWD -100 /* Special value used to indicate openat should use the current working directory. */ #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ #define AT_REMOVEDIR 0x200 /* Remove directory instead of unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ -- cgit v1.2.3 From 1b2628397058ebce7277480960b29c788138de90 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:17 +0000 Subject: block: Keyslot Manager for Inline Encryption Inline Encryption hardware allows software to specify an encryption context (an encryption key, crypto algorithm, data unit num, data unit size) along with a data transfer request to a storage device, and the inline encryption hardware will use that context to en/decrypt the data. The inline encryption hardware is part of the storage device, and it conceptually sits on the data path between system memory and the storage device. Inline Encryption hardware implementations often function around the concept of "keyslots". These implementations often have a limited number of "keyslots", each of which can hold a key (we say that a key can be "programmed" into a keyslot). Requests made to the storage device may have a keyslot and a data unit number associated with them, and the inline encryption hardware will en/decrypt the data in the requests using the key programmed into that associated keyslot and the data unit number specified with the request. As keyslots are limited, and programming keys may be expensive in many implementations, and multiple requests may use exactly the same encryption contexts, we introduce a Keyslot Manager to efficiently manage keyslots. We also introduce a blk_crypto_key, which will represent the key that's programmed into keyslots managed by keyslot managers. The keyslot manager also functions as the interface that upper layers will use to program keys into inline encryption hardware. For more information on the Keyslot Manager, refer to documentation found in block/keyslot-manager.c and linux/keyslot-manager.h. Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 7 + block/Makefile | 1 + block/keyslot-manager.c | 378 ++++++++++++++++++++++++++++++++++++++++ include/linux/blk-crypto.h | 52 ++++++ include/linux/blkdev.h | 6 + include/linux/keyslot-manager.h | 106 +++++++++++ 6 files changed, 550 insertions(+) create mode 100644 block/keyslot-manager.c create mode 100644 include/linux/blk-crypto.h create mode 100644 include/linux/keyslot-manager.h (limited to 'include/linux') diff --git a/block/Kconfig b/block/Kconfig index 41cb34b0fcd1..f8870c316a03 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -186,6 +186,13 @@ config BLK_SED_OPAL Enabling this option enables users to setup/unlock/lock Locking ranges for SED devices using the Opal protocol. +config BLK_INLINE_ENCRYPTION + bool "Enable inline encryption support in block layer" + help + Build the blk-crypto subsystem. Enabling this lets the + block layer handle encryption, so users can take + advantage of inline encryption hardware if present. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 206b96e9387f..fc963e4676b0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c new file mode 100644 index 000000000000..fcd3fd469d7c --- /dev/null +++ b/block/keyslot-manager.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: The Keyslot Manager + * + * Many devices with inline encryption support have a limited number of "slots" + * into which encryption contexts may be programmed, and requests can be tagged + * with a slot number to specify the key to use for en/decryption. + * + * As the number of slots is limited, and programming keys is expensive on + * many inline encryption hardware, we don't want to program the same key into + * multiple slots - if multiple requests are using the same key, we want to + * program just one slot with that key and use that slot for all requests. + * + * The keyslot manager manages these keyslots appropriately, and also acts as + * an abstraction between the inline encryption hardware and the upper layers. + * + * Lower layer devices will set up a keyslot manager in their request queue + * and tell it how to perform device specific operations like programming/ + * evicting keys from keyslots. + * + * Upper layers will call blk_ksm_get_slot_for_key() to program a + * key into some slot in the inline encryption hardware. + */ +#include +#include +#include +#include +#include +#include + +struct blk_ksm_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_keyslot_manager *ksm; +}; + +static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) +{ + /* + * Calling into the driver requires ksm->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release ksm->lock via blk_ksm_reprogram_all_keys(). + */ + if (ksm->dev) + pm_runtime_get_sync(ksm->dev); + down_write(&ksm->lock); +} + +static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) +{ + up_write(&ksm->lock); + if (ksm->dev) + pm_runtime_put_sync(ksm->dev); +} + +/** + * blk_ksm_init() - Initialize a keyslot manager + * @ksm: The keyslot_manager to initialize. + * @num_slots: The number of key slots to manage. + * + * Allocate memory for keyslots and initialize a keyslot manager. Called by + * e.g. storage drivers to set up a keyslot manager in their request_queue. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(ksm, 0, sizeof(*ksm)); + + if (num_slots == 0) + return -EINVAL; + + ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); + if (!ksm->slots) + return -ENOMEM; + + ksm->num_slots = num_slots; + + init_rwsem(&ksm->lock); + + init_waitqueue_head(&ksm->idle_slots_wait_queue); + INIT_LIST_HEAD(&ksm->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + ksm->slots[slot].ksm = ksm; + list_add_tail(&ksm->slots[slot].idle_slot_node, + &ksm->idle_slots); + } + + spin_lock_init(&ksm->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + ksm->log_slot_ht_size = ilog2(slot_hashtable_size); + ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, + sizeof(ksm->slot_hashtable[0]), + GFP_KERNEL); + if (!ksm->slot_hashtable) + goto err_destroy_ksm; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); + + return 0; + +err_destroy_ksm: + blk_ksm_destroy(ksm); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_ksm_init); + +static inline struct hlist_head * +blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; +} + +static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) +{ + struct blk_keyslot_manager *ksm = slot->ksm; + unsigned long flags; + + spin_lock_irqsave(&ksm->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); +} + +static struct blk_ksm_keyslot *blk_ksm_find_keyslot( + struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); + struct blk_ksm_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( + struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + struct blk_ksm_keyslot *slot; + + slot = blk_ksm_find_keyslot(ksm, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_ksm_remove_slot_from_lru_list(slot); + } + return slot; +} + +unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) +{ + return slot - slot->ksm->slots; +} +EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); + +/** + * blk_ksm_get_slot_for_key() - Program a key into a keyslot. + * @ksm: The keyslot manager to program the key into. + * @key: Pointer to the key object to program, including the raw key, crypto + * mode, and data unit size. + * @slot_ptr: A pointer to return the pointer of the allocated keyslot. + * + * Get a keyslot that's been programmed with the specified key. If one already + * exists, return it with incremented refcount. Otherwise, wait for a keyslot + * to become idle and program it. + * + * Context: Process context. Takes and releases ksm->lock. + * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the + * allocated keyslot), or some other blk_status_t otherwise (and + * keyslot is set to NULL). + */ +blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + struct blk_ksm_keyslot **slot_ptr) +{ + struct blk_ksm_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + down_read(&ksm->lock); + slot = blk_ksm_find_and_grab_keyslot(ksm, key); + up_read(&ksm->lock); + if (slot) + goto success; + + for (;;) { + blk_ksm_hw_enter(ksm); + slot = blk_ksm_find_and_grab_keyslot(ksm, key); + if (slot) { + blk_ksm_hw_exit(ksm); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&ksm->idle_slots)) + break; + + blk_ksm_hw_exit(ksm); + wait_event(ksm->idle_slots_wait_queue, + !list_empty(&ksm->idle_slots)); + } + + slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, + idle_slot_node); + slot_idx = blk_ksm_get_slot_idx(slot); + + err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); + if (err) { + wake_up(&ksm->idle_slots_wait_queue); + blk_ksm_hw_exit(ksm); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_ksm_remove_slot_from_lru_list(slot); + + blk_ksm_hw_exit(ksm); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_ksm_put_slot() - Release a reference to a slot + * @slot: The keyslot to release the reference of. + * + * Context: Any context. + */ +void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) +{ + struct blk_keyslot_manager *ksm; + unsigned long flags; + + if (!slot) + return; + + ksm = slot->ksm; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &ksm->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); + spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); + wake_up(&ksm->idle_slots_wait_queue); + } +} + +/** + * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is + * supported by a ksm. + * @ksm: The keyslot manager to check + * @cfg: The crypto configuration to check for. + * + * Checks for crypto_mode/data unit size/dun bytes support. + * + * Return: Whether or not this ksm supports the specified crypto config. + */ +bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, + const struct blk_crypto_config *cfg) +{ + if (!ksm) + return false; + if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & + cfg->data_unit_size)) + return false; + if (ksm->max_dun_bytes_supported < cfg->dun_bytes) + return false; + return true; +} + +/** + * blk_ksm_evict_key() - Evict a key from the lower layer device. + * @ksm: The keyslot manager to evict from + * @key: The key to evict + * + * Find the keyslot that the specified key was programmed into, and evict that + * slot from the lower layer device. The slot must not be in use by any + * in-flight IO when this function is called. + * + * Context: Process context. Takes and releases ksm->lock. + * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY + * if the keyslot is still in use, or another -errno value on other + * error. + */ +int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + struct blk_ksm_keyslot *slot; + int err = 0; + + blk_ksm_hw_enter(ksm); + slot = blk_ksm_find_keyslot(ksm, key); + if (!slot) + goto out_unlock; + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + err = -EBUSY; + goto out_unlock; + } + err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, + blk_ksm_get_slot_idx(slot)); + if (err) + goto out_unlock; + + hlist_del(&slot->hash_node); + slot->key = NULL; + err = 0; +out_unlock: + blk_ksm_hw_exit(ksm); + return err; +} + +/** + * blk_ksm_reprogram_all_keys() - Re-program all keyslots. + * @ksm: The keyslot manager + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases ksm->lock. + */ +void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) +{ + unsigned int slot; + + /* This is for device initialization, so don't resume the device */ + down_write(&ksm->lock); + for (slot = 0; slot < ksm->num_slots; slot++) { + const struct blk_crypto_key *key = ksm->slots[slot].key; + int err; + + if (!key) + continue; + + err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); + WARN_ON(err); + } + up_write(&ksm->lock); +} +EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); + +void blk_ksm_destroy(struct blk_keyslot_manager *ksm) +{ + if (!ksm) + return; + kvfree(ksm->slot_hashtable); + memzero_explicit(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); + kvfree(ksm->slots); + memzero_explicit(ksm, sizeof(*ksm)); +} +EXPORT_SYMBOL_GPL(blk_ksm_destroy); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h new file mode 100644 index 000000000000..4e77938c3d0e --- /dev/null +++ b/include/linux/blk-crypto.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_H +#define __LINUX_BLK_CRYPTO_H + +enum blk_crypto_mode_num { + BLK_ENCRYPTION_MODE_INVALID, + BLK_ENCRYPTION_MODE_AES_256_XTS, + BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, + BLK_ENCRYPTION_MODE_ADIANTUM, + BLK_ENCRYPTION_MODE_MAX, +}; + +#define BLK_CRYPTO_MAX_KEY_SIZE 64 +/** + * struct blk_crypto_config - an inline encryption key's crypto configuration + * @crypto_mode: encryption algorithm this key is for + * @data_unit_size: the data unit size for all encryption/decryptions with this + * key. This is the size in bytes of each individual plaintext and + * ciphertext. This is always a power of 2. It might be e.g. the + * filesystem block size or the disk sector size. + * @dun_bytes: the maximum number of bytes of DUN used when using this key + */ +struct blk_crypto_config { + enum blk_crypto_mode_num crypto_mode; + unsigned int data_unit_size; + unsigned int dun_bytes; +}; + +/** + * struct blk_crypto_key - an inline encryption key + * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this + * key + * @data_unit_size_bits: log2 of data_unit_size + * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) + * @raw: the raw bytes of this key. Only the first @size bytes are used. + * + * A blk_crypto_key is immutable once created, and many bios can reference it at + * the same time. It must not be freed until all bios using it have completed + * and it has been evicted from all devices on which it may have been used. + */ +struct blk_crypto_key { + struct blk_crypto_config crypto_cfg; + unsigned int data_unit_size_bits; + unsigned int size; + u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; +}; + +#endif /* __LINUX_BLK_CRYPTO_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9e4b21b051b..354e44eebef9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -43,6 +43,7 @@ struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; +struct blk_keyslot_manager; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -468,6 +469,11 @@ struct request_queue { unsigned int dma_pad_mask; unsigned int dma_alignment; +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + /* Inline crypto capabilities */ + struct blk_keyslot_manager *ksm; +#endif + unsigned int rq_timeout; int poll_nsec; diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h new file mode 100644 index 000000000000..18f3f5346843 --- /dev/null +++ b/include/linux/keyslot-manager.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_KEYSLOT_MANAGER_H +#define __LINUX_KEYSLOT_MANAGER_H + +#include +#include + +struct blk_keyslot_manager; + +/** + * struct blk_ksm_ll_ops - functions to manage keyslots in hardware + * @keyslot_program: Program the specified key into the specified slot in the + * inline encryption hardware. + * @keyslot_evict: Evict key from the specified keyslot in the hardware. + * The key is provided so that e.g. dm layers can evict + * keys from the devices that they map over. + * Returns 0 on success, -errno otherwise. + * + * This structure should be provided by storage device drivers when they set up + * a keyslot manager - this structure holds the function ptrs that the keyslot + * manager will use to manipulate keyslots in the hardware. + */ +struct blk_ksm_ll_ops { + int (*keyslot_program)(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot); + int (*keyslot_evict)(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot); +}; + +struct blk_keyslot_manager { + /* + * The struct blk_ksm_ll_ops that this keyslot manager will use + * to perform operations like programming and evicting keys on the + * device + */ + struct blk_ksm_ll_ops ksm_ll_ops; + + /* + * The maximum number of bytes supported for specifying the data unit + * number. + */ + unsigned int max_dun_bytes_supported; + + /* + * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents + * whether a crypto mode and data unit size are supported. The i'th + * bit of crypto_mode_supported[crypto_mode] is set iff a data unit + * size of (1 << i) is supported. We only support data unit sizes + * that are powers of 2. + */ + unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; + + /* Device for runtime power management (NULL if none) */ + struct device *dev; + + /* Here onwards are *private* fields for internal keyslot manager use */ + + unsigned int num_slots; + + /* Protects programming and evicting keys from the device */ + struct rw_semaphore lock; + + /* List of idle slots, with least recently used slot at front */ + wait_queue_head_t idle_slots_wait_queue; + struct list_head idle_slots; + spinlock_t idle_slots_lock; + + /* + * Hash table which maps struct *blk_crypto_key to keyslots, so that we + * can find a key's keyslot in O(1) time rather than O(num_slots). + * Protected by 'lock'. + */ + struct hlist_head *slot_hashtable; + unsigned int log_slot_ht_size; + + /* Per-keyslot data */ + struct blk_ksm_keyslot *slots; +}; + +int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); + +blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + struct blk_ksm_keyslot **slot_ptr); + +unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); + +void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); + +bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, + const struct blk_crypto_config *cfg); + +int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key); + +void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); + +void blk_ksm_destroy(struct blk_keyslot_manager *ksm); + +#endif /* __LINUX_KEYSLOT_MANAGER_H */ -- cgit v1.2.3 From a892c8d52c02284076fbbacae6692aa5c5807d11 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:18 +0000 Subject: block: Inline encryption support for blk-mq We must have some way of letting a storage device driver know what encryption context it should use for en/decrypting a request. However, it's the upper layers (like the filesystem/fscrypt) that know about and manages encryption contexts. As such, when the upper layer submits a bio to the block layer, and this bio eventually reaches a device driver with support for inline encryption, the device driver will need to have been told the encryption context for that bio. We want to communicate the encryption context from the upper layer to the storage device along with the bio, when the bio is submitted to the block layer. To do this, we add a struct bio_crypt_ctx to struct bio, which can represent an encryption context (note that we can't use the bi_private field in struct bio to do this because that field does not function to pass information across layers in the storage stack). We also introduce various functions to manipulate the bio_crypt_ctx and make the bio/request merging logic aware of the bio_crypt_ctx. We also make changes to blk-mq to make it handle bios with encryption contexts. blk-mq can merge many bios into the same request. These bios need to have contiguous data unit numbers (the necessary changes to blk-merge are also made to ensure this) - as such, it suffices to keep the data unit number of just the first bio, since that's all a storage driver needs to infer the data unit number to use for each data block in each bio in a request. blk-mq keeps track of the encryption context to be used for all the bios in a request with the request's rq_crypt_ctx. When the first bio is added to an empty request, blk-mq will program the encryption context of that bio into the request_queue's keyslot manager, and store the returned keyslot in the request's rq_crypt_ctx. All the functions to operate on encryption contexts are in blk-crypto.c. Upper layers only need to call bio_crypt_set_ctx with the encryption key, algorithm and data_unit_num; they don't have to worry about getting a keyslot for each encryption context, as blk-mq/blk-crypto handles that. Blk-crypto also makes it possible for request-based layered devices like dm-rq to make use of inline encryption hardware by cloning the rq_crypt_ctx and programming a keyslot in the new request_queue when necessary. Note that any user of the block layer can submit bios with an encryption context, such as filesystems, device-mapper targets, etc. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/bio.c | 6 + block/blk-core.c | 27 +++- block/blk-crypto-internal.h | 166 +++++++++++++++++++ block/blk-crypto.c | 376 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-map.c | 1 + block/blk-merge.c | 11 ++ block/blk-mq.c | 13 ++ block/blk.h | 2 + block/bounce.c | 2 + drivers/md/dm.c | 3 + include/linux/blk-crypto.h | 71 +++++++++ include/linux/blk_types.h | 6 + include/linux/blkdev.h | 5 + 14 files changed, 684 insertions(+), 7 deletions(-) create mode 100644 block/blk-crypto-internal.h create mode 100644 block/blk-crypto.c (limited to 'include/linux') diff --git a/block/Makefile b/block/Makefile index fc963e4676b0..3ee88d3e807d 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,4 +36,4 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o diff --git a/block/bio.c b/block/bio.c index e4c46e2bd5ba..1594804fe8bc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "blk.h" @@ -237,6 +238,8 @@ void bio_uninit(struct bio *bio) if (bio_integrity(bio)) bio_integrity_free(bio); + + bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); @@ -708,6 +711,8 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) __bio_clone_fast(b, bio); + bio_crypt_clone(b, bio, gfp_mask); + if (bio_integrity(bio)) { int ret; @@ -1172,6 +1177,7 @@ void bio_advance(struct bio *bio, unsigned bytes) if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); + bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(bio_advance); diff --git a/block/blk-core.c b/block/blk-core.c index 7c1587b45427..1e97f9973523 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -121,6 +122,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->start_time_ns = ktime_get_ns(); rq->part = NULL; refcount_set(&rq->ref, 1); + blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -652,6 +654,8 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio, req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; + bio_crypt_free_ctx(bio); + blk_account_io_start(req, false); return true; } @@ -676,6 +680,8 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio, req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; + bio_crypt_do_front_merge(req, bio); + blk_account_io_start(req, false); return true; } @@ -1125,10 +1131,12 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - if (q->make_request_fn) - ret = q->make_request_fn(q, bio); - else - ret = blk_mq_make_request(q, bio); + if (blk_crypto_bio_prep(&bio)) { + if (q->make_request_fn) + ret = q->make_request_fn(q, bio); + else + ret = blk_mq_make_request(q, bio); + } blk_queue_exit(q); @@ -1167,7 +1175,7 @@ EXPORT_SYMBOL(generic_make_request); blk_qc_t direct_make_request(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; - blk_qc_t ret; + blk_qc_t ret = BLK_QC_T_NONE; if (WARN_ON_ONCE(q->make_request_fn)) { bio_io_error(bio); @@ -1177,7 +1185,8 @@ blk_qc_t direct_make_request(struct bio *bio) return BLK_QC_T_NONE; if (unlikely(bio_queue_enter(bio))) return BLK_QC_T_NONE; - ret = blk_mq_make_request(q, bio); + if (blk_crypto_bio_prep(&bio)) + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); return ret; } @@ -1309,6 +1318,9 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; + if (blk_crypto_insert_cloned_request(rq)) + return BLK_STS_IOERR; + if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); @@ -1673,6 +1685,9 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; + if (rq->bio) + blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask); + return 0; free_and_out: diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h new file mode 100644 index 000000000000..796f757fe8e9 --- /dev/null +++ b/block/blk-crypto-internal.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H +#define __LINUX_BLK_CRYPTO_INTERNAL_H + +#include +#include + +/* Represents a crypto mode supported by blk-crypto */ +struct blk_crypto_mode { + unsigned int keysize; /* key size in bytes */ + unsigned int ivsize; /* iv size in bytes */ +}; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int inc); + +bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); + +bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, + struct bio_crypt_ctx *bc2); + +static inline bool bio_crypt_ctx_back_mergeable(struct request *req, + struct bio *bio) +{ + return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), + bio->bi_crypt_context); +} + +static inline bool bio_crypt_ctx_front_mergeable(struct request *req, + struct bio *bio) +{ + return bio_crypt_ctx_mergeable(bio->bi_crypt_context, + bio->bi_iter.bi_size, req->crypt_ctx); +} + +static inline bool bio_crypt_ctx_merge_rq(struct request *req, + struct request *next) +{ + return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), + next->crypt_ctx); +} + +static inline void blk_crypto_rq_set_defaults(struct request *rq) +{ + rq->crypt_ctx = NULL; + rq->crypt_keyslot = NULL; +} + +static inline bool blk_crypto_rq_is_encrypted(struct request *rq) +{ + return rq->crypt_ctx; +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_front_mergeable(struct request *req, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_back_mergeable(struct request *req, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_merge_rq(struct request *req, + struct request *next) +{ + return true; +} + +static inline void blk_crypto_rq_set_defaults(struct request *rq) { } + +static inline bool blk_crypto_rq_is_encrypted(struct request *rq) +{ + return false; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + +void __bio_crypt_advance(struct bio *bio, unsigned int bytes); +static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) +{ + if (bio_has_crypt_ctx(bio)) + __bio_crypt_advance(bio, bytes); +} + +void __bio_crypt_free_ctx(struct bio *bio); +static inline void bio_crypt_free_ctx(struct bio *bio) +{ + if (bio_has_crypt_ctx(bio)) + __bio_crypt_free_ctx(bio); +} + +static inline void bio_crypt_do_front_merge(struct request *rq, + struct bio *bio) +{ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (bio_has_crypt_ctx(bio)) + memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, + sizeof(rq->crypt_ctx->bc_dun)); +#endif +} + +bool __blk_crypto_bio_prep(struct bio **bio_ptr); +static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) +{ + if (bio_has_crypt_ctx(*bio_ptr)) + return __blk_crypto_bio_prep(bio_ptr); + return true; +} + +blk_status_t __blk_crypto_init_request(struct request *rq); +static inline blk_status_t blk_crypto_init_request(struct request *rq) +{ + if (blk_crypto_rq_is_encrypted(rq)) + return __blk_crypto_init_request(rq); + return BLK_STS_OK; +} + +void __blk_crypto_free_request(struct request *rq); +static inline void blk_crypto_free_request(struct request *rq) +{ + if (blk_crypto_rq_is_encrypted(rq)) + __blk_crypto_free_request(rq); +} + +void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask); +static inline void blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) +{ + if (bio_has_crypt_ctx(bio)) + __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); +} + +/** + * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted + * into a request queue. + * @rq: the request being queued + * + * Return: BLK_STS_OK on success, nonzero on error. + */ +static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) +{ + + if (blk_crypto_rq_is_encrypted(rq)) + return blk_crypto_init_request(rq); + return BLK_STS_OK; +} + +#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ diff --git a/block/blk-crypto.c b/block/blk-crypto.c new file mode 100644 index 000000000000..25b981257f5f --- /dev/null +++ b/block/blk-crypto.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/* + * Refer to Documentation/block/inline-encryption.rst for detailed explanation. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include +#include +#include +#include +#include + +#include "blk-crypto-internal.h" + +const struct blk_crypto_mode blk_crypto_modes[] = { + [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .keysize = 64, + .ivsize = 16, + }, + [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .keysize = 16, + .ivsize = 16, + }, + [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .keysize = 32, + .ivsize = 32, + }, +}; + +/* + * This number needs to be at least (the number of threads doing IO + * concurrently) * (maximum recursive depth of a bio), so that we don't + * deadlock on crypt_ctx allocations. The default is chosen to be the same + * as the default number of post read contexts in both EXT4 and F2FS. + */ +static int num_prealloc_crypt_ctxs = 128; + +module_param(num_prealloc_crypt_ctxs, int, 0444); +MODULE_PARM_DESC(num_prealloc_crypt_ctxs, + "Number of bio crypto contexts to preallocate"); + +static struct kmem_cache *bio_crypt_ctx_cache; +static mempool_t *bio_crypt_ctx_pool; + +static int __init bio_crypt_ctx_init(void) +{ + size_t i; + + bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0); + if (!bio_crypt_ctx_cache) + goto out_no_mem; + + bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs, + bio_crypt_ctx_cache); + if (!bio_crypt_ctx_pool) + goto out_no_mem; + + /* This is assumed in various places. */ + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + + /* Sanity check that no algorithm exceeds the defined limits. */ + for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { + BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); + } + + return 0; +out_no_mem: + panic("Failed to allocate mem for bio crypt ctxs\n"); +} +subsys_initcall(bio_crypt_ctx_init); + +void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, + const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) +{ + struct bio_crypt_ctx *bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + + bc->bc_key = key; + memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); + + bio->bi_crypt_context = bc; +} + +void __bio_crypt_free_ctx(struct bio *bio) +{ + mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool); + bio->bi_crypt_context = NULL; +} + +void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) +{ + dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + *dst->bi_crypt_context = *src->bi_crypt_context; +} +EXPORT_SYMBOL_GPL(__bio_crypt_clone); + +/* Increments @dun by @inc, treating @dun as a multi-limb integer. */ +void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int inc) +{ + int i; + + for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + dun[i] += inc; + /* + * If the addition in this limb overflowed, then we need to + * carry 1 into the next limb. Else the carry is 0. + */ + if (dun[i] < inc) + inc = 1; + else + inc = 0; + } +} + +void __bio_crypt_advance(struct bio *bio, unsigned int bytes) +{ + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + + bio_crypt_dun_increment(bc->bc_dun, + bytes >> bc->bc_key->data_unit_size_bits); +} + +/* + * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to + * @next_dun, treating the DUNs as multi-limb integers. + */ +bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, + unsigned int bytes, + const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) +{ + int i; + unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits; + + for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + if (bc->bc_dun[i] + carry != next_dun[i]) + return false; + /* + * If the addition in this limb overflowed, then we need to + * carry 1 into the next limb. Else the carry is 0. + */ + if ((bc->bc_dun[i] + carry) < carry) + carry = 1; + else + carry = 0; + } + + /* If the DUN wrapped through 0, don't treat it as contiguous. */ + return carry == 0; +} + +/* + * Checks that two bio crypt contexts are compatible - i.e. that + * they are mergeable except for data_unit_num continuity. + */ +static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1, + struct bio_crypt_ctx *bc2) +{ + if (!bc1) + return !bc2; + + return bc2 && bc1->bc_key == bc2->bc_key; +} + +bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) +{ + return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context); +} + +/* + * Checks that two bio crypt contexts are compatible, and also + * that their data_unit_nums are continuous (and can hence be merged) + * in the order @bc1 followed by @bc2. + */ +bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, + struct bio_crypt_ctx *bc2) +{ + if (!bio_crypt_ctx_compatible(bc1, bc2)) + return false; + + return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); +} + +/* Check that all I/O segments are data unit aligned. */ +static bool bio_crypt_check_alignment(struct bio *bio) +{ + const unsigned int data_unit_size = + bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) { + if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) + return false; + } + + return true; +} + +blk_status_t __blk_crypto_init_request(struct request *rq) +{ + return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); +} + +/** + * __blk_crypto_free_request - Uninitialize the crypto fields of a request. + * + * @rq: The request whose crypto fields to uninitialize. + * + * Completely uninitializes the crypto fields of a request. If a keyslot has + * been programmed into some inline encryption hardware, that keyslot is + * released. The rq->crypt_ctx is also freed. + */ +void __blk_crypto_free_request(struct request *rq) +{ + blk_ksm_put_slot(rq->crypt_keyslot); + mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); + blk_crypto_rq_set_defaults(rq); +} + +/** + * __blk_crypto_bio_prep - Prepare bio for inline encryption + * + * @bio_ptr: pointer to original bio pointer + * + * Succeeds if the bio doesn't have inline encryption enabled or if the bio + * crypt context provided for the bio is supported by the underlying device's + * inline encryption hardware. Ends the bio with error otherwise. + * + * Caller must ensure bio has bio_crypt_ctx. + * + * Return: true on success; false on error (and bio->bi_status will be set + * appropriately, and bio_endio() will have been called so bio + * submission should abort). + */ +bool __blk_crypto_bio_prep(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + blk_status_t blk_st = BLK_STS_IOERR; + + /* Error if bio has no data. */ + if (WARN_ON_ONCE(!bio_has_data(bio))) + goto fail; + + if (!bio_crypt_check_alignment(bio)) + goto fail; + + /* + * Success if device supports the encryption context. + */ + if (!blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + &bc_key->crypto_cfg)) { + blk_st = BLK_STS_NOTSUPP; + goto fail; + } + + return true; +fail: + (*bio_ptr)->bi_status = blk_st; + bio_endio(*bio_ptr); + return false; +} + +/** + * __blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio + * is inserted + * + * @rq: The request to prepare + * @bio: The first bio being inserted into the request + * @gfp_mask: gfp mask + */ +void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) +{ + if (!rq->crypt_ctx) + rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + *rq->crypt_ctx = *bio->bi_crypt_context; +} + +/** + * blk_crypto_init_key() - Prepare a key for use with blk-crypto + * @blk_key: Pointer to the blk_crypto_key to initialize. + * @raw_key: Pointer to the raw key. Must be the correct length for the chosen + * @crypto_mode; see blk_crypto_modes[]. + * @crypto_mode: identifier for the encryption algorithm to use + * @dun_bytes: number of bytes that will be used to specify the DUN when this + * key is used + * @data_unit_size: the data unit size to use for en/decryption + * + * Return: 0 on success, -errno on failure. The caller is responsible for + * zeroizing both blk_key and raw_key when done with them. + */ +int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, + enum blk_crypto_mode_num crypto_mode, + unsigned int dun_bytes, + unsigned int data_unit_size) +{ + const struct blk_crypto_mode *mode; + + memset(blk_key, 0, sizeof(*blk_key)); + + if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes)) + return -EINVAL; + + mode = &blk_crypto_modes[crypto_mode]; + if (mode->keysize == 0) + return -EINVAL; + + if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + return -EINVAL; + + if (!is_power_of_2(data_unit_size)) + return -EINVAL; + + blk_key->crypto_cfg.crypto_mode = crypto_mode; + blk_key->crypto_cfg.dun_bytes = dun_bytes; + blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->data_unit_size_bits = ilog2(data_unit_size); + blk_key->size = mode->keysize; + memcpy(blk_key->raw, raw_key, mode->keysize); + + return 0; +} + +bool blk_crypto_config_supported(struct request_queue *q, + const struct blk_crypto_config *cfg) +{ + return blk_ksm_crypto_cfg_supported(q->ksm, cfg); +} + +/** + * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @key: A key to use on the device + * @q: the request queue for the device + * + * Upper layers must call this function to ensure that the hardware supports + * the key's crypto settings. + * + * Return: 0 on success; -ENOPKG if the hardware doesn't support the key + */ +int blk_crypto_start_using_key(const struct blk_crypto_key *key, + struct request_queue *q) +{ + if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + return 0; + return -ENOPKG; +} + +/** + * blk_crypto_evict_key() - Evict a key from any inline encryption hardware + * it may have been programmed into + * @q: The request queue who's associated inline encryption hardware this key + * might have been programmed into + * @key: The key to evict + * + * Upper layers (filesystems) must call this function to ensure that a key is + * evicted from any hardware that it might have been programmed into. The key + * must not be in use by any in-flight IO when this function is called. + * + * Return: 0 on success or if key is not present in the q's ksm, -err on error. + */ +int blk_crypto_evict_key(struct request_queue *q, + const struct blk_crypto_key *key) +{ + if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + return blk_ksm_evict_key(q->ksm, key); + + return 0; +} diff --git a/block/blk-map.c b/block/blk-map.c index e3e4ac48db45..6e804892d5ec 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -550,6 +550,7 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) rq->biotail->bi_next = *bio; rq->biotail = *bio; rq->__data_len += (*bio)->bi_iter.bi_size; + bio_crypt_free_ctx(*bio); } return 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index a04e991b5ded..1a9036398214 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -562,6 +562,8 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) if (blk_integrity_rq(req) && integrity_req_gap_back_merge(req, bio)) return 0; + if (!bio_crypt_ctx_back_mergeable(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) { req_set_nomerge(req->q, req); @@ -578,6 +580,8 @@ int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs if (blk_integrity_rq(req) && integrity_req_gap_front_merge(req, bio)) return 0; + if (!bio_crypt_ctx_front_mergeable(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { req_set_nomerge(req->q, req); @@ -627,6 +631,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (blk_integrity_merge_rq(q, req, next) == false) return 0; + if (!bio_crypt_ctx_merge_rq(req, next)) + return 0; + /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; return 1; @@ -851,6 +858,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; + /* Only merge if the crypt contexts are compatible */ + if (!bio_crypt_rq_ctx_compatible(rq, bio)) + return false; + /* must be using the same buffer */ if (req_op(rq) == REQ_OP_WRITE_SAME && !blk_write_same_mergeable(rq->bio, bio)) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9ee695bdf873..4f8adef7fd0d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -317,6 +318,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif + blk_crypto_rq_set_defaults(rq); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); @@ -473,6 +475,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; + blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != -1) @@ -1820,6 +1823,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->__sector = bio->bi_iter.bi_sector; rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); + blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); blk_account_io_start(rq, true); } @@ -2021,6 +2025,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *same_queue_rq = NULL; unsigned int nr_segs; blk_qc_t cookie; + blk_status_t ret; blk_queue_bounce(q, &bio); __blk_queue_split(q, &bio, &nr_segs); @@ -2054,6 +2059,14 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); + ret = blk_crypto_init_request(rq); + if (ret != BLK_STS_OK) { + bio->bi_status = ret; + bio_endio(bio); + blk_mq_free_request(rq); + return BLK_QC_T_NONE; + } + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* Bypass scheduler for flush requests */ diff --git a/block/blk.h b/block/blk.h index e5cd350ca379..fc00537026a0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include "blk-crypto-internal.h" #include "blk-mq.h" #include "blk-mq-sched.h" diff --git a/block/bounce.c b/block/bounce.c index f8ed677a1bf7..c3aaed070124 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -267,6 +267,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, break; } + bio_crypt_clone(bio, bio_src, gfp_mask); + if (bio_integrity(bio_src)) { int ret; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0eb93da44ea2..8921cd79422c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -26,6 +26,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" @@ -1334,6 +1335,8 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, __bio_clone_fast(clone, bio); + bio_crypt_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { int r; diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 4e77938c3d0e..76095b07dd90 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -6,6 +6,8 @@ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H +#include + enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, BLK_ENCRYPTION_MODE_AES_256_XTS, @@ -49,4 +51,73 @@ struct blk_crypto_key { u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; }; +#define BLK_CRYPTO_MAX_IV_SIZE 32 +#define BLK_CRYPTO_DUN_ARRAY_SIZE (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64)) + +/** + * struct bio_crypt_ctx - an inline encryption context + * @bc_key: the key, algorithm, and data unit size to use + * @bc_dun: the data unit number (starting IV) to use + * + * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for + * write requests) or decrypted (for read requests) inline by the storage device + * or controller. + */ +struct bio_crypt_ctx { + const struct blk_crypto_key *bc_key; + u64 bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; +}; + +#include +#include + +struct request; +struct request_queue; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +static inline bool bio_has_crypt_ctx(struct bio *bio) +{ + return bio->bi_crypt_context; +} + +void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, + const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + gfp_t gfp_mask); + +bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, + unsigned int bytes, + const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); + +int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, + enum blk_crypto_mode_num crypto_mode, + unsigned int dun_bytes, + unsigned int data_unit_size); + +int blk_crypto_start_using_key(const struct blk_crypto_key *key, + struct request_queue *q); + +int blk_crypto_evict_key(struct request_queue *q, + const struct blk_crypto_key *key); + +bool blk_crypto_config_supported(struct request_queue *q, + const struct blk_crypto_config *cfg); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool bio_has_crypt_ctx(struct bio *bio) +{ + return false; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + +void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); +static inline void bio_crypt_clone(struct bio *dst, struct bio *src, + gfp_t gfp_mask) +{ + if (bio_has_crypt_ctx(src)) + __bio_crypt_clone(dst, src, gfp_mask); +} + #endif /* __LINUX_BLK_CRYPTO_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b90dca1fa430..9322261fb316 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -18,6 +18,7 @@ struct block_device; struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +struct bio_crypt_ctx; /* * Block error status values. See block/blk-core:blk_errors for the details. @@ -185,6 +186,11 @@ struct bio { u64 bi_iocost_cost; #endif #endif + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *bi_crypt_context; +#endif + union { #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 354e44eebef9..52a9f456cadf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -222,6 +222,11 @@ struct request { unsigned short nr_integrity_segments; #endif +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *crypt_ctx; + struct blk_ksm_keyslot *crypt_keyslot; +#endif + unsigned short write_hint; unsigned short ioprio; -- cgit v1.2.3 From d145dc23030bbf2de3a8ca5e0c29c2e568f69737 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:19 +0000 Subject: block: Make blk-integrity preclude hardware inline encryption Whenever a device supports blk-integrity, make the kernel pretend that the device doesn't support inline encryption (essentially by setting the keyslot manager in the request queue to NULL). There's no hardware currently that supports both integrity and inline encryption. However, it seems possible that there will be such hardware in the near future (like the NVMe key per I/O support that might support both inline encryption and PI). But properly integrating both features is not trivial, and without real hardware that implements both, it is difficult to tell if it will be done correctly by the majority of hardware that support both. So it seems best not to support both features together right now, and to decide what to do at probe time. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 3 +++ block/blk-integrity.c | 7 +++++++ block/keyslot-manager.c | 19 +++++++++++++++++++ include/linux/blkdev.h | 30 ++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bf62c25cde8f..3579ac0f6ec1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -42,6 +42,9 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, struct bio_set *bs = bio->bi_pool; unsigned inline_vecs; + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return ERR_PTR(-EOPNOTSUPP); + if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); inline_vecs = nr_vecs; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ff1070edbb40..c03705cbb9c9 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -409,6 +409,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tag_size = template->tag_size; disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (disk->queue->ksm) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + blk_ksm_unregister(disk->queue); + } +#endif } EXPORT_SYMBOL(blk_integrity_register); diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c index fcd3fd469d7c..c2ef41b3147b 100644 --- a/block/keyslot-manager.c +++ b/block/keyslot-manager.c @@ -25,6 +25,9 @@ * Upper layers will call blk_ksm_get_slot_for_key() to program a * key into some slot in the inline encryption hardware. */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + #include #include #include @@ -376,3 +379,19 @@ void blk_ksm_destroy(struct blk_keyslot_manager *ksm) memzero_explicit(ksm, sizeof(*ksm)); } EXPORT_SYMBOL_GPL(blk_ksm_destroy); + +bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->ksm = ksm; + return true; +} +EXPORT_SYMBOL_GPL(blk_ksm_register); + +void blk_ksm_unregister(struct request_queue *q) +{ + q->ksm = NULL; +} diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 52a9f456cadf..2b33166b9daf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1583,6 +1583,12 @@ struct blk_integrity *bdev_get_integrity(struct block_device *bdev) return blk_get_integrity(bdev->bd_disk); } +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return q->integrity.profile; +} + static inline bool blk_integrity_rq(struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; @@ -1663,6 +1669,11 @@ static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { return NULL; } +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return false; +} static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) { return 0; @@ -1714,6 +1725,25 @@ static inline struct bio_vec *rq_integrity_vec(struct request *rq) #endif /* CONFIG_BLK_DEV_INTEGRITY */ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); + +void blk_ksm_unregister(struct request_queue *q); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, + struct request_queue *q) +{ + return true; +} + +static inline void blk_ksm_unregister(struct request_queue *q) { } + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + + struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); -- cgit v1.2.3 From 488f6682c832e9549d28b30075f00c76328eb1be Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:20 +0000 Subject: block: blk-crypto-fallback for Inline Encryption Blk-crypto delegates crypto operations to inline encryption hardware when available. The separately configurable blk-crypto-fallback contains a software fallback to the kernel crypto API - when enabled, blk-crypto will use this fallback for en/decryption when inline encryption hardware is not available. This lets upper layers not have to worry about whether or not the underlying device has support for inline encryption before deciding to specify an encryption context for a bio. It also allows for testing without actual inline encryption hardware - in particular, it makes it possible to test the inline encryption code in ext4 and f2fs simply by running xfstests with the inlinecrypt mount option, which in turn allows for things like the regular upstream regression testing of ext4 to cover the inline encryption code paths. For more details, refer to Documentation/block/inline-encryption.rst. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/Kconfig | 10 + block/Makefile | 1 + block/blk-crypto-fallback.c | 657 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-crypto-internal.h | 35 +++ block/blk-crypto.c | 68 +++-- include/linux/blk-crypto.h | 2 +- 6 files changed, 752 insertions(+), 21 deletions(-) create mode 100644 block/blk-crypto-fallback.c (limited to 'include/linux') diff --git a/block/Kconfig b/block/Kconfig index f8870c316a03..9382a4acefc3 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -193,6 +193,16 @@ config BLK_INLINE_ENCRYPTION block layer handle encryption, so users can take advantage of inline encryption hardware if present. +config BLK_INLINE_ENCRYPTION_FALLBACK + bool "Enable crypto API fallback for blk-crypto" + depends on BLK_INLINE_ENCRYPTION + select CRYPTO + select CRYPTO_SKCIPHER + help + Enabling this lets the block layer handle inline encryption + by falling back to the kernel crypto API when inline + encryption hardware is not present. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 3ee88d3e807d..78719169fb2a 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c new file mode 100644 index 000000000000..74ab137ae3ba --- /dev/null +++ b/block/blk-crypto-fallback.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/* + * Refer to Documentation/block/inline-encryption.rst for detailed explanation. + */ + +#define pr_fmt(fmt) "blk-crypto-fallback: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk-crypto-internal.h" + +static unsigned int num_prealloc_bounce_pg = 32; +module_param(num_prealloc_bounce_pg, uint, 0); +MODULE_PARM_DESC(num_prealloc_bounce_pg, + "Number of preallocated bounce pages for the blk-crypto crypto API fallback"); + +static unsigned int blk_crypto_num_keyslots = 100; +module_param_named(num_keyslots, blk_crypto_num_keyslots, uint, 0); +MODULE_PARM_DESC(num_keyslots, + "Number of keyslots for the blk-crypto crypto API fallback"); + +static unsigned int num_prealloc_fallback_crypt_ctxs = 128; +module_param(num_prealloc_fallback_crypt_ctxs, uint, 0); +MODULE_PARM_DESC(num_prealloc_crypt_fallback_ctxs, + "Number of preallocated bio fallback crypto contexts for blk-crypto to use during crypto API fallback"); + +struct bio_fallback_crypt_ctx { + struct bio_crypt_ctx crypt_ctx; + /* + * Copy of the bvec_iter when this bio was submitted. + * We only want to en/decrypt the part of the bio as described by the + * bvec_iter upon submission because bio might be split before being + * resubmitted + */ + struct bvec_iter crypt_iter; + union { + struct { + struct work_struct work; + struct bio *bio; + }; + struct { + void *bi_private_orig; + bio_end_io_t *bi_end_io_orig; + }; + }; +}; + +static struct kmem_cache *bio_fallback_crypt_ctx_cache; +static mempool_t *bio_fallback_crypt_ctx_pool; + +/* + * Allocating a crypto tfm during I/O can deadlock, so we have to preallocate + * all of a mode's tfms when that mode starts being used. Since each mode may + * need all the keyslots at some point, each mode needs its own tfm for each + * keyslot; thus, a keyslot may contain tfms for multiple modes. However, to + * match the behavior of real inline encryption hardware (which only supports a + * single encryption context per keyslot), we only allow one tfm per keyslot to + * be used at a time - the rest of the unused tfms have their keys cleared. + */ +static DEFINE_MUTEX(tfms_init_lock); +static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; + +static struct blk_crypto_keyslot { + enum blk_crypto_mode_num crypto_mode; + struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; +} *blk_crypto_keyslots; + +static struct blk_keyslot_manager blk_crypto_ksm; +static struct workqueue_struct *blk_crypto_wq; +static mempool_t *blk_crypto_bounce_page_pool; + +/* + * This is the key we set when evicting a keyslot. This *should* be the all 0's + * key, but AES-XTS rejects that key, so we use some random bytes instead. + */ +static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; + +static void blk_crypto_evict_keyslot(unsigned int slot) +{ + struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; + int err; + + WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID); + + /* Clear the key in the skcipher */ + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, + blk_crypto_modes[crypto_mode].keysize); + WARN_ON(err); + slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; +} + +static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot) +{ + struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + const enum blk_crypto_mode_num crypto_mode = + key->crypto_cfg.crypto_mode; + int err; + + if (crypto_mode != slotp->crypto_mode && + slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) + blk_crypto_evict_keyslot(slot); + + slotp->crypto_mode = crypto_mode; + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + key->size); + if (err) { + blk_crypto_evict_keyslot(slot); + return err; + } + return 0; +} + +static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot) +{ + blk_crypto_evict_keyslot(slot); + return 0; +} + +/* + * The crypto API fallback KSM ops - only used for a bio when it specifies a + * blk_crypto_key that was not supported by the device's inline encryption + * hardware. + */ +static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { + .keyslot_program = blk_crypto_keyslot_program, + .keyslot_evict = blk_crypto_keyslot_evict, +}; + +static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) +{ + struct bio *src_bio = enc_bio->bi_private; + int i; + + for (i = 0; i < enc_bio->bi_vcnt; i++) + mempool_free(enc_bio->bi_io_vec[i].bv_page, + blk_crypto_bounce_page_pool); + + src_bio->bi_status = enc_bio->bi_status; + + bio_put(enc_bio); + bio_endio(src_bio); +} + +static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; + bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); + + return bio; +} + +static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) +{ + struct skcipher_request *ciph_req; + const struct blk_crypto_keyslot *slotp; + int keyslot_idx = blk_ksm_get_slot_idx(slot); + + slotp = &blk_crypto_keyslots[keyslot_idx]; + ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], + GFP_NOIO); + if (!ciph_req) + return false; + + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, wait); + *ciph_req_ret = ciph_req; + + return true; +} + +static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + unsigned int i = 0; + unsigned int num_sectors = 0; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + num_sectors += bv.bv_len >> SECTOR_SHIFT; + if (++i == BIO_MAX_PAGES) + break; + } + if (num_sectors < bio_sectors(bio)) { + struct bio *split_bio; + + split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL); + if (!split_bio) { + bio->bi_status = BLK_STS_RESOURCE; + return false; + } + bio_chain(split_bio, bio); + generic_make_request(bio); + *bio_ptr = split_bio; + } + + return true; +} + +union blk_crypto_iv { + __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + u8 bytes[BLK_CRYPTO_MAX_IV_SIZE]; +}; + +static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + union blk_crypto_iv *iv) +{ + int i; + + for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) + iv->dun[i] = cpu_to_le64(dun[i]); +} + +/* + * The crypto API fallback's encryption routine. + * Allocate a bounce bio for encryption, encrypt the input bio using crypto API, + * and replace *bio_ptr with the bounce bio. May split input bio if it's too + * large. Returns true on success. Returns false and sets bio->bi_status on + * error. + */ +static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) +{ + struct bio *src_bio, *enc_bio; + struct bio_crypt_ctx *bc; + struct blk_ksm_keyslot *slot; + int data_unit_size; + struct skcipher_request *ciph_req = NULL; + DECLARE_CRYPTO_WAIT(wait); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + struct scatterlist src, dst; + union blk_crypto_iv iv; + unsigned int i, j; + bool ret = false; + blk_status_t blk_st; + + /* Split the bio if it's too big for single page bvec */ + if (!blk_crypto_split_bio_if_needed(bio_ptr)) + return false; + + src_bio = *bio_ptr; + bc = src_bio->bi_crypt_context; + data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + + /* Allocate bounce bio for encryption */ + enc_bio = blk_crypto_clone_bio(src_bio); + if (!enc_bio) { + src_bio->bi_status = BLK_STS_RESOURCE; + return false; + } + + /* + * Use the crypto API fallback keyslot manager to get a crypto_skcipher + * for the algorithm and key specified for this bio. + */ + blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + if (blk_st != BLK_STS_OK) { + src_bio->bi_status = blk_st; + goto out_put_enc_bio; + } + + /* and then allocate an skcipher_request for it */ + if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + src_bio->bi_status = BLK_STS_RESOURCE; + goto out_release_keyslot; + } + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&src, 1); + sg_init_table(&dst, 1); + + skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size, + iv.bytes); + + /* Encrypt each page in the bounce bio */ + for (i = 0; i < enc_bio->bi_vcnt; i++) { + struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i]; + struct page *plaintext_page = enc_bvec->bv_page; + struct page *ciphertext_page = + mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO); + + enc_bvec->bv_page = ciphertext_page; + + if (!ciphertext_page) { + src_bio->bi_status = BLK_STS_RESOURCE; + goto out_free_bounce_pages; + } + + sg_set_page(&src, plaintext_page, data_unit_size, + enc_bvec->bv_offset); + sg_set_page(&dst, ciphertext_page, data_unit_size, + enc_bvec->bv_offset); + + /* Encrypt each data unit in this page */ + for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req), + &wait)) { + i++; + src_bio->bi_status = BLK_STS_IOERR; + goto out_free_bounce_pages; + } + bio_crypt_dun_increment(curr_dun, 1); + src.offset += data_unit_size; + dst.offset += data_unit_size; + } + } + + enc_bio->bi_private = src_bio; + enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; + *bio_ptr = enc_bio; + ret = true; + + enc_bio = NULL; + goto out_free_ciph_req; + +out_free_bounce_pages: + while (i > 0) + mempool_free(enc_bio->bi_io_vec[--i].bv_page, + blk_crypto_bounce_page_pool); +out_free_ciph_req: + skcipher_request_free(ciph_req); +out_release_keyslot: + blk_ksm_put_slot(slot); +out_put_enc_bio: + if (enc_bio) + bio_put(enc_bio); + + return ret; +} + +/* + * The crypto API fallback's main decryption routine. + * Decrypts input bio in place, and calls bio_endio on the bio. + */ +static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) +{ + struct bio_fallback_crypt_ctx *f_ctx = + container_of(work, struct bio_fallback_crypt_ctx, work); + struct bio *bio = f_ctx->bio; + struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; + struct blk_ksm_keyslot *slot; + struct skcipher_request *ciph_req = NULL; + DECLARE_CRYPTO_WAIT(wait); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + union blk_crypto_iv iv; + struct scatterlist sg; + struct bio_vec bv; + struct bvec_iter iter; + const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + unsigned int i; + blk_status_t blk_st; + + /* + * Use the crypto API fallback keyslot manager to get a crypto_skcipher + * for the algorithm and key specified for this bio. + */ + blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + if (blk_st != BLK_STS_OK) { + bio->bi_status = blk_st; + goto out_no_keyslot; + } + + /* and then allocate an skcipher_request for it */ + if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + bio->bi_status = BLK_STS_RESOURCE; + goto out; + } + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&sg, 1); + skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, + iv.bytes); + + /* Decrypt each segment in the bio */ + __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) { + struct page *page = bv.bv_page; + + sg_set_page(&sg, page, data_unit_size, bv.bv_offset); + + /* Decrypt each data unit in the segment */ + for (i = 0; i < bv.bv_len; i += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req), + &wait)) { + bio->bi_status = BLK_STS_IOERR; + goto out; + } + bio_crypt_dun_increment(curr_dun, 1); + sg.offset += data_unit_size; + } + } + +out: + skcipher_request_free(ciph_req); + blk_ksm_put_slot(slot); +out_no_keyslot: + mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + bio_endio(bio); +} + +/** + * blk_crypto_fallback_decrypt_endio - queue bio for fallback decryption + * + * @bio: the bio to queue + * + * Restore bi_private and bi_end_io, and queue the bio for decryption into a + * workqueue, since this function will be called from an atomic context. + */ +static void blk_crypto_fallback_decrypt_endio(struct bio *bio) +{ + struct bio_fallback_crypt_ctx *f_ctx = bio->bi_private; + + bio->bi_private = f_ctx->bi_private_orig; + bio->bi_end_io = f_ctx->bi_end_io_orig; + + /* If there was an IO error, don't queue for decrypt. */ + if (bio->bi_status) { + mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + bio_endio(bio); + return; + } + + INIT_WORK(&f_ctx->work, blk_crypto_fallback_decrypt_bio); + f_ctx->bio = bio; + queue_work(blk_crypto_wq, &f_ctx->work); +} + +/** + * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption + * + * @bio_ptr: pointer to the bio to prepare + * + * If bio is doing a WRITE operation, this splits the bio into two parts if it's + * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio + * for the first part, encrypts it, and update bio_ptr to point to the bounce + * bio. + * + * For a READ operation, we mark the bio for decryption by using bi_private and + * bi_end_io. + * + * In either case, this function will make the bio look like a regular bio (i.e. + * as if no encryption context was ever specified) for the purposes of the rest + * of the stack except for blk-integrity (blk-integrity and blk-crypto are not + * currently supported together). + * + * Return: true on success. Sets bio->bi_status and returns false on error. + */ +bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + struct bio_fallback_crypt_ctx *f_ctx; + + if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) { + /* User didn't call blk_crypto_start_using_key() first */ + bio->bi_status = BLK_STS_IOERR; + return false; + } + + if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, + &bc->bc_key->crypto_cfg)) { + bio->bi_status = BLK_STS_NOTSUPP; + return false; + } + + if (bio_data_dir(bio) == WRITE) + return blk_crypto_fallback_encrypt_bio(bio_ptr); + + /* + * bio READ case: Set up a f_ctx in the bio's bi_private and set the + * bi_end_io appropriately to trigger decryption when the bio is ended. + */ + f_ctx = mempool_alloc(bio_fallback_crypt_ctx_pool, GFP_NOIO); + f_ctx->crypt_ctx = *bc; + f_ctx->crypt_iter = bio->bi_iter; + f_ctx->bi_private_orig = bio->bi_private; + f_ctx->bi_end_io_orig = bio->bi_end_io; + bio->bi_private = (void *)f_ctx; + bio->bi_end_io = blk_crypto_fallback_decrypt_endio; + bio_crypt_free_ctx(bio); + + return true; +} + +int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) +{ + return blk_ksm_evict_key(&blk_crypto_ksm, key); +} + +static bool blk_crypto_fallback_inited; +static int blk_crypto_fallback_init(void) +{ + int i; + int err = -ENOMEM; + + if (blk_crypto_fallback_inited) + return 0; + + prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + + err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + if (err) + goto out; + err = -ENOMEM; + + blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; + blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + + /* All blk-crypto modes have a crypto API fallback. */ + for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) + blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; + blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + + blk_crypto_wq = alloc_workqueue("blk_crypto_wq", + WQ_UNBOUND | WQ_HIGHPRI | + WQ_MEM_RECLAIM, num_online_cpus()); + if (!blk_crypto_wq) + goto fail_free_ksm; + + blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, + sizeof(blk_crypto_keyslots[0]), + GFP_KERNEL); + if (!blk_crypto_keyslots) + goto fail_free_wq; + + blk_crypto_bounce_page_pool = + mempool_create_page_pool(num_prealloc_bounce_pg, 0); + if (!blk_crypto_bounce_page_pool) + goto fail_free_keyslots; + + bio_fallback_crypt_ctx_cache = KMEM_CACHE(bio_fallback_crypt_ctx, 0); + if (!bio_fallback_crypt_ctx_cache) + goto fail_free_bounce_page_pool; + + bio_fallback_crypt_ctx_pool = + mempool_create_slab_pool(num_prealloc_fallback_crypt_ctxs, + bio_fallback_crypt_ctx_cache); + if (!bio_fallback_crypt_ctx_pool) + goto fail_free_crypt_ctx_cache; + + blk_crypto_fallback_inited = true; + + return 0; +fail_free_crypt_ctx_cache: + kmem_cache_destroy(bio_fallback_crypt_ctx_cache); +fail_free_bounce_page_pool: + mempool_destroy(blk_crypto_bounce_page_pool); +fail_free_keyslots: + kfree(blk_crypto_keyslots); +fail_free_wq: + destroy_workqueue(blk_crypto_wq); +fail_free_ksm: + blk_ksm_destroy(&blk_crypto_ksm); +out: + return err; +} + +/* + * Prepare blk-crypto-fallback for the specified crypto mode. + * Returns -ENOPKG if the needed crypto API support is missing. + */ +int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) +{ + const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; + struct blk_crypto_keyslot *slotp; + unsigned int i; + int err = 0; + + /* + * Fast path + * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num] + * for each i are visible before we try to access them. + */ + if (likely(smp_load_acquire(&tfms_inited[mode_num]))) + return 0; + + mutex_lock(&tfms_init_lock); + if (tfms_inited[mode_num]) + goto out; + + err = blk_crypto_fallback_init(); + if (err) + goto out; + + for (i = 0; i < blk_crypto_num_keyslots; i++) { + slotp = &blk_crypto_keyslots[i]; + slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0); + if (IS_ERR(slotp->tfms[mode_num])) { + err = PTR_ERR(slotp->tfms[mode_num]); + if (err == -ENOENT) { + pr_warn_once("Missing crypto API support for \"%s\"\n", + cipher_str); + err = -ENOPKG; + } + slotp->tfms[mode_num] = NULL; + goto out_free_tfms; + } + + crypto_skcipher_set_flags(slotp->tfms[mode_num], + CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + } + + /* + * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num] + * for each i are visible before we set tfms_inited[mode_num]. + */ + smp_store_release(&tfms_inited[mode_num], true); + goto out; + +out_free_tfms: + for (i = 0; i < blk_crypto_num_keyslots; i++) { + slotp = &blk_crypto_keyslots[i]; + crypto_free_skcipher(slotp->tfms[mode_num]); + slotp->tfms[mode_num] = NULL; + } +out: + mutex_unlock(&tfms_init_lock); + return err; +} diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 796f757fe8e9..d2b0f565d83c 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -11,10 +11,13 @@ /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { + const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ }; +extern const struct blk_crypto_mode blk_crypto_modes[]; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], @@ -163,4 +166,36 @@ static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) return BLK_STS_OK; } +#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK + +int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); + +bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); + +int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ + +static inline int +blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) +{ + pr_warn_once("crypto API fallback is disabled\n"); + return -ENOPKG; +} + +static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +{ + pr_warn_once("crypto API fallback disabled; failing request.\n"); + (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; + return false; +} + +static inline int +blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) +{ + return 0; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ + #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 25b981257f5f..6533c9b36ab8 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -19,14 +19,17 @@ const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, }, @@ -229,9 +232,16 @@ void __blk_crypto_free_request(struct request *rq) * * @bio_ptr: pointer to original bio pointer * - * Succeeds if the bio doesn't have inline encryption enabled or if the bio - * crypt context provided for the bio is supported by the underlying device's - * inline encryption hardware. Ends the bio with error otherwise. + * If the bio crypt context provided for the bio is supported by the underlying + * device's inline encryption hardware, do nothing. + * + * Otherwise, try to perform en/decryption for this bio by falling back to the + * kernel crypto API. When the crypto API fallback is used for encryption, + * blk-crypto may choose to split the bio into 2 - the first one that will + * continue to be processed and the second one that will be resubmitted via + * generic_make_request. A bounce bio will be allocated to encrypt the contents + * of the aforementioned "first one", and *bio_ptr will be updated to this + * bounce bio. * * Caller must ensure bio has bio_crypt_ctx. * @@ -243,27 +253,29 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; - blk_status_t blk_st = BLK_STS_IOERR; /* Error if bio has no data. */ - if (WARN_ON_ONCE(!bio_has_data(bio))) + if (WARN_ON_ONCE(!bio_has_data(bio))) { + bio->bi_status = BLK_STS_IOERR; goto fail; + } - if (!bio_crypt_check_alignment(bio)) + if (!bio_crypt_check_alignment(bio)) { + bio->bi_status = BLK_STS_IOERR; goto fail; + } /* - * Success if device supports the encryption context. + * Success if device supports the encryption context, or if we succeeded + * in falling back to the crypto API. */ - if (!blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, - &bc_key->crypto_cfg)) { - blk_st = BLK_STS_NOTSUPP; - goto fail; - } + if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + &bc_key->crypto_cfg)) + return true; - return true; + if (blk_crypto_fallback_bio_prep(bio_ptr)) + return true; fail: - (*bio_ptr)->bi_status = blk_st; bio_endio(*bio_ptr); return false; } @@ -329,10 +341,16 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return 0; } +/* + * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the + * request queue it's submitted to supports inline crypto, or the + * blk-crypto-fallback is enabled and supports the cfg). + */ bool blk_crypto_config_supported(struct request_queue *q, const struct blk_crypto_config *cfg) { - return blk_ksm_crypto_cfg_supported(q->ksm, cfg); + return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || + blk_ksm_crypto_cfg_supported(q->ksm, cfg); } /** @@ -340,17 +358,22 @@ bool blk_crypto_config_supported(struct request_queue *q, * @key: A key to use on the device * @q: the request queue for the device * - * Upper layers must call this function to ensure that the hardware supports - * the key's crypto settings. + * Upper layers must call this function to ensure that either the hardware + * supports the key's crypto settings, or the crypto API fallback has transforms + * for the needed mode allocated and ready to go. This function may allocate + * an skcipher, and *should not* be called from the data path, since that might + * cause a deadlock * - * Return: 0 on success; -ENOPKG if the hardware doesn't support the key + * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and + * blk-crypto-fallback is either disabled or the needed algorithm + * is disabled in the crypto API; or another -errno code. */ int blk_crypto_start_using_key(const struct blk_crypto_key *key, struct request_queue *q) { if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) return 0; - return -ENOPKG; + return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } /** @@ -372,5 +395,10 @@ int blk_crypto_evict_key(struct request_queue *q, if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) return blk_ksm_evict_key(q->ksm, key); - return 0; + /* + * If the request queue's associated inline encryption hardware didn't + * have support for the key, then the key might have been programmed + * into the fallback keyslot manager, so try to evict from there. + */ + return blk_crypto_fallback_evict_key(key); } diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 76095b07dd90..e82342907f2b 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -61,7 +61,7 @@ struct blk_crypto_key { * * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for * write requests) or decrypted (for read requests) inline by the storage device - * or controller. + * or controller, or by the crypto API fallback. */ struct bio_crypt_ctx { const struct blk_crypto_key *bc_key; -- cgit v1.2.3 From d639836ab3363f935a9a4336cb4ea3828d0437dd Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 May 2020 12:57:17 +0300 Subject: net: qed: adding hw_err states and handling Here we introduce qed device error tracking flags and error types. qed_hw_err_notify is an entrace point to report errors. It'll notify higher level drivers (qede/qedr/etc) to handle and recover the error. List of posible errors comes from hardware interfaces, but could be extended in future. Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed.h | 2 ++ drivers/net/ethernet/qlogic/qed/qed_hw.c | 32 ++++++++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_hw.h | 15 ++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_main.c | 29 +++++++++++++++++++++++++++ include/linux/qed/qed_if.h | 12 +++++++++++ 5 files changed, 90 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index fa41bf08a589..12c40ce3d876 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -1020,6 +1020,8 @@ u32 qed_unzip_data(struct qed_hwfn *p_hwfn, u32 input_len, u8 *input_buf, u32 max_size, u8 *unzip_buf); void qed_schedule_recovery_handler(struct qed_hwfn *p_hwfn); +void qed_hw_error_occurred(struct qed_hwfn *p_hwfn, + enum qed_hw_err_type err_type); void qed_get_protocol_stats(struct qed_dev *cdev, enum qed_mcp_protocol_type type, union qed_mcp_protocol_stats *stats); diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c index 4ab8cfaf63d1..90b777019cf5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hw.c +++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c @@ -837,6 +837,38 @@ int qed_dmae_host2host(struct qed_hwfn *p_hwfn, return rc; } +void qed_hw_err_notify(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + enum qed_hw_err_type err_type, char *fmt, ...) +{ + char buf[QED_HW_ERR_MAX_STR_SIZE]; + va_list vl; + int len; + + if (fmt) { + va_start(vl, fmt); + len = vsnprintf(buf, QED_HW_ERR_MAX_STR_SIZE, fmt, vl); + va_end(vl); + + if (len > QED_HW_ERR_MAX_STR_SIZE - 1) + len = QED_HW_ERR_MAX_STR_SIZE - 1; + + DP_NOTICE(p_hwfn, "%s", buf); + } + + /* Fan failure cannot be masked by handling of another HW error */ + if (p_hwfn->cdev->recov_in_prog && + err_type != QED_HW_ERR_FAN_FAIL) { + DP_VERBOSE(p_hwfn, + NETIF_MSG_DRV, + "Recovery is in progress. Avoid notifying about HW error %d.\n", + err_type); + return; + } + + qed_hw_error_occurred(p_hwfn, err_type); +} + int qed_dmae_sanity(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, const char *phase) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h index 505e94db939d..f5b109b04b66 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hw.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h @@ -315,4 +315,19 @@ int qed_init_fw_data(struct qed_dev *cdev, int qed_dmae_sanity(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, const char *phase); +#define QED_HW_ERR_MAX_STR_SIZE 256 + +/** + * @brief qed_hw_err_notify - Notify upper layer driver and management FW + * about a HW error. + * + * @param p_hwfn + * @param p_ptt + * @param err_type + * @param fmt - debug data buffer to send to the MFW + * @param ... - buffer format args + */ +void qed_hw_err_notify(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + enum qed_hw_err_type err_type, char *fmt, ...); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 38a1d26ca9db..d7c9d94e4c59 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -2468,6 +2468,35 @@ void qed_schedule_recovery_handler(struct qed_hwfn *p_hwfn) ops->schedule_recovery_handler(cookie); } +char *qed_hw_err_type_descr[] = { + [QED_HW_ERR_FAN_FAIL] = "Fan Failure", + [QED_HW_ERR_MFW_RESP_FAIL] = "MFW Response Failure", + [QED_HW_ERR_HW_ATTN] = "HW Attention", + [QED_HW_ERR_DMAE_FAIL] = "DMAE Failure", + [QED_HW_ERR_RAMROD_FAIL] = "Ramrod Failure", + [QED_HW_ERR_FW_ASSERT] = "FW Assertion", + [QED_HW_ERR_LAST] = "Unknown", +}; + +void qed_hw_error_occurred(struct qed_hwfn *p_hwfn, + enum qed_hw_err_type err_type) +{ + struct qed_common_cb_ops *ops = p_hwfn->cdev->protocol_ops.common; + void *cookie = p_hwfn->cdev->ops_cookie; + char *err_str; + + if (err_type > QED_HW_ERR_LAST) + err_type = QED_HW_ERR_LAST; + err_str = qed_hw_err_type_descr[err_type]; + + DP_NOTICE(p_hwfn, "HW error occurred [%s]\n", err_str); + + /* Call the HW error handler of the protocol driver + */ + if (ops && ops->schedule_hw_err_handler) + ops->schedule_hw_err_handler(cookie, err_type); +} + static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, void *handle) { diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 8f29e0d8a7b3..1b7d9548ee43 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -607,6 +607,16 @@ struct qed_sb_info { struct qed_dev *cdev; }; +enum qed_hw_err_type { + QED_HW_ERR_FAN_FAIL, + QED_HW_ERR_MFW_RESP_FAIL, + QED_HW_ERR_HW_ATTN, + QED_HW_ERR_DMAE_FAIL, + QED_HW_ERR_RAMROD_FAIL, + QED_HW_ERR_FW_ASSERT, + QED_HW_ERR_LAST, +}; + enum qed_dev_type { QED_DEV_TYPE_BB, QED_DEV_TYPE_AH, @@ -814,6 +824,8 @@ struct qed_common_cb_ops { void (*link_update)(void *dev, struct qed_link_output *link); void (*schedule_recovery_handler)(void *dev); + void (*schedule_hw_err_handler)(void *dev, + enum qed_hw_err_type err_type); void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data); void (*get_protocol_tlv_data)(void *dev, void *data); -- cgit v1.2.3 From 936c7ba4dd5e94a3fc784f2296de5d577a9b5e43 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 May 2020 12:57:22 +0300 Subject: net: qed: attention clearing properties On different hardware events we have to respond differently, on some of hardware indications hw attention (error condition) should be cleared by the driver to continue normal functioning. Here we introduce attention clear flags, and put them on some important events (in aeu_descs). Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed.h | 3 +++ drivers/net/ethernet/qlogic/qed/qed_int.c | 22 ++++++++++++++++++---- drivers/net/ethernet/qlogic/qed/qed_int.h | 11 +++++++++++ drivers/net/ethernet/qlogic/qed/qed_main.c | 7 ++++++- drivers/net/ethernet/qlogic/qede/qede_main.c | 6 ++++++ include/linux/qed/qed_if.h | 9 +++++++++ 6 files changed, 53 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index 07f6ef930b52..66ed39d6f357 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -838,6 +838,9 @@ struct qed_dev { /* Recovery */ bool recov_in_prog; + /* Indicates whether should prevent attentions from being reasserted */ + bool attn_clr_en; + /* LLH info */ u8 ppfid_bitmap; struct qed_llh_info *p_llh_info; diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c index 1b1447b2f059..b7b974f0ef21 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_int.c +++ b/drivers/net/ethernet/qlogic/qed/qed_int.c @@ -96,6 +96,7 @@ struct aeu_invert_reg_bit { #define ATTENTION_BB(value) (value << ATTENTION_BB_SHIFT) #define ATTENTION_BB_DIFFERENT BIT(23) +#define ATTENTION_CLEAR_ENABLE BIT(28) unsigned int flags; /* Callback to call if attention will be triggered */ @@ -371,6 +372,13 @@ static int qed_fw_assertion(struct qed_hwfn *p_hwfn) return -EINVAL; } +static int qed_general_attention_35(struct qed_hwfn *p_hwfn) +{ + DP_INFO(p_hwfn, "General attention 35!\n"); + + return 0; +} + #define QED_DORQ_ATTENTION_REASON_MASK (0xfffff) #define QED_DORQ_ATTENTION_OPAQUE_MASK (0xffff) #define QED_DORQ_ATTENTION_OPAQUE_SHIFT (0x0) @@ -613,14 +621,15 @@ static struct aeu_invert_reg aeu_descs[NUM_ATTN_REGS] = { { { /* After Invert 4 */ - {"General Attention 32", ATTENTION_SINGLE, - qed_fw_assertion, + {"General Attention 32", ATTENTION_SINGLE | + ATTENTION_CLEAR_ENABLE, qed_fw_assertion, MAX_BLOCK_ID}, {"General Attention %d", (2 << ATTENTION_LENGTH_SHIFT) | (33 << ATTENTION_OFFSET_SHIFT), NULL, MAX_BLOCK_ID}, - {"General Attention 35", ATTENTION_SINGLE, - NULL, MAX_BLOCK_ID}, + {"General Attention 35", ATTENTION_SINGLE | + ATTENTION_CLEAR_ENABLE, qed_general_attention_35, + MAX_BLOCK_ID}, {"NWS Parity", ATTENTION_PAR | ATTENTION_BB_DIFFERENT | ATTENTION_BB(AEU_INVERT_REG_SPECIAL_CNIG_0), @@ -2361,6 +2370,11 @@ void qed_int_disable_post_isr_release(struct qed_dev *cdev) cdev->hwfns[i].b_int_requested = false; } +void qed_int_attn_clr_enable(struct qed_dev *cdev, bool clr_enable) +{ + cdev->attn_clr_en = clr_enable; +} + int qed_int_set_timer_res(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u8 timer_res, u16 sb_id, bool tx) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h index 9ad568d93ae6..e09db3386367 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_int.h +++ b/drivers/net/ethernet/qlogic/qed/qed_int.h @@ -190,6 +190,17 @@ void qed_int_get_num_sbs(struct qed_hwfn *p_hwfn, */ void qed_int_disable_post_isr_release(struct qed_dev *cdev); +/** + * @brief qed_int_attn_clr_enable - sets whether the general behavior is + * preventing attentions from being reasserted, or following the + * attributes of the specific attention. + * + * @param cdev + * @param clr_enable + * + */ +void qed_int_attn_clr_enable(struct qed_dev *cdev, bool clr_enable); + /** * @brief - Doorbell Recovery handler. * Run doorbell recovery in case of PF overflow (and flush DORQ if diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index d7c9d94e4c59..83e798d4eebb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -2491,10 +2491,14 @@ void qed_hw_error_occurred(struct qed_hwfn *p_hwfn, DP_NOTICE(p_hwfn, "HW error occurred [%s]\n", err_str); - /* Call the HW error handler of the protocol driver + /* Call the HW error handler of the protocol driver. + * If it is not available - perform a minimal handling of preventing + * HW attentions from being reasserted. */ if (ops && ops->schedule_hw_err_handler) ops->schedule_hw_err_handler(cookie, err_type); + else + qed_int_attn_clr_enable(p_hwfn->cdev, true); } static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, @@ -2718,6 +2722,7 @@ const struct qed_common_ops qed_common_ops_pass = { .set_led = &qed_set_led, .recovery_process = &qed_recovery_process, .recovery_prolog = &qed_recovery_prolog, + .attn_clr_enable = &qed_int_attn_clr_enable, .update_drv_state = &qed_update_drv_state, .update_mac = &qed_update_mac, .update_mtu = &qed_update_mtu, diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index e67d5da23792..ee7662da6413 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2516,6 +2516,8 @@ err: static void qede_atomic_hw_err_handler(struct qede_dev *edev) { + struct qed_dev *cdev = edev->cdev; + DP_NOTICE(edev, "Generic non-sleepable HW error handling started - err_flags 0x%lx\n", edev->err_flags); @@ -2523,6 +2525,10 @@ static void qede_atomic_hw_err_handler(struct qede_dev *edev) /* Get a call trace of the flow that led to the error */ WARN_ON(test_bit(QEDE_ERR_WARN, &edev->err_flags)); + /* Prevent HW attentions from being reasserted */ + if (test_bit(QEDE_ERR_ATTN_CLR_EN, &edev->err_flags)) + edev->ops->common->attn_clr_enable(cdev, true); + DP_NOTICE(edev, "Generic non-sleepable HW error handling is done\n"); } diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 1b7d9548ee43..978e91e9ab65 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -1046,6 +1046,15 @@ struct qed_common_ops { */ int (*set_led)(struct qed_dev *cdev, enum qed_led_mode mode); + +/** + * @brief attn_clr_enable - Prevent attentions from being reasserted + * + * @param cdev + * @param clr_enable + */ + void (*attn_clr_enable)(struct qed_dev *cdev, bool clr_enable); + /** * @brief db_recovery_add - add doorbell information to the doorbell * recovery mechanism. -- cgit v1.2.3 From 8f76812e1cc4d561c3efc3b2586c686b5428d31f Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 May 2020 12:57:27 +0300 Subject: net: qed: fix bad formatting On some adjacent code, fix bad code formatting Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 978e91e9ab65..48325d7790f8 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -821,12 +821,11 @@ enum qed_nvm_flash_cmd { struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); - void (*link_update)(void *dev, - struct qed_link_output *link); + void (*link_update)(void *dev, struct qed_link_output *link); void (*schedule_recovery_handler)(void *dev); void (*schedule_hw_err_handler)(void *dev, enum qed_hw_err_type err_type); - void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); + void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data); void (*get_protocol_tlv_data)(void *dev, void *data); }; -- cgit v1.2.3 From 6ae72bfa656ea04806f98ef85cb44b0789064362 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 9 May 2020 18:19:28 +0800 Subject: PCI: Unify pcie_find_root_port() and pci_find_pcie_root_port() Previously we used pcie_find_root_port() to find a Root Port from a PCIe device and pci_find_pcie_root_port() to find a Root Port from a Conventional PCI device. Unify the two functions and use pcie_find_root_port() to find a Root Port from either a Conventional PCI device or a PCIe device. Then there is no need to distinguish the type of the device. Link: https://lore.kernel.org/r/1589019568-5216-1-git-send-email-yangyicong@hisilicon.com Signed-off-by: Yicong Yang Signed-off-by: Bjorn Helgaas Acked-by: Kalle Valo # wireless Acked-by: Mika Westerberg # thunderbolt --- drivers/pci/pci-acpi.c | 2 +- drivers/pci/pci.c | 24 ------------------------ drivers/pci/probe.c | 2 +- drivers/pci/quirks.c | 2 +- drivers/thunderbolt/switch.c | 4 ++-- include/linux/pci.h | 23 ++++++++++++++--------- 6 files changed, 19 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d21969fba6ab..d820a55ae71c 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -948,7 +948,7 @@ static bool acpi_pci_bridge_d3(struct pci_dev *dev) * Look for a special _DSD property for the root port and if it * is set we know the hierarchy behind it supports D3 just fine. */ - root = pci_find_pcie_root_port(dev); + root = pcie_find_root_port(dev); if (!root) return false; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index bb78f580814e..227a3a979ec4 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -751,30 +751,6 @@ struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res) } EXPORT_SYMBOL(pci_find_resource); -/** - * pci_find_pcie_root_port - return PCIe Root Port - * @dev: PCI device to query - * - * Traverse up the parent chain and return the PCIe Root Port PCI Device - * for a given PCI Device. - */ -struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev) -{ - struct pci_dev *bridge, *highest_pcie_bridge = dev; - - bridge = pci_upstream_bridge(dev); - while (bridge && pci_is_pcie(bridge)) { - highest_pcie_bridge = bridge; - bridge = pci_upstream_bridge(bridge); - } - - if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT) - return NULL; - - return highest_pcie_bridge; -} -EXPORT_SYMBOL(pci_find_pcie_root_port); - /** * pci_wait_for_pending - wait for @mask bit(s) to clear in status word @pos * @dev: the PCI device to operate on diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 77b8a145c39b..cdff469ba070 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2056,7 +2056,7 @@ static void pci_configure_relaxed_ordering(struct pci_dev *dev) * For now, we only deal with Relaxed Ordering issues with Root * Ports. Peer-to-Peer DMA is another can of worms. */ - root = pci_find_pcie_root_port(dev); + root = pcie_find_root_port(dev); if (!root) return; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 28c9a2409c50..885044d050a6 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4319,7 +4319,7 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_AMD, 0x1a02, PCI_CLASS_NOT_DEFINED, */ static void quirk_disable_root_port_attributes(struct pci_dev *pdev) { - struct pci_dev *root_port = pci_find_pcie_root_port(pdev); + struct pci_dev *root_port = pcie_find_root_port(pdev); if (!root_port) { pci_warn(pdev, "PCIe Completion erratum may cause device errors\n"); diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index a2ce99051c51..d92c7554520b 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -263,7 +263,7 @@ static void nvm_authenticate_start_dma_port(struct tb_switch *sw) * itself. To be on the safe side keep the root port in D0 during * the whole upgrade process. */ - root_port = pci_find_pcie_root_port(sw->tb->nhi->pdev); + root_port = pcie_find_root_port(sw->tb->nhi->pdev); if (root_port) pm_runtime_get_noresume(&root_port->dev); } @@ -272,7 +272,7 @@ static void nvm_authenticate_complete_dma_port(struct tb_switch *sw) { struct pci_dev *root_port; - root_port = pci_find_pcie_root_port(sw->tb->nhi->pdev); + root_port = pcie_find_root_port(sw->tb->nhi->pdev); if (root_port) pm_runtime_put(&root_port->dev); } diff --git a/include/linux/pci.h b/include/linux/pci.h index 0453ee458ab1..bbd6510065a7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1025,7 +1025,6 @@ void pci_bus_add_device(struct pci_dev *dev); void pci_read_bridge_bases(struct pci_bus *child); struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res); -struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev); u8 pci_swizzle_interrupt_pin(const struct pci_dev *dev, u8 pin); int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge); u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp); @@ -2143,17 +2142,23 @@ static inline int pci_pcie_type(const struct pci_dev *dev) return (pcie_caps_reg(dev) & PCI_EXP_FLAGS_TYPE) >> 4; } +/** + * pcie_find_root_port - Get the PCIe root port device + * @dev: PCI device + * + * Traverse up the parent chain and return the PCIe Root Port PCI Device + * for a given PCI/PCIe Device. + */ static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev) { - while (1) { - if (!pci_is_pcie(dev)) - break; - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - return dev; - if (!dev->bus->self) - break; - dev = dev->bus->self; + struct pci_dev *bridge = pci_upstream_bridge(dev); + + while (bridge) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) + return bridge; + bridge = pci_upstream_bridge(bridge); } + return NULL; } -- cgit v1.2.3 From 69cf449166987d9a041020be6422ee7bf94a7228 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 13 May 2020 15:47:21 -0700 Subject: iommu: Remove functions that support private domain After moving iommu_group setup to iommu core code [1][2] and removing private domain support in vt-d [3], there are no users for functions such as iommu_request_dm_for_dev(), iommu_request_dma_domain_for_dev() and request_default_domain_for_dev(). So, remove these functions. [1] commit dce8d6964ebd ("iommu/amd: Convert to probe/release_device() call-backs") [2] commit e5d1841f18b2 ("iommu/vt-d: Convert to probe/release_device() call-backs") [3] commit 327d5b2fee91 ("iommu/vt-d: Allow 32bit devices to uses DMA domain") Signed-off-by: Sai Praneeth Prakhya Cc: Joerg Roedel Cc: Lu Baolu Link: https://lore.kernel.org/r/20200513224721.20504-1-sai.praneeth.prakhya@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 65 --------------------------------------------------- include/linux/iommu.h | 12 ---------- 2 files changed, 77 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4050569188be..374b34fd6fac 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2536,71 +2536,6 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, } EXPORT_SYMBOL_GPL(iommu_alloc_resv_region); -static int -request_default_domain_for_dev(struct device *dev, unsigned long type) -{ - struct iommu_domain *domain; - struct iommu_group *group; - int ret; - - /* Device must already be in a group before calling this function */ - group = iommu_group_get(dev); - if (!group) - return -EINVAL; - - mutex_lock(&group->mutex); - - ret = 0; - if (group->default_domain && group->default_domain->type == type) - goto out; - - /* Don't change mappings of existing devices */ - ret = -EBUSY; - if (iommu_group_device_count(group) != 1) - goto out; - - ret = -ENOMEM; - domain = __iommu_domain_alloc(dev->bus, type); - if (!domain) - goto out; - - /* Attach the device to the domain */ - ret = __iommu_attach_group(domain, group); - if (ret) { - iommu_domain_free(domain); - goto out; - } - - /* Make the domain the default for this group */ - if (group->default_domain) - iommu_domain_free(group->default_domain); - group->default_domain = domain; - - iommu_create_device_direct_mappings(group, dev); - - dev_info(dev, "Using iommu %s mapping\n", - type == IOMMU_DOMAIN_DMA ? "dma" : "direct"); - - ret = 0; -out: - mutex_unlock(&group->mutex); - iommu_group_put(group); - - return ret; -} - -/* Request that a device is direct mapped by the IOMMU */ -int iommu_request_dm_for_dev(struct device *dev) -{ - return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY); -} - -/* Request that a device can't be direct mapped by the IOMMU */ -int iommu_request_dma_domain_for_dev(struct device *dev) -{ - return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA); -} - void iommu_set_default_passthrough(bool cmd_line) { if (cmd_line) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cfd2dddb49d..78a26ae5c2b6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -482,8 +482,6 @@ extern void iommu_get_resv_regions(struct device *dev, struct list_head *list); extern void iommu_put_resv_regions(struct device *dev, struct list_head *list); extern void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list); -extern int iommu_request_dm_for_dev(struct device *dev); -extern int iommu_request_dma_domain_for_dev(struct device *dev); extern void iommu_set_default_passthrough(bool cmd_line); extern void iommu_set_default_translated(bool cmd_line); extern bool iommu_default_passthrough(void); @@ -804,16 +802,6 @@ static inline int iommu_get_group_resv_regions(struct iommu_group *group, return -ENODEV; } -static inline int iommu_request_dm_for_dev(struct device *dev) -{ - return -ENODEV; -} - -static inline int iommu_request_dma_domain_for_dev(struct device *dev) -{ - return -ENODEV; -} - static inline void iommu_set_default_passthrough(bool cmd_line) { } -- cgit v1.2.3 From c150c0f362c1e51c0e3216c9912b85b71d00e70d Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 12 May 2020 14:40:02 +0200 Subject: serial: Allow uart_get_rs485_mode() to return errno We're about to amend uart_get_rs485_mode() to support a GPIO pin for rs485 bus termination. Retrieving the GPIO descriptor may fail, so allow uart_get_rs485_mode() to return an errno and change all callers to check for failure. The GPIO descriptor is going to be stored in struct uart_port. Pass that struct to uart_get_rs485_mode() in lieu of a struct device and struct serial_rs485, both of which are directly accessible from struct uart_port. A few drivers call uart_get_rs485_mode() before setting the struct device pointer in struct uart_port. Shuffle those calls around where necessary. [Heiko Stuebner did the ar933x_uart.c portion, hence his Signed-off-by.] Signed-off-by: Heiko Stuebner Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/271e814af4b0db3bffbbb74abf2b46b75add4516.1589285873.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 4 +++- drivers/tty/serial/ar933x_uart.c | 6 ++++-- drivers/tty/serial/atmel_serial.c | 6 ++++-- drivers/tty/serial/fsl_lpuart.c | 5 ++++- drivers/tty/serial/imx.c | 6 +++++- drivers/tty/serial/omap-serial.c | 4 +++- drivers/tty/serial/serial_core.c | 6 +++++- drivers/tty/serial/stm32-usart.c | 8 ++++---- include/linux/serial_core.h | 2 +- 9 files changed, 33 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 9548d3f8fc8e..fc118f649887 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -1026,7 +1026,9 @@ int serial8250_register_8250_port(struct uart_8250_port *up) if (up->port.dev) { uart->port.dev = up->port.dev; - uart_get_rs485_mode(uart->port.dev, &uart->port.rs485); + ret = uart_get_rs485_mode(&uart->port); + if (ret) + goto err; } if (up->port.flags & UPF_FIXED_TYPE) diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c index 7e7f1398019f..0c80a79d7442 100644 --- a/drivers/tty/serial/ar933x_uart.c +++ b/drivers/tty/serial/ar933x_uart.c @@ -766,8 +766,6 @@ static int ar933x_uart_probe(struct platform_device *pdev) goto err_disable_clk; } - uart_get_rs485_mode(&pdev->dev, &port->rs485); - port->mapbase = mem_res->start; port->line = id; port->irq = irq_res->start; @@ -786,6 +784,10 @@ static int ar933x_uart_probe(struct platform_device *pdev) baud = ar933x_uart_get_baud(port->uartclk, 0, AR933X_UART_MAX_STEP); up->max_baud = min_t(unsigned int, baud, AR933X_UART_MAX_BAUD); + ret = uart_get_rs485_mode(port); + if (ret) + goto err_disable_clk; + up->gpios = mctrl_gpio_init(port, 0); if (IS_ERR(up->gpios) && PTR_ERR(up->gpios) != -ENOSYS) return PTR_ERR(up->gpios); diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 8d7080efad9b..e43471b33710 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -2491,8 +2491,6 @@ static int atmel_init_port(struct atmel_uart_port *atmel_port, atmel_init_property(atmel_port, pdev); atmel_set_ops(port); - uart_get_rs485_mode(&mpdev->dev, &port->rs485); - port->iotype = UPIO_MEM; port->flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP; port->ops = &atmel_pops; @@ -2506,6 +2504,10 @@ static int atmel_init_port(struct atmel_uart_port *atmel_port, memset(&atmel_port->rx_ring, 0, sizeof(atmel_port->rx_ring)); + ret = uart_get_rs485_mode(port); + if (ret) + return ret; + /* for console, the clock could already be configured */ if (!atmel_port->clk) { atmel_port->clk = clk_get(&mpdev->dev, "usart"); diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 6a9909e56449..029324c77cd7 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -2619,7 +2619,9 @@ static int lpuart_probe(struct platform_device *pdev) if (ret) goto failed_attach_port; - uart_get_rs485_mode(&pdev->dev, &sport->port.rs485); + ret = uart_get_rs485_mode(&sport->port); + if (ret) + goto failed_get_rs485; if (sport->port.rs485.flags & SER_RS485_RX_DURING_TX) dev_err(&pdev->dev, "driver doesn't support RX during TX\n"); @@ -2632,6 +2634,7 @@ static int lpuart_probe(struct platform_device *pdev) return 0; +failed_get_rs485: failed_attach_port: failed_irq_request: lpuart_disable_clks(sport); diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index f4023d9d8e7a..986d902fb7fe 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2304,7 +2304,11 @@ static int imx_uart_probe(struct platform_device *pdev) sport->ucr4 = readl(sport->port.membase + UCR4); sport->ufcr = readl(sport->port.membase + UFCR); - uart_get_rs485_mode(&pdev->dev, &sport->port.rs485); + ret = uart_get_rs485_mode(&sport->port); + if (ret) { + clk_disable_unprepare(sport->clk_ipg); + return ret; + } if (sport->port.rs485.flags & SER_RS485_ENABLED && (!sport->have_rtscts && !sport->have_rtsgpio)) diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index c71c1a2266dc..8573fc9cb0cd 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -1608,7 +1608,9 @@ static int serial_omap_probe_rs485(struct uart_omap_port *up, if (!np) return 0; - uart_get_rs485_mode(up->dev, rs485conf); + ret = uart_get_rs485_mode(&up->port); + if (ret) + return ret; if (of_property_read_bool(np, "rs485-rts-active-high")) { rs485conf->flags |= SER_RS485_RTS_ON_SEND; diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 66a5e2faf57e..43b6682877d5 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -3295,8 +3295,10 @@ EXPORT_SYMBOL(uart_remove_one_port); * This function implements the device tree binding described in * Documentation/devicetree/bindings/serial/rs485.txt. */ -void uart_get_rs485_mode(struct device *dev, struct serial_rs485 *rs485conf) +int uart_get_rs485_mode(struct uart_port *port) { + struct serial_rs485 *rs485conf = &port->rs485; + struct device *dev = port->dev; u32 rs485_delay[2]; int ret; @@ -3328,6 +3330,8 @@ void uart_get_rs485_mode(struct device *dev, struct serial_rs485 *rs485conf) rs485conf->flags &= ~SER_RS485_RTS_ON_SEND; rs485conf->flags |= SER_RS485_RTS_AFTER_SEND; } + + return 0; } EXPORT_SYMBOL_GPL(uart_get_rs485_mode); diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 17c2f3276888..7d1acb3786b8 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -159,9 +159,7 @@ static int stm32_init_rs485(struct uart_port *port, if (!pdev->dev.of_node) return -ENODEV; - uart_get_rs485_mode(&pdev->dev, rs485conf); - - return 0; + return uart_get_rs485_mode(port); } static int stm32_pending_rx(struct uart_port *port, u32 *sr, int *last_res, @@ -959,7 +957,9 @@ static int stm32_init_port(struct stm32_port *stm32port, port->rs485_config = stm32_config_rs485; - stm32_init_rs485(port, pdev); + ret = stm32_init_rs485(port, pdev); + if (ret) + return ret; if (stm32port->info->cfg.has_wakeup) { stm32port->wakeirq = platform_get_irq(pdev, 1); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 92f5eba86052..b649a2b894e7 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -472,5 +472,5 @@ extern int uart_handle_break(struct uart_port *port); (cflag) & CRTSCTS || \ !((cflag) & CLOCAL)) -void uart_get_rs485_mode(struct device *dev, struct serial_rs485 *rs485conf); +int uart_get_rs485_mode(struct uart_port *port); #endif /* LINUX_SERIAL_CORE_H */ -- cgit v1.2.3 From 0f1c9688a194d22bb81953bd85bd18b0115fd17f Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:41 +0100 Subject: tty/sysrq: alpha: export and use __sysrq_get_key_op() Export a pointer to the sysrq_get_key_op(). This way we can cleanly unregister it, instead of the current solutions of modifuing it inplace. Since __sysrq_get_key_op() is no longer used externally, let's make it a static function. This patch will allow us to limit access to each and every sysrq op and constify the sysrq handling. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: linux-alpha@vger.kernel.org Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-1-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- arch/alpha/kernel/setup.c | 13 +++++++++++-- drivers/tty/sysrq.c | 4 +++- include/linux/sysrq.h | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index f19aa577354b..dd7f770f23cf 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -430,6 +430,15 @@ register_cpus(void) arch_initcall(register_cpus); +#ifdef CONFIG_MAGIC_SYSRQ +static struct sysrq_key_op srm_sysrq_reboot_op = { + .handler = machine_halt, + .help_msg = "reboot(b)", + .action_msg = "Resetting", + .enable_mask = SYSRQ_ENABLE_BOOT, +}; +#endif + void __init setup_arch(char **cmdline_p) { @@ -550,8 +559,8 @@ setup_arch(char **cmdline_p) /* If we're using SRM, make sysrq-b halt back to the prom, not auto-reboot. */ if (alpha_using_srm) { - struct sysrq_key_op *op = __sysrq_get_key_op('b'); - op->handler = (void *) machine_halt; + unregister_sysrq_key('b', __sysrq_reboot_op); + register_sysrq_key('b', &srm_sysrq_reboot_op); } #endif diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 0dc3878794fd..1741134cabca 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -172,6 +172,8 @@ static struct sysrq_key_op sysrq_reboot_op = { .enable_mask = SYSRQ_ENABLE_BOOT, }; +struct sysrq_key_op *__sysrq_reboot_op = &sysrq_reboot_op; + static void sysrq_handle_sync(int key) { emergency_sync(); @@ -516,7 +518,7 @@ static int sysrq_key_table_key2index(int key) /* * get and put functions for the table, exposed to modules. */ -struct sysrq_key_op *__sysrq_get_key_op(int key) +static struct sysrq_key_op *__sysrq_get_key_op(int key) { struct sysrq_key_op *op_p = NULL; int i; diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 8e159e16850f..9b51f98e5f60 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -47,7 +47,7 @@ void handle_sysrq(int key); void __handle_sysrq(int key, bool check_mask); int register_sysrq_key(int key, struct sysrq_key_op *op); int unregister_sysrq_key(int key, struct sysrq_key_op *op); -struct sysrq_key_op *__sysrq_get_key_op(int key); +extern struct sysrq_key_op *__sysrq_reboot_op; int sysrq_toggle_support(int enable_mask); int sysrq_mask(void); -- cgit v1.2.3 From 23cbedf812ff7c7751582928e32d953d84c1c821 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:42 +0100 Subject: tty/sysrq: constify the sysrq API The user is not supposed to thinker with the underlying sysrq_key_op. Make that explicit by adding a handful of const notations. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-2-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/sysrq.rst | 10 +++++----- drivers/tty/sysrq.c | 16 ++++++++-------- include/linux/sysrq.h | 16 ++++++++-------- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/sysrq.rst b/Documentation/admin-guide/sysrq.rst index a46209f4636c..e6424d8c5846 100644 --- a/Documentation/admin-guide/sysrq.rst +++ b/Documentation/admin-guide/sysrq.rst @@ -231,13 +231,13 @@ prints help, and C) an action_msg string, that will print right before your handler is called. Your handler must conform to the prototype in 'sysrq.h'. After the ``sysrq_key_op`` is created, you can call the kernel function -``register_sysrq_key(int key, struct sysrq_key_op *op_p);`` this will +``register_sysrq_key(int key, const struct sysrq_key_op *op_p);`` this will register the operation pointed to by ``op_p`` at table key 'key', if that slot in the table is blank. At module unload time, you must call -the function ``unregister_sysrq_key(int key, struct sysrq_key_op *op_p)``, which -will remove the key op pointed to by 'op_p' from the key 'key', if and only if -it is currently registered in that slot. This is in case the slot has been -overwritten since you registered it. +the function ``unregister_sysrq_key(int key, const struct sysrq_key_op *op_p)``, +which will remove the key op pointed to by 'op_p' from the key 'key', if and +only if it is currently registered in that slot. This is in case the slot has +been overwritten since you registered it. The Magic SysRQ system works by registering key operations against a key op lookup table, which is defined in 'drivers/tty/sysrq.c'. This key table has diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 1741134cabca..091c64a3cef0 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -518,9 +518,9 @@ static int sysrq_key_table_key2index(int key) /* * get and put functions for the table, exposed to modules. */ -static struct sysrq_key_op *__sysrq_get_key_op(int key) +static const struct sysrq_key_op *__sysrq_get_key_op(int key) { - struct sysrq_key_op *op_p = NULL; + const struct sysrq_key_op *op_p = NULL; int i; i = sysrq_key_table_key2index(key); @@ -530,7 +530,7 @@ static struct sysrq_key_op *__sysrq_get_key_op(int key) return op_p; } -static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) +static void __sysrq_put_key_op(int key, const struct sysrq_key_op *op_p) { int i = sysrq_key_table_key2index(key); @@ -540,7 +540,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) void __handle_sysrq(int key, bool check_mask) { - struct sysrq_key_op *op_p; + const struct sysrq_key_op *op_p; int orig_log_level; int orig_suppress_printk; int i; @@ -1063,8 +1063,8 @@ int sysrq_toggle_support(int enable_mask) } EXPORT_SYMBOL_GPL(sysrq_toggle_support); -static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, - struct sysrq_key_op *remove_op_p) +static int __sysrq_swap_key_ops(int key, const struct sysrq_key_op *insert_op_p, + const struct sysrq_key_op *remove_op_p) { int retval; @@ -1087,13 +1087,13 @@ static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, return retval; } -int register_sysrq_key(int key, struct sysrq_key_op *op_p) +int register_sysrq_key(int key, const struct sysrq_key_op *op_p) { return __sysrq_swap_key_ops(key, op_p, NULL); } EXPORT_SYMBOL(register_sysrq_key); -int unregister_sysrq_key(int key, struct sysrq_key_op *op_p) +int unregister_sysrq_key(int key, const struct sysrq_key_op *op_p) { return __sysrq_swap_key_ops(key, NULL, op_p); } diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 9b51f98e5f60..479028391c08 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -30,10 +30,10 @@ #define SYSRQ_ENABLE_RTNICE 0x0100 struct sysrq_key_op { - void (*handler)(int); - char *help_msg; - char *action_msg; - int enable_mask; + void (* const handler)(int); + const char * const help_msg; + const char * const action_msg; + const int enable_mask; }; #ifdef CONFIG_MAGIC_SYSRQ @@ -45,8 +45,8 @@ struct sysrq_key_op { void handle_sysrq(int key); void __handle_sysrq(int key, bool check_mask); -int register_sysrq_key(int key, struct sysrq_key_op *op); -int unregister_sysrq_key(int key, struct sysrq_key_op *op); +int register_sysrq_key(int key, const struct sysrq_key_op *op); +int unregister_sysrq_key(int key, const struct sysrq_key_op *op); extern struct sysrq_key_op *__sysrq_reboot_op; int sysrq_toggle_support(int enable_mask); @@ -62,12 +62,12 @@ static inline void __handle_sysrq(int key, bool check_mask) { } -static inline int register_sysrq_key(int key, struct sysrq_key_op *op) +static inline int register_sysrq_key(int key, const struct sysrq_key_op *op) { return -EINVAL; } -static inline int unregister_sysrq_key(int key, struct sysrq_key_op *op) +static inline int unregister_sysrq_key(int key, const struct sysrq_key_op *op) { return -EINVAL; } -- cgit v1.2.3 From 7fffe31d3eaa2f08bdfde2403adcaa8029f9bea4 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Wed, 13 May 2020 22:43:43 +0100 Subject: tty/sysrq: constify the the sysrq_key_op(s) All the users threat them as immutable - annotate them as such. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-kernel@vger.kernel.org Signed-off-by: Emil Velikov Link: https://lore.kernel.org/r/20200513214351.2138580-3-emil.l.velikov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/sysrq.c | 52 +++++++++++++++++++++++++-------------------------- include/linux/sysrq.h | 2 +- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 091c64a3cef0..477cdc1e9cf3 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -106,7 +106,7 @@ static void sysrq_handle_loglevel(int key) pr_info("Loglevel set to %d\n", i); console_loglevel = i; } -static struct sysrq_key_op sysrq_loglevel_op = { +static const struct sysrq_key_op sysrq_loglevel_op = { .handler = sysrq_handle_loglevel, .help_msg = "loglevel(0-9)", .action_msg = "Changing Loglevel", @@ -119,14 +119,14 @@ static void sysrq_handle_SAK(int key) struct work_struct *SAK_work = &vc_cons[fg_console].SAK_work; schedule_work(SAK_work); } -static struct sysrq_key_op sysrq_SAK_op = { +static const struct sysrq_key_op sysrq_SAK_op = { .handler = sysrq_handle_SAK, .help_msg = "sak(k)", .action_msg = "SAK", .enable_mask = SYSRQ_ENABLE_KEYBOARD, }; #else -#define sysrq_SAK_op (*(struct sysrq_key_op *)NULL) +#define sysrq_SAK_op (*(const struct sysrq_key_op *)NULL) #endif #ifdef CONFIG_VT @@ -135,14 +135,14 @@ static void sysrq_handle_unraw(int key) vt_reset_unicode(fg_console); } -static struct sysrq_key_op sysrq_unraw_op = { +static const struct sysrq_key_op sysrq_unraw_op = { .handler = sysrq_handle_unraw, .help_msg = "unraw(r)", .action_msg = "Keyboard mode set to system default", .enable_mask = SYSRQ_ENABLE_KEYBOARD, }; #else -#define sysrq_unraw_op (*(struct sysrq_key_op *)NULL) +#define sysrq_unraw_op (*(const struct sysrq_key_op *)NULL) #endif /* CONFIG_VT */ static void sysrq_handle_crash(int key) @@ -152,7 +152,7 @@ static void sysrq_handle_crash(int key) panic("sysrq triggered crash\n"); } -static struct sysrq_key_op sysrq_crash_op = { +static const struct sysrq_key_op sysrq_crash_op = { .handler = sysrq_handle_crash, .help_msg = "crash(c)", .action_msg = "Trigger a crash", @@ -165,20 +165,20 @@ static void sysrq_handle_reboot(int key) local_irq_enable(); emergency_restart(); } -static struct sysrq_key_op sysrq_reboot_op = { +static const struct sysrq_key_op sysrq_reboot_op = { .handler = sysrq_handle_reboot, .help_msg = "reboot(b)", .action_msg = "Resetting", .enable_mask = SYSRQ_ENABLE_BOOT, }; -struct sysrq_key_op *__sysrq_reboot_op = &sysrq_reboot_op; +const struct sysrq_key_op *__sysrq_reboot_op = &sysrq_reboot_op; static void sysrq_handle_sync(int key) { emergency_sync(); } -static struct sysrq_key_op sysrq_sync_op = { +static const struct sysrq_key_op sysrq_sync_op = { .handler = sysrq_handle_sync, .help_msg = "sync(s)", .action_msg = "Emergency Sync", @@ -190,7 +190,7 @@ static void sysrq_handle_show_timers(int key) sysrq_timer_list_show(); } -static struct sysrq_key_op sysrq_show_timers_op = { +static const struct sysrq_key_op sysrq_show_timers_op = { .handler = sysrq_handle_show_timers, .help_msg = "show-all-timers(q)", .action_msg = "Show clockevent devices & pending hrtimers (no others)", @@ -200,7 +200,7 @@ static void sysrq_handle_mountro(int key) { emergency_remount(); } -static struct sysrq_key_op sysrq_mountro_op = { +static const struct sysrq_key_op sysrq_mountro_op = { .handler = sysrq_handle_mountro, .help_msg = "unmount(u)", .action_msg = "Emergency Remount R/O", @@ -213,13 +213,13 @@ static void sysrq_handle_showlocks(int key) debug_show_all_locks(); } -static struct sysrq_key_op sysrq_showlocks_op = { +static const struct sysrq_key_op sysrq_showlocks_op = { .handler = sysrq_handle_showlocks, .help_msg = "show-all-locks(d)", .action_msg = "Show Locks Held", }; #else -#define sysrq_showlocks_op (*(struct sysrq_key_op *)NULL) +#define sysrq_showlocks_op (*(const struct sysrq_key_op *)NULL) #endif #ifdef CONFIG_SMP @@ -266,7 +266,7 @@ static void sysrq_handle_showallcpus(int key) } } -static struct sysrq_key_op sysrq_showallcpus_op = { +static const struct sysrq_key_op sysrq_showallcpus_op = { .handler = sysrq_handle_showallcpus, .help_msg = "show-backtrace-all-active-cpus(l)", .action_msg = "Show backtrace of all active CPUs", @@ -284,7 +284,7 @@ static void sysrq_handle_showregs(int key) show_regs(regs); perf_event_print_debug(); } -static struct sysrq_key_op sysrq_showregs_op = { +static const struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, .help_msg = "show-registers(p)", .action_msg = "Show Regs", @@ -296,7 +296,7 @@ static void sysrq_handle_showstate(int key) show_state(); show_workqueue_state(); } -static struct sysrq_key_op sysrq_showstate_op = { +static const struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, .help_msg = "show-task-states(t)", .action_msg = "Show State", @@ -307,7 +307,7 @@ static void sysrq_handle_showstate_blocked(int key) { show_state_filter(TASK_UNINTERRUPTIBLE); } -static struct sysrq_key_op sysrq_showstate_blocked_op = { +static const struct sysrq_key_op sysrq_showstate_blocked_op = { .handler = sysrq_handle_showstate_blocked, .help_msg = "show-blocked-tasks(w)", .action_msg = "Show Blocked State", @@ -321,21 +321,21 @@ static void sysrq_ftrace_dump(int key) { ftrace_dump(DUMP_ALL); } -static struct sysrq_key_op sysrq_ftrace_dump_op = { +static const struct sysrq_key_op sysrq_ftrace_dump_op = { .handler = sysrq_ftrace_dump, .help_msg = "dump-ftrace-buffer(z)", .action_msg = "Dump ftrace buffer", .enable_mask = SYSRQ_ENABLE_DUMP, }; #else -#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)NULL) +#define sysrq_ftrace_dump_op (*(const struct sysrq_key_op *)NULL) #endif static void sysrq_handle_showmem(int key) { show_mem(0, NULL); } -static struct sysrq_key_op sysrq_showmem_op = { +static const struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, .help_msg = "show-memory-usage(m)", .action_msg = "Show Memory", @@ -366,7 +366,7 @@ static void sysrq_handle_term(int key) send_sig_all(SIGTERM); console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } -static struct sysrq_key_op sysrq_term_op = { +static const struct sysrq_key_op sysrq_term_op = { .handler = sysrq_handle_term, .help_msg = "terminate-all-tasks(e)", .action_msg = "Terminate All Tasks", @@ -396,7 +396,7 @@ static void sysrq_handle_moom(int key) { schedule_work(&moom_work); } -static struct sysrq_key_op sysrq_moom_op = { +static const struct sysrq_key_op sysrq_moom_op = { .handler = sysrq_handle_moom, .help_msg = "memory-full-oom-kill(f)", .action_msg = "Manual OOM execution", @@ -408,7 +408,7 @@ static void sysrq_handle_thaw(int key) { emergency_thaw_all(); } -static struct sysrq_key_op sysrq_thaw_op = { +static const struct sysrq_key_op sysrq_thaw_op = { .handler = sysrq_handle_thaw, .help_msg = "thaw-filesystems(j)", .action_msg = "Emergency Thaw of all frozen filesystems", @@ -421,7 +421,7 @@ static void sysrq_handle_kill(int key) send_sig_all(SIGKILL); console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } -static struct sysrq_key_op sysrq_kill_op = { +static const struct sysrq_key_op sysrq_kill_op = { .handler = sysrq_handle_kill, .help_msg = "kill-all-tasks(i)", .action_msg = "Kill All Tasks", @@ -432,7 +432,7 @@ static void sysrq_handle_unrt(int key) { normalize_rt_tasks(); } -static struct sysrq_key_op sysrq_unrt_op = { +static const struct sysrq_key_op sysrq_unrt_op = { .handler = sysrq_handle_unrt, .help_msg = "nice-all-RT-tasks(n)", .action_msg = "Nice All RT Tasks", @@ -442,7 +442,7 @@ static struct sysrq_key_op sysrq_unrt_op = { /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); -static struct sysrq_key_op *sysrq_key_table[36] = { +static const struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_loglevel_op, /* 0 */ &sysrq_loglevel_op, /* 1 */ &sysrq_loglevel_op, /* 2 */ diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 479028391c08..3a582ec7a2f1 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -47,7 +47,7 @@ void handle_sysrq(int key); void __handle_sysrq(int key, bool check_mask); int register_sysrq_key(int key, const struct sysrq_key_op *op); int unregister_sysrq_key(int key, const struct sysrq_key_op *op); -extern struct sysrq_key_op *__sysrq_reboot_op; +extern const struct sysrq_key_op *__sysrq_reboot_op; int sysrq_toggle_support(int enable_mask); int sysrq_mask(void); -- cgit v1.2.3 From ff4c65ca48f08f4781accfb1d224acd7c897070e Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Thu, 14 May 2020 17:50:36 +0530 Subject: usb: hci: add hc_driver as argument for usb_hcd_pci_probe usb_hcd_pci_probe expects users to call this with driver_data set as hc_driver, that limits the possibility of using the driver_data for driver data. Add hc_driver as argument to usb_hcd_pci_probe and modify the callers ehci/ohci/xhci/uhci to pass hc_driver as argument and freeup the driver_data used Tested xhci driver on Dragon-board RB3, compile tested ehci, ohci and uhci. [For all but the xHCI parts] [For the xhci part] Suggested-by: Mathias Nyman Acked-by: Alan Stern Acked-by: Mathias Nyman Signed-off-by: Vinod Koul Link: https://lore.kernel.org/r/20200514122039.300417-2-vkoul@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd-pci.c | 7 ++++--- drivers/usb/host/ehci-pci.c | 6 ++---- drivers/usb/host/ohci-pci.c | 9 ++++++--- drivers/usb/host/uhci-pci.c | 8 ++++++-- drivers/usb/host/xhci-pci.c | 14 +++++--------- include/linux/usb/hcd.h | 3 ++- 6 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index f0a259937da8..1547aa6e5314 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -159,6 +159,7 @@ static void ehci_wait_for_companions(struct pci_dev *pdev, struct usb_hcd *hcd, * usb_hcd_pci_probe - initialize PCI-based HCDs * @dev: USB Host Controller being probed * @id: pci hotplug id connecting controller to HCD framework + * @driver: USB HC driver handle * Context: !in_interrupt() * * Allocates basic PCI resources for this USB host controller, and @@ -169,9 +170,9 @@ static void ehci_wait_for_companions(struct pci_dev *pdev, struct usb_hcd *hcd, * * Return: 0 if successful. */ -int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id, + const struct hc_driver *driver) { - struct hc_driver *driver; struct usb_hcd *hcd; int retval; int hcd_irq = 0; @@ -181,7 +182,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) if (!id) return -EINVAL; - driver = (struct hc_driver *)id->driver_data; + if (!driver) return -EINVAL; diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c index 1a48ab1bd3b2..3c3820ad9092 100644 --- a/drivers/usb/host/ehci-pci.c +++ b/drivers/usb/host/ehci-pci.c @@ -360,23 +360,21 @@ static int ehci_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { if (is_bypassed_id(pdev)) return -ENODEV; - return usb_hcd_pci_probe(pdev, id); + return usb_hcd_pci_probe(pdev, id, &ehci_pci_hc_driver); } static void ehci_pci_remove(struct pci_dev *pdev) { pci_clear_mwi(pdev); - usb_hcd_pci_remove(pdev); + usb_hcd_pci_remove(pdev); } /* PCI driver selection metadata; PCI hotplugging uses this */ static const struct pci_device_id pci_ids [] = { { /* handle any USB 2.0 EHCI controller */ PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_EHCI, ~0), - .driver_data = (unsigned long) &ehci_pci_hc_driver, }, { PCI_VDEVICE(STMICRO, PCI_DEVICE_ID_STMICRO_USB_HOST), - .driver_data = (unsigned long) &ehci_pci_hc_driver, }, { /* end: all zeroes */ } }; diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c index 22117a6aeb4a..585222af24ff 100644 --- a/drivers/usb/host/ohci-pci.c +++ b/drivers/usb/host/ohci-pci.c @@ -277,21 +277,24 @@ static const struct ohci_driver_overrides pci_overrides __initconst = { static const struct pci_device_id pci_ids[] = { { /* handle any USB OHCI controller */ PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_OHCI, ~0), - .driver_data = (unsigned long) &ohci_pci_hc_driver, }, { /* The device in the ConneXT I/O hub has no class reg */ PCI_VDEVICE(STMICRO, PCI_DEVICE_ID_STMICRO_USB_OHCI), - .driver_data = (unsigned long) &ohci_pci_hc_driver, }, { /* end: all zeroes */ } }; MODULE_DEVICE_TABLE (pci, pci_ids); +static int ohci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + return usb_hcd_pci_probe(dev, id, &ohci_pci_hc_driver); +} + /* pci driver glue; this is a "new style" PCI driver module */ static struct pci_driver ohci_pci_driver = { .name = hcd_name, .id_table = pci_ids, - .probe = usb_hcd_pci_probe, + .probe = ohci_pci_probe, .remove = usb_hcd_pci_remove, .shutdown = usb_hcd_pci_shutdown, diff --git a/drivers/usb/host/uhci-pci.c b/drivers/usb/host/uhci-pci.c index 957c87efc746..9b88745d247f 100644 --- a/drivers/usb/host/uhci-pci.c +++ b/drivers/usb/host/uhci-pci.c @@ -287,17 +287,21 @@ static const struct hc_driver uhci_driver = { static const struct pci_device_id uhci_pci_ids[] = { { /* handle any USB UHCI controller */ PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_UHCI, ~0), - .driver_data = (unsigned long) &uhci_driver, }, { /* end: all zeroes */ } }; MODULE_DEVICE_TABLE(pci, uhci_pci_ids); +static int uhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + return usb_hcd_pci_probe(dev, id, &uhci_driver); +} + static struct pci_driver uhci_pci_driver = { .name = hcd_name, .id_table = uhci_pci_ids, - .probe = usb_hcd_pci_probe, + .probe = uhci_pci_probe, .remove = usb_hcd_pci_remove, .shutdown = uhci_shutdown, diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 766b74723e64..b6c2f5c530e3 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -327,11 +327,8 @@ static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) { int retval; struct xhci_hcd *xhci; - struct hc_driver *driver; struct usb_hcd *hcd; - driver = (struct hc_driver *)id->driver_data; - /* Prevent runtime suspending between USB-2 and USB-3 initialization */ pm_runtime_get_noresume(&dev->dev); @@ -341,7 +338,7 @@ static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) * to say USB 2.0, but I'm not sure what the implications would be in * the other parts of the HCD code. */ - retval = usb_hcd_pci_probe(dev, id); + retval = usb_hcd_pci_probe(dev, id, &xhci_pci_hc_driver); if (retval) goto put_runtime_pm; @@ -349,8 +346,8 @@ static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) /* USB 2.0 roothub is stored in the PCI device now. */ hcd = dev_get_drvdata(&dev->dev); xhci = hcd_to_xhci(hcd); - xhci->shared_hcd = usb_create_shared_hcd(driver, &dev->dev, - pci_name(dev), hcd); + xhci->shared_hcd = usb_create_shared_hcd(&xhci_pci_hc_driver, &dev->dev, + pci_name(dev), hcd); if (!xhci->shared_hcd) { retval = -ENOMEM; goto dealloc_usb2_hcd; @@ -544,10 +541,9 @@ static void xhci_pci_shutdown(struct usb_hcd *hcd) /*-------------------------------------------------------------------------*/ /* PCI driver selection metadata; PCI hotplugging uses this */ -static const struct pci_device_id pci_ids[] = { { +static const struct pci_device_id pci_ids[] = { /* handle any USB 3.0 xHCI controller */ - PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_XHCI, ~0), - .driver_data = (unsigned long) &xhci_pci_hc_driver, + { PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_XHCI, ~0), }, { /* end: all zeroes */ } }; diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index e12105ed3834..3dbb42c637c1 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -479,7 +479,8 @@ extern void usb_hcd_platform_shutdown(struct platform_device *dev); struct pci_dev; struct pci_device_id; extern int usb_hcd_pci_probe(struct pci_dev *dev, - const struct pci_device_id *id); + const struct pci_device_id *id, + const struct hc_driver *driver); extern void usb_hcd_pci_remove(struct pci_dev *dev); extern void usb_hcd_pci_shutdown(struct pci_dev *dev); -- cgit v1.2.3 From 6fbeb0048e6b93f7b7f195864f3ddc876ac4d42e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 7 Feb 2020 15:59:25 -0500 Subject: dm bufio: implement discard Add functions dm_bufio_issue_discard and dm_bufio_discard_buffers. dm_bufio_issue_discard sends discard request to the underlying device. dm_bufio_discard_buffers frees buffers in the range and then calls dm_bufio_issue_discard. Also, factor out block_to_sector for reuse in dm_bufio_issue_discard. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 69 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/dm-bufio.h | 12 +++++++++ 2 files changed, 76 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..bf289be1ee3a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -631,6 +631,19 @@ dmio: submit_bio(bio); } +static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block) +{ + sector_t sector; + + if (likely(c->sectors_per_block_bits >= 0)) + sector = block << c->sectors_per_block_bits; + else + sector = block * (c->block_size >> SECTOR_SHIFT); + sector += c->start; + + return sector; +} + static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; @@ -639,11 +652,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff b->end_io = end_io; - if (likely(b->c->sectors_per_block_bits >= 0)) - sector = b->block << b->c->sectors_per_block_bits; - else - sector = b->block * (b->c->block_size >> SECTOR_SHIFT); - sector += b->c->start; + sector = block_to_sector(b->c, b->block); if (rw != REQ_OP_WRITE) { n_sectors = b->c->block_size >> SECTOR_SHIFT; @@ -1325,6 +1334,56 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); +/* + * Use dm-io to send a discard request to flush the device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + struct dm_io_request io_req = { + .bi_op = REQ_OP_DISCARD, + .bi_op_flags = REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = block_to_sector(c, block), + .count = block_to_sector(c, count), + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); + +/* + * Free the specified range of buffers. If a buffer is held by other process, it + * is not freed. If a buffer is dirty, it is discarded without writeback. + * Finally, send the discard request to the device. + */ +int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + sector_t i; + + for (i = block; i < block + count; i++) { + struct dm_buffer *b; + dm_bufio_lock(c); + b = __find(c, i); + if (b && likely(!b->hold_count)) { + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); + __unlink_buffer(b); + __free_buffer_wake(b); + } + dm_bufio_unlock(c); + } + + return dm_bufio_issue_discard(c, block, count); +} +EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers); + /* * We first delete any other buffer that may be at that new location. * diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 3c8b7d274bd9..07e1f163e299 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -118,6 +118,18 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); */ int dm_bufio_issue_flush(struct dm_bufio_client *c); +/* + * Send a discard request to the underlying device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count); + +/* + * Free the specified range of buffers. If a buffer is held by other process, it + * is not freed. If a buffer is dirty, it is discarded without writeback. + * Finally, send the discard request to the device. + */ +int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count); + /* * Like dm_bufio_release but also move the buffer to the new * block. dm_bufio_write_dirty_buffers is needed to commit the new block. -- cgit v1.2.3 From 087615bf3acdafd0ba7c7c9ed5286e7b7c80fe1b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 30 Apr 2020 16:48:29 -0400 Subject: dm mpath: pass IO start time to path selector The HST path selector needs this information to perform path prediction. For request-based mpath, struct request's io_start_time_ns is used, while for bio-based, use the start_time stored in dm_io. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 9 ++++++--- drivers/md/dm-path-selector.h | 2 +- drivers/md/dm-queue-length.c | 2 +- drivers/md/dm-service-time.c | 2 +- drivers/md/dm.c | 9 +++++++++ include/linux/device-mapper.h | 2 ++ 6 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e0c800cf87a9..74246d7c7d68 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -567,7 +567,8 @@ static void multipath_release_clone(struct request *clone, if (pgpath && pgpath->pg->ps.type->end_io) pgpath->pg->ps.type->end_io(&pgpath->pg->ps, &pgpath->path, - mpio->nr_bytes); + mpio->nr_bytes, + clone->io_start_time_ns); } blk_put_request(clone); @@ -1617,7 +1618,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, struct path_selector *ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + clone->io_start_time_ns); } return r; @@ -1661,7 +1663,8 @@ done: struct path_selector *ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + dm_start_time_ns_from_clone(clone)); } return r; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index b6eb5365b1a4..c47bc0e20275 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -74,7 +74,7 @@ struct path_selector_type { int (*start_io) (struct path_selector *ps, struct dm_path *path, size_t nr_bytes); int (*end_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); + size_t nr_bytes, u64 start_time); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 969c4f1a3633..5fd018d18418 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct dm_path *path, } static int ql_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index f006a9005593..9cfda665e9eb 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct dm_path *path, } static int st_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index db9e46114653..2fcb932eb4bd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -675,6 +675,15 @@ static bool md_in_flight(struct mapped_device *md) return md_in_flight_bios(md); } +u64 dm_start_time_ns_from_clone(struct bio *bio) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + struct dm_io *io = tio->io; + + return jiffies_to_nsecs(io->start_time); +} +EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index af48d9da3916..934037d938b9 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -332,6 +332,8 @@ void *dm_per_bio_data(struct bio *bio, size_t data_size); struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size); unsigned dm_bio_get_target_bio_nr(const struct bio *bio); +u64 dm_start_time_ns_from_clone(struct bio *bio); + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); -- cgit v1.2.3 From 716a7a25969003d82ab738179c3f1068a120ed11 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 14 May 2020 22:34:59 -0700 Subject: driver core: fw_devlink: Add support for batching fwnode parsing The amount of time spent parsing fwnodes of devices can become really high if the devices are added in an non-ideal order. Worst case can be O(N^2) when N devices are added. But this can be optimized to O(N) by adding all the devices and then parsing all their fwnodes in one batch. This commit adds fw_devlink_pause() and fw_devlink_resume() to allow doing this. Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20200515053500.215929-4-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/base.h | 1 + drivers/base/core.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++--- drivers/base/dd.c | 8 ++++ include/linux/fwnode.h | 2 + 4 files changed, 120 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/base.h b/drivers/base/base.h index 40fb069a8a7e..95c22c0f9036 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -153,6 +153,7 @@ extern char *make_class_name(const char *name, struct kobject *kobj); extern int devres_release_all(struct device *dev); extern void device_block_probing(void); extern void device_unblock_probing(void); +extern void driver_deferred_probe_force_trigger(void); /* /sys/devices directory */ extern struct kset *devices_kset; diff --git a/drivers/base/core.c b/drivers/base/core.c index f585d92e09d0..84c569726d75 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -49,6 +49,9 @@ static LIST_HEAD(wait_for_suppliers); static DEFINE_MUTEX(wfs_lock); static LIST_HEAD(deferred_sync); static unsigned int defer_sync_state_count = 1; +static unsigned int defer_fw_devlink_count; +static DEFINE_MUTEX(defer_fw_devlink_lock); +static bool fw_devlink_is_permissive(void); #ifdef CONFIG_SRCU static DEFINE_MUTEX(device_links_lock); @@ -527,7 +530,7 @@ static void device_link_add_missing_supplier_links(void) int ret = fwnode_call_int_op(dev->fwnode, add_links, dev); if (!ret) list_del_init(&dev->links.needs_suppliers); - else if (ret != -ENODEV) + else if (ret != -ENODEV || fw_devlink_is_permissive()) dev->links.need_for_probe = false; } mutex_unlock(&wfs_lock); @@ -1177,17 +1180,116 @@ static void fw_devlink_link_device(struct device *dev) { int fw_ret; - device_link_add_missing_supplier_links(); + if (!fw_devlink_flags) + return; + + mutex_lock(&defer_fw_devlink_lock); + if (!defer_fw_devlink_count) + device_link_add_missing_supplier_links(); + + /* + * The device's fwnode not having add_links() doesn't affect if other + * consumers can find this device as a supplier. So, this check is + * intentionally placed after device_link_add_missing_supplier_links(). + */ + if (!fwnode_has_op(dev->fwnode, add_links)) + goto out; - if (fw_devlink_flags && fwnode_has_op(dev->fwnode, add_links)) { + /* + * If fw_devlink is being deferred, assume all devices have mandatory + * suppliers they need to link to later. Then, when the fw_devlink is + * resumed, all these devices will get a chance to try and link to any + * suppliers they have. + */ + if (!defer_fw_devlink_count) { fw_ret = fwnode_call_int_op(dev->fwnode, add_links, dev); - if (fw_ret == -ENODEV && !fw_devlink_is_permissive()) - device_link_wait_for_mandatory_supplier(dev); - else if (fw_ret) - device_link_wait_for_optional_supplier(dev); + if (fw_ret == -ENODEV && fw_devlink_is_permissive()) + fw_ret = -EAGAIN; + } else { + fw_ret = -ENODEV; } + + if (fw_ret == -ENODEV) + device_link_wait_for_mandatory_supplier(dev); + else if (fw_ret) + device_link_wait_for_optional_supplier(dev); + +out: + mutex_unlock(&defer_fw_devlink_lock); } +/** + * fw_devlink_pause - Pause parsing of fwnode to create device links + * + * Calling this function defers any fwnode parsing to create device links until + * fw_devlink_resume() is called. Both these functions are ref counted and the + * caller needs to match the calls. + * + * While fw_devlink is paused: + * - Any device that is added won't have its fwnode parsed to create device + * links. + * - The probe of the device will also be deferred during this period. + * - Any devices that were already added, but waiting for suppliers won't be + * able to link to newly added devices. + * + * Once fw_devlink_resume(): + * - All the fwnodes that was not parsed will be parsed. + * - All the devices that were deferred probing will be reattempted if they + * aren't waiting for any more suppliers. + * + * This pair of functions, is mainly meant to optimize the parsing of fwnodes + * when a lot of devices that need to link to each other are added in a short + * interval of time. For example, adding all the top level devices in a system. + * + * For example, if N devices are added and: + * - All the consumers are added before their suppliers + * - All the suppliers of the N devices are part of the N devices + * + * Then: + * + * - With the use of fw_devlink_pause() and fw_devlink_resume(), each device + * will only need one parsing of its fwnode because it is guaranteed to find + * all the supplier devices already registered and ready to link to. It won't + * have to do another pass later to find one or more suppliers it couldn't + * find in the first parse of the fwnode. So, we'll only need O(N) fwnode + * parses. + * + * - Without the use of fw_devlink_pause() and fw_devlink_resume(), we would + * end up doing O(N^2) parses of fwnodes because every device that's added is + * guaranteed to trigger a parse of the fwnode of every device added before + * it. This O(N^2) parse is made worse by the fact that when a fwnode of a + * device is parsed, all it descendant devices might need to have their + * fwnodes parsed too (even if the devices themselves aren't added). + */ +void fw_devlink_pause(void) +{ + mutex_lock(&defer_fw_devlink_lock); + defer_fw_devlink_count++; + mutex_unlock(&defer_fw_devlink_lock); +} + +/** fw_devlink_resume - Resume parsing of fwnode to create device links + * + * This function is used in conjunction with fw_devlink_pause() and is ref + * counted. See documentation for fw_devlink_pause() for more details. + */ +void fw_devlink_resume(void) +{ + mutex_lock(&defer_fw_devlink_lock); + if (!defer_fw_devlink_count) { + WARN(true, "Unmatched fw_devlink pause/resume!"); + goto out; + } + + defer_fw_devlink_count--; + if (defer_fw_devlink_count) + goto out; + + device_link_add_missing_supplier_links(); + driver_deferred_probe_force_trigger(); +out: + mutex_unlock(&defer_fw_devlink_lock); +} /* Device links support end. */ int (*platform_notify)(struct device *dev) = NULL; diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 48ca81cb8ebc..63991d97adcc 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -160,6 +160,14 @@ static bool driver_deferred_probe_enable = false; * again. */ static void driver_deferred_probe_trigger(void) +{ + if (!driver_deferred_probe_enable) + return; + + driver_deferred_probe_force_trigger(); +} + +void driver_deferred_probe_force_trigger(void) { if (!driver_deferred_probe_enable) return; diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index e0abafbb17f8..9506f8ec0974 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -171,5 +171,7 @@ struct fwnode_operations { #define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) extern u32 fw_devlink_get_flags(void); +void fw_devlink_pause(void); +void fw_devlink_resume(void); #endif -- cgit v1.2.3 From a17b53c4a4b55ec322c132b6670743612229ee9c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:53 -0700 Subject: bpf, capability: Introduce CAP_BPF Split BPF operations that are allowed under CAP_SYS_ADMIN into combination of CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN. For backward compatibility include them in CAP_SYS_ADMIN as well. The end result provides simple safety model for applications that use BPF: - to load tracing program types BPF_PROG_TYPE_{KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT, etc} use CAP_BPF and CAP_PERFMON - to load networking program types BPF_PROG_TYPE_{SCHED_CLS, XDP, SK_SKB, etc} use CAP_BPF and CAP_NET_ADMIN There are few exceptions from this rule: - bpf_trace_printk() is allowed in networking programs, but it's using tracing mechanism, hence this helper needs additional CAP_PERFMON if networking program is using this helper. - BPF_F_ZERO_SEED flag for hash/lru map is allowed under CAP_SYS_ADMIN only to discourage production use. - BPF HW offload is allowed under CAP_SYS_ADMIN. - bpf_probe_write_user() is allowed under CAP_SYS_ADMIN only. CAPs are not checked at attach/detach time with two exceptions: - loading BPF_PROG_TYPE_CGROUP_SKB is allowed for unprivileged users, hence CAP_NET_ADMIN is required at attach time. - flow_dissector detach doesn't check prog FD at detach, hence CAP_NET_ADMIN is required at detach time. CAP_SYS_ADMIN is required to iterate BPF objects (progs, maps, links) via get_next_id command and convert them to file descriptor via GET_FD_BY_ID command. This restriction guarantees that mutliple tasks with CAP_BPF are not able to affect each other. That leads to clean isolation of tasks. For example: task A with CAP_BPF and CAP_NET_ADMIN loads and attaches a firewall via bpf_link. task B with the same capabilities cannot detach that firewall unless task A explicitly passed link FD to task B via scm_rights or bpffs. CAP_SYS_ADMIN can still detach/unload everything. Two networking user apps with CAP_SYS_ADMIN and CAP_NET_ADMIN can accidentely mess with each other programs and maps. Two networking user apps with CAP_NET_ADMIN and CAP_BPF cannot affect each other. CAP_NET_ADMIN + CAP_BPF allows networking programs access only packet data. Such networking progs cannot access arbitrary kernel memory or leak pointers. bpftool, bpftrace, bcc tools binaries should NOT be installed with CAP_BPF and CAP_PERFMON, since unpriv users will be able to read kernel secrets. But users with these two permissions will be able to use these tracing tools. CAP_PERFMON is least secure, since it allows kprobes and kernel memory access. CAP_NET_ADMIN can stop network traffic via iproute2. CAP_BPF is the safest from security point of view and harmless on its own. Having CAP_BPF and/or CAP_NET_ADMIN is not enough to write into arbitrary map and if that map is used by firewall-like bpf prog. CAP_BPF allows many bpf prog_load commands in parallel. The verifier may consume large amount of memory and significantly slow down the system. Existing unprivileged BPF operations are not affected. In particular unprivileged users are allowed to load socket_filter and cg_skb program types and to create array, hash, prog_array, map-in-map map types. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-2-alexei.starovoitov@gmail.com --- include/linux/capability.h | 5 +++++ include/uapi/linux/capability.h | 34 +++++++++++++++++++++++++++++++++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 027d7e4a853b..b4345b38a6be 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -256,6 +256,11 @@ static inline bool perfmon_capable(void) return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } +static inline bool bpf_capable(void) +{ + return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index e58c9636741b..c7372180a0a9 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -274,6 +274,7 @@ struct vfs_ns_cap_data { arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ +/* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */ #define CAP_SYS_ADMIN 21 @@ -374,7 +375,38 @@ struct vfs_ns_cap_data { #define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_PERFMON +/* + * CAP_BPF allows the following BPF operations: + * - Creating all types of BPF maps + * - Advanced verifier features + * - Indirect variable access + * - Bounded loops + * - BPF to BPF function calls + * - Scalar precision tracking + * - Larger complexity limits + * - Dead code elimination + * - And potentially other features + * - Loading BPF Type Format (BTF) data + * - Retrieve xlated and JITed code of BPF programs + * - Use bpf_spin_lock() helper + * + * CAP_PERFMON relaxes the verifier checks further: + * - BPF progs can use of pointer-to-integer conversions + * - speculation attack hardening measures are bypassed + * - bpf_probe_read to read arbitrary kernel memory is allowed + * - bpf_trace_printk to print kernel memory is allowed + * + * CAP_SYS_ADMIN is required to use bpf_probe_write_user. + * + * CAP_SYS_ADMIN is required to iterate system wide loaded + * programs, maps, links, BTFs and convert their IDs to file descriptors. + * + * CAP_PERFMON and CAP_BPF are required to load tracing programs. + * CAP_NET_ADMIN and CAP_BPF are required to load networking programs. + */ +#define CAP_BPF 39 + +#define CAP_LAST_CAP CAP_BPF #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index d233ab3f1533..98e1513b608a 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read", "perfmon" + "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf" -#if CAP_LAST_CAP > CAP_PERFMON +#if CAP_LAST_CAP > CAP_BPF #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 18 ++++++++- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ net/core/bpf_sk_storage.c | 4 +- net/core/filter.c | 4 +- 19 files changed, 134 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 069c42f22a8c..5bb144435c16 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); /* fall through */ default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c45d198ac38c..efe8836b5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,6 +19,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -119,7 +120,7 @@ struct bpf_map { struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; - bool unpriv_array; + bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; +static inline bool bpf_allow_ptr_leaks(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v1(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v4(void) +{ + return perfmon_capable(); +} + int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6abd5a778fcd..ea833087e853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -375,6 +375,9 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool bpf_capable; + bool bypass_spec_v1; + bool bypass_spec_v4; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/filter.c b/net/core/filter.c index a85eb538d4d6..f8a3c7e9d027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6687,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6699,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: -- cgit v1.2.3 From d08b9f0ca6605e13dcb48f04e55a30545b3c71eb Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:07 -0700 Subject: scs: Add support for Clang's Shadow Call Stack (SCS) This change adds generic support for Clang's Shadow Call Stack, which uses a shadow stack to protect return addresses from being overwritten by an attacker. Details are available here: https://clang.llvm.org/docs/ShadowCallStack.html Note that security guarantees in the kernel differ from the ones documented for user space. The kernel must store addresses of shadow stacks in memory, which means an attacker capable reading and writing arbitrary memory may be able to locate them and hijack control flow by modifying the stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Miguel Ojeda [will: Numerous cosmetic changes] Signed-off-by: Will Deacon --- Makefile | 6 ++++ arch/Kconfig | 24 +++++++++++++++ include/linux/compiler-clang.h | 4 +++ include/linux/compiler_types.h | 4 +++ include/linux/scs.h | 68 ++++++++++++++++++++++++++++++++++++++++++ init/init_task.c | 8 +++++ kernel/Makefile | 1 + kernel/fork.c | 9 ++++++ kernel/sched/core.c | 2 ++ kernel/scs.c | 65 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 191 insertions(+) create mode 100644 include/linux/scs.h create mode 100644 kernel/scs.c (limited to 'include/linux') diff --git a/Makefile b/Makefile index 679f302a8b8b..33dc0d0cdd08 100644 --- a/Makefile +++ b/Makefile @@ -866,6 +866,12 @@ ifdef CONFIG_LIVEPATCH KBUILD_CFLAGS += $(call cc-option, -flive-patching=inline-clone) endif +ifdef CONFIG_SHADOW_CALL_STACK +CC_FLAGS_SCS := -fsanitize=shadow-call-stack +KBUILD_CFLAGS += $(CC_FLAGS_SCS) +export CC_FLAGS_SCS +endif + # arch Makefile may override CC so keep this after arch Makefile is included NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include) diff --git a/arch/Kconfig b/arch/Kconfig index 786a85d4ad40..334a3d9b19df 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -533,6 +533,30 @@ config STACKPROTECTOR_STRONG about 20% of all kernel functions, which increases the kernel code size by about 2%. +config ARCH_SUPPORTS_SHADOW_CALL_STACK + bool + help + An architecture should select this if it supports Clang's Shadow + Call Stack, has asm/scs.h, and implements runtime support for shadow + stack switching. + +config SHADOW_CALL_STACK + bool "Clang Shadow Call Stack" + depends on CC_IS_CLANG && ARCH_SUPPORTS_SHADOW_CALL_STACK + help + This option enables Clang's Shadow Call Stack, which uses a + shadow stack to protect function return addresses from being + overwritten by an attacker. More information can be found in + Clang's documentation: + + https://clang.llvm.org/docs/ShadowCallStack.html + + Note that security guarantees in the kernel differ from the + ones documented for user space. The kernel must store addresses + of shadow stacks in memory, which means an attacker capable of + reading and writing arbitrary memory may be able to locate them + and hijack control flow by modifying the stacks. + config HAVE_ARCH_WITHIN_STACK_FRAMES bool help diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 333a6695a918..790c0c6b8552 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -42,3 +42,7 @@ * compilers, like ICC. */ #define barrier() __asm__ __volatile__("" : : : "memory") + +#if __has_feature(shadow_call_stack) +# define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) +#endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e970f97a7fcb..97b62f47a80d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -193,6 +193,10 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif +#ifndef __noscs +# define __noscs +#endif + #ifndef asm_volatile_goto #define asm_volatile_goto(x...) asm goto(x) #endif diff --git a/include/linux/scs.h b/include/linux/scs.h new file mode 100644 index 000000000000..3f3662621a27 --- /dev/null +++ b/include/linux/scs.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#ifndef _LINUX_SCS_H +#define _LINUX_SCS_H + +#include +#include +#include +#include + +#ifdef CONFIG_SHADOW_CALL_STACK + +/* + * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit + * architecture) provided ~40% safety margin on stack usage while keeping + * memory allocation overhead reasonable. + */ +#define SCS_SIZE SZ_1K +#define GFP_SCS (GFP_KERNEL | __GFP_ZERO) + +/* An illegal pointer value to mark the end of the shadow stack. */ +#define SCS_END_MAGIC (0x5f6UL + POISON_POINTER_DELTA) + +#define task_scs(tsk) (task_thread_info(tsk)->scs_base) +#define task_scs_offset(tsk) (task_thread_info(tsk)->scs_offset) + +void scs_init(void); +int scs_prepare(struct task_struct *tsk, int node); +void scs_release(struct task_struct *tsk); + +static inline void scs_task_reset(struct task_struct *tsk) +{ + /* + * Reset the shadow stack to the base address in case the task + * is reused. + */ + task_scs_offset(tsk) = 0; +} + +static inline unsigned long *__scs_magic(void *s) +{ + return (unsigned long *)(s + SCS_SIZE) - 1; +} + +static inline bool scs_corrupted(struct task_struct *tsk) +{ + unsigned long *magic = __scs_magic(task_scs(tsk)); + + return (task_scs_offset(tsk) >= SCS_SIZE - 1 || + READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC); +} + +#else /* CONFIG_SHADOW_CALL_STACK */ + +static inline void scs_init(void) {} +static inline void scs_task_reset(struct task_struct *tsk) {} +static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } +static inline bool scs_corrupted(struct task_struct *tsk) { return false; } +static inline void scs_release(struct task_struct *tsk) {} + +#endif /* CONFIG_SHADOW_CALL_STACK */ + +#endif /* _LINUX_SCS_H */ diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..169e34066d35 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,13 @@ static struct sighand_struct init_sighand = { .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), }; +#ifdef CONFIG_SHADOW_CALL_STACK +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] + __init_task_data = { + [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC +}; +#endif + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..f6339f9d232d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..934e03cfaec7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -6040,6 +6041,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..38f8f31c9451 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include +#include +#include +#include + +static struct kmem_cache *scs_cache; + +static void *scs_alloc(int node) +{ + void *s; + + s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + if (s) { + *__scs_magic(s) = SCS_END_MAGIC; + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + } + + return s; +} + +static void scs_free(void *s) +{ + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = s; + task_scs_offset(tsk) = 0; + + return 0; +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_free(s); +} -- cgit v1.2.3 From 628d06a48f57c36abdc2a024930212e654a501b7 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 27 Apr 2020 09:00:08 -0700 Subject: scs: Add page accounting for shadow call stack allocations This change adds accounting for the memory allocated for shadow stacks. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Acked-by: Will Deacon Signed-off-by: Will Deacon --- drivers/base/node.c | 6 ++++++ fs/proc/meminfo.c | 4 ++++ include/linux/mmzone.h | 3 +++ kernel/scs.c | 15 +++++++++++++++ mm/page_alloc.c | 6 ++++++ mm/vmstat.c | 3 +++ 6 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 10d7e818e118..50b8c0d43859 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -415,6 +415,9 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d AnonPages: %8lu kB\n" "Node %d Shmem: %8lu kB\n" "Node %d KernelStack: %8lu kB\n" +#ifdef CONFIG_SHADOW_CALL_STACK + "Node %d ShadowCallStack:%8lu kB\n" +#endif "Node %d PageTables: %8lu kB\n" "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" @@ -438,6 +441,9 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), nid, K(i.sharedram), nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB), +#endif nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..09cd51c8d23d 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -103,6 +103,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_zone_page_state(NR_KERNEL_SCS_KB)); +#endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1b9de7d220fb..acffc3bc6178 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -156,6 +156,9 @@ enum zone_stat_item { NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + NR_KERNEL_SCS_KB, /* measured in KiB */ +#endif /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) diff --git a/kernel/scs.c b/kernel/scs.c index 38f8f31c9451..6d2f983ac54e 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -6,8 +6,10 @@ */ #include +#include #include #include +#include #include static struct kmem_cache *scs_cache; @@ -40,6 +42,17 @@ void __init scs_init(void) scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); } +static struct page *__scs_page(struct task_struct *tsk) +{ + return virt_to_page(task_scs(tsk)); +} + +static void scs_account(struct task_struct *tsk, int account) +{ + mod_zone_page_state(page_zone(__scs_page(tsk)), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / 1024)); +} + int scs_prepare(struct task_struct *tsk, int node) { void *s = scs_alloc(node); @@ -49,6 +62,7 @@ int scs_prepare(struct task_struct *tsk, int node) task_scs(tsk) = s; task_scs_offset(tsk) = 0; + scs_account(tsk, 1); return 0; } @@ -61,5 +75,6 @@ void scs_release(struct task_struct *tsk) return; WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + scs_account(tsk, -1); scs_free(s); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69827d4fa052..83743d7a6177 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5411,6 +5411,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " managed:%lukB" " mlocked:%lukB" " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5433,6 +5436,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + zone_page_state(zone, NR_KERNEL_SCS_KB), +#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..2435d2c24657 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1119,6 +1119,9 @@ const char * const vmstat_text[] = { "nr_mlock", "nr_page_table_pages", "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", -- cgit v1.2.3 From b0ed0bbfb3046ed127f6004b5893ccb6cdd9ba90 Mon Sep 17 00:00:00 2001 From: Kevin Lo Date: Sat, 16 May 2020 01:24:47 +0800 Subject: net: phy: broadcom: add support for BCM54811 PHY The BCM54811 PHY shares many similarities with the already supported BCM54810 PHY but additionally requires some semi-unique configuration. Signed-off-by: Kevin Lo Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 56 +++++++++++++++++++++++++++++++++++++++++----- include/linux/brcmphy.h | 2 ++ 2 files changed, 53 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 97201d5cf007..8cd8d188542a 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -195,7 +195,8 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) if (BRCM_PHY_MODEL(phydev) != PHY_ID_BCM57780 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610M && - BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54810) + BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54810 && + BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54811) return; val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_SCR3); @@ -214,8 +215,10 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) clk125en = false; } else { if (phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED) { - /* Here, bit 0 _enables_ CLK125 when set */ - val &= ~BCM54XX_SHD_SCR3_DEF_CLK125; + if (BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54811) { + /* Here, bit 0 _enables_ CLK125 when set */ + val &= ~BCM54XX_SHD_SCR3_DEF_CLK125; + } clk125en = false; } } @@ -225,8 +228,13 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) else val |= BCM54XX_SHD_SCR3_DLLAPD_DIS; - if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) - val |= BCM54XX_SHD_SCR3_TRDDAPD; + if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) { + if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 || + BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54811) + val |= BCM54810_SHD_SCR3_TRDDAPD; + else + val |= BCM54XX_SHD_SCR3_TRDDAPD; + } if (orig != val) bcm_phy_write_shadow(phydev, BCM54XX_SHD_SCR3, val); @@ -327,6 +335,32 @@ static int bcm54xx_resume(struct phy_device *phydev) return bcm54xx_config_init(phydev); } +static int bcm54811_config_init(struct phy_device *phydev) +{ + int err, reg; + + /* Disable BroadR-Reach function. */ + reg = bcm_phy_read_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL); + reg &= ~BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN; + err = bcm_phy_write_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL, + reg); + if (err < 0) + return err; + + err = bcm54xx_config_init(phydev); + + /* Enable CLK125 MUX on LED4 if ref clock is enabled. */ + if (!(phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)) { + reg = bcm_phy_read_exp(phydev, BCM54612E_EXP_SPARE0); + err = bcm_phy_write_exp(phydev, BCM54612E_EXP_SPARE0, + BCM54612E_LED4_CLK125OUT_EN | reg); + if (err < 0) + return err; + } + + return err; +} + static int bcm5482_config_init(struct phy_device *phydev) { int err, reg; @@ -722,6 +756,17 @@ static struct phy_driver broadcom_drivers[] = { .config_intr = bcm_phy_config_intr, .suspend = genphy_suspend, .resume = bcm54xx_resume, +}, { + .phy_id = PHY_ID_BCM54811, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM54811", + /* PHY_GBIT_FEATURES */ + .config_init = bcm54811_config_init, + .config_aneg = bcm5481_config_aneg, + .ack_interrupt = bcm_phy_ack_intr, + .config_intr = bcm_phy_config_intr, + .suspend = genphy_suspend, + .resume = bcm54xx_resume, }, { .phy_id = PHY_ID_BCM5482, .phy_id_mask = 0xfffffff0, @@ -816,6 +861,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = { { PHY_ID_BCM5464, 0xfffffff0 }, { PHY_ID_BCM5481, 0xfffffff0 }, { PHY_ID_BCM54810, 0xfffffff0 }, + { PHY_ID_BCM54811, 0xfffffff0 }, { PHY_ID_BCM5482, 0xfffffff0 }, { PHY_ID_BCM50610, 0xfffffff0 }, { PHY_ID_BCM50610M, 0xfffffff0 }, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index d41624db6de2..6ad4c000661a 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -17,6 +17,7 @@ #define PHY_ID_BCM5395 0x0143bcf0 #define PHY_ID_BCM53125 0x03625f20 #define PHY_ID_BCM54810 0x03625d00 +#define PHY_ID_BCM54811 0x03625cc0 #define PHY_ID_BCM5482 0x0143bcb0 #define PHY_ID_BCM5411 0x00206070 #define PHY_ID_BCM5421 0x002060e0 @@ -255,6 +256,7 @@ #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN (1 << 0) #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) +#define BCM54810_SHD_SCR3_TRDDAPD 0x0100 /* BCM54612E Registers */ #define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) -- cgit v1.2.3 From 90bf45134d55d626ae2713cac50cda10c6c8b0c2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 15 May 2020 19:22:15 +0200 Subject: mptcp: add new sock flag to deal with join subflows MP_JOIN subflows must not land into the accept queue. Currently tcp_check_req() calls an mptcp specific helper to detect such scenario. Such helper leverages the subflow context to check for MP_JOIN subflows. We need to deal also with MP JOIN failures, even when the subflow context is not available due allocation failure. A possible solution would be changing the syn_recv_sock() signature to allow returning a more descriptive action/ error code and deal with that in tcp_check_req(). Since the above need is MPTCP specific, this patch instead uses a TCP request socket hole to add a MPTCP specific flag. Such flag is used by the MPTCP syn_recv_sock() to tell tcp_check_req() how to deal with the request socket. This change is a no-op for !MPTCP build, and makes the MPTCP code simpler. It allows also the next patch to deal correctly with MP JOIN failure. v1 -> v2: - be more conservative on drop_req initialization (Mat) RFC -> v1: - move the drop_req bit inside tcp_request_sock (Eric) Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/net/mptcp.h | 17 ++++++++++------- net/ipv4/tcp_minisocks.c | 2 +- net/mptcp/protocol.c | 7 ------- net/mptcp/subflow.c | 3 +++ 5 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e60db06ec28d..bf44e85d709d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -120,6 +120,9 @@ struct tcp_request_sock { u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; +#if IS_ENABLED(CONFIG_MPTCP) + bool drop_req; +#endif u32 txhash; u32 rcv_isn; u32 snt_isn; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index e60275659de6..c4a6ef4ba35b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -68,6 +68,11 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } +static inline bool rsk_drop_req(const struct request_sock *req) +{ + return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; +} + void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); @@ -121,8 +126,6 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, skb_ext_find(from, SKB_EXT_MPTCP)); } -bool mptcp_sk_is_subflow(const struct sock *sk); - void mptcp_seq_show(struct seq_file *seq); #else @@ -140,6 +143,11 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return false; } +static inline bool rsk_drop_req(const struct request_sock *req) +{ + return false; +} + static inline void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) @@ -190,11 +198,6 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, return true; } -static inline bool mptcp_sk_is_subflow(const struct sock *sk) -{ - return false; -} - static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } #endif /* CONFIG_MPTCP */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7e40322cc5ec..495dda2449fe 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,7 +774,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; - if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) { + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(sk, req); return child; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e1f23016ed3f..a61e60e94137 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1638,13 +1638,6 @@ bool mptcp_finish_join(struct sock *sk) return ret; } -bool mptcp_sk_is_subflow(const struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - - return subflow->mp_join == 1; -} - static bool mptcp_memory_free(const struct sock *sk, int wake) { struct mptcp_sock *msk = mptcp_sk(sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 009d5c478062..5e03ed8ae899 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -470,6 +470,8 @@ create_child: if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + tcp_rsk(req)->drop_req = false; + /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. @@ -512,6 +514,7 @@ create_child: goto close_child; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); + tcp_rsk(req)->drop_req = true; } } -- cgit v1.2.3 From 8b85996095049a2216c0a6b582330ec75913243c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 May 2020 16:32:48 -0700 Subject: linux/parser.h: add include guards is missing include guards. Add them. This is needed to allow declaring a function in that takes a substring_t parameter. Link: https://lore.kernel.org/r/20200512233251.118314-2-ebiggers@kernel.org Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- include/linux/parser.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/parser.h b/include/linux/parser.h index 12fc3482f5fc..89e2b23fb888 100644 --- a/include/linux/parser.h +++ b/include/linux/parser.h @@ -7,7 +7,8 @@ * but could potentially be used anywhere else that simple option=arg * parsing is required. */ - +#ifndef _LINUX_PARSER_H +#define _LINUX_PARSER_H /* associates an integer enumerator with a pattern string. */ struct match_token { @@ -34,3 +35,5 @@ int match_hex(substring_t *, int *result); bool match_wildcard(const char *pattern, const char *str); size_t match_strlcpy(char *, const substring_t *, size_t); char *match_strdup(const substring_t *); + +#endif /* _LINUX_PARSER_H */ -- cgit v1.2.3 From e5006671acc714d9dbfc6f8a618124c36f5cc6f8 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 28 Apr 2020 15:49:45 -0500 Subject: clk: versatile: Drop the legacy IM-PD1 clock code Now that the non-DT IM-PD1 support code has been removed, drop the clock related code from clk-impd1.c. Link: https://lore.kernel.org/r/20200428204945.21067-1-robh@kernel.org Cc: Linus Walleij Cc: Stephen Boyd Cc: linux-arm-kernel@lists.infradead.org Cc: linux-clk@vger.kernel.org Signed-off-by: Rob Herring Reviewed-by: Linus Walleij Reviewed-by: Stephen Boyd Signed-off-by: Arnd Bergmann --- drivers/clk/versatile/clk-impd1.c | 121 --------------------------- include/linux/platform_data/clk-integrator.h | 2 - 2 files changed, 123 deletions(-) delete mode 100644 include/linux/platform_data/clk-integrator.h (limited to 'include/linux') diff --git a/drivers/clk/versatile/clk-impd1.c b/drivers/clk/versatile/clk-impd1.c index b05da8516d4c..95129d39a44b 100644 --- a/drivers/clk/versatile/clk-impd1.c +++ b/drivers/clk/versatile/clk-impd1.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -20,26 +19,6 @@ #define IMPD1_OSC2 0x04 #define IMPD1_LOCK 0x08 -struct impd1_clk { - char *pclkname; - struct clk *pclk; - char *vco1name; - struct clk *vco1clk; - char *vco2name; - struct clk *vco2clk; - struct clk *mmciclk; - char *uartname; - struct clk *uartclk; - char *spiname; - struct clk *spiclk; - char *scname; - struct clk *scclk; - struct clk_lookup *clks[15]; -}; - -/* One entry for each connected IM-PD1 LM */ -static struct impd1_clk impd1_clks[4]; - /* * There are two VCO's on the IM-PD1 */ @@ -80,106 +59,6 @@ static const struct clk_icst_desc impd1_icst2_desc = { .lock_offset = IMPD1_LOCK, }; -/** - * integrator_impd1_clk_init() - set up the integrator clock tree - * @base: base address of the logic module (LM) - * @id: the ID of this LM - */ -void integrator_impd1_clk_init(void __iomem *base, unsigned int id) -{ - struct impd1_clk *imc; - struct clk *clk; - struct clk *pclk; - int i; - - if (id > 3) { - pr_crit("no more than 4 LMs can be attached\n"); - return; - } - imc = &impd1_clks[id]; - - /* Register the fixed rate PCLK */ - imc->pclkname = kasprintf(GFP_KERNEL, "lm%x-pclk", id); - pclk = clk_register_fixed_rate(NULL, imc->pclkname, NULL, 0, 0); - imc->pclk = pclk; - - imc->vco1name = kasprintf(GFP_KERNEL, "lm%x-vco1", id); - clk = icst_clk_register(NULL, &impd1_icst1_desc, imc->vco1name, NULL, - base); - imc->vco1clk = clk; - imc->clks[0] = clkdev_alloc(pclk, "apb_pclk", "lm%x:01000", id); - imc->clks[1] = clkdev_alloc(clk, NULL, "lm%x:01000", id); - - /* VCO2 is also called "CLK2" */ - imc->vco2name = kasprintf(GFP_KERNEL, "lm%x-vco2", id); - clk = icst_clk_register(NULL, &impd1_icst2_desc, imc->vco2name, NULL, - base); - imc->vco2clk = clk; - - /* MMCI uses CLK2 right off */ - imc->clks[2] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00700", id); - imc->clks[3] = clkdev_alloc(clk, NULL, "lm%x:00700", id); - - /* UART reference clock divides CLK2 by a fixed factor 4 */ - imc->uartname = kasprintf(GFP_KERNEL, "lm%x-uartclk", id); - clk = clk_register_fixed_factor(NULL, imc->uartname, imc->vco2name, - CLK_IGNORE_UNUSED, 1, 4); - imc->uartclk = clk; - imc->clks[4] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00100", id); - imc->clks[5] = clkdev_alloc(clk, NULL, "lm%x:00100", id); - imc->clks[6] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00200", id); - imc->clks[7] = clkdev_alloc(clk, NULL, "lm%x:00200", id); - - /* SPI PL022 clock divides CLK2 by a fixed factor 64 */ - imc->spiname = kasprintf(GFP_KERNEL, "lm%x-spiclk", id); - clk = clk_register_fixed_factor(NULL, imc->spiname, imc->vco2name, - CLK_IGNORE_UNUSED, 1, 64); - imc->clks[8] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00300", id); - imc->clks[9] = clkdev_alloc(clk, NULL, "lm%x:00300", id); - - /* The GPIO blocks and AACI have only PCLK */ - imc->clks[10] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00400", id); - imc->clks[11] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00500", id); - imc->clks[12] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00800", id); - - /* Smart Card clock divides CLK2 by a fixed factor 4 */ - imc->scname = kasprintf(GFP_KERNEL, "lm%x-scclk", id); - clk = clk_register_fixed_factor(NULL, imc->scname, imc->vco2name, - CLK_IGNORE_UNUSED, 1, 4); - imc->scclk = clk; - imc->clks[13] = clkdev_alloc(pclk, "apb_pclk", "lm%x:00600", id); - imc->clks[14] = clkdev_alloc(clk, NULL, "lm%x:00600", id); - - for (i = 0; i < ARRAY_SIZE(imc->clks); i++) - clkdev_add(imc->clks[i]); -} -EXPORT_SYMBOL_GPL(integrator_impd1_clk_init); - -void integrator_impd1_clk_exit(unsigned int id) -{ - int i; - struct impd1_clk *imc; - - if (id > 3) - return; - imc = &impd1_clks[id]; - - for (i = 0; i < ARRAY_SIZE(imc->clks); i++) - clkdev_drop(imc->clks[i]); - clk_unregister(imc->spiclk); - clk_unregister(imc->uartclk); - clk_unregister(imc->vco2clk); - clk_unregister(imc->vco1clk); - clk_unregister(imc->pclk); - kfree(imc->scname); - kfree(imc->spiname); - kfree(imc->uartname); - kfree(imc->vco2name); - kfree(imc->vco1name); - kfree(imc->pclkname); -} -EXPORT_SYMBOL_GPL(integrator_impd1_clk_exit); - static int integrator_impd1_clk_spawn(struct device *dev, struct device_node *parent, struct device_node *np) diff --git a/include/linux/platform_data/clk-integrator.h b/include/linux/platform_data/clk-integrator.h deleted file mode 100644 index addd48cac625..000000000000 --- a/include/linux/platform_data/clk-integrator.h +++ /dev/null @@ -1,2 +0,0 @@ -void integrator_impd1_clk_init(void __iomem *base, unsigned int id); -void integrator_impd1_clk_exit(unsigned int id); -- cgit v1.2.3 From 2771cefeac499d68ca2fca3861ba9e46d15bd4ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 18:10:05 +0200 Subject: block: remove the REQ_NOWAIT_INLINE flag Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9322261fb316..ccb895f911b1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -343,7 +343,6 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ - __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -378,7 +377,6 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) -#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From 4930f4831b1547b52c5968e9307fe3d840d7fba0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:23 +0200 Subject: net: allow __skb_ext_alloc to sleep mptcp calls this from the transmit side, from process context. Allow a sleeping allocation instead of unconditional GFP_ATOMIC. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 8 +++++--- net/mptcp/protocol.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3000c526f552..531843952809 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4165,7 +4165,7 @@ struct skb_ext { char data[] __aligned(8); }; -struct skb_ext *__skb_ext_alloc(void); +struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1bf0c3d278e7..35a133c6d13b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6087,13 +6087,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) /** * __skb_ext_alloc - allocate a new skb extensions storage * + * @flags: See kmalloc(). + * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ -struct skb_ext *__skb_ext_alloc(void) +struct skb_ext *__skb_ext_alloc(gfp_t flags) { - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); @@ -6188,7 +6190,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = __skb_ext_alloc(); + new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc950cf818f7..e3a628bea2b8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -367,8 +367,10 @@ static void mptcp_stop_timer(struct sock *sk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; + if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(); + msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); return !!msk->cached_ext; } -- cgit v1.2.3 From 9dafdfc2f0a3ae551711098de3d7b621a469f11a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 May 2020 14:18:05 +0300 Subject: splice: export do_tee() export do_tee() for use in io_uring Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/splice.c | 3 +-- include/linux/splice.h | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/splice.c b/fs/splice.c index fd0a1e7e5959..a1dd54de24d8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1754,8 +1754,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ -static long do_tee(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in); struct pipe_inode_info *opipe = get_pipe_info(out); diff --git a/include/linux/splice.h b/include/linux/splice.h index ebbbfea48aa0..5c47013f708e 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -82,6 +82,9 @@ extern long do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags); +extern long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags); + /* * for dynamic pipe sizing */ -- cgit v1.2.3 From 4c033b549912bc301c5e2adfb7b6ca007c11bf31 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 May 2020 16:52:54 +0200 Subject: gpiolib: Add support for GPIO lookup by line name Currently a GPIO lookup table can only refer to a specific GPIO by a tuple, consisting of a GPIO controller label and a GPIO offset inside the controller. However, a GPIO may also carry a line name, defined by DT or ACPI. If present, the line name is the most use-centric way to refer to a GPIO. Hence add support for looking up GPIOs by line name. Note that there is no guarantee that GPIO line names are globally unique, so this will use the first match found. Implement this by reusing the existing gpiod_lookup infrastructure. Rename gpiod_lookup.chip_label to gpiod_lookup.key, to make it clear that this field can have two meanings, and update the kerneldoc and GPIO_LOOKUP*() macros. Signed-off-by: Geert Uytterhoeven Tested-by: Eugeniu Rosca Reviewed-by: Ulrich Hecht Reviewed-by: Eugeniu Rosca Link: https://lore.kernel.org/r/20200511145257.22970-4-geert+renesas@glider.be Signed-off-by: Linus Walleij --- Documentation/driver-api/gpio/board.rst | 15 ++++++++++----- drivers/gpio/gpiolib.c | 22 +++++++++++++++++----- include/linux/gpio/machine.h | 17 ++++++++++------- 3 files changed, 37 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index ce91518bf9f4..191fa867826a 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -113,13 +113,15 @@ files that desire to do so need to include the following header:: GPIOs are mapped by the means of tables of lookups, containing instances of the gpiod_lookup structure. Two macros are defined to help declaring such mappings:: - GPIO_LOOKUP(chip_label, chip_hwnum, con_id, flags) - GPIO_LOOKUP_IDX(chip_label, chip_hwnum, con_id, idx, flags) + GPIO_LOOKUP(key, chip_hwnum, con_id, flags) + GPIO_LOOKUP_IDX(key, chip_hwnum, con_id, idx, flags) where - - chip_label is the label of the gpiod_chip instance providing the GPIO - - chip_hwnum is the hardware number of the GPIO within the chip + - key is either the label of the gpiod_chip instance providing the GPIO, or + the GPIO line name + - chip_hwnum is the hardware number of the GPIO within the chip, or U16_MAX + to indicate that key is a GPIO line name - con_id is the name of the GPIO function from the device point of view. It can be NULL, in which case it will match any function. - idx is the index of the GPIO within the function. @@ -135,7 +137,10 @@ where In the future, these flags might be extended to support more properties. -Note that GPIO_LOOKUP() is just a shortcut to GPIO_LOOKUP_IDX() where idx = 0. +Note that: + 1. GPIO line names are not guaranteed to be globally unique, so the first + match found will be used. + 2. GPIO_LOOKUP() is just a shortcut to GPIO_LOOKUP_IDX() where idx = 0. A lookup table can then be defined as follows, with an empty entry defining its end. The 'dev_id' field of the table is the identifier of the device that will diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 40f2d7f69be2..3c1dcdfe8572 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -4618,7 +4618,7 @@ static struct gpio_desc *gpiod_find(struct device *dev, const char *con_id, if (!table) return desc; - for (p = &table->table[0]; p->chip_label; p++) { + for (p = &table->table[0]; p->key; p++) { struct gpio_chip *gc; /* idx must always match exactly */ @@ -4629,18 +4629,30 @@ static struct gpio_desc *gpiod_find(struct device *dev, const char *con_id, if (p->con_id && (!con_id || strcmp(p->con_id, con_id))) continue; - gc = find_chip_by_name(p->chip_label); + if (p->chip_hwnum == U16_MAX) { + desc = gpio_name_to_desc(p->key); + if (desc) { + *flags = p->flags; + return desc; + } + + dev_warn(dev, "cannot find GPIO line %s, deferring\n", + p->key); + return ERR_PTR(-EPROBE_DEFER); + } + + gc = find_chip_by_name(p->key); if (!gc) { /* * As the lookup table indicates a chip with - * p->chip_label should exist, assume it may + * p->key should exist, assume it may * still appear later and let the interested * consumer be probed again or let the Deferred * Probe infrastructure handle the error. */ dev_warn(dev, "cannot find GPIO chip %s, deferring\n", - p->chip_label); + p->key); return ERR_PTR(-EPROBE_DEFER); } @@ -4671,7 +4683,7 @@ static int platform_gpio_count(struct device *dev, const char *con_id) if (!table) return -ENOENT; - for (p = &table->table[0]; p->chip_label; p++) { + for (p = &table->table[0]; p->key; p++) { if ((con_id && p->con_id && !strcmp(con_id, p->con_id)) || (!con_id && !p->con_id)) count++; diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index 1ebe5be05d5f..781a053abbb9 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -20,8 +20,11 @@ enum gpio_lookup_flags { /** * struct gpiod_lookup - lookup table - * @chip_label: name of the chip the GPIO belongs to - * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO + * @key: either the name of the chip the GPIO belongs to, or the GPIO line name + * Note that GPIO line names are not guaranteed to be globally unique, + * so this will use the first match found! + * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO, or + * U16_MAX to indicate that @key is a GPIO line name * @con_id: name of the GPIO from the device's point of view * @idx: index of the GPIO in case several GPIOs share the same name * @flags: bitmask of gpio_lookup_flags GPIO_* values @@ -30,7 +33,7 @@ enum gpio_lookup_flags { * functions using platform data. */ struct gpiod_lookup { - const char *chip_label; + const char *key; u16 chip_hwnum; const char *con_id; unsigned int idx; @@ -63,17 +66,17 @@ struct gpiod_hog { /* * Simple definition of a single GPIO under a con_id */ -#define GPIO_LOOKUP(_chip_label, _chip_hwnum, _con_id, _flags) \ - GPIO_LOOKUP_IDX(_chip_label, _chip_hwnum, _con_id, 0, _flags) +#define GPIO_LOOKUP(_key, _chip_hwnum, _con_id, _flags) \ + GPIO_LOOKUP_IDX(_key, _chip_hwnum, _con_id, 0, _flags) /* * Use this macro if you need to have several GPIOs under the same con_id. * Each GPIO needs to use a different index and can be accessed using * gpiod_get_index() */ -#define GPIO_LOOKUP_IDX(_chip_label, _chip_hwnum, _con_id, _idx, _flags) \ +#define GPIO_LOOKUP_IDX(_key, _chip_hwnum, _con_id, _idx, _flags) \ { \ - .chip_label = _chip_label, \ + .key = _key, \ .chip_hwnum = _chip_hwnum, \ .con_id = _con_id, \ .idx = _idx, \ -- cgit v1.2.3 From 5c8f77a278737a6af44a892f0700d9aadb2b0de0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 14 May 2020 10:39:00 +0200 Subject: irqdomain: Make irq_domain_reset_irq_data() available to non-hierarchical users irq_domain_reset_irq_data() doesn't modify the parent data, so it can be made available even if irq domain hierarchy is not being built. We'll subsequently use it in irq_sim code. Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20200514083901.23445-2-brgl@bgdev.pl --- include/linux/irqdomain.h | 2 +- kernel/irq/irqdomain.c | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 8d062e86d954..b37350c4fe37 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -450,6 +450,7 @@ extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name); +extern void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, unsigned int flags, unsigned int size, @@ -491,7 +492,6 @@ extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_chip *chip, void *chip_data); -extern void irq_domain_reset_irq_data(struct irq_data *irq_data); extern void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 35b8d97c3a1d..e2aa128ea3ee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1047,6 +1047,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, return virq; } +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); + #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy @@ -1247,18 +1259,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, } EXPORT_SYMBOL(irq_domain_set_info); -/** - * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data - * @irq_data: The pointer to irq_data - */ -void irq_domain_reset_irq_data(struct irq_data *irq_data) -{ - irq_data->hwirq = 0; - irq_data->chip = &no_irq_chip; - irq_data->chip_data = NULL; -} -EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); - /** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match -- cgit v1.2.3 From 337cbeb2c13eb4cab84f576fd402d7ae4ed31ae1 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 14 May 2020 10:39:01 +0200 Subject: genirq/irq_sim: Simplify the API The interrupt simulator API exposes a lot of custom data structures and functions and doesn't reuse the interfaces already exposed by the irq subsystem. This patch tries to address it. We hide all the simulator-related data structures from users and instead rely on the well-known irq domain. When creating the interrupt simulator the user receives a pointer to a newly created irq_domain and can use it to create mappings for simulated interrupts. It is also possible to pass a handle to fwnode when creating the simulator domain and retrieve it using irq_find_matching_fwnode(). The irq_sim_fire() function is dropped as well. Instead we implement the irq_get/set_irqchip_state interface. We modify the two modules that use the simulator at the same time as adding these changes in order to reduce the intermediate bloat that would result when trying to migrate the drivers in separate patches. Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Acked-by: Jonathan Cameron #for IIO Link: https://lore.kernel.org/r/20200514083901.23445-3-brgl@bgdev.pl --- drivers/gpio/gpio-mockup.c | 53 +++++-- drivers/iio/dummy/iio_dummy_evgen.c | 34 +++-- include/linux/irq_sim.h | 33 ++--- kernel/irq/Kconfig | 1 + kernel/irq/irq_sim.c | 267 ++++++++++++++++++++++-------------- 5 files changed, 237 insertions(+), 151 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index 3eb94f3740d1..bc345185db26 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -48,7 +49,7 @@ struct gpio_mockup_line_status { struct gpio_mockup_chip { struct gpio_chip gc; struct gpio_mockup_line_status *lines; - struct irq_sim irqsim; + struct irq_domain *irq_sim_domain; struct dentry *dbg_dir; struct mutex lock; }; @@ -144,14 +145,12 @@ static void gpio_mockup_set_multiple(struct gpio_chip *gc, static int gpio_mockup_apply_pull(struct gpio_mockup_chip *chip, unsigned int offset, int value) { + int curr, irq, irq_type, ret = 0; struct gpio_desc *desc; struct gpio_chip *gc; - struct irq_sim *sim; - int curr, irq, irq_type; gc = &chip->gc; desc = &gc->gpiodev->descs[offset]; - sim = &chip->irqsim; mutex_lock(&chip->lock); @@ -161,14 +160,28 @@ static int gpio_mockup_apply_pull(struct gpio_mockup_chip *chip, if (curr == value) goto out; - irq = irq_sim_irqnum(sim, offset); + irq = irq_find_mapping(chip->irq_sim_domain, offset); + if (!irq) + /* + * This is fine - it just means, nobody is listening + * for interrupts on this line, otherwise + * irq_create_mapping() would have been called from + * the to_irq() callback. + */ + goto set_value; + irq_type = irq_get_trigger_type(irq); if ((value == 1 && (irq_type & IRQ_TYPE_EDGE_RISING)) || - (value == 0 && (irq_type & IRQ_TYPE_EDGE_FALLING))) - irq_sim_fire(sim, offset); + (value == 0 && (irq_type & IRQ_TYPE_EDGE_FALLING))) { + ret = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, + true); + if (ret) + goto out; + } } +set_value: /* Change the value unless we're actively driving the line. */ if (!test_bit(FLAG_REQUESTED, &desc->flags) || !test_bit(FLAG_IS_OUT, &desc->flags)) @@ -177,7 +190,7 @@ static int gpio_mockup_apply_pull(struct gpio_mockup_chip *chip, out: chip->lines[offset].pull = value; mutex_unlock(&chip->lock); - return 0; + return ret; } static int gpio_mockup_set_config(struct gpio_chip *gc, @@ -236,7 +249,7 @@ static int gpio_mockup_to_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_mockup_chip *chip = gpiochip_get_data(gc); - return irq_sim_irqnum(&chip->irqsim, offset); + return irq_create_mapping(chip->irq_sim_domain, offset); } static void gpio_mockup_free(struct gpio_chip *gc, unsigned int offset) @@ -389,6 +402,19 @@ static int gpio_mockup_name_lines(struct device *dev, return 0; } +static void gpio_mockup_dispose_mappings(void *data) +{ + struct gpio_mockup_chip *chip = data; + struct gpio_chip *gc = &chip->gc; + int i, irq; + + for (i = 0; i < gc->ngpio; i++) { + irq = irq_find_mapping(chip->irq_sim_domain, i); + if (irq) + irq_dispose_mapping(irq); + } +} + static int gpio_mockup_probe(struct platform_device *pdev) { struct gpio_mockup_chip *chip; @@ -456,8 +482,13 @@ static int gpio_mockup_probe(struct platform_device *pdev) return rv; } - rv = devm_irq_sim_init(dev, &chip->irqsim, gc->ngpio); - if (rv < 0) + chip->irq_sim_domain = devm_irq_domain_create_sim(dev, NULL, + gc->ngpio); + if (IS_ERR(chip->irq_sim_domain)) + return PTR_ERR(chip->irq_sim_domain); + + rv = devm_add_action_or_reset(dev, gpio_mockup_dispose_mappings, chip); + if (rv) return rv; rv = devm_gpiochip_add_data(dev, &chip->gc, chip); diff --git a/drivers/iio/dummy/iio_dummy_evgen.c b/drivers/iio/dummy/iio_dummy_evgen.c index a6edf30567aa..409fe0f7df1c 100644 --- a/drivers/iio/dummy/iio_dummy_evgen.c +++ b/drivers/iio/dummy/iio_dummy_evgen.c @@ -37,8 +37,7 @@ struct iio_dummy_eventgen { struct iio_dummy_regs regs[IIO_EVENTGEN_NO]; struct mutex lock; bool inuse[IIO_EVENTGEN_NO]; - struct irq_sim irq_sim; - int base; + struct irq_domain *irq_sim_domain; }; /* We can only ever have one instance of this 'device' */ @@ -46,19 +45,17 @@ static struct iio_dummy_eventgen *iio_evgen; static int iio_dummy_evgen_create(void) { - int ret; - iio_evgen = kzalloc(sizeof(*iio_evgen), GFP_KERNEL); if (!iio_evgen) return -ENOMEM; - ret = irq_sim_init(&iio_evgen->irq_sim, IIO_EVENTGEN_NO); - if (ret < 0) { + iio_evgen->irq_sim_domain = irq_domain_create_sim(NULL, + IIO_EVENTGEN_NO); + if (IS_ERR(iio_evgen->irq_sim_domain)) { kfree(iio_evgen); - return ret; + return PTR_ERR(iio_evgen->irq_sim_domain); } - iio_evgen->base = irq_sim_irqnum(&iio_evgen->irq_sim, 0); mutex_init(&iio_evgen->lock); return 0; @@ -80,7 +77,7 @@ int iio_dummy_evgen_get_irq(void) mutex_lock(&iio_evgen->lock); for (i = 0; i < IIO_EVENTGEN_NO; i++) { if (!iio_evgen->inuse[i]) { - ret = irq_sim_irqnum(&iio_evgen->irq_sim, i); + ret = irq_create_mapping(iio_evgen->irq_sim_domain, i); iio_evgen->inuse[i] = true; break; } @@ -101,21 +98,27 @@ EXPORT_SYMBOL_GPL(iio_dummy_evgen_get_irq); */ void iio_dummy_evgen_release_irq(int irq) { + struct irq_data *irqd = irq_get_irq_data(irq); + mutex_lock(&iio_evgen->lock); - iio_evgen->inuse[irq - iio_evgen->base] = false; + iio_evgen->inuse[irqd_to_hwirq(irqd)] = false; + irq_dispose_mapping(irq); mutex_unlock(&iio_evgen->lock); } EXPORT_SYMBOL_GPL(iio_dummy_evgen_release_irq); struct iio_dummy_regs *iio_dummy_evgen_get_regs(int irq) { - return &iio_evgen->regs[irq - iio_evgen->base]; + struct irq_data *irqd = irq_get_irq_data(irq); + + return &iio_evgen->regs[irqd_to_hwirq(irqd)]; + } EXPORT_SYMBOL_GPL(iio_dummy_evgen_get_regs); static void iio_dummy_evgen_free(void) { - irq_sim_fini(&iio_evgen->irq_sim); + irq_domain_remove_sim(iio_evgen->irq_sim_domain); kfree(iio_evgen); } @@ -131,7 +134,7 @@ static ssize_t iio_evgen_poke(struct device *dev, { struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); unsigned long event; - int ret; + int ret, irq; ret = kstrtoul(buf, 10, &event); if (ret) @@ -140,7 +143,10 @@ static ssize_t iio_evgen_poke(struct device *dev, iio_evgen->regs[this_attr->address].reg_id = this_attr->address; iio_evgen->regs[this_attr->address].reg_data = event; - irq_sim_fire(&iio_evgen->irq_sim, this_attr->address); + irq = irq_find_mapping(iio_evgen->irq_sim_domain, this_attr->address); + ret = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true); + if (ret) + return ret; return len; } diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h index 4500d453a63e..ab831e5ae748 100644 --- a/include/linux/irq_sim.h +++ b/include/linux/irq_sim.h @@ -1,41 +1,26 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2017-2018 Bartosz Golaszewski + * Copyright (C) 2020 Bartosz Golaszewski */ #ifndef _LINUX_IRQ_SIM_H #define _LINUX_IRQ_SIM_H -#include #include +#include +#include /* * Provides a framework for allocating simulated interrupts which can be * requested like normal irqs and enqueued from process context. */ -struct irq_sim_work_ctx { - struct irq_work work; - unsigned long *pending; -}; - -struct irq_sim_irq_ctx { - int irqnum; - bool enabled; -}; - -struct irq_sim { - struct irq_sim_work_ctx work_ctx; - int irq_base; - unsigned int irq_count; - struct irq_sim_irq_ctx *irqs; -}; - -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs); -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs); -void irq_sim_fini(struct irq_sim *sim); -void irq_sim_fire(struct irq_sim *sim, unsigned int offset); -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset); +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs); +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs); +void irq_domain_remove_sim(struct irq_domain *domain); #endif /* _LINUX_IRQ_SIM_H */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20d501af4f2e..d63c324895ea 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -72,6 +72,7 @@ config IRQ_DOMAIN config IRQ_SIM bool select IRQ_WORK + select IRQ_DOMAIN # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b992f88c5613..48006608baf0 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017-2018 Bartosz Golaszewski + * Copyright (C) 2020 Bartosz Golaszewski */ -#include -#include #include +#include +#include +#include +#include + +struct irq_sim_work_ctx { + struct irq_work work; + int irq_base; + unsigned int irq_count; + unsigned long *pending; + struct irq_domain *domain; +}; + +struct irq_sim_irq_ctx { + int irqnum; + bool enabled; + struct irq_sim_work_ctx *work_ctx; +}; struct irq_sim_devres { - struct irq_sim *sim; + struct irq_domain *domain; }; static void irq_sim_irqmask(struct irq_data *data) @@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type) return 0; } +static int irq_sim_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) + *state = test_bit(hwirq, irq_ctx->work_ctx->pending); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int irq_sim_set_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) { + assign_bit(hwirq, irq_ctx->work_ctx->pending, state); + if (state) + irq_work_queue(&irq_ctx->work_ctx->work); + } + break; + default: + return -EINVAL; + } + + return 0; +} + static struct irq_chip irq_sim_irqchip = { - .name = "irq_sim", - .irq_mask = irq_sim_irqmask, - .irq_unmask = irq_sim_irqunmask, - .irq_set_type = irq_sim_set_type, + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, + .irq_get_irqchip_state = irq_sim_get_irqchip_state, + .irq_set_irqchip_state = irq_sim_set_irqchip_state, }; static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; unsigned int offset = 0; - struct irq_sim *sim; int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - sim = container_of(work_ctx, struct irq_sim, work_ctx); - while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) { offset = find_next_bit(work_ctx->pending, - sim->irq_count, offset); + work_ctx->irq_count, offset); clear_bit(offset, work_ctx->pending); - irqnum = irq_sim_irqnum(sim, offset); + irqnum = irq_find_mapping(work_ctx->domain, offset); handle_simple_irq(irq_to_desc(irqnum)); } } +static int irq_sim_domain_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct irq_sim_work_ctx *work_ctx = domain->host_data; + struct irq_sim_irq_ctx *irq_ctx; + + irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL); + if (!irq_ctx) + return -ENOMEM; + + irq_set_chip(virq, &irq_sim_irqchip); + irq_set_chip_data(virq, irq_ctx); + irq_set_handler(virq, handle_simple_irq); + irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + irq_ctx->work_ctx = work_ctx; + + return 0; +} + +static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_sim_irq_ctx *irq_ctx; + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + irq_ctx = irq_data_get_irq_chip_data(irqd); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(irqd); + kfree(irq_ctx); +} + +static const struct irq_domain_ops irq_sim_domain_ops = { + .map = irq_sim_domain_map, + .unmap = irq_sim_domain_unmap, +}; + /** - * irq_sim_init - Initialize the interrupt simulator: allocate a range of - * dummy interrupts. + * irq_domain_create_sim - Create a new interrupt simulator irq_domain and + * allocate a range of dummy interrupts. * - * @sim: The interrupt simulator object to initialize. - * @num_irqs: Number of interrupts to allocate + * @fnode: struct fwnode_handle to be associated with this domain. + * @num_irqs: Number of interrupts to allocate. * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs) { - int i; + struct irq_sim_work_ctx *work_ctx; - sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); - if (!sim->irqs) - return -ENOMEM; + work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); + if (!work_ctx) + goto err_out; - sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); - if (sim->irq_base < 0) { - kfree(sim->irqs); - return sim->irq_base; - } + work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!work_ctx->pending) + goto err_free_work_ctx; - sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!sim->work_ctx.pending) { - kfree(sim->irqs); - irq_free_descs(sim->irq_base, num_irqs); - return -ENOMEM; - } + work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, + &irq_sim_domain_ops, + work_ctx); + if (!work_ctx->domain) + goto err_free_bitmap; - for (i = 0; i < num_irqs; i++) { - sim->irqs[i].irqnum = sim->irq_base + i; - sim->irqs[i].enabled = false; - irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); - irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); - irq_set_handler(sim->irq_base + i, &handle_simple_irq); - irq_modify_status(sim->irq_base + i, - IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); - } + work_ctx->irq_count = num_irqs; + init_irq_work(&work_ctx->work, irq_sim_handle_irq); - init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); - sim->irq_count = num_irqs; + return work_ctx->domain; - return sim->irq_base; +err_free_bitmap: + bitmap_free(work_ctx->pending); +err_free_work_ctx: + kfree(work_ctx); +err_out: + return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(irq_sim_init); +EXPORT_SYMBOL_GPL(irq_domain_create_sim); /** - * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt - * descriptors and allocated memory. + * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free + * the interrupt descriptors and allocated memory. * - * @sim: The interrupt simulator to tear down. + * @domain: The interrupt simulator domain to tear down. */ -void irq_sim_fini(struct irq_sim *sim) +void irq_domain_remove_sim(struct irq_domain *domain) { - irq_work_sync(&sim->work_ctx.work); - bitmap_free(sim->work_ctx.pending); - irq_free_descs(sim->irq_base, sim->irq_count); - kfree(sim->irqs); + struct irq_sim_work_ctx *work_ctx = domain->host_data; + + irq_work_sync(&work_ctx->work); + bitmap_free(work_ctx->pending); + kfree(work_ctx); + + irq_domain_remove(domain); } -EXPORT_SYMBOL_GPL(irq_sim_fini); +EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_sim_release(struct device *dev, void *res) +static void devm_irq_domain_release_sim(struct device *dev, void *res) { struct irq_sim_devres *this = res; - irq_sim_fini(this->sim); + irq_domain_remove_sim(this->domain); } /** - * irq_sim_init - Initialize the interrupt simulator for a managed device. + * devm_irq_domain_create_sim - Create a new interrupt simulator for + * a managed device. * * @dev: Device to initialize the simulator object for. - * @sim: The interrupt simulator object to initialize. + * @fnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs) +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs) { struct irq_sim_devres *dr; - int rv; - dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_irq_domain_release_sim, + sizeof(*dr), GFP_KERNEL); if (!dr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rv = irq_sim_init(sim, num_irqs); - if (rv < 0) { + dr->domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(dr->domain)) { devres_free(dr); - return rv; + return dr->domain; } - dr->sim = sim; devres_add(dev, dr); - - return rv; -} -EXPORT_SYMBOL_GPL(devm_irq_sim_init); - -/** - * irq_sim_fire - Enqueue an interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt which should be fired. - */ -void irq_sim_fire(struct irq_sim *sim, unsigned int offset) -{ - if (sim->irqs[offset].enabled) { - set_bit(offset, sim->work_ctx.pending); - irq_work_queue(&sim->work_ctx.work); - } -} -EXPORT_SYMBOL_GPL(irq_sim_fire); - -/** - * irq_sim_irqnum - Get the allocated number of a dummy interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt for which to retrieve - * the number. - */ -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) -{ - return sim->irqs[offset].irqnum; + return dr->domain; } -EXPORT_SYMBOL_GPL(irq_sim_irqnum); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); -- cgit v1.2.3 From 3db9983e4327f773c490de2a8c66d6000561d88a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:44 +0800 Subject: iommu/vt-d: Move domain helper to header Move domain helper to header to be used by SVA code. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 6 ------ include/linux/intel-iommu.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2ff8d69ce4f8..8027f21073eb 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -441,12 +441,6 @@ static void init_translation_status(struct intel_iommu *iommu) iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; } -/* Convert generic 'struct iommu_domain to private struct dmar_domain */ -static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) -{ - return container_of(dom, struct dmar_domain, domain); -} - static int __init intel_iommu_setup(char *str) { if (!str) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 980234ae0312..ed7171d2ae1f 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -595,6 +595,12 @@ static inline void __iommu_flush_cache( clflush_cache_range(addr, size); } +/* Convert generic struct iommu_domain to private struct dmar_domain */ +static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct dmar_domain, domain); +} + /* * 0: readable * 1: writable -- cgit v1.2.3 From b0d1f8741b812352fe0e5f3b2381427085f23e19 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:46 +0800 Subject: iommu/vt-d: Add nested translation helper function Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8. With PASID granular translation type set to 0x11b, translation result from the first level(FL) also subject to a second level(SL) page table translation. This mode is used for SVA virtualization, where FL performs guest virtual to guest physical translation and SL performs guest physical to host physical translation. This patch adds a helper function for setting up nested translation where second level comes from a domain and first level comes from a guest PGD. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 25 ------- drivers/iommu/intel-pasid.c | 174 +++++++++++++++++++++++++++++++++++++++++++- drivers/iommu/intel-pasid.h | 10 +++ include/linux/intel-iommu.h | 20 +++++ include/uapi/linux/iommu.h | 5 ++ 5 files changed, 206 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 8027f21073eb..7e85c09eec71 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -296,31 +296,6 @@ static inline void context_clear_entry(struct context_entry *context) static struct dmar_domain *si_domain; static int hw_pass_through = 1; -/* si_domain contains mulitple devices */ -#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) - -/* - * This is a DMA domain allocated through the iommu domain allocation - * interface. But one or more devices belonging to this domain have - * been chosen to use a private domain. We should avoid to use the - * map/unmap/iova_to_phys APIs on it. - */ -#define DOMAIN_FLAG_LOSE_CHILDREN BIT(1) - -/* - * When VT-d works in the scalable mode, it allows DMA translation to - * happen through either first level or second level page table. This - * bit marks that the DMA translation for the domain goes through the - * first level page table, otherwise, it goes through the second level. - */ -#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2) - -/* - * Domain represents a virtual machine which demands iommu nested - * translation mode support. - */ -#define DOMAIN_FLAG_NESTING_MODE BIT(3) - #define for_each_domain_iommu(idx, domain) \ for (idx = 0; idx < g_num_of_iommus; idx++) \ if (domain->iommu_refcnt[idx]) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index d9cea3011b58..c7fa1b79eaf7 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -359,6 +359,16 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, int pasid) @@ -492,7 +502,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, 1); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); pasid_set_present(pte); pasid_flush_caches(iommu, pte, pasid, did); @@ -561,7 +571,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, 2); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -595,7 +605,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, 4); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -609,3 +619,161 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } + +static int +intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte, + struct iommu_gpasid_bind_data_vtd *pasid_data) +{ + /* + * Not all guest PASID table entry fields are passed down during bind, + * here we only set up the ones that are dependent on guest settings. + * Execution related bits such as NXE, SMEP are not supported. + * Other fields, such as snoop related, are set based on host needs + * regardless of guest settings. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) { + if (!ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_sre(pte); + } + + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) { + if (!ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_eafe(pte); + } + + /* + * Memory type is only applicable to devices inside processor coherent + * domain. Will add MTS support once coherent devices are available. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) { + pr_warn_ratelimited("No memory type support %s\n", + iommu->name); + return -EINVAL; + } + + return 0; +} + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * This could be used for guest shared virtual address. In this case, the + * first level page tables are used for GVA-GPA translation in the guest, + * second level page tables are used for GPA-HPA translation. + * + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @gpgd: FLPTPTR: First Level Page translation pointer in GPA + * @pasid: PASID to be programmed in the device PASID table + * @pasid_data: Additional PASID info from the guest bind request + * @domain: Domain info for setting up second level page tables + * @addr_width: Address width of the first level (guest) + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + pgd_t *gpgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width) +{ + struct pasid_entry *pte; + struct dma_pte *pgd; + int ret = 0; + u64 pgd_val; + int agaw; + u16 did; + + if (!ecap_nest(iommu->ecap)) { + pr_err_ratelimited("IOMMU: %s: No nested translation support\n", + iommu->name); + return -EINVAL; + } + + if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) { + pr_err_ratelimited("Domain is not in nesting mode, %x\n", + domain->flags); + return -EINVAL; + } + + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) + return -EINVAL; + + /* + * Caller must ensure PASID entry is not in use, i.e. not bind the + * same PASID to the same device twice. + */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + + pasid_clear_entry(pte); + + /* Sanity checking performed by caller to make sure address + * width matching in two dimensions: + * 1. CPU vs. IOMMU + * 2. Guest vs. Host. + */ + switch (addr_width) { +#ifdef CONFIG_X86 + case ADDR_WIDTH_5LEVEL: + if (!cpu_feature_enabled(X86_FEATURE_LA57) || + !cap_5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + + pasid_set_flpm(pte, 1); + break; +#endif + case ADDR_WIDTH_4LEVEL: + pasid_set_flpm(pte, 0); + break; + default: + dev_err_ratelimited(dev, "Invalid guest address width %d\n", + addr_width); + return -EINVAL; + } + + /* First level PGD is in GPA, must be supported by the second level */ + if ((unsigned long long)gpgd > domain->max_addr) { + dev_err_ratelimited(dev, + "Guest PGD %llx not supported, max %llx\n", + (unsigned long long)gpgd, domain->max_addr); + return -EINVAL; + } + pasid_set_flptr(pte, (u64)gpgd); + + ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data); + if (ret) + return ret; + + /* Setup the second level based on the given domain */ + pgd = domain->pgd; + + agaw = iommu_skip_agaw(domain, iommu, &pgd); + if (agaw < 0) { + dev_err_ratelimited(dev, "Invalid domain page table\n"); + return -EINVAL; + } + pgd_val = virt_to_phys(pgd); + pasid_set_slptr(pte, pgd_val); + pasid_set_fault_enable(pte); + + did = domain->iommu_did[iommu->seq_id]; + pasid_set_domain_id(pte, did); + + pasid_set_address_width(pte, agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + pasid_flush_caches(iommu, pte, pasid, did); + + return ret; +} diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index 92de6df24ccb..ccd50c2ae75c 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -36,6 +36,7 @@ * to vmalloc or even module mappings. */ #define PASID_FLAG_SUPERVISOR_MODE BIT(0) +#define PASID_FLAG_NESTED BIT(1) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- @@ -51,6 +52,11 @@ struct pasid_entry { u64 val[8]; }; +#define PASID_ENTRY_PGTT_FL_ONLY (1) +#define PASID_ENTRY_PGTT_SL_ONLY (2) +#define PASID_ENTRY_PGTT_NESTED (3) +#define PASID_ENTRY_PGTT_PT (4) + /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ @@ -99,6 +105,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, int pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index ed7171d2ae1f..e0d1fed7cbe4 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -42,6 +42,9 @@ #define DMA_FL_PTE_PRESENT BIT_ULL(0) #define DMA_FL_PTE_XD BIT_ULL(63) +#define ADDR_WIDTH_5LEVEL (57) +#define ADDR_WIDTH_4LEVEL (48) + #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 @@ -480,6 +483,23 @@ struct context_entry { u64 hi; }; +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) + +/* + * When VT-d works in the scalable mode, it allows DMA translation to + * happen through either first level or second level page table. This + * bit marks that the DMA translation for the domain goes through the + * first level page table, otherwise, it goes through the second level. + */ +#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(1) + +/* + * Domain represents a virtual machine which demands iommu nested + * translation mode support. + */ +#define DOMAIN_FLAG_NESTING_MODE BIT(2) + struct dmar_domain { int nid; /* node id */ diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 4ad3496e5c43..e907b7091a46 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -285,6 +285,11 @@ struct iommu_gpasid_bind_data_vtd { __u32 emt; }; +#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ + IOMMU_SVA_VTD_GPASID_EMTE | \ + IOMMU_SVA_VTD_GPASID_PCD | \ + IOMMU_SVA_VTD_GPASID_PWT) + /** * struct iommu_gpasid_bind_data - Information about device and guest PASID binding * @version: Version of this data structure -- cgit v1.2.3 From 56722a4398a306585ca3ed39ff54fc907af98618 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:47 +0800 Subject: iommu/vt-d: Add bind guest PASID support When supporting guest SVA with emulated IOMMU, the guest PASID table is shadowed in VMM. Updates to guest vIOMMU PASID table will result in PASID cache flush which will be passed down to the host as bind guest PASID calls. For the SL page tables, it will be harvested from device's default domain (request w/o PASID), or aux domain in case of mediated device. .-------------. .---------------------------. | vIOMMU | | Guest process CR3, FL only| | | '---------------------------' .----------------/ | PASID Entry |--- PASID cache flush - '-------------' | | | V | | CR3 in GPA '-------------' Guest ------| Shadow |--------------------------|-------- v v v Host .-------------. .----------------------. | pIOMMU | | Bind FL for GVA-GPA | | | '----------------------' .----------------/ | | PASID Entry | V (Nested xlate) '----------------\.------------------------------. | | |SL for GPA-HPA, default domain| | | '------------------------------' '-------------' Where: - FL = First level/stage one page tables - SL = Second level/stage two page tables Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 4 + drivers/iommu/intel-svm.c | 200 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 6 +- include/linux/intel-svm.h | 12 +++ 4 files changed, 221 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7e85c09eec71..f42c548f8421 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5780,6 +5780,10 @@ const struct iommu_ops intel_iommu_ops = { .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .pgsize_bitmap = INTEL_IOMMU_PGSIZES, +#ifdef CONFIG_INTEL_IOMMU_SVM + .sva_bind_gpasid = intel_svm_bind_gpasid, + .sva_unbind_gpasid = intel_svm_unbind_gpasid, +#endif }; static void quirk_iommu_igfx(struct pci_dev *dev) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 2998418f0a38..7d3405c5a198 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -226,6 +226,206 @@ static LIST_HEAD(global_svm_list); list_for_each_entry((sdev), &(svm)->devs, list) \ if ((d) != (sdev)->dev) {} else +int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, + struct iommu_gpasid_bind_data *data) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct dmar_domain *dmar_domain; + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = 0; + + if (WARN_ON(!iommu) || !data) + return -EINVAL; + + if (data->version != IOMMU_GPASID_BIND_VERSION_1 || + data->format != IOMMU_PASID_FORMAT_INTEL_VTD) + return -EINVAL; + + if (!dev_is_pci(dev)) + return -ENOTSUPP; + + /* VT-d supports devices with full 20 bit PASIDs only */ + if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) + return -EINVAL; + + /* + * We only check host PASID range, we have no knowledge to check + * guest PASID range. + */ + if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) + return -EINVAL; + + dmar_domain = to_dmar_domain(domain); + + mutex_lock(&pasid_mutex); + svm = ioasid_find(NULL, data->hpasid, NULL); + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + if (svm) { + /* + * If we found svm for the PASID, there must be at + * least one device bond, otherwise svm should be freed. + */ + if (WARN_ON(list_empty(&svm->devs))) { + ret = -EINVAL; + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + /* + * For devices with aux domains, we should allow + * multiple bind calls with the same PASID and pdev. + */ + if (iommu_dev_feature_enabled(dev, + IOMMU_DEV_FEAT_AUX)) { + sdev->users++; + } else { + dev_warn_ratelimited(dev, + "Already bound with PASID %u\n", + svm->pasid); + ret = -EBUSY; + } + goto out; + } + } else { + /* We come here when PASID has never been bond to a device. */ + svm = kzalloc(sizeof(*svm), GFP_KERNEL); + if (!svm) { + ret = -ENOMEM; + goto out; + } + /* REVISIT: upper layer/VFIO can track host process that bind + * the PASID. ioasid_set = mm might be sufficient for vfio to + * check pasid VMM ownership. We can drop the following line + * once VFIO and IOASID set check is in place. + */ + svm->mm = get_task_mm(current); + svm->pasid = data->hpasid; + if (data->flags & IOMMU_SVA_GPASID_VAL) { + svm->gpasid = data->gpasid; + svm->flags |= SVM_FLAG_GUEST_PASID; + } + ioasid_set_data(data->hpasid, svm); + INIT_LIST_HEAD_RCU(&svm->devs); + mmput(svm->mm); + } + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!sdev) { + ret = -ENOMEM; + goto out; + } + sdev->dev = dev; + + /* Only count users if device has aux domains */ + if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) + sdev->users = 1; + + /* Set up device context entry for PASID if not enabled already */ + ret = intel_iommu_enable_pasid(iommu, sdev->dev); + if (ret) { + dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); + kfree(sdev); + goto out; + } + + /* + * PASID table is per device for better security. Therefore, for + * each bind of a new device even with an existing PASID, we need to + * call the nested mode setup function here. + */ + spin_lock(&iommu->lock); + ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)data->gpgd, + data->hpasid, &data->vtd, dmar_domain, + data->addr_width); + spin_unlock(&iommu->lock); + if (ret) { + dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", + data->hpasid, ret); + /* + * PASID entry should be in cleared state if nested mode + * set up failed. So we only need to clear IOASID tracking + * data such that free call will succeed. + */ + kfree(sdev); + goto out; + } + + svm->flags |= SVM_FLAG_GUEST_MODE; + + init_rcu_head(&sdev->rcu); + list_add_rcu(&sdev->list, &svm->devs); + out: + if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { + ioasid_set_data(data->hpasid, NULL); + kfree(svm); + } + + mutex_unlock(&pasid_mutex); + return ret; +} + +int intel_svm_unbind_gpasid(struct device *dev, int pasid) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = -EINVAL; + + if (WARN_ON(!iommu)) + return -EINVAL; + + mutex_lock(&pasid_mutex); + svm = ioasid_find(NULL, pasid, NULL); + if (!svm) { + ret = -EINVAL; + goto out; + } + + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + ret = 0; + if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) + sdev->users--; + if (!sdev->users) { + list_del_rcu(&sdev->list); + intel_pasid_tear_down_entry(iommu, dev, svm->pasid); + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); + /* TODO: Drain in flight PRQ for the PASID since it + * may get reused soon, we don't want to + * confuse with its previous life. + * intel_svm_drain_prq(dev, pasid); + */ + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + /* + * We do not free the IOASID here in that + * IOMMU driver did not allocate it. + * Unlike native SVM, IOASID for guest use was + * allocated prior to the bind call. + * In any case, if the free call comes before + * the unbind, IOMMU driver will get notified + * and perform cleanup. + */ + ioasid_set_data(pasid, NULL); + kfree(svm); + } + } + break; + } +out: + mutex_unlock(&pasid_mutex); + return ret; +} + int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e0d1fed7cbe4..3dfd426dfb03 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -698,7 +698,9 @@ struct dmar_domain *find_domain(struct device *dev); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); - +int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, + struct iommu_gpasid_bind_data *data); +int intel_svm_unbind_gpasid(struct device *dev, int pasid); struct svm_dev_ops; struct intel_svm_dev { @@ -715,9 +717,11 @@ struct intel_svm_dev { struct intel_svm { struct mmu_notifier notifier; struct mm_struct *mm; + struct intel_iommu *iommu; int flags; int pasid; + int gpasid; /* In case that guest PASID is different from host PASID */ struct list_head devs; struct list_head list; }; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index d7c403d0dd27..1b47ca46373e 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -44,6 +44,18 @@ struct svm_dev_ops { * do such IOTLB flushes automatically. */ #define SVM_FLAG_SUPERVISOR_MODE (1<<1) +/* + * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest + * processes. Compared to the host bind, the primary differences are: + * 1. mm life cycle management + * 2. fault reporting + */ +#define SVM_FLAG_GUEST_MODE (1<<2) +/* + * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space, + * which requires guest and host PASID translation at both directions. + */ +#define SVM_FLAG_GUEST_PASID (1<<3) #ifdef CONFIG_INTEL_IOMMU_SVM -- cgit v1.2.3 From 61a06a16e36d830f7811fbf931668d87197d95b7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:48 +0800 Subject: iommu/vt-d: Support flushing more translation cache types When Shared Virtual Memory is exposed to a guest via vIOMMU, scalable IOTLB invalidation may be passed down from outside IOMMU subsystems. This patch adds invalidation functions that can be used for additional translation cache types. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/iommu/intel-pasid.c | 3 ++- include/linux/intel-iommu.h | 21 +++++++++++++++++---- 3 files changed, 58 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index f77dae7ba7d4..34ee8f28555f 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1421,6 +1421,45 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, qi_submit_sync(&desc, iommu); } +/* PASID-based device IOTLB Invalidate */ +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, u64 granu) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + desc.qw1 = QI_DEV_EIOTLB_GLOB(granu); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + desc.qw1 |= addr & ~mask; + if (size_order) + desc.qw1 |= QI_DEV_EIOTLB_SIZE; + + qi_submit_sync(&desc, iommu); +} + +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, + u64 granu, int pasid) +{ + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | + QI_PC_GRAN(granu) | QI_PC_TYPE; + qi_submit_sync(&desc, iommu); +} + /* * Disable Queued Invalidation interface. */ diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index c7fa1b79eaf7..5d9d9ff49334 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -375,7 +375,8 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, { struct qi_desc desc; - desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid); + desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) | + QI_PC_PASID(pasid) | QI_PC_TYPE; desc.qw1 = 0; desc.qw2 = 0; desc.qw3 = 0; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 3dfd426dfb03..a9c984b29a72 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -334,7 +334,7 @@ enum { #define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) #define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) #define QI_IOTLB_IH(ih) (((u64)ih) << 6) -#define QI_IOTLB_AM(am) (((u8)am)) +#define QI_IOTLB_AM(am) (((u8)am) & 0x3f) #define QI_CC_FM(fm) (((u64)fm) << 48) #define QI_CC_SID(sid) (((u64)sid) << 32) @@ -353,16 +353,21 @@ enum { #define QI_PC_DID(did) (((u64)did) << 16) #define QI_PC_GRAN(gran) (((u64)gran) << 4) -#define QI_PC_ALL_PASIDS (QI_PC_TYPE | QI_PC_GRAN(0)) -#define QI_PC_PASID_SEL (QI_PC_TYPE | QI_PC_GRAN(1)) +/* PASID cache invalidation granu */ +#define QI_PC_ALL_PASIDS 0 +#define QI_PC_PASID_SEL 1 #define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_EIOTLB_IH(ih) (((u64)ih) << 6) -#define QI_EIOTLB_AM(am) (((u64)am)) +#define QI_EIOTLB_AM(am) (((u64)am) & 0x3f) #define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) #define QI_EIOTLB_DID(did) (((u64)did) << 16) #define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) +/* QI Dev-IOTLB inv granu */ +#define QI_DEV_IOTLB_GRAN_ALL 1 +#define QI_DEV_IOTLB_GRAN_PASID_SEL 0 + #define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) #define QI_DEV_EIOTLB_GLOB(g) ((u64)g) @@ -679,8 +684,16 @@ extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned mask); + void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, unsigned long npages, bool ih); + +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, u64 granu); +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, + int pasid); + extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); -- cgit v1.2.3 From 24f27d32ab6b71dedcbbeeab8f9bdc143b539ac0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:50 +0800 Subject: iommu/vt-d: Enlightened PASID allocation Enabling IOMMU in a guest requires communication with the host driver for certain aspects. Use of PASID ID to enable Shared Virtual Addressing (SVA) requires managing PASID's in the host. VT-d 3.0 spec provides a Virtual Command Register (VCMD) to facilitate this. Writes to this register in the guest are trapped by vIOMMU which proxies the call to the host driver. This virtual command interface consists of a capability register, a virtual command register, and a virtual response register. Refer to section 10.4.42, 10.4.43, 10.4.44 for more information. This patch adds the enlightened PASID allocation/free interfaces via the virtual command interface. Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Signed-off-by: Jacob Pan Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-pasid.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ drivers/iommu/intel-pasid.h | 13 ++++++++++- include/linux/intel-iommu.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 5d9d9ff49334..ea8f4ef4e295 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -27,6 +27,63 @@ static DEFINE_SPINLOCK(pasid_lock); u32 intel_pasid_max_id = PASID_MAX; +int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid) +{ + unsigned long flags; + u8 status_code; + int ret = 0; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + *pasid = VCMD_VRSP_RESULT_PASID(res); + break; + case VCMD_VRSP_SC_NO_PASID_AVAIL: + pr_info("IOMMU: %s: No PASID available\n", iommu->name); + ret = -ENOSPC; + break; + default: + ret = -ENODEV; + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } + + return ret; +} + +void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid) +{ + unsigned long flags; + u8 status_code; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, + VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + break; + case VCMD_VRSP_SC_INVALID_PASID: + pr_info("IOMMU: %s: Invalid PASID\n", iommu->name); + break; + default: + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } +} + /* * Per device pasid table management: */ diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index ccd50c2ae75c..a41b09b3ffde 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -23,6 +23,16 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) +/* Virtual command interface for enlightened pasid management. */ +#define VCMD_CMD_ALLOC 0x1 +#define VCMD_CMD_FREE 0x2 +#define VCMD_VRSP_IP 0x1 +#define VCMD_VRSP_SC(e) (((e) >> 1) & 0x3) +#define VCMD_VRSP_SC_SUCCESS 0 +#define VCMD_VRSP_SC_NO_PASID_AVAIL 1 +#define VCMD_VRSP_SC_INVALID_PASID 1 +#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 8) & 0xfffff) +#define VCMD_CMD_OPERAND(e) ((e) << 8) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. @@ -111,5 +121,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); - +int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid); +void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid); #endif /* __INTEL_PASID_H */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a9c984b29a72..addb310b4ded 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -169,6 +169,7 @@ #define ecap_smpwc(e) (((e) >> 48) & 0x1) #define ecap_flts(e) (((e) >> 47) & 0x1) #define ecap_slts(e) (((e) >> 46) & 0x1) +#define ecap_vcs(e) (((e) >> 44) & 0x1) #define ecap_smts(e) (((e) >> 43) & 0x1) #define ecap_dit(e) ((e >> 41) & 0x1) #define ecap_pasid(e) ((e >> 40) & 0x1) -- cgit v1.2.3 From 3375303e82877552f3b2b42309e8233fe715fd9f Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:51 +0800 Subject: iommu/vt-d: Add custom allocator for IOASID When VT-d driver runs in the guest, PASID allocation must be performed via virtual command interface. This patch registers a custom IOASID allocator which takes precedence over the default XArray based allocator. The resulting IOASID allocation will always come from the host. This ensures that PASID namespace is system- wide. Virtual command registers are used in the guest only, to prevent vmexit cost, we cache the capability and store it during initialization. Signed-off-by: Liu, Yi L Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 1 + drivers/iommu/intel-iommu.c | 85 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 7 ++++ 3 files changed, 93 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 34ee8f28555f..66af08ad10fb 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -963,6 +963,7 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) warn_invalid_dmar(phys_addr, " returns all ones"); goto unmap; } + iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG); /* the registers might be more than one page */ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 627bb5093317..80d0bd561bdd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1726,6 +1726,9 @@ static void free_dmar_iommu(struct intel_iommu *iommu) if (ecap_prs(iommu->ecap)) intel_svm_finish_prq(iommu); } + if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) + ioasid_unregister_allocator(&iommu->pasid_allocator); + #endif } @@ -3038,6 +3041,85 @@ out_unmap: return ret; } +#ifdef CONFIG_INTEL_IOMMU_SVM +static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) +{ + struct intel_iommu *iommu = data; + ioasid_t ioasid; + + if (!iommu) + return INVALID_IOASID; + /* + * VT-d virtual command interface always uses the full 20 bit + * PASID range. Host can partition guest PASID range based on + * policies but it is out of guest's control. + */ + if (min < PASID_MIN || max > intel_pasid_max_id) + return INVALID_IOASID; + + if (vcmd_alloc_pasid(iommu, &ioasid)) + return INVALID_IOASID; + + return ioasid; +} + +static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) +{ + struct intel_iommu *iommu = data; + + if (!iommu) + return; + /* + * Sanity check the ioasid owner is done at upper layer, e.g. VFIO + * We can only free the PASID when all the devices are unbound. + */ + if (ioasid_find(NULL, ioasid, NULL)) { + pr_alert("Cannot free active IOASID %d\n", ioasid); + return; + } + vcmd_free_pasid(iommu, ioasid); +} + +static void register_pasid_allocator(struct intel_iommu *iommu) +{ + /* + * If we are running in the host, no need for custom allocator + * in that PASIDs are allocated from the host system-wide. + */ + if (!cap_caching_mode(iommu->cap)) + return; + + if (!sm_supported(iommu)) { + pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); + return; + } + + /* + * Register a custom PASID allocator if we are running in a guest, + * guest PASID must be obtained via virtual command interface. + * There can be multiple vIOMMUs in each guest but only one allocator + * is active. All vIOMMU allocators will eventually be calling the same + * host allocator. + */ + if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap)) + return; + + pr_info("Register custom PASID allocator\n"); + iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; + iommu->pasid_allocator.free = intel_vcmd_ioasid_free; + iommu->pasid_allocator.pdata = (void *)iommu; + if (ioasid_register_allocator(&iommu->pasid_allocator)) { + pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); + /* + * Disable scalable mode on this IOMMU if there + * is no custom allocator. Mixing SM capable vIOMMU + * and non-SM vIOMMU are not supported. + */ + intel_iommu_sm = 0; + } +} +#endif + static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; @@ -3155,6 +3237,9 @@ static int __init init_dmars(void) */ for_each_active_iommu(iommu, drhd) { iommu_flush_write_buffer(iommu); +#ifdef CONFIG_INTEL_IOMMU_SVM + register_pasid_allocator(iommu); +#endif iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index addb310b4ded..e14124f74b3a 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -195,6 +196,9 @@ #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) #define ecap_sc_support(e) ((e >> 7) & 0x1) /* Snooping Control */ +/* Virtual command interface capability */ +#define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ + /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) @@ -288,6 +292,7 @@ /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) +#define DMA_VCS_PAS ((u64)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ do { \ @@ -555,6 +560,7 @@ struct intel_iommu { u64 reg_size; /* size of hw register set */ u64 cap; u64 ecap; + u64 vccap; u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ raw_spinlock_t register_lock; /* protect register handling */ int seq_id; /* sequence id of the iommu */ @@ -575,6 +581,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ #endif struct q_inval *qi; /* Queued invalidation info */ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ -- cgit v1.2.3 From e85bb99b79ca5ad2681612a7bb22f94cc2c71866 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:52 +0800 Subject: iommu/vt-d: Add get_domain_info() helper Add a get_domain_info() helper to retrieve the valid per-device iommu private data. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 40 +++++++++++++++++++++++++++------------- drivers/iommu/intel-pasid.c | 12 ++++++------ drivers/iommu/intel-svm.c | 2 +- include/linux/intel-iommu.h | 1 + 4 files changed, 35 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 80d0bd561bdd..a13b723ca38d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -365,6 +365,21 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) +struct device_domain_info *get_domain_info(struct device *dev) +{ + struct device_domain_info *info; + + if (!dev) + return NULL; + + info = dev->archdata.iommu; + if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO || + info == DEFER_DEVICE_DOMAIN_INFO)) + return NULL; + + return info; +} + DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); @@ -2429,7 +2444,7 @@ struct dmar_domain *find_domain(struct device *dev) dev = &pci_real_dma_dev(to_pci_dev(dev))->dev; /* No lock here, assumes no domain exit in normal case */ - info = dev->archdata.iommu; + info = get_domain_info(dev); if (likely(info)) return info->domain; @@ -5012,9 +5027,8 @@ static void dmar_remove_one_dev_info(struct device *dev) unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; - if (info && info != DEFER_DEVICE_DOMAIN_INFO - && info != DUMMY_DEVICE_DOMAIN_INFO) + info = get_domain_info(dev); + if (info) __dmar_remove_one_dev_info(info); spin_unlock_irqrestore(&device_domain_lock, flags); } @@ -5104,7 +5118,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) static inline bool is_aux_domain(struct device *dev, struct iommu_domain *domain) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); return info && info->auxd_enabled && domain->type == IOMMU_DOMAIN_UNMANAGED; @@ -5113,7 +5127,7 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain) static void auxiliary_link_device(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) @@ -5126,7 +5140,7 @@ static void auxiliary_link_device(struct dmar_domain *domain, static void auxiliary_unlink_device(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) @@ -5214,7 +5228,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, return; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); iommu = info->iommu; auxiliary_unlink_device(domain, dev); @@ -5404,7 +5418,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, spin_lock_irqsave(&device_domain_lock, flags); spin_lock(&iommu->lock); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info) { ret = -EINVAL; goto out_unlock; @@ -5768,7 +5782,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) spin_lock(&iommu->lock); ret = -EINVAL; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_supported) goto out; @@ -5864,7 +5878,7 @@ static int intel_iommu_enable_auxd(struct device *dev) return -ENODEV; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); info->auxd_enabled = 1; spin_unlock_irqrestore(&device_domain_lock, flags); @@ -5877,7 +5891,7 @@ static int intel_iommu_disable_auxd(struct device *dev) unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!WARN_ON(!info)) info->auxd_enabled = 0; spin_unlock_irqrestore(&device_domain_lock, flags); @@ -5954,7 +5968,7 @@ intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) static bool intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); if (feat == IOMMU_DEV_FEAT_AUX) return scalable_mode_support() && info && info->auxd_enabled; diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index ea8f4ef4e295..c46a068142b9 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -151,7 +151,7 @@ int intel_pasid_alloc_table(struct device *dev) int size; might_sleep(); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) return -EINVAL; @@ -198,7 +198,7 @@ void intel_pasid_free_table(struct device *dev) struct pasid_entry *table; int i, max_pde; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !dev_is_pci(dev) || !info->pasid_table) return; @@ -224,7 +224,7 @@ struct pasid_table *intel_pasid_get_table(struct device *dev) { struct device_domain_info *info; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info) return NULL; @@ -235,7 +235,7 @@ int intel_pasid_get_dev_max_id(struct device *dev) { struct device_domain_info *info; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_table) return 0; @@ -256,7 +256,7 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid) return NULL; dir = pasid_table->table; - info = dev->archdata.iommu; + info = get_domain_info(dev); dir_index = pasid >> PASID_PDE_SHIFT; index = pasid & PASID_PTE_MASK; @@ -462,7 +462,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, struct device_domain_info *info; u16 sid, qdep, pfsid; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->ats_enabled) return; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 7d3405c5a198..75a1ba4439f7 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -503,7 +503,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ goto out; } - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_supported) { kfree(sdev); goto out; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e14124f74b3a..caa179e806fc 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -714,6 +714,7 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void iommu_flush_write_buffer(struct intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); struct dmar_domain *find_domain(struct device *dev); +struct device_domain_info *get_domain_info(struct device *dev); #ifdef CONFIG_INTEL_IOMMU_SVM extern void intel_svm_check(struct intel_iommu *iommu); -- cgit v1.2.3 From 064a57d7ddfc46ada02b477b91c478001b03bfa3 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:54 +0800 Subject: iommu/vt-d: Replace intel SVM APIs with generic SVA APIs This patch is an initial step to replace Intel SVM code with the following IOMMU SVA ops: intel_svm_bind_mm() => iommu_sva_bind_device() intel_svm_unbind_mm() => iommu_sva_unbind_device() intel_svm_is_pasid_valid() => iommu_sva_get_pasid() The features below will continue to work but are not included in this patch in that they are handled mostly within the IOMMU subsystem. - IO page fault - mmu notifier Consolidation of the above will come after merging generic IOMMU sva code[1]. There should not be any changes needed for SVA users such as accelerator device drivers during this time. [1] http://jpbrucker.net/sva/ Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 ++ drivers/iommu/intel-svm.c | 124 ++++++++++++++++++++++++-------------------- include/linux/intel-iommu.h | 6 +++ include/linux/intel-svm.h | 86 ------------------------------ 4 files changed, 78 insertions(+), 141 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ed7de7420b3c..7d28ef2e6fe2 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -6071,6 +6071,9 @@ const struct iommu_ops intel_iommu_ops = { .cache_invalidate = intel_iommu_sva_invalidate, .sva_bind_gpasid = intel_svm_bind_gpasid, .sva_unbind_gpasid = intel_svm_unbind_gpasid, + .sva_bind = intel_svm_bind, + .sva_unbind = intel_svm_unbind, + .sva_get_pasid = intel_svm_get_pasid, #endif }; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 75a1ba4439f7..8b66bf45477e 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -426,13 +426,15 @@ out: return ret; } -int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) +/* Caller must hold pasid_mutex, mm reference */ +static int +intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops, + struct mm_struct *mm, struct intel_svm_dev **sd) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); struct device_domain_info *info; struct intel_svm_dev *sdev; struct intel_svm *svm = NULL; - struct mm_struct *mm = NULL; int pasid_max; int ret; @@ -449,16 +451,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } else pasid_max = 1 << 20; + /* Bind supervisor PASID shuld have mm = NULL */ if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap)) + if (!ecap_srs(iommu->ecap) || mm) { + pr_err("Supervisor PASID with user provided mm.\n"); return -EINVAL; - } else if (pasid) { - mm = get_task_mm(current); - BUG_ON(!mm); + } } - mutex_lock(&pasid_mutex); - if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) { + if (!(flags & SVM_FLAG_PRIVATE_PASID)) { struct intel_svm *t; list_for_each_entry(t, &global_svm_list, list) { @@ -496,9 +497,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ sdev->dev = dev; ret = intel_iommu_enable_pasid(iommu, dev); - if (ret || !pasid) { - /* If they don't actually want to assign a PASID, this is - * just an enabling check/preparation. */ + if (ret) { kfree(sdev); goto out; } @@ -597,18 +596,17 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } } list_add_rcu(&sdev->list, &svm->devs); - - success: - *pasid = svm->pasid; +success: + sdev->pasid = svm->pasid; + sdev->sva.dev = dev; + if (sd) + *sd = sdev; ret = 0; out: - mutex_unlock(&pasid_mutex); - if (mm) - mmput(mm); return ret; } -EXPORT_SYMBOL_GPL(intel_svm_bind_mm); +/* Caller must hold pasid_mutex */ int intel_svm_unbind_mm(struct device *dev, int pasid) { struct intel_svm_dev *sdev; @@ -616,7 +614,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) struct intel_svm *svm; int ret = -EINVAL; - mutex_lock(&pasid_mutex); iommu = intel_svm_device_to_iommu(dev); if (!iommu) goto out; @@ -662,45 +659,9 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) break; } out: - mutex_unlock(&pasid_mutex); return ret; } -EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); - -int intel_svm_is_pasid_valid(struct device *dev, int pasid) -{ - struct intel_iommu *iommu; - struct intel_svm *svm; - int ret = -EINVAL; - - mutex_lock(&pasid_mutex); - iommu = intel_svm_device_to_iommu(dev); - if (!iommu) - goto out; - - svm = ioasid_find(NULL, pasid, NULL); - if (!svm) - goto out; - - if (IS_ERR(svm)) { - ret = PTR_ERR(svm); - goto out; - } - /* init_mm is used in this case */ - if (!svm->mm) - ret = 1; - else if (atomic_read(&svm->mm->mm_users) > 0) - ret = 1; - else - ret = 0; - - out: - mutex_unlock(&pasid_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid); /* Page request queue descriptor */ struct page_req_dsc { @@ -894,3 +855,56 @@ static irqreturn_t prq_event_thread(int irq, void *d) return IRQ_RETVAL(handled); } + +#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) +struct iommu_sva * +intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +{ + struct iommu_sva *sva = ERR_PTR(-EINVAL); + struct intel_svm_dev *sdev = NULL; + int flags = 0; + int ret; + + /* + * TODO: Consolidate with generic iommu-sva bind after it is merged. + * It will require shared SVM data structures, i.e. combine io_mm + * and intel_svm etc. + */ + if (drvdata) + flags = *(int *)drvdata; + mutex_lock(&pasid_mutex); + ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); + if (ret) + sva = ERR_PTR(ret); + else if (sdev) + sva = &sdev->sva; + else + WARN(!sdev, "SVM bind succeeded with no sdev!\n"); + + mutex_unlock(&pasid_mutex); + + return sva; +} + +void intel_svm_unbind(struct iommu_sva *sva) +{ + struct intel_svm_dev *sdev; + + mutex_lock(&pasid_mutex); + sdev = to_intel_svm_dev(sva); + intel_svm_unbind_mm(sdev->dev, sdev->pasid); + mutex_unlock(&pasid_mutex); +} + +int intel_svm_get_pasid(struct iommu_sva *sva) +{ + struct intel_svm_dev *sdev; + int pasid; + + mutex_lock(&pasid_mutex); + sdev = to_intel_svm_dev(sva); + pasid = sdev->pasid; + mutex_unlock(&pasid_mutex); + + return pasid; +} diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index caa179e806fc..42245e1e1b48 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -723,6 +723,10 @@ extern int intel_svm_finish_prq(struct intel_iommu *iommu); int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, struct iommu_gpasid_bind_data *data); int intel_svm_unbind_gpasid(struct device *dev, int pasid); +struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, + void *drvdata); +void intel_svm_unbind(struct iommu_sva *handle); +int intel_svm_get_pasid(struct iommu_sva *handle); struct svm_dev_ops; struct intel_svm_dev { @@ -730,6 +734,8 @@ struct intel_svm_dev { struct rcu_head rcu; struct device *dev; struct svm_dev_ops *ops; + struct iommu_sva sva; + int pasid; int users; u16 did; u16 dev_iotlb:1; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 1b47ca46373e..c9e7e601950d 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -21,7 +21,6 @@ struct svm_dev_ops { #define SVM_REQ_EXEC (1<<1) #define SVM_REQ_PRIV (1<<0) - /* * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main" * PASID for the current process. Even if a PASID already exists, a new one @@ -57,89 +56,4 @@ struct svm_dev_ops { */ #define SVM_FLAG_GUEST_PASID (1<<3) -#ifdef CONFIG_INTEL_IOMMU_SVM - -/** - * intel_svm_bind_mm() - Bind the current process to a PASID - * @dev: Device to be granted access - * @pasid: Address for allocated PASID - * @flags: Flags. Later for requesting supervisor mode, etc. - * @ops: Callbacks to device driver - * - * This function attempts to enable PASID support for the given device. - * If the @pasid argument is non-%NULL, a PASID is allocated for access - * to the MM of the current process. - * - * By using a %NULL value for the @pasid argument, this function can - * be used to simply validate that PASID support is available for the - * given device — i.e. that it is behind an IOMMU which has the - * requisite support, and is enabled. - * - * Page faults are handled transparently by the IOMMU code, and there - * should be no need for the device driver to be involved. If a page - * fault cannot be handled (i.e. is an invalid address rather than - * just needs paging in), then the page request will be completed by - * the core IOMMU code with appropriate status, and the device itself - * can then report the resulting fault to its driver via whatever - * mechanism is appropriate. - * - * Multiple calls from the same process may result in the same PASID - * being re-used. A reference count is kept. - */ -extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, - struct svm_dev_ops *ops); - -/** - * intel_svm_unbind_mm() - Unbind a specified PASID - * @dev: Device for which PASID was allocated - * @pasid: PASID value to be unbound - * - * This function allows a PASID to be retired when the device no - * longer requires access to the address space of a given process. - * - * If the use count for the PASID in question reaches zero, the - * PASID is revoked and may no longer be used by hardware. - * - * Device drivers are required to ensure that no access (including - * page requests) is currently outstanding for the PASID in question, - * before calling this function. - */ -extern int intel_svm_unbind_mm(struct device *dev, int pasid); - -/** - * intel_svm_is_pasid_valid() - check if pasid is valid - * @dev: Device for which PASID was allocated - * @pasid: PASID value to be checked - * - * This function checks if the specified pasid is still valid. A - * valid pasid means the backing mm is still having a valid user. - * For kernel callers init_mm is always valid. for other mm, if mm->mm_users - * is non-zero, it is valid. - * - * returns -EINVAL if invalid pasid, 0 if pasid ref count is invalid - * 1 if pasid is valid. - */ -extern int intel_svm_is_pasid_valid(struct device *dev, int pasid); - -#else /* CONFIG_INTEL_IOMMU_SVM */ - -static inline int intel_svm_bind_mm(struct device *dev, int *pasid, - int flags, struct svm_dev_ops *ops) -{ - return -ENOSYS; -} - -static inline int intel_svm_unbind_mm(struct device *dev, int pasid) -{ - BUG(); -} - -static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid) -{ - return -EINVAL; -} -#endif /* CONFIG_INTEL_IOMMU_SVM */ - -#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL)) - #endif /* __INTEL_SVM_H__ */ -- cgit v1.2.3 From 8a1d824625402b3ef3c3e5965663354ff0394d86 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:55 +0800 Subject: iommu/vt-d: Multiple descriptors per qi_submit_sync() Current qi_submit_sync() only supports single invalidation descriptor per submission and appends wait descriptor after each submission to poll the hardware completion. This extends the qi_submit_sync() helper to support multiple descriptors, and add an option so that the caller could specify the Page-request Drain (PD) bit in the wait descriptor. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 63 ++++++++++++++++++++++--------------- drivers/iommu/intel-pasid.c | 4 +-- drivers/iommu/intel-svm.c | 6 ++-- drivers/iommu/intel_irq_remapping.c | 2 +- include/linux/intel-iommu.h | 9 +++++- 5 files changed, 52 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 66af08ad10fb..60a2970c37ff 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1157,12 +1157,11 @@ static inline void reclaim_free_desc(struct q_inval *qi) } } -static int qi_check_fault(struct intel_iommu *iommu, int index) +static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) { u32 fault; int head, tail; struct q_inval *qi = iommu->qi; - int wait_index = (index + 1) % QI_LENGTH; int shift = qi_shift(iommu); if (qi->desc_status[wait_index] == QI_ABORT) @@ -1225,17 +1224,21 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) } /* - * Submit the queued invalidation descriptor to the remapping - * hardware unit and wait for its completion. + * Function to submit invalidation descriptors of all types to the queued + * invalidation interface(QI). Multiple descriptors can be submitted at a + * time, a wait descriptor will be appended to each submission to ensure + * hardware has completed the invalidation before return. Wait descriptors + * can be part of the submission but it will not be polled for completion. */ -int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options) { - int rc; struct q_inval *qi = iommu->qi; - int offset, shift, length; struct qi_desc wait_desc; int wait_index, index; unsigned long flags; + int offset, shift; + int rc, i; if (!qi) return 0; @@ -1244,32 +1247,41 @@ restart: rc = 0; raw_spin_lock_irqsave(&qi->q_lock, flags); - while (qi->free_cnt < 3) { + /* + * Check if we have enough empty slots in the queue to submit, + * the calculation is based on: + * # of desc + 1 wait desc + 1 space between head and tail + */ + while (qi->free_cnt < count + 2) { raw_spin_unlock_irqrestore(&qi->q_lock, flags); cpu_relax(); raw_spin_lock_irqsave(&qi->q_lock, flags); } index = qi->free_head; - wait_index = (index + 1) % QI_LENGTH; + wait_index = (index + count) % QI_LENGTH; shift = qi_shift(iommu); - length = 1 << shift; - qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE; + for (i = 0; i < count; i++) { + offset = ((index + i) % QI_LENGTH) << shift; + memcpy(qi->desc + offset, &desc[i], 1 << shift); + qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE; + } + qi->desc_status[wait_index] = QI_IN_USE; - offset = index << shift; - memcpy(qi->desc + offset, desc, length); wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE; + if (options & QI_OPT_WAIT_DRAIN) + wait_desc.qw0 |= QI_IWD_PRQ_DRAIN; wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]); wait_desc.qw2 = 0; wait_desc.qw3 = 0; offset = wait_index << shift; - memcpy(qi->desc + offset, &wait_desc, length); + memcpy(qi->desc + offset, &wait_desc, 1 << shift); - qi->free_head = (qi->free_head + 2) % QI_LENGTH; - qi->free_cnt -= 2; + qi->free_head = (qi->free_head + count + 1) % QI_LENGTH; + qi->free_cnt -= count + 1; /* * update the HW tail register indicating the presence of @@ -1285,7 +1297,7 @@ restart: * a deadlock where the interrupt context can wait indefinitely * for free slots in the queue. */ - rc = qi_check_fault(iommu, index); + rc = qi_check_fault(iommu, index, wait_index); if (rc) break; @@ -1294,7 +1306,8 @@ restart: raw_spin_lock(&qi->q_lock); } - qi->desc_status[index] = QI_DONE; + for (i = 0; i < count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1318,7 +1331,7 @@ void qi_global_iec(struct intel_iommu *iommu) desc.qw3 = 0; /* should never fail */ - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, @@ -1332,7 +1345,7 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, @@ -1356,7 +1369,7 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, @@ -1378,7 +1391,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* PASID-based IOTLB invalidation */ @@ -1419,7 +1432,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, QI_EIOTLB_AM(mask); } - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* PASID-based device IOTLB Invalidate */ @@ -1448,7 +1461,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (size_order) desc.qw1 |= QI_DEV_EIOTLB_SIZE; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, @@ -1458,7 +1471,7 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_TYPE; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index c46a068142b9..45e9b5b291bc 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -438,7 +438,7 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } static void @@ -452,7 +452,7 @@ iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } static void diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 8b66bf45477e..5133b2d4428f 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -138,7 +138,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, svm->iommu); + qi_submit_sync(svm->iommu, &desc, 1, 0); if (sdev->dev_iotlb) { desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | @@ -162,7 +162,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, svm->iommu); + qi_submit_sync(svm->iommu, &desc, 1, 0); } } @@ -846,7 +846,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) sizeof(req->priv_data)); resp.qw2 = 0; resp.qw3 = 0; - qi_submit_sync(&resp, iommu); + qi_submit_sync(iommu, &resp, 1, 0); } head = (head + sizeof(*req)) & PRQ_RING_MASK; } diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 81e43c1df7ec..a042f123b091 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -151,7 +151,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) desc.qw2 = 0; desc.qw3 = 0; - return qi_submit_sync(&desc, iommu); + return qi_submit_sync(iommu, &desc, 1, 0); } static int modify_irte(struct irq_2_iommu *irq_iommu, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 42245e1e1b48..677dee59e3c0 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -333,6 +333,7 @@ enum { #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) #define QI_IOTLB_DR(dr) (((u64)dr) << 7) @@ -702,7 +703,13 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int pasid); -extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options); +/* + * Options used in qi_submit_sync: + * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. + */ +#define QI_OPT_WAIT_DRAIN BIT(0) extern int dmar_ir_support(void); -- cgit v1.2.3 From 66ac4db36f4c12a3b02db864ccb0801cd938b6de Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:58 +0800 Subject: iommu/vt-d: Add page request draining support When a PASID is stopped or terminated, there can be pending PRQs (requests that haven't received responses) in remapping hardware. This adds the interface to drain page requests and call it when a PASID is terminated. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-16-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 107 +++++++++++++++++++++++++++++++++++++++++--- include/linux/intel-iommu.h | 4 ++ 2 files changed, 106 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 960a3610e852..5ab71107afd5 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -23,6 +23,7 @@ #include "intel-pasid.h" static irqreturn_t prq_event_thread(int irq, void *d); +static void intel_svm_drain_prq(struct device *dev, int pasid); #define PRQ_ORDER 0 @@ -66,6 +67,8 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + init_completion(&iommu->prq_complete); + return 0; } @@ -399,12 +402,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid) list_del_rcu(&sdev->list); intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); - /* TODO: Drain in flight PRQ for the PASID since it - * may get reused soon, we don't want to - * confuse with its previous life. - * intel_svm_drain_prq(dev, pasid); - */ kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { @@ -643,6 +642,7 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) * hard to be as defensive as we might like. */ intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); kfree_rcu(sdev, rcu); @@ -721,6 +721,93 @@ static bool is_canonical_address(u64 addr) return (((saddr << shift) >> shift) == saddr); } +/** + * intel_svm_drain_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +static void intel_svm_drain_prq(struct device *dev, int pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + struct pci_dev *pdev; + int head, tail; + u16 sid, did; + int qdep; + + info = get_domain_info(dev); + if (WARN_ON(!info || !dev_is_pci(dev))) + return; + + if (!info->pri_enabled) + return; + + iommu = info->iommu; + domain = info->domain; + pdev = to_pci_dev(dev); + sid = PCI_DEVID(info->bus, info->devfn); + did = domain->iommu_did[iommu->seq_id]; + qdep = pci_ats_queue_depth(pdev); + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (!req->pasid_present || req->pasid != pasid) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + desc[1].qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | + QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | + QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(info->pfsid); +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + static irqreturn_t prq_event_thread(int irq, void *d) { struct intel_iommu *iommu = d; @@ -856,6 +943,16 @@ static irqreturn_t prq_event_thread(int irq, void *d) dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + return IRQ_RETVAL(handled); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 677dee59e3c0..21633cee6331 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -292,6 +292,8 @@ /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) +#define DMA_PRS_PRO ((u32)2) + #define DMA_VCS_PAS ((u64)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ @@ -333,6 +335,7 @@ enum { #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_FENCE (((u64)1) << 6) #define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) @@ -582,6 +585,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct completion prq_complete; struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ #endif struct q_inval *qi; /* Queued invalidation info */ -- cgit v1.2.3 From 81530a38a36d411e01ea99116503901f75aa758b Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 15 May 2020 11:05:15 +0300 Subject: phy: omap-usb2: Clean up exported header Move private definitions from header to phy-omap-usb2.c file. Get rid of unused data structures usb_dpll_params and omap_usb_phy_type. Signed-off-by: Roger Quadros Link: https://lore.kernel.org/r/20200515080518.26870-2-rogerq@ti.com Signed-off-by: Kishon Vijay Abraham I --- drivers/phy/ti/phy-omap-usb2.c | 60 +++++++++++++++++++++++++++++++++--- include/linux/phy/omap_usb.h | 69 ++---------------------------------------- 2 files changed, 58 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/drivers/phy/ti/phy-omap-usb2.c b/drivers/phy/ti/phy-omap-usb2.c index 3d74629d7423..cb2dd3230fa7 100644 --- a/drivers/phy/ti/phy-omap-usb2.c +++ b/drivers/phy/ti/phy-omap-usb2.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * omap-usb2.c - USB PHY, talking to musb controller in OMAP. + * omap-usb2.c - USB PHY, talking to USB controller on TI SoCs. * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2012-2020 Texas Instruments Incorporated - http://www.ti.com * Author: Kishon Vijay Abraham I */ @@ -23,13 +23,65 @@ #include #include -#define USB2PHY_DISCON_BYP_LATCH (1 << 31) -#define USB2PHY_ANA_CONFIG1 0x4c +#define USB2PHY_ANA_CONFIG1 0x4c +#define USB2PHY_DISCON_BYP_LATCH BIT(31) +/* SoC Specific USB2_OTG register definitions */ #define AM654_USB2_OTG_PD BIT(8) #define AM654_USB2_VBUS_DET_EN BIT(5) #define AM654_USB2_VBUSVALID_DET_EN BIT(4) +#define OMAP_DEV_PHY_PD BIT(0) +#define OMAP_USB2_PHY_PD BIT(28) + +#define AM437X_USB2_PHY_PD BIT(0) +#define AM437X_USB2_OTG_PD BIT(1) +#define AM437X_USB2_OTGVDET_EN BIT(19) +#define AM437X_USB2_OTGSESSEND_EN BIT(20) + +/* Driver Flags */ +#define OMAP_USB2_HAS_START_SRP BIT(0) +#define OMAP_USB2_HAS_SET_VBUS BIT(1) +#define OMAP_USB2_CALIBRATE_FALSE_DISCONNECT BIT(2) + +struct omap_usb { + struct usb_phy phy; + struct phy_companion *comparator; + void __iomem *pll_ctrl_base; + void __iomem *phy_base; + struct device *dev; + struct device *control_dev; + struct clk *wkupclk; + struct clk *optclk; + u8 flags; + struct regmap *syscon_phy_power; /* ctrl. reg. acces */ + unsigned int power_reg; /* power reg. index within syscon */ + u32 mask; + u32 power_on; + u32 power_off; +}; + +#define phy_to_omapusb(x) container_of((x), struct omap_usb, phy) + +struct usb_phy_data { + const char *label; + u8 flags; + u32 mask; + u32 power_on; + u32 power_off; +}; + +static inline u32 omap_usb_readl(void __iomem *addr, unsigned int offset) +{ + return __raw_readl(addr + offset); +} + +static inline void omap_usb_writel(void __iomem *addr, unsigned int offset, + u32 data) +{ + __raw_writel(data, addr + offset); +} + /** * omap_usb2_set_comparator - links the comparator present in the sytem with * this phy diff --git a/include/linux/phy/omap_usb.h b/include/linux/phy/omap_usb.h index 5973a6313529..e23b52df93ec 100644 --- a/include/linux/phy/omap_usb.h +++ b/include/linux/phy/omap_usb.h @@ -2,68 +2,14 @@ /* * omap_usb.h -- omap usb2 phy header file * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2012-2020 Texas Instruments Incorporated - http://www.ti.com * Author: Kishon Vijay Abraham I */ #ifndef __DRIVERS_OMAP_USB2_H #define __DRIVERS_OMAP_USB2_H -#include -#include - -struct usb_dpll_params { - u16 m; - u8 n; - u8 freq:3; - u8 sd; - u32 mf; -}; - -enum omap_usb_phy_type { - TYPE_USB2, /* USB2_PHY, power down in CONTROL_DEV_CONF */ - TYPE_DRA7USB2, /* USB2 PHY, power and power_aux e.g. DRA7 */ - TYPE_AM437USB2, /* USB2 PHY, power e.g. AM437x */ -}; - -struct omap_usb { - struct usb_phy phy; - struct phy_companion *comparator; - void __iomem *pll_ctrl_base; - void __iomem *phy_base; - struct device *dev; - struct device *control_dev; - struct clk *wkupclk; - struct clk *optclk; - u8 flags; - enum omap_usb_phy_type type; - struct regmap *syscon_phy_power; /* ctrl. reg. acces */ - unsigned int power_reg; /* power reg. index within syscon */ - u32 mask; - u32 power_on; - u32 power_off; -}; - -struct usb_phy_data { - const char *label; - u8 flags; - u32 mask; - u32 power_on; - u32 power_off; -}; - -/* Driver Flags */ -#define OMAP_USB2_HAS_START_SRP (1 << 0) -#define OMAP_USB2_HAS_SET_VBUS (1 << 1) -#define OMAP_USB2_CALIBRATE_FALSE_DISCONNECT (1 << 2) - -#define OMAP_DEV_PHY_PD BIT(0) -#define OMAP_USB2_PHY_PD BIT(28) - -#define AM437X_USB2_PHY_PD BIT(0) -#define AM437X_USB2_OTG_PD BIT(1) -#define AM437X_USB2_OTGVDET_EN BIT(19) -#define AM437X_USB2_OTGSESSEND_EN BIT(20) +#include #define phy_to_omapusb(x) container_of((x), struct omap_usb, phy) @@ -76,15 +22,4 @@ static inline int omap_usb2_set_comparator(struct phy_companion *comparator) } #endif -static inline u32 omap_usb_readl(void __iomem *addr, unsigned offset) -{ - return __raw_readl(addr + offset); -} - -static inline void omap_usb_writel(void __iomem *addr, unsigned offset, - u32 data) -{ - __raw_writel(data, addr + offset); -} - #endif /* __DRIVERS_OMAP_USB_H */ -- cgit v1.2.3 From ca4faf543a33373bed3650812d5f0cd0bd295b1a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 2 May 2020 10:37:44 -0400 Subject: SUNRPC: Move xpt_mutex into socket xpo_sendto methods It appears that the RPC/RDMA transport does not need serialization of calls to its xpo_sendto method. Move the mutex into the socket methods that still need that serialization. Tail latencies are unambiguously better with this patch applied. fio randrw 8KB 70/30 on NFSv3, smaller numbers are better: clat percentiles (usec): With xpt_mutex: r | 99.99th=[ 8848] w | 99.99th=[ 9634] Without xpt_mutex: r | 99.99th=[ 8586] w | 99.99th=[ 8979] Serializing the construction of RPC/RDMA transport headers is not really necessary at this point, because the Linux NFS server implementation never changes its credit grant on a connection. If that should change, then svc_rdma_sendto will need to serialize access to the transport's credit grant fields. Reported-by: kbuild test robot [ cel: fix uninitialized variable warning ] Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 6 +++++ net/sunrpc/svc_xprt.c | 12 +++------- net/sunrpc/svcsock.c | 25 +++++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 35 ++++++++++++++---------------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 10 ++++----- net/sunrpc/xprtsock.c | 12 ++++++++-- 6 files changed, 64 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 9e1e046de176..aca35ab5cff2 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -117,6 +117,12 @@ static inline int register_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u return 0; } +static inline bool svc_xprt_is_dead(const struct svc_xprt *xprt) +{ + return (test_bit(XPT_DEAD, &xprt->xpt_flags) != 0) || + (test_bit(XPT_CLOSE, &xprt->xpt_flags) != 0); +} + int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2284ff038dad..07cdbf7d5764 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -914,16 +914,10 @@ int svc_send(struct svc_rqst *rqstp) xb->page_len + xb->tail[0].iov_len; trace_svc_sendto(xb); - - /* Grab mutex to serialize outgoing data. */ - mutex_lock(&xprt->xpt_mutex); trace_svc_stats_latency(rqstp); - if (test_bit(XPT_DEAD, &xprt->xpt_flags) - || test_bit(XPT_CLOSE, &xprt->xpt_flags)) - len = -ENOTCONN; - else - len = xprt->xpt_ops->xpo_sendto(rqstp); - mutex_unlock(&xprt->xpt_mutex); + + len = xprt->xpt_ops->xpo_sendto(rqstp); + trace_svc_send(rqstp, len); svc_xprt_release(rqstp); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 023514e392b3..3e7b6445e317 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -506,6 +506,9 @@ out_free: * svc_udp_sendto - Send out a reply on a UDP socket * @rqstp: completed svc_rqst * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * * Returns the number of bytes sent, or a negative errno. */ static int svc_udp_sendto(struct svc_rqst *rqstp) @@ -531,6 +534,11 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) svc_set_cmsg_data(rqstp, cmh); + mutex_lock(&xprt->xpt_mutex); + + if (svc_xprt_is_dead(xprt)) + goto out_notconn; + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); xdr_free_bvec(xdr); if (err == -ECONNREFUSED) { @@ -538,9 +546,15 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); xdr_free_bvec(xdr); } + + mutex_unlock(&xprt->xpt_mutex); if (err < 0) return err; return sent; + +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; } static int svc_udp_has_wspace(struct svc_xprt *xprt) @@ -1063,6 +1077,9 @@ err_noclose: * svc_tcp_sendto - Send out a reply on a TCP socket * @rqstp: completed svc_rqst * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * * Returns the number of bytes sent, or a negative errno. */ static int svc_tcp_sendto(struct svc_rqst *rqstp) @@ -1080,12 +1097,19 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) svc_release_skb(rqstp); + mutex_lock(&xprt->xpt_mutex); + if (svc_xprt_is_dead(xprt)) + goto out_notconn; err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); xdr_free_bvec(xdr); if (err < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; + mutex_unlock(&xprt->xpt_mutex); return sent; +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; out_close: pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", xprt->xpt_server->sv_name, @@ -1093,6 +1117,7 @@ out_close: (err < 0) ? err : sent, xdr->len); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index af7eb8d202ae..d9aab4504f2c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -210,34 +210,31 @@ drop_connection: return -ENOTCONN; } -/* Send an RPC call on the passive end of a transport - * connection. +/** + * xprt_rdma_bc_send_request - Send a reverse-direction Call + * @rqst: rpc_rqst containing Call message to be sent + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ -static int -xprt_rdma_bc_send_request(struct rpc_rqst *rqst) +static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; - struct svcxprt_rdma *rdma; + struct svcxprt_rdma *rdma = + container_of(sxprt, struct svcxprt_rdma, sc_xprt); int ret; dprintk("svcrdma: sending bc call with xid: %08x\n", be32_to_cpu(rqst->rq_xid)); - mutex_lock(&sxprt->xpt_mutex); - - ret = -ENOTCONN; - rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) { - ret = rpcrdma_bc_send_request(rdma, rqst); - if (ret == -ENOTCONN) - svc_close_xprt(sxprt); - } + if (test_bit(XPT_DEAD, &sxprt->xpt_flags)) + return -ENOTCONN; - mutex_unlock(&sxprt->xpt_mutex); - - if (ret < 0) - return ret; - return 0; + ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) + svc_close_xprt(sxprt); + return ret; } static void diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index b6c8643867f2..38e7c3c8c4a9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -868,12 +868,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) __be32 *p; int ret; - /* Create the RDMA response header. xprt->xpt_mutex, - * acquired in svc_send(), serializes RPC replies. The - * code path below that inserts the credit grant value - * into each transport header runs only inside this - * critical section. - */ + ret = -ENOTCONN; + if (svc_xprt_is_dead(xprt)) + goto err0; + ret = -ENOMEM; sctxt = svc_rdma_send_ctxt_get(rdma); if (!sctxt) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 845d0be805ec..839c49330785 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2548,8 +2548,16 @@ static int bc_sendto(struct rpc_rqst *req) return sent; } -/* - * The send routine. Borrows from svc_send +/** + * bc_send_request - Send a backchannel Call on a TCP socket + * @req: rpc_rqst containing Call message to be sent + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ static int bc_send_request(struct rpc_rqst *req) { -- cgit v1.2.3 From ea740bd5f58e2912e74f401fd01a9d6aa985ca05 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Mar 2020 17:32:41 -0400 Subject: svcrdma: Fix backchannel return code Way back when I was writing the RPC/RDMA server-side backchannel code, I misread the TCP backchannel reply handler logic. When svc_tcp_recvfrom() successfully receives a backchannel reply, it does not return -EAGAIN. It sets XPT_DATA and returns zero. Update svc_rdma_recvfrom() to return zero. Here, XPT_DATA doesn't need to be set again: it is set whenever a new message is received, behind a spin lock in a single threaded context. Also, if handling the cb reply is not successful, the message is simply dropped. There's no special message framing to deal with as there is in the TCP case. Now that the handle_bc_reply() return value is ignored, I've removed the dprintk call sites in the error exit of handle_bc_reply() in favor of trace points in other areas that already report the error cases. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 5 ++-- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 38 ++++++++---------------------- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 11 ++++----- 3 files changed, 17 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index cbcfbd0521e3..8518c3f37e56 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -160,9 +160,8 @@ struct svc_rdma_send_ctxt { }; /* svc_rdma_backchannel.c */ -extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, - __be32 *rdma_resp, - struct xdr_buf *rcvbuf); +extern void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *rctxt); /* svc_rdma_recvfrom.c */ extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index d9aab4504f2c..b174f3b109a5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -15,26 +15,25 @@ #undef SVCRDMA_BACKCHANNEL_DEBUG /** - * svc_rdma_handle_bc_reply - Process incoming backchannel reply - * @xprt: controlling backchannel transport - * @rdma_resp: pointer to incoming transport header - * @rcvbuf: XDR buffer into which to decode the reply + * svc_rdma_handle_bc_reply - Process incoming backchannel Reply + * @rqstp: resources for handling the Reply + * @rctxt: Received message * - * Returns: - * %0 if @rcvbuf is filled in, xprt_complete_rqst called, - * %-EAGAIN if server should call ->recvfrom again. */ -int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, - struct xdr_buf *rcvbuf) +void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *rctxt) { + struct svc_xprt *sxprt = rqstp->rq_xprt; + struct rpc_xprt *xprt = sxprt->xpt_bc_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct xdr_buf *rcvbuf = &rqstp->rq_arg; struct kvec *dst, *src = &rcvbuf->head[0]; + __be32 *rdma_resp = rctxt->rc_recv_buf; struct rpc_rqst *req; u32 credits; size_t len; __be32 xid; __be32 *p; - int ret; p = (__be32 *)src->iov_base; len = src->iov_len; @@ -49,14 +48,10 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, __func__, (int)len, p); #endif - ret = -EAGAIN; - if (src->iov_len < 24) - goto out_shortreply; - spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) - goto out_notfound; + goto out_unlock; dst = &req->rq_private_buf.head[0]; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); @@ -77,25 +72,12 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, spin_unlock(&xprt->transport_lock); spin_lock(&xprt->queue_lock); - ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: spin_unlock(&xprt->queue_lock); -out: - return ret; - -out_shortreply: - dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", - xprt, src->iov_len); - goto out; - -out_notfound: - dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", - xprt, be32_to_cpu(xid)); - goto out_unlock; } /* Send a backwards direction RPC call. diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index efa5fcb5793f..eee7c6478b30 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -878,12 +878,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, p)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, - &rqstp->rq_arg); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - return ret; - } + if (svc_rdma_is_backchannel_reply(xprt, p)) + goto out_backchannel; + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); p += rpcrdma_fixed_maxsz; @@ -913,6 +910,8 @@ out_postfail: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; +out_backchannel: + svc_rdma_handle_bc_reply(rqstp, ctxt); out_drop: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -- cgit v1.2.3 From 08e3c9f181bfb78d842c4f432888116d66e07c2f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Apr 2020 16:00:12 -0400 Subject: svcrdma: Remove the SVCRDMA_DEBUG macro Clean up: Commit d21b05f101ae ("rdma: SVCRMDA Header File") introduced the SVCRDMA_DEBUG macro, but it doesn't seem to have been used. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 8518c3f37e56..7ed82625dc0b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -48,7 +48,6 @@ #include #include #include -#define SVCRDMA_DEBUG /* Default and maximum inline threshold sizes */ enum { -- cgit v1.2.3 From 02648908d19a99532b0839959e38ae53d95d2798 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Mar 2020 14:12:15 -0400 Subject: SUNRPC: Rename svc_sock::sk_reclen Clean up. I find the name of the svc_sock::sk_reclen field confusing, so I've changed it to better reflect its function. This field is not read directly to get the record length. Rather, it is a buffer containing a record marker that needs to be decoded. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 6 +++--- net/sunrpc/svcsock.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 771baadaee9d..b7ac7fe68306 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -28,7 +28,7 @@ struct svc_sock { /* private TCP part */ /* On-the-wire fragment header: */ - __be32 sk_reclen; + __be32 sk_marker; /* As we receive a record, this includes the length received so * far (including the fragment header): */ u32 sk_tcplen; @@ -41,12 +41,12 @@ struct svc_sock { static inline u32 svc_sock_reclen(struct svc_sock *svsk) { - return ntohl(svsk->sk_reclen) & RPC_FRAGMENT_SIZE_MASK; + return be32_to_cpu(svsk->sk_marker) & RPC_FRAGMENT_SIZE_MASK; } static inline u32 svc_sock_final_rec(struct svc_sock *svsk) { - return ntohl(svsk->sk_reclen) & RPC_LAST_STREAM_FRAGMENT; + return be32_to_cpu(svsk->sk_marker) & RPC_LAST_STREAM_FRAGMENT; } /* diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 4ac1180c6306..d63b21f3f207 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -841,7 +841,7 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) struct kvec iov; want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; - iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; len = svc_recvfrom(rqstp, &iov, 1, want, 0); if (len < 0) @@ -938,7 +938,7 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk) svc_sock_final_rec(svsk) ? "final" : "nonfinal", svc_sock_reclen(svsk)); svsk->sk_tcplen = 0; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; } /* @@ -1154,7 +1154,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) sk->sk_data_ready = svc_data_ready; sk->sk_write_space = svc_write_space; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); -- cgit v1.2.3 From 356d411c26735bcc62718c4c9181014255dc302d Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 15 May 2020 15:16:52 -0700 Subject: net/mlx5: Cleanup mlx5_ifc_fte_match_set_misc2_bits Remove the "metadata_reg_b" field and all uses of this field in code to match the device specification. As this field is not in use in SW steering it is safe to remove it. Signed-off-by: Raed Salem Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 3 +-- include/linux/mlx5/mlx5_ifc.h | 4 +--- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index c0e3a1e7389d..78c884911ceb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -961,7 +961,6 @@ static void dr_ste_copy_mask_misc2(char *mask, struct mlx5dr_match_misc2 *spec) spec->metadata_reg_c_1 = MLX5_GET(fte_match_set_misc2, mask, metadata_reg_c_1); spec->metadata_reg_c_0 = MLX5_GET(fte_match_set_misc2, mask, metadata_reg_c_0); spec->metadata_reg_a = MLX5_GET(fte_match_set_misc2, mask, metadata_reg_a); - spec->metadata_reg_b = MLX5_GET(fte_match_set_misc2, mask, metadata_reg_b); } static void dr_ste_copy_mask_misc3(char *mask, struct mlx5dr_match_misc3 *spec) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 984783238baa..71fa01ce348a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -554,8 +554,7 @@ struct mlx5dr_match_misc2 { u32 metadata_reg_c_1; /* metadata_reg_c_1 */ u32 metadata_reg_c_0; /* metadata_reg_c_0 */ u32 metadata_reg_a; /* metadata_reg_a */ - u32 metadata_reg_b; /* metadata_reg_b */ - u8 reserved_auto2[8]; + u8 reserved_auto2[12]; }; struct mlx5dr_match_misc3 { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c9dd6e99ad56..fd8da4875ea0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -584,9 +584,7 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 metadata_reg_a[0x20]; - u8 metadata_reg_b[0x20]; - - u8 reserved_at_1c0[0x40]; + u8 reserved_at_1a0[0x60]; }; struct mlx5_ifc_fte_match_set_misc3_bits { -- cgit v1.2.3 From 555af0c3fa0b632be73c241cc932129af4b70d27 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 15 May 2020 15:16:53 -0700 Subject: net/mlx5: Move iseg access helper routines close to mlx5_core driver Only mlx5_core driver handles fw initialization check and command interface revision check. Hence move them inside the mlx5_core driver where it is used. This avoid exposing these helpers to all mlx5 drivers. Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 +++++ include/linux/mlx5/driver.h | 10 ---------- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 34cba97f7bf4..e6567d5570ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1890,6 +1890,11 @@ static void free_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd) cmd->alloc_dma); } +static u16 cmdif_rev(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16; +} + int mlx5_cmd_init(struct mlx5_core_dev *dev) { int size = sizeof(struct mlx5_cmd_prot_block); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 061b69ea9cc4..8a375e3ed5c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -177,6 +177,11 @@ static struct mlx5_profile profile[] = { #define FW_PRE_INIT_TIMEOUT_MILI 120000 #define FW_INIT_WARN_MESSAGE_INTERVAL 20000 +static int fw_initializing(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->initializing) >> 31; +} + static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili, u32 warn_time_mili) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 24e04901f92e..a988eb405aa6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -823,11 +823,6 @@ static inline u16 fw_rev_sub(struct mlx5_core_dev *dev) return ioread32be(&dev->iseg->cmdif_rev_fw_sub) & 0xffff; } -static inline u16 cmdif_rev(struct mlx5_core_dev *dev) -{ - return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16; -} - static inline u32 mlx5_base_mkey(const u32 key) { return key & 0xffffff00u; @@ -1012,11 +1007,6 @@ int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, u8 roce_version, u8 roce_l3_type, const u8 *gid, const u8 *mac, bool vlan, u16 vlan_id, u8 port_num); -static inline int fw_initializing(struct mlx5_core_dev *dev) -{ - return ioread32be(&dev->iseg->initializing) >> 31; -} - static inline u32 mlx5_mkey_to_idx(u32 mkey) { return mkey >> 8; -- cgit v1.2.3 From 51189c7a7ed1b4ed4493e27275d466ff60406d3a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:11:05 +0100 Subject: arm64: scs: Store absolute SCS stack pointer value in thread_info Storing the SCS information in thread_info as a {base,offset} pair introduces an additional load instruction on the ret-to-user path, since the SCS stack pointer in x18 has to be converted back to an offset by subtracting the base. Replace the offset with the absolute SCS stack pointer value instead and avoid the redundant load. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/include/asm/scs.h | 9 ++++----- arch/arm64/include/asm/thread_info.h | 4 ++-- arch/arm64/kernel/asm-offsets.c | 2 +- include/linux/scs.h | 8 ++++---- kernel/scs.c | 3 +-- 5 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 96549353b0cb..6b8cf4352fe3 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -4,16 +4,15 @@ #ifdef __ASSEMBLY__ +#include + #ifdef CONFIG_SHADOW_CALL_STACK .macro scs_load tsk, tmp - ldp x18, \tmp, [\tsk, #TSK_TI_SCS_BASE] - add x18, x18, \tmp + ldr x18, [\tsk, #TSK_TI_SCS_SP] .endm .macro scs_save tsk, tmp - ldr \tmp, [\tsk, #TSK_TI_SCS_BASE] - sub \tmp, x18, \tmp - str \tmp, [\tsk, #TSK_TI_SCS_OFFSET] + str x18, [\tsk, #TSK_TI_SCS_SP] .endm #else .macro scs_load tsk, tmp diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9df79c0a4c43..6ea8b6a26ae9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -43,7 +43,7 @@ struct thread_info { }; #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; - unsigned long scs_offset; + void *scs_sp; #endif }; @@ -107,7 +107,7 @@ void arch_release_task_struct(struct task_struct *tsk); #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ - .scs_offset = 0, + .scs_sp = init_shadow_call_stack, #else #define INIT_SCS #endif diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index d7934250b68c..a098a45f63d8 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,7 +36,7 @@ int main(void) #endif #ifdef CONFIG_SHADOW_CALL_STACK DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base)); - DEFINE(TSK_TI_SCS_OFFSET, offsetof(struct task_struct, thread_info.scs_offset)); + DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp)); #endif DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); #ifdef CONFIG_STACKPROTECTOR diff --git a/include/linux/scs.h b/include/linux/scs.h index 3f3662621a27..0eb2485ef832 100644 --- a/include/linux/scs.h +++ b/include/linux/scs.h @@ -27,7 +27,7 @@ #define SCS_END_MAGIC (0x5f6UL + POISON_POINTER_DELTA) #define task_scs(tsk) (task_thread_info(tsk)->scs_base) -#define task_scs_offset(tsk) (task_thread_info(tsk)->scs_offset) +#define task_scs_sp(tsk) (task_thread_info(tsk)->scs_sp) void scs_init(void); int scs_prepare(struct task_struct *tsk, int node); @@ -39,7 +39,7 @@ static inline void scs_task_reset(struct task_struct *tsk) * Reset the shadow stack to the base address in case the task * is reused. */ - task_scs_offset(tsk) = 0; + task_scs_sp(tsk) = task_scs(tsk); } static inline unsigned long *__scs_magic(void *s) @@ -50,9 +50,9 @@ static inline unsigned long *__scs_magic(void *s) static inline bool scs_corrupted(struct task_struct *tsk) { unsigned long *magic = __scs_magic(task_scs(tsk)); + unsigned long sz = task_scs_sp(tsk) - task_scs(tsk); - return (task_scs_offset(tsk) >= SCS_SIZE - 1 || - READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC); + return sz >= SCS_SIZE - 1 || READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC; } #else /* CONFIG_SHADOW_CALL_STACK */ diff --git a/kernel/scs.c b/kernel/scs.c index 9389c28f0853..5ff8663e4a67 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -60,8 +60,7 @@ int scs_prepare(struct task_struct *tsk, int node) if (!s) return -ENOMEM; - task_scs(tsk) = s; - task_scs_offset(tsk) = 0; + task_scs(tsk) = task_scs_sp(tsk) = s; scs_account(tsk, 1); return 0; } -- cgit v1.2.3 From 88485be531f4aee841ddc53b56e2f6e6a338854d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 14:56:05 +0100 Subject: scs: Move scs_overflow_check() out of architecture code There is nothing architecture-specific about scs_overflow_check() as it's just a trivial wrapper around scs_corrupted(). For parity with task_stack_end_corrupted(), rename scs_corrupted() to task_scs_end_corrupted() and call it from schedule_debug() when CONFIG_SCHED_STACK_END_CHECK_is enabled, which better reflects its purpose as a debug feature to catch inadvertent overflow of the SCS. Finally, remove the unused scs_overflow_check() function entirely. This has absolutely no impact on architectures that do not support SCS (currently arm64 only). Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/include/asm/scs.h | 18 ------------------ arch/arm64/kernel/process.c | 2 -- arch/arm64/kernel/scs.c | 2 +- include/linux/scs.h | 4 ++-- kernel/sched/core.c | 3 +++ kernel/scs.c | 3 ++- 6 files changed, 8 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index d46efdd2060a..eaa2cd92e4c1 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -24,24 +24,6 @@ .endm #endif /* CONFIG_SHADOW_CALL_STACK */ -#else /* __ASSEMBLY__ */ - -#include - -#ifdef CONFIG_SHADOW_CALL_STACK - -static inline void scs_overflow_check(struct task_struct *tsk) -{ - if (unlikely(scs_corrupted(tsk))) - panic("corrupted shadow stack detected inside scheduler\n"); -} - -#else /* CONFIG_SHADOW_CALL_STACK */ - -static inline void scs_overflow_check(struct task_struct *tsk) {} - -#endif /* CONFIG_SHADOW_CALL_STACK */ - #endif /* __ASSEMBLY __ */ #endif /* _ASM_SCS_H */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a35d3318492c..56be4cbf771f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) @@ -516,7 +515,6 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, entry_task_switch(next); uao_thread_switch(next); ssbs_thread_switch(next); - scs_overflow_check(next); /* * Complete any pending TLB or cache maintenance on this CPU in case diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c index adc97f826fab..955875dff9e1 100644 --- a/arch/arm64/kernel/scs.c +++ b/arch/arm64/kernel/scs.c @@ -6,7 +6,7 @@ */ #include -#include +#include /* Allocate a static per-CPU shadow stack */ #define DEFINE_SCS(name) \ diff --git a/include/linux/scs.h b/include/linux/scs.h index 0eb2485ef832..2fd3df50e93e 100644 --- a/include/linux/scs.h +++ b/include/linux/scs.h @@ -47,7 +47,7 @@ static inline unsigned long *__scs_magic(void *s) return (unsigned long *)(s + SCS_SIZE) - 1; } -static inline bool scs_corrupted(struct task_struct *tsk) +static inline bool task_scs_end_corrupted(struct task_struct *tsk) { unsigned long *magic = __scs_magic(task_scs(tsk)); unsigned long sz = task_scs_sp(tsk) - task_scs(tsk); @@ -60,8 +60,8 @@ static inline bool scs_corrupted(struct task_struct *tsk) static inline void scs_init(void) {} static inline void scs_task_reset(struct task_struct *tsk) {} static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } -static inline bool scs_corrupted(struct task_struct *tsk) { return false; } static inline void scs_release(struct task_struct *tsk) {} +static inline bool task_scs_end_corrupted(struct task_struct *tsk) { return false; } #endif /* CONFIG_SHADOW_CALL_STACK */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 934e03cfaec7..a1d815a11b90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3878,6 +3878,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/kernel/scs.c b/kernel/scs.c index aea841cd7586..faf0ecd7b893 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -98,7 +98,8 @@ void scs_release(struct task_struct *tsk) if (!s) return; - WARN(scs_corrupted(tsk), "corrupted shadow stack detected when freeing task\n"); + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); scs_check_usage(tsk); scs_free(s); } -- cgit v1.2.3 From 871e100e432c651c9c46fb9c3184b4577e0de3ae Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 May 2020 16:17:12 +0100 Subject: scs: Move DEFINE_SCS macro into core code Defining static shadow call stacks is not architecture-specific, so move the DEFINE_SCS() macro into the core header file. Tested-by: Sami Tolvanen Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/kernel/scs.c | 4 ---- include/linux/scs.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c index 955875dff9e1..e8f7ff45dd8f 100644 --- a/arch/arm64/kernel/scs.c +++ b/arch/arm64/kernel/scs.c @@ -8,10 +8,6 @@ #include #include -/* Allocate a static per-CPU shadow stack */ -#define DEFINE_SCS(name) \ - DEFINE_PER_CPU(unsigned long [SCS_SIZE/sizeof(long)], name) \ - DEFINE_SCS(irq_shadow_call_stack); #ifdef CONFIG_ARM_SDE_INTERFACE diff --git a/include/linux/scs.h b/include/linux/scs.h index 2fd3df50e93e..6dec390cf154 100644 --- a/include/linux/scs.h +++ b/include/linux/scs.h @@ -26,6 +26,10 @@ /* An illegal pointer value to mark the end of the shadow stack. */ #define SCS_END_MAGIC (0x5f6UL + POISON_POINTER_DELTA) +/* Allocate a static per-CPU shadow stack */ +#define DEFINE_SCS(name) \ + DEFINE_PER_CPU(unsigned long [SCS_SIZE/sizeof(long)], name) \ + #define task_scs(tsk) (task_thread_info(tsk)->scs_base) #define task_scs_sp(tsk) (task_thread_info(tsk)->scs_sp) -- cgit v1.2.3 From 220995622da5317714b5fe659165735f7b44b87e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 7 May 2020 13:08:46 -0700 Subject: kgdboc: Add kgdboc_earlycon to support early kgdb using boot consoles We want to enable kgdb to debug the early parts of the kernel. Unfortunately kgdb normally is a client of the tty API in the kernel and serial drivers don't register to the tty layer until fairly late in the boot process. Serial drivers do, however, commonly register a boot console. Let's enable the kgdboc driver to work with boot consoles to provide early debugging. This change co-opts the existing read() function pointer that's part of "struct console". It's assumed that if a boot console (with the flag CON_BOOT) has implemented read() that both the read() and write() function are polling functions. That means they work without interrupts and read() will return immediately (with 0 bytes read) if there's nothing to read. This should be a safe assumption since it appears that no current boot consoles implement read() right now and there seems no reason to do so unless they wanted to support "kgdboc_earlycon". The normal/expected way to make all this work is to use "kgdboc_earlycon" and "kgdboc" together. You should point them both to the same physical serial connection. At boot time, as the system transitions from the boot console to the normal console (and registers a tty), kgdb will switch over. One awkward part of all this, though, is that there can be a window where the boot console goes away and we can't quite transtion over to the main kgdboc that uses the tty layer. There are two main problems: 1. The act of registering the tty doesn't cause any call into kgdboc so there is a window of time when the tty is there but kgdboc's init code hasn't been called so we can't transition to it. 2. On some serial drivers the normal console inits (and replaces the boot console) quite early in the system. Presumably these drivers were coded up before earlycon worked as well as it does today and probably they don't need to do this anymore, but it causes us problems nontheless. Problem #1 is not too big of a deal somewhat due to the luck of probe ordering. kgdboc is last in the tty/serial/Makefile so its probe gets right after all other tty devices. It's not fun to rely on this, but it does work for the most part. Problem #2 is a big deal, but only for some serial drivers. Other serial drivers end up registering the console (which gets rid of the boot console) and tty at nearly the same time. The way we'll deal with the window when the system has stopped using the boot console and the time when we're setup using the tty is to keep using the boot console. This may sound surprising, but it has been found to work well in practice. If it doesn't work, it shouldn't be too hard for a given serial driver to make it keep working. Specifically, it's expected that the read()/write() function provided in the boot console should be the same (or nearly the same) as the normal kgdb polling functions. That means continuing to use them should work just fine. To make things even more likely to work work we'll also trap the recently added exit() function in the boot console we're using and delay any calls to it until we're all done with the boot console. NOTE: there could be ways to use all this in weird / unexpected ways. If you do something like this, it's a bit of a buyer beware situation. Specifically: - If you specify only "kgdboc_earlycon" but not "kgdboc" then (depending on your serial driver) things will probably work OK, but you'll get a warning printed the first time you use kgdb after the boot console is gone. You'd only be able to do this, of course, if the serial driver you're running atop provided an early boot console. - If your "kgdboc_earlycon" and "kgdboc" devices are not the same device things should work OK, but it'll be your job to switch over which device you're monitoring (including figuring out how to switch over gdb in-flight if you're using it). When trying to enable "kgdboc_earlycon" it should be noted that the names that are registered through the boot console layer and the tty layer are not the same for the same port. For example when debugging on one board I'd need to pass "kgdboc_earlycon=qcom_geni kgdboc=ttyMSM0" to enable things properly. Since digging up the boot console name is a pain and there will rarely be more than one boot console enabled, you can provide the "kgdboc_earlycon" parameter without specifying the name of the boot console. In this case we'll just pick the first boot that implements read() that we find. This new "kgdboc_earlycon" parameter should be contrasted to the existing "ekgdboc" parameter. While both provide a way to debug very early, the usage and mechanisms are quite different. Specifically "kgdboc_earlycon" is meant to be used in tandem with "kgdboc" and there is a transition from one to the other. The "ekgdboc" parameter, on the other hand, replaces the "kgdboc" parameter. It runs the same logic as the "kgdboc" parameter but just relies on your TTY driver being present super early. The only known usage of the old "ekgdboc" parameter is documented as "ekgdboc=kbd earlyprintk=vga". It should be noted that "kbd" has special treatment allowing it to init early as a tty device. Signed-off-by: Douglas Anderson Reviewed-by: Greg Kroah-Hartman Tested-by: Sumit Garg Link: https://lore.kernel.org/r/20200507130644.v4.8.I8fba5961bf452ab92350654aa61957f23ecf0100@changeid Signed-off-by: Daniel Thompson --- drivers/tty/serial/kgdboc.c | 136 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/kgdb.h | 4 ++ kernel/debug/debug_core.c | 22 +++++-- 3 files changed, 158 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/kgdboc.c b/drivers/tty/serial/kgdboc.c index a260ecd13e8f..34b5e91dd245 100644 --- a/drivers/tty/serial/kgdboc.c +++ b/drivers/tty/serial/kgdboc.c @@ -21,6 +21,7 @@ #include #include #include +#include #define MAX_CONFIG_LEN 40 @@ -42,6 +43,10 @@ static int kgdb_tty_line; static struct platform_device *kgdboc_pdev; +static struct kgdb_io kgdboc_earlycon_io_ops; +static struct console *earlycon; +static int (*earlycon_orig_exit)(struct console *con); + #ifdef CONFIG_KDB_KEYBOARD static int kgdboc_reset_connect(struct input_handler *handler, struct input_dev *dev, @@ -137,6 +142,9 @@ static void kgdboc_unregister_kbd(void) static void cleanup_kgdboc(void) { + if (earlycon) + kgdb_unregister_io_module(&kgdboc_earlycon_io_ops); + if (configured != 1) return; @@ -409,6 +417,134 @@ static int __init kgdboc_early_init(char *opt) early_param("ekgdboc", kgdboc_early_init); +static int kgdboc_earlycon_get_char(void) +{ + char c; + + if (!earlycon->read(earlycon, &c, 1)) + return NO_POLL_CHAR; + + return c; +} + +static void kgdboc_earlycon_put_char(u8 chr) +{ + earlycon->write(earlycon, &chr, 1); +} + +static void kgdboc_earlycon_pre_exp_handler(void) +{ + struct console *con; + static bool already_warned; + + if (already_warned) + return; + + /* + * When the first normal console comes up the kernel will take all + * the boot consoles out of the list. Really, we should stop using + * the boot console when it does that but until a TTY is registered + * we have no other choice so we keep using it. Since not all + * serial drivers might be OK with this, print a warning once per + * boot if we detect this case. + */ + for_each_console(con) + if (con == earlycon) + return; + + already_warned = true; + pr_warn("kgdboc_earlycon is still using bootconsole\n"); +} + +static int kgdboc_earlycon_deferred_exit(struct console *con) +{ + /* + * If we get here it means the boot console is going away but we + * don't yet have a suitable replacement. Don't pass through to + * the original exit routine. We'll call it later in our deinit() + * function. For now, restore the original exit() function pointer + * as a sentinal that we've hit this point. + */ + con->exit = earlycon_orig_exit; + + return 0; +} + +static void kgdboc_earlycon_deinit(void) +{ + if (!earlycon) + return; + + if (earlycon->exit == kgdboc_earlycon_deferred_exit) + /* + * kgdboc_earlycon is exiting but original boot console exit + * was never called (AKA kgdboc_earlycon_deferred_exit() + * didn't ever run). Undo our trap. + */ + earlycon->exit = earlycon_orig_exit; + else if (earlycon->exit) + /* + * We skipped calling the exit() routine so we could try to + * keep using the boot console even after it went away. We're + * finally done so call the function now. + */ + earlycon->exit(earlycon); + + earlycon = NULL; +} + +static struct kgdb_io kgdboc_earlycon_io_ops = { + .name = "kgdboc_earlycon", + .read_char = kgdboc_earlycon_get_char, + .write_char = kgdboc_earlycon_put_char, + .pre_exception = kgdboc_earlycon_pre_exp_handler, + .deinit = kgdboc_earlycon_deinit, + .is_console = true, +}; + +static int __init kgdboc_earlycon_init(char *opt) +{ + struct console *con; + + kdb_init(KDB_INIT_EARLY); + + /* + * Look for a matching console, or if the name was left blank just + * pick the first one we find. + */ + console_lock(); + for_each_console(con) { + if (con->write && con->read && + (con->flags & (CON_BOOT | CON_ENABLED)) && + (!opt || !opt[0] || strcmp(con->name, opt) == 0)) + break; + } + + if (!con) { + pr_info("Couldn't find kgdb earlycon\n"); + goto unlock; + } + + earlycon = con; + pr_info("Going to register kgdb with earlycon '%s'\n", con->name); + if (kgdb_register_io_module(&kgdboc_earlycon_io_ops) != 0) { + earlycon = NULL; + pr_info("Failed to register kgdb with earlycon\n"); + } else { + /* Trap exit so we can keep earlycon longer if needed. */ + earlycon_orig_exit = con->exit; + con->exit = kgdboc_earlycon_deferred_exit; + } + +unlock: + console_unlock(); + + /* Non-zero means malformed option so we always return zero */ + return 0; +} + +early_param("kgdboc_earlycon", kgdboc_earlycon_init); + module_init(init_kgdboc); module_exit(exit_kgdboc); module_param_call(kgdboc, param_set_kgdboc_var, param_get_string, &kps, 0644); diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index c2caee08e418..c62d76478adc 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -269,6 +269,9 @@ struct kgdb_arch { * @write_char: Pointer to a function that will write one char. * @flush: Pointer to a function that will flush any pending writes. * @init: Pointer to a function that will initialize the device. + * @deinit: Pointer to a function that will deinit the device. Implies that + * this I/O driver is temporary and expects to be replaced. Called when + * an I/O driver is replaced or explicitly unregistered. * @pre_exception: Pointer to a function that will do any prep work for * the I/O driver. * @post_exception: Pointer to a function that will do any cleanup work @@ -282,6 +285,7 @@ struct kgdb_io { void (*write_char) (u8); void (*flush) (void); int (*init) (void); + void (*deinit) (void); void (*pre_exception) (void); void (*post_exception) (void); int is_console; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index e2d67b163fb6..4d59aa907fdc 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1073,15 +1073,23 @@ EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); */ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) { + struct kgdb_io *old_dbg_io_ops; int err; spin_lock(&kgdb_registration_lock); - if (dbg_io_ops) { - spin_unlock(&kgdb_registration_lock); + old_dbg_io_ops = dbg_io_ops; + if (old_dbg_io_ops) { + if (!old_dbg_io_ops->deinit) { + spin_unlock(&kgdb_registration_lock); - pr_err("Another I/O driver is already registered with KGDB\n"); - return -EBUSY; + pr_err("KGDB I/O driver %s can't replace %s.\n", + new_dbg_io_ops->name, old_dbg_io_ops->name); + return -EBUSY; + } + pr_info("Replacing I/O driver %s with %s\n", + old_dbg_io_ops->name, new_dbg_io_ops->name); + old_dbg_io_ops->deinit(); } if (new_dbg_io_ops->init) { @@ -1096,6 +1104,9 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops) + return 0; + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ @@ -1132,6 +1143,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops->deinit) + old_dbg_io_ops->deinit(); + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } -- cgit v1.2.3 From a307593a644443db12888f45eed0dafb5869e2cc Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 18 May 2020 15:23:59 -0700 Subject: net: phy: simplify phy_link_change arguments This function was introduced to allow for different handling of link up and link down events particularly with regard to the netif_carrier. The third argument do_carrier allowed the flag to be left unchanged. Since then the phylink has introduced an implementation that completely ignores the third parameter since it never wants to change the flag and the phylib always sets the third parameter to true so the flag is always changed. Therefore the third argument (i.e. do_carrier) is no longer necessary and can be removed. This also means that the phylib phy_link_down() function no longer needs its second argument. Signed-off-by: Doug Berger Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 12 ++++++------ drivers/net/phy/phy_device.c | 12 +++++------- drivers/net/phy/phylink.c | 3 +-- include/linux/phy.h | 2 +- 4 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index d4bbf79dab6c..d584701187db 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -58,13 +58,13 @@ static const char *phy_state_to_str(enum phy_state st) static void phy_link_up(struct phy_device *phydev) { - phydev->phy_link_change(phydev, true, true); + phydev->phy_link_change(phydev, true); phy_led_trigger_change_speed(phydev); } -static void phy_link_down(struct phy_device *phydev, bool do_carrier) +static void phy_link_down(struct phy_device *phydev) { - phydev->phy_link_change(phydev, false, do_carrier); + phydev->phy_link_change(phydev, false); phy_led_trigger_change_speed(phydev); } @@ -524,7 +524,7 @@ int phy_start_cable_test(struct phy_device *phydev, goto out; /* Mark the carrier down until the test is complete */ - phy_link_down(phydev, true); + phy_link_down(phydev); netif_testing_on(dev); err = phydev->drv->cable_test_start(phydev); @@ -595,7 +595,7 @@ static int phy_check_link_status(struct phy_device *phydev) phy_link_up(phydev); } else if (!phydev->link && phydev->state != PHY_NOLINK) { phydev->state = PHY_NOLINK; - phy_link_down(phydev, true); + phy_link_down(phydev); } return 0; @@ -999,7 +999,7 @@ void phy_state_machine(struct work_struct *work) case PHY_HALTED: if (phydev->link) { phydev->link = 0; - phy_link_down(phydev, true); + phy_link_down(phydev); } do_suspend = true; break; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c3a107cf578e..7481135d27ab 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -916,16 +916,14 @@ struct phy_device *phy_find_first(struct mii_bus *bus) } EXPORT_SYMBOL(phy_find_first); -static void phy_link_change(struct phy_device *phydev, bool up, bool do_carrier) +static void phy_link_change(struct phy_device *phydev, bool up) { struct net_device *netdev = phydev->attached_dev; - if (do_carrier) { - if (up) - netif_carrier_on(netdev); - else - netif_carrier_off(netdev); - } + if (up) + netif_carrier_on(netdev); + else + netif_carrier_off(netdev); phydev->adjust_link(netdev); if (phydev->mii_ts && phydev->mii_ts->link_state) phydev->mii_ts->link_state(phydev->mii_ts, phydev); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 0f23bec431c1..b6b1f77bba58 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -803,8 +803,7 @@ void phylink_destroy(struct phylink *pl) } EXPORT_SYMBOL_GPL(phylink_destroy); -static void phylink_phy_change(struct phy_device *phydev, bool up, - bool do_carrier) +static void phylink_phy_change(struct phy_device *phydev, bool up) { struct phylink *pl = phydev->phylink; bool tx_pause, rx_pause; diff --git a/include/linux/phy.h b/include/linux/phy.h index 5d8ff5428010..467aa8bf9f64 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -543,7 +543,7 @@ struct phy_device { u8 mdix; u8 mdix_ctrl; - void (*phy_link_change)(struct phy_device *, bool up, bool do_carrier); + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); #if IS_ENABLED(CONFIG_MACSEC) -- cgit v1.2.3 From ed318a6cc0b620440e65f48eb527dc3df7269ce4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 May 2020 16:32:50 -0700 Subject: fscrypt: support test_dummy_encryption=v2 v1 encryption policies are deprecated in favor of v2, and some new features (e.g. encryption+casefolding) are only being added for v2. Therefore, the "test_dummy_encryption" mount option (which is used for encryption I/O testing with xfstests) needs to support v2 policies. To do this, extend its syntax to be "test_dummy_encryption=v1" or "test_dummy_encryption=v2". The existing "test_dummy_encryption" (no argument) also continues to be accepted, to specify the default setting -- currently v1, but the next patch changes it to v2. To cleanly support both v1 and v2 while also making it easy to support specifying other encryption settings in the future (say, accepting "$contents_mode:$filenames_mode:v2"), make ext4 and f2fs maintain a pointer to the dummy fscrypt_context rather than using mount flags. To avoid concurrency issues, don't allow test_dummy_encryption to be set or changed during a remount. (The former restriction is new, but xfstests doesn't run into it, so no one should notice.) Tested with 'gce-xfstests -c {ext4,f2fs}/encrypt -g auto'. On ext4, there are two regressions, both of which are test bugs: ext4/023 and ext4/028 fail because they set an xattr and expect it to be stored inline, but the increase in size of the fscrypt_context from 24 to 40 bytes causes this xattr to be spilled into an external block. Link: https://lore.kernel.org/r/20200512233251.118314-4-ebiggers@kernel.org Acked-by: Jaegeuk Kim Reviewed-by: Theodore Ts'o Signed-off-by: Eric Biggers --- Documentation/filesystems/f2fs.rst | 6 +- fs/crypto/keysetup.c | 15 ++--- fs/crypto/policy.c | 125 +++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 7 ++- fs/ext4/super.c | 68 +++++++++++++++----- fs/ext4/sysfs.c | 2 + fs/f2fs/f2fs.h | 4 +- fs/f2fs/super.c | 85 ++++++++++++++++++------- fs/f2fs/sysfs.c | 4 ++ include/linux/fscrypt.h | 51 ++++++++++++--- 10 files changed, 307 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 87d794bc75a4..4218ac658629 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -225,8 +225,12 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix", pass, but the performance will regress. "nobarrier" is based on "posix", but doesn't issue flush command for non-atomic files likewise "nobarrier" mount option. -test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt +test_dummy_encryption +test_dummy_encryption=%s + Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. + The argument may be either "v1" or "v2", in order to + select the corresponding fscrypt policy version. checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable" to reenable checkpointing. Is enabled by default. While disabled, any unmounting or unexpected shutdowns will cause diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index edc4590c69f0..675479f8e6f3 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -395,21 +395,18 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) { + const union fscrypt_context *dummy_ctx = + fscrypt_get_dummy_context(inode->i_sb); + + if (IS_ENCRYPTED(inode) || !dummy_ctx) { fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.version = FSCRYPT_CONTEXT_V1; - ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memset(ctx.v1.master_key_descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - res = sizeof(ctx.v1); + res = fscrypt_context_size(dummy_ctx); + memcpy(&ctx, dummy_ctx, res); } crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0a95537fcef9..bcab94b29d10 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include "fscrypt_private.h" @@ -616,3 +617,127 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return preload ? fscrypt_get_encryption_info(child): 0; } EXPORT_SYMBOL(fscrypt_inherit_context); + +/** + * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' + * @sb: the filesystem on which test_dummy_encryption is being specified + * @arg: the argument to the test_dummy_encryption option. + * If no argument was specified, then @arg->from == NULL. + * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * + * Handle the test_dummy_encryption mount option by creating a dummy encryption + * context, saving it in @dummy_ctx, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_ctx is already set, then + * instead validate that it matches @arg. Don't support changing it via + * remount, as that is difficult to do safely. + * + * The reason we use an fscrypt_context rather than an fscrypt_policy is because + * we mustn't generate a new nonce each time we access a dummy-encrypted + * directory, as that would change the way filenames are encrypted. + * + * Return: 0 on success (dummy context set, or the same context is already set); + * -EEXIST if a different dummy context is already set; + * or another -errno value. + */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx) +{ + const char *argstr = "v1"; + const char *argstr_to_free = NULL; + struct fscrypt_key_specifier key_spec = { 0 }; + int version; + union fscrypt_context *ctx = NULL; + int err; + + if (arg->from) { + argstr = argstr_to_free = match_strdup(arg); + if (!argstr) + return -ENOMEM; + } + + if (!strcmp(argstr, "v1")) { + version = FSCRYPT_CONTEXT_V1; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memset(key_spec.u.descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + } else if (!strcmp(argstr, "v2")) { + version = FSCRYPT_CONTEXT_V2; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + /* key_spec.u.identifier gets filled in when adding the key */ + } else { + err = -EINVAL; + goto out; + } + + if (dummy_ctx->ctx) { + /* + * Note: if we ever make test_dummy_encryption support + * specifying other encryption settings, such as the encryption + * modes, we'll need to compare those settings here. + */ + if (dummy_ctx->ctx->version == version) + err = 0; + else + err = -EEXIST; + goto out; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + err = fscrypt_add_test_dummy_key(sb, &key_spec); + if (err) + goto out; + + ctx->version = version; + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out; + } + dummy_ctx->ctx = ctx; + ctx = NULL; + err = 0; +out: + kfree(ctx); + kfree(argstr_to_free); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); + +/** + * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' + * @seq: the seq_file to print the option to + * @sep: the separator character to use + * @sb: the filesystem whose options are being shown + * + * Show the test_dummy_encryption mount option, if it was specified. + * This is mainly used for /proc/mounts. + */ +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb) +{ + const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + + if (!ctx) + return; + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); +} +EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..546504cba842 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1357,11 +1357,9 @@ struct ext4_super_block { */ #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ - EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1551,6 +1549,9 @@ struct ext4_sb_info { struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + /* Encryption context for '-o test_dummy_encryption' */ + struct fscrypt_dummy_context s_dummy_enc_ctx; + /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..4a3d21972011 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1106,6 +1106,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1389,9 +1390,10 @@ retry: return res; } -static bool ext4_dummy_context(struct inode *inode) +static const union fscrypt_context * +ext4_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); + return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1410,7 +1412,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .dummy_context = ext4_dummy_context, + .get_dummy_context = ext4_get_dummy_context, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1605,6 +1607,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ @@ -1816,7 +1819,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -1851,6 +1854,48 @@ static int ext4_sb_read_encoding(const struct ext4_super_block *es, } #endif +static int ext4_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ +#ifdef CONFIG_FS_ENCRYPTION + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + ext4_msg(sb, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + ext4_msg(sb, KERN_WARNING, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + ext4_msg(sb, KERN_WARNING, + "Value of option \"%s\" is unrecognized", opt); + else + ext4_msg(sb, KERN_WARNING, + "Error processing option \"%s\" [%d]", + opt, err); + return -1; + } + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif + return 1; +} + static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, unsigned long *journal_devnum, unsigned int *journal_ioprio, int is_remount) @@ -2047,14 +2092,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_FS_ENCRYPTION - sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); -#endif + return ext4_set_test_dummy_encryption(sb, opt, &args[0], + is_remount); } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2311,8 +2350,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) - SEQ_OPTS_PUTS("test_dummy_encryption"); + + fscrypt_show_test_dummy_encryption(seq, sep, sb); ext4_show_quota_options(seq, sb); return 0; @@ -4780,6 +4819,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); ext4_blkdev_remove(sbi); brelse(bh); out_fail: diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 04bfaf63752c..6c9fc9e21c13 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -293,6 +293,7 @@ EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); #ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); @@ -308,6 +309,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(meta_bg_resize), #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_UNICODE ATTR_LIST(casefold), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ba470d5687fe..157eec348970 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,7 +138,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - bool test_dummy_encryption; /* test dummy encryption */ + struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ @@ -1259,7 +1259,7 @@ enum fsync_mode { #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) + (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2dfc21c6abb..8a9955902d84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -202,6 +202,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, @@ -394,7 +395,52 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int parse_options(struct super_block *sb, char *options) +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); +#ifdef CONFIG_FS_ENCRYPTION + int err; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + err = fscrypt_set_test_dummy_encryption( + sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +#else + f2fs_warn(sbi, "Test dummy encryption mount option ignored"); +#endif + return 0; +} + +static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; @@ -403,9 +449,7 @@ static int parse_options(struct super_block *sb, char *options) int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; -#ifdef CONFIG_QUOTA int ret; -#endif if (!options) return 0; @@ -778,17 +822,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - F2FS_OPTION(sbi).test_dummy_encryption = true; - f2fs_info(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_info(sbi, "Test dummy encryption mount option ignored"); -#endif + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) @@ -1213,6 +1250,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -1543,10 +1581,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_FS_ENCRYPTION - if (F2FS_OPTION(sbi).test_dummy_encryption) - seq_puts(seq, ",test_dummy_encryption"); -#endif + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1575,7 +1611,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; @@ -1734,7 +1769,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi); /* parse mount options */ - err = parse_options(sb, data); + err = parse_options(sb, data, true); if (err) goto restore_opts; checkpoint_changed = @@ -2410,9 +2445,10 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static bool f2fs_dummy_context(struct inode *inode) +static const union fscrypt_context * +f2fs_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2431,7 +2467,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, + .get_dummy_context = f2fs_get_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3366,7 +3402,7 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options); + err = parse_options(sb, options, false); if (err) goto free_options; @@ -3769,6 +3805,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); free_sb_buf: kvfree(raw_super); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e3bbbef9b4f0..3162f46b3c9b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -446,6 +446,7 @@ enum feat_id { FEAT_SB_CHECKSUM, FEAT_CASEFOLD, FEAT_COMPRESSION, + FEAT_TEST_DUMMY_ENCRYPTION_V2, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -466,6 +467,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: case FEAT_COMPRESSION: + case FEAT_TEST_DUMMY_ENCRYPTION_V2: return sprintf(buf, "supported\n"); } return 0; @@ -563,6 +565,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); #endif #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); @@ -647,6 +650,7 @@ ATTRIBUTE_GROUPS(f2fs); static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 0e0c7ad19383..2862ca5fea33 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -15,12 +15,15 @@ #include #include +#include #include #include #define FS_CRYPTO_BLOCK_SIZE 16 +union fscrypt_context; struct fscrypt_info; +struct seq_file; struct fscrypt_str { unsigned char *name; @@ -59,7 +62,8 @@ struct fscrypt_operations { int (*get_context)(struct inode *inode, void *ctx, size_t len); int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); - bool (*dummy_context)(struct inode *inode); + const union fscrypt_context *(*get_dummy_context)( + struct super_block *sb); bool (*empty_dir)(struct inode *inode); unsigned int max_namelen; bool (*has_stable_inodes)(struct super_block *sb); @@ -89,10 +93,12 @@ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +static inline const union fscrypt_context * +fscrypt_get_dummy_context(struct super_block *sb) { - return inode->i_sb->s_cop->dummy_context && - inode->i_sb->s_cop->dummy_context(inode); + if (!sb->s_cop->get_dummy_context) + return NULL; + return sb->s_cop->get_dummy_context(sb); } /* @@ -145,6 +151,22 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload); +struct fscrypt_dummy_context { + const union fscrypt_context *ctx; +}; + +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx); +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb); +static inline void +fscrypt_free_dummy_context(struct fscrypt_dummy_context *dummy_ctx) +{ + kfree(dummy_ctx->ctx); + dummy_ctx->ctx = NULL; +} + /* keyring.c */ void fscrypt_sb_free(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); @@ -219,9 +241,10 @@ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) return false; } -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +static inline const union fscrypt_context * +fscrypt_get_dummy_context(struct super_block *sb) { - return false; + return NULL; } static inline void fscrypt_handle_d_move(struct dentry *dentry) @@ -316,6 +339,20 @@ static inline int fscrypt_inherit_context(struct inode *parent, return -EOPNOTSUPP; } +struct fscrypt_dummy_context { +}; + +static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, + char sep, + struct super_block *sb) +{ +} + +static inline void +fscrypt_free_dummy_context(struct fscrypt_dummy_context *dummy_ctx) +{ +} + /* keyring.c */ static inline void fscrypt_sb_free(struct super_block *sb) { @@ -677,7 +714,7 @@ static inline int fscrypt_prepare_symlink(struct inode *dir, unsigned int max_len, struct fscrypt_str *disk_link) { - if (IS_ENCRYPTED(dir) || fscrypt_dummy_context_enabled(dir)) + if (IS_ENCRYPTED(dir) || fscrypt_get_dummy_context(dir->i_sb) != NULL) return __fscrypt_prepare_symlink(dir, len, max_len, disk_link); disk_link->name = (unsigned char *)target; -- cgit v1.2.3 From 5cab3ff2489ede5abffa6ad730708e087dc45a4d Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 19 May 2020 01:43:18 +0800 Subject: soundwire: bus: rename sdw_bus_master_add/delete, add arguments In preparation for future extensions, rename functions to use sdw_bus_master prefix and add a parent and fwnode argument to sdw_bus_master_add to help with device registration in follow-up patches. No functionality change, just renames and additional arguments. The Intel code is currently unused, the two additional arguments are only needed for compilation. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Acked-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20200518174322.31561-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- Documentation/driver-api/soundwire/summary.rst | 7 ++++--- drivers/soundwire/bus.c | 15 +++++++++------ drivers/soundwire/intel.c | 8 ++++---- drivers/soundwire/qcom.c | 6 +++--- include/linux/soundwire/sdw.h | 5 +++-- 5 files changed, 23 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/soundwire/summary.rst b/Documentation/driver-api/soundwire/summary.rst index 8193125a2bfb..01dcb954f6d7 100644 --- a/Documentation/driver-api/soundwire/summary.rst +++ b/Documentation/driver-api/soundwire/summary.rst @@ -101,10 +101,11 @@ Following is the Bus API to register the SoundWire Bus: .. code-block:: c - int sdw_add_bus_master(struct sdw_bus *bus) + int sdw_bus_master_add(struct sdw_bus *bus, + struct device *parent, + struct fwnode_handle) { - if (!bus->dev) - return -ENODEV; + sdw_master_device_add(bus, parent, fwnode); mutex_init(&bus->lock); INIT_LIST_HEAD(&bus->slaves); diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 4d8183cc323d..a8072791d6b8 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -10,13 +10,16 @@ #include "bus.h" /** - * sdw_add_bus_master() - add a bus Master instance + * sdw_bus_master_add() - add a bus Master instance * @bus: bus instance + * @parent: parent device + * @fwnode: firmware node handle * * Initializes the bus instance, read properties and create child * devices. */ -int sdw_add_bus_master(struct sdw_bus *bus) +int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, + struct fwnode_handle *fwnode) { struct sdw_master_prop *prop = NULL; int ret; @@ -107,7 +110,7 @@ int sdw_add_bus_master(struct sdw_bus *bus) return 0; } -EXPORT_SYMBOL(sdw_add_bus_master); +EXPORT_SYMBOL(sdw_bus_master_add); static int sdw_delete_slave(struct device *dev, void *data) { @@ -131,18 +134,18 @@ static int sdw_delete_slave(struct device *dev, void *data) } /** - * sdw_delete_bus_master() - delete the bus master instance + * sdw_bus_master_delete() - delete the bus master instance * @bus: bus to be deleted * * Remove the instance, delete the child devices. */ -void sdw_delete_bus_master(struct sdw_bus *bus) +void sdw_bus_master_delete(struct sdw_bus *bus) { device_for_each_child(bus->dev, NULL, sdw_delete_slave); sdw_bus_debugfs_exit(bus); } -EXPORT_SYMBOL(sdw_delete_bus_master); +EXPORT_SYMBOL(sdw_bus_master_delete); /* * SDW IO Calls diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c index ed8d576bf5dc..430f8addef83 100644 --- a/drivers/soundwire/intel.c +++ b/drivers/soundwire/intel.c @@ -1110,9 +1110,9 @@ static int intel_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sdw); - ret = sdw_add_bus_master(&sdw->cdns.bus); + ret = sdw_bus_master_add(&sdw->cdns.bus, &pdev->dev, pdev->dev.fwnode); if (ret) { - dev_err(&pdev->dev, "sdw_add_bus_master fail: %d\n", ret); + dev_err(&pdev->dev, "sdw_bus_master_add fail: %d\n", ret); return ret; } @@ -1173,7 +1173,7 @@ err_interrupt: sdw_cdns_enable_interrupt(&sdw->cdns, false); free_irq(sdw->link_res->irq, sdw); err_init: - sdw_delete_bus_master(&sdw->cdns.bus); + sdw_bus_master_delete(&sdw->cdns.bus); return ret; } @@ -1189,7 +1189,7 @@ static int intel_remove(struct platform_device *pdev) free_irq(sdw->link_res->irq, sdw); snd_soc_unregister_component(sdw->cdns.dev); } - sdw_delete_bus_master(&sdw->cdns.bus); + sdw_bus_master_delete(&sdw->cdns.bus); return 0; } diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c index a2a35f780fc3..ea03bb988f81 100644 --- a/drivers/soundwire/qcom.c +++ b/drivers/soundwire/qcom.c @@ -822,7 +822,7 @@ static int qcom_swrm_probe(struct platform_device *pdev) goto err_clk; } - ret = sdw_add_bus_master(&ctrl->bus); + ret = sdw_bus_master_add(&ctrl->bus, dev, dev->fwnode); if (ret) { dev_err(dev, "Failed to register Soundwire controller (%d)\n", ret); @@ -841,7 +841,7 @@ static int qcom_swrm_probe(struct platform_device *pdev) return 0; err_master_add: - sdw_delete_bus_master(&ctrl->bus); + sdw_bus_master_delete(&ctrl->bus); err_clk: clk_disable_unprepare(ctrl->hclk); err_init: @@ -852,7 +852,7 @@ static int qcom_swrm_remove(struct platform_device *pdev) { struct qcom_swrm_ctrl *ctrl = dev_get_drvdata(&pdev->dev); - sdw_delete_bus_master(&ctrl->bus); + sdw_bus_master_delete(&ctrl->bus); clk_disable_unprepare(ctrl->hclk); return 0; diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 00f5826092e3..2003e8c55538 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -832,8 +832,9 @@ struct sdw_bus { bool multi_link; }; -int sdw_add_bus_master(struct sdw_bus *bus); -void sdw_delete_bus_master(struct sdw_bus *bus); +int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, + struct fwnode_handle *fwnode); +void sdw_bus_master_delete(struct sdw_bus *bus); /** * sdw_port_config: Master or Slave Port configuration -- cgit v1.2.3 From 90acca1d54ad566b4af5f1030b4a4a2420ce2ef0 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 19 May 2020 01:43:19 +0800 Subject: soundwire: bus_type: introduce sdw_slave_type and sdw_master_type this is a preparatory patch before the introduction of the sdw_master_type. The SoundWire slave support is slightly modified with the use of a sdw_slave_type, and the uevent handling move to slave.c (since it's not necessary for the master). No functionality change other than moving code around. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Acked-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20200518174322.31561-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/bus_type.c | 19 +++++++++++++------ drivers/soundwire/slave.c | 8 +++++++- include/linux/soundwire/sdw_type.h | 9 ++++++++- 3 files changed, 28 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c index 17f096dd6806..2c1a19caba51 100644 --- a/drivers/soundwire/bus_type.c +++ b/drivers/soundwire/bus_type.c @@ -33,13 +33,21 @@ sdw_get_device_id(struct sdw_slave *slave, struct sdw_driver *drv) static int sdw_bus_match(struct device *dev, struct device_driver *ddrv) { - struct sdw_slave *slave = dev_to_sdw_dev(dev); - struct sdw_driver *drv = drv_to_sdw_driver(ddrv); + struct sdw_slave *slave; + struct sdw_driver *drv; + int ret = 0; + + if (is_sdw_slave(dev)) { + slave = dev_to_sdw_dev(dev); + drv = drv_to_sdw_driver(ddrv); - return !!sdw_get_device_id(slave, drv); + ret = !!sdw_get_device_id(slave, drv); + } + return ret; } -int sdw_slave_modalias(const struct sdw_slave *slave, char *buf, size_t size) +static int sdw_slave_modalias(const struct sdw_slave *slave, char *buf, + size_t size) { /* modalias is sdw:mp */ @@ -47,7 +55,7 @@ int sdw_slave_modalias(const struct sdw_slave *slave, char *buf, size_t size) slave->id.mfg_id, slave->id.part_id); } -static int sdw_uevent(struct device *dev, struct kobj_uevent_env *env) +int sdw_slave_uevent(struct device *dev, struct kobj_uevent_env *env) { struct sdw_slave *slave = dev_to_sdw_dev(dev); char modalias[32]; @@ -63,7 +71,6 @@ static int sdw_uevent(struct device *dev, struct kobj_uevent_env *env) struct bus_type sdw_bus_type = { .name = "soundwire", .match = sdw_bus_match, - .uevent = sdw_uevent, }; EXPORT_SYMBOL_GPL(sdw_bus_type); diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index 4bacdb187eab..0839445ee07b 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -14,6 +14,12 @@ static void sdw_slave_release(struct device *dev) kfree(slave); } +struct device_type sdw_slave_type = { + .name = "sdw_slave", + .release = sdw_slave_release, + .uevent = sdw_slave_uevent, +}; + static int sdw_slave_add(struct sdw_bus *bus, struct sdw_slave_id *id, struct fwnode_handle *fwnode) { @@ -41,9 +47,9 @@ static int sdw_slave_add(struct sdw_bus *bus, id->class_id, id->unique_id); } - slave->dev.release = sdw_slave_release; slave->dev.bus = &sdw_bus_type; slave->dev.of_node = of_node_get(to_of_node(fwnode)); + slave->dev.type = &sdw_slave_type; slave->bus = bus; slave->status = SDW_SLAVE_UNATTACHED; init_completion(&slave->enumeration_complete); diff --git a/include/linux/soundwire/sdw_type.h b/include/linux/soundwire/sdw_type.h index aaa7f4267c14..52eb66cd11bc 100644 --- a/include/linux/soundwire/sdw_type.h +++ b/include/linux/soundwire/sdw_type.h @@ -5,6 +5,13 @@ #define __SOUNDWIRE_TYPES_H extern struct bus_type sdw_bus_type; +extern struct device_type sdw_slave_type; +extern struct device_type sdw_master_type; + +static inline int is_sdw_slave(const struct device *dev) +{ + return dev->type == &sdw_slave_type; +} #define drv_to_sdw_driver(_drv) container_of(_drv, struct sdw_driver, driver) @@ -14,7 +21,7 @@ extern struct bus_type sdw_bus_type; int __sdw_register_driver(struct sdw_driver *drv, struct module *owner); void sdw_unregister_driver(struct sdw_driver *drv); -int sdw_slave_modalias(const struct sdw_slave *slave, char *buf, size_t size); +int sdw_slave_uevent(struct device *dev, struct kobj_uevent_env *env); /** * module_sdw_driver() - Helper macro for registering a Soundwire driver -- cgit v1.2.3 From dbb50c7a9949506f750d59d9ba4d58f0ce8ccd42 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 19 May 2020 01:43:20 +0800 Subject: soundwire: bus: add unique bus id Adding an unique id for each bus. Suggested-by: Vinod Koul Signed-off-by: Bard Liao Acked-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20200518174322.31561-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/bus.c | 20 ++++++++++++++++++++ include/linux/soundwire/sdw.h | 2 ++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index a8072791d6b8..03d72fa3c190 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -9,6 +9,19 @@ #include #include "bus.h" +static DEFINE_IDA(sdw_ida); + +static int sdw_get_id(struct sdw_bus *bus) +{ + int rc = ida_alloc(&sdw_ida, GFP_KERNEL); + + if (rc < 0) + return rc; + + bus->id = rc; + return 0; +} + /** * sdw_bus_master_add() - add a bus Master instance * @bus: bus instance @@ -29,6 +42,12 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, return -ENODEV; } + ret = sdw_get_id(bus); + if (ret) { + dev_err(bus->dev, "Failed to get bus id\n"); + return ret; + } + if (!bus->ops) { dev_err(bus->dev, "SoundWire Bus ops are not set\n"); return -EINVAL; @@ -144,6 +163,7 @@ void sdw_bus_master_delete(struct sdw_bus *bus) device_for_each_child(bus->dev, NULL, sdw_delete_slave); sdw_bus_debugfs_exit(bus); + ida_free(&sdw_ida, bus->id); } EXPORT_SYMBOL(sdw_bus_master_delete); diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 2003e8c55538..a32cb26f1815 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -789,6 +789,7 @@ struct sdw_master_ops { * struct sdw_bus - SoundWire bus * @dev: Master linux device * @link_id: Link id number, can be 0 to N, unique for each Master + * @id: bus system-wide unique id * @slaves: list of Slaves on this bus * @assigned: Bitmap for Slave device numbers. * Bit set implies used number, bit clear implies unused number. @@ -813,6 +814,7 @@ struct sdw_master_ops { struct sdw_bus { struct device *dev; unsigned int link_id; + int id; struct list_head slaves; DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); struct mutex bus_lock; -- cgit v1.2.3 From 7ceaa40b930e462ba0477ca6af34ec04d08181dc Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 19 May 2020 01:43:21 +0800 Subject: soundwire: bus_type: add sdw_master_device support In the existing SoundWire code, Master Devices are not explicitly represented - only SoundWire Slave Devices are exposed (the use of capital letters follows the SoundWire specification conventions). With the existing code, the bus is handled without using a proper device, and bus->dev typically points to a platform device. The right thing to do as discussed in multiple reviews is use a device for each bus. The sdw_master_device addition is done with minimal internal plumbing and not exposed externally. The existing API based on sdw_bus_master_add() and sdw_bus_master_delete() will deal with the sdw_master_device life cycle, which minimizes changes to existing drivers. Note that the Intel code will be modified in follow-up patches (no impact on any platform since the connection with ASoC is not supported upstream so far). Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Acked-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20200518174322.31561-5-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/Makefile | 2 +- drivers/soundwire/bus.c | 14 ++++++-- drivers/soundwire/bus.h | 3 ++ drivers/soundwire/intel.c | 1 - drivers/soundwire/master.c | 81 +++++++++++++++++++++++++++++++++++++++++++ drivers/soundwire/qcom.c | 1 - include/linux/soundwire/sdw.h | 17 ++++++++- 7 files changed, 112 insertions(+), 7 deletions(-) create mode 100644 drivers/soundwire/master.c (limited to 'include/linux') diff --git a/drivers/soundwire/Makefile b/drivers/soundwire/Makefile index e2cdff990e9f..7319918e0aec 100644 --- a/drivers/soundwire/Makefile +++ b/drivers/soundwire/Makefile @@ -4,7 +4,7 @@ # #Bus Objs -soundwire-bus-objs := bus_type.o bus.o slave.o mipi_disco.o stream.o +soundwire-bus-objs := bus_type.o bus.o master.o slave.o mipi_disco.o stream.o obj-$(CONFIG_SOUNDWIRE) += soundwire-bus.o ifdef CONFIG_DEBUG_FS diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 03d72fa3c190..11cfe435e4a8 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -37,14 +37,21 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, struct sdw_master_prop *prop = NULL; int ret; - if (!bus->dev) { - pr_err("SoundWire bus has no device\n"); + if (!parent) { + pr_err("SoundWire parent device is not set\n"); return -ENODEV; } ret = sdw_get_id(bus); if (ret) { - dev_err(bus->dev, "Failed to get bus id\n"); + dev_err(parent, "Failed to get bus id\n"); + return ret; + } + + ret = sdw_master_device_add(bus, parent, fwnode); + if (ret) { + dev_err(parent, "Failed to add master device at link %d\n", + bus->link_id); return ret; } @@ -161,6 +168,7 @@ static int sdw_delete_slave(struct device *dev, void *data) void sdw_bus_master_delete(struct sdw_bus *bus) { device_for_each_child(bus->dev, NULL, sdw_delete_slave); + sdw_master_device_del(bus); sdw_bus_debugfs_exit(bus); ida_free(&sdw_ida, bus->id); diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h index 204204a26db8..93ab0234a491 100644 --- a/drivers/soundwire/bus.h +++ b/drivers/soundwire/bus.h @@ -19,6 +19,9 @@ static inline int sdw_acpi_find_slaves(struct sdw_bus *bus) int sdw_of_find_slaves(struct sdw_bus *bus); void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id); +int sdw_master_device_add(struct sdw_bus *bus, struct device *parent, + struct fwnode_handle *fwnode); +int sdw_master_device_del(struct sdw_bus *bus); #ifdef CONFIG_DEBUG_FS void sdw_bus_debugfs_init(struct sdw_bus *bus); diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c index 430f8addef83..4cfdd074e310 100644 --- a/drivers/soundwire/intel.c +++ b/drivers/soundwire/intel.c @@ -1099,7 +1099,6 @@ static int intel_probe(struct platform_device *pdev) sdw->cdns.registers = sdw->link_res->registers; sdw->cdns.instance = sdw->instance; sdw->cdns.msg_count = 0; - sdw->cdns.bus.dev = &pdev->dev; sdw->cdns.bus.link_id = pdev->id; sdw_cdns_probe(&sdw->cdns); diff --git a/drivers/soundwire/master.c b/drivers/soundwire/master.c new file mode 100644 index 000000000000..6be0a027def7 --- /dev/null +++ b/drivers/soundwire/master.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright(c) 2019-2020 Intel Corporation. + +#include +#include +#include +#include +#include "bus.h" + +static void sdw_master_device_release(struct device *dev) +{ + struct sdw_master_device *md = dev_to_sdw_master_device(dev); + + kfree(md); +} + +struct device_type sdw_master_type = { + .name = "soundwire_master", + .release = sdw_master_device_release, +}; + +/** + * sdw_master_device_add() - create a Linux Master Device representation. + * @bus: SDW bus instance + * @parent: parent device + * @fwnode: firmware node handle + */ +int sdw_master_device_add(struct sdw_bus *bus, struct device *parent, + struct fwnode_handle *fwnode) +{ + struct sdw_master_device *md; + int ret; + + if (!parent) + return -EINVAL; + + md = kzalloc(sizeof(*md), GFP_KERNEL); + if (!md) + return -ENOMEM; + + md->dev.bus = &sdw_bus_type; + md->dev.type = &sdw_master_type; + md->dev.parent = parent; + md->dev.of_node = parent->of_node; + md->dev.fwnode = fwnode; + md->dev.dma_mask = parent->dma_mask; + + dev_set_name(&md->dev, "sdw-master-%d", bus->id); + + ret = device_register(&md->dev); + if (ret) { + dev_err(parent, "Failed to add master: ret %d\n", ret); + /* + * On err, don't free but drop ref as this will be freed + * when release method is invoked. + */ + put_device(&md->dev); + goto device_register_err; + } + + /* add shortcuts to improve code readability/compactness */ + md->bus = bus; + bus->dev = &md->dev; + bus->md = md; + +device_register_err: + return ret; +} + +/** + * sdw_master_device_del() - delete a Linux Master Device representation. + * @bus: bus handle + * + * This function is the dual of sdw_master_device_add() + */ +int sdw_master_device_del(struct sdw_bus *bus) +{ + device_unregister(bus->dev); + + return 0; +} diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c index ea03bb988f81..a1c2a44a3b4d 100644 --- a/drivers/soundwire/qcom.c +++ b/drivers/soundwire/qcom.c @@ -784,7 +784,6 @@ static int qcom_swrm_probe(struct platform_device *pdev) mutex_init(&ctrl->port_lock); INIT_WORK(&ctrl->slave_work, qcom_swrm_slave_wq); - ctrl->bus.dev = dev; ctrl->bus.ops = &qcom_swrm_ops; ctrl->bus.port_ops = &qcom_swrm_port_ops; ctrl->bus.compute_params = &qcom_swrm_compute_params; diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index a32cb26f1815..7658d9698dd5 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -632,6 +632,19 @@ struct sdw_slave { #define dev_to_sdw_dev(_dev) container_of(_dev, struct sdw_slave, dev) +/** + * struct sdw_master_device - SoundWire 'Master Device' representation + * @dev: Linux device for this Master + * @bus: Bus handle shortcut + */ +struct sdw_master_device { + struct device dev; + struct sdw_bus *bus; +}; + +#define dev_to_sdw_master_device(d) \ + container_of(d, struct sdw_master_device, dev) + struct sdw_driver { const char *name; @@ -787,7 +800,8 @@ struct sdw_master_ops { /** * struct sdw_bus - SoundWire bus - * @dev: Master linux device + * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. + * @md: Master device * @link_id: Link id number, can be 0 to N, unique for each Master * @id: bus system-wide unique id * @slaves: list of Slaves on this bus @@ -813,6 +827,7 @@ struct sdw_master_ops { */ struct sdw_bus { struct device *dev; + struct sdw_master_device *md; unsigned int link_id; int id; struct list_head slaves; -- cgit v1.2.3 From 2318976619daf0e868de5b8aff19c1fd8d585867 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 14 May 2020 11:36:41 +0100 Subject: ARM: 8976/1: module: allow arch overrides for .init section names ARM stores unwind information for .init.text in sections named .ARM.extab.init.text and .ARM.exidx.init.text. Since those aren't currently recognized as init sections, they're allocated along with the core section, and relocation fails if the core and the init section are allocated from different regions and can't reach other. final section addresses: ... 0x7f800000 .init.text .. 0xcbb54078 .ARM.exidx.init.text .. section 16 reloc 0 sym '': relocation 42 out of range (0xcbb54078 -> 0x7f800000) Allow architectures to override the section name so that ARM can fix this. Acked-by: Jessica Yu Signed-off-by: Vincent Whitchurch Signed-off-by: Russell King --- include/linux/moduleloader.h | 5 +++++ kernel/module.c | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index ca92aea8a6bd..4fa67a8b2265 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -29,6 +29,11 @@ void *module_alloc(unsigned long size); /* Free memory returned from module_alloc. */ void module_memfree(void *module_region); +/* Determines if the section name is an init section (that is only used during + * module loading). + */ +bool module_init_section(const char *name); + /* Determines if the section name is an exit section (that is only used during * module unloading) */ diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..d29c23d07aff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2400,7 +2400,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2433,7 +2433,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2768,6 +2768,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + bool __weak module_exit_section(const char *name) { return strstarts(name, ".exit"); -- cgit v1.2.3 From 333cff6c963fbc8b9820ca2b6a8b2e22a572cd43 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 29 Apr 2020 12:36:39 +0200 Subject: powercap/drivers/idle_inject: Specify idle state max latency Currently the idle injection framework uses the play_idle() function which puts the current CPU in an idle state. The idle state is the deepest one, as specified by the latency constraint when calling the subsequent play_idle_precise() function with the INT_MAX. The idle_injection is used by the cpuidle_cooling device which computes the idle / run duration to mitigate the temperature by injecting idle cycles. The cooling device has no control on the depth of the idle state. Allow finer control of the idle injection mechanism by allowing to specify the latency for the idle state. Thus the cooling device has the ability to have a guarantee on the exit latency of the idle states it is injecting. Acked-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano Reviewed-by: Amit Kucheria Link: https://lore.kernel.org/r/20200429103644.5492-1-daniel.lezcano@linaro.org --- drivers/powercap/idle_inject.c | 16 +++++++++++++++- include/linux/idle_inject.h | 4 ++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index e9bbd3c42eef..c90f0990968b 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -61,12 +61,14 @@ struct idle_inject_thread { * @timer: idle injection period timer * @idle_duration_us: duration of CPU idle time to inject * @run_duration_us: duration of CPU run time to allow + * @latency_us: max allowed latency * @cpumask: mask of CPUs affected by idle injection */ struct idle_inject_device { struct hrtimer timer; unsigned int idle_duration_us; unsigned int run_duration_us; + unsigned int latency_us; unsigned long cpumask[]; }; @@ -138,7 +140,8 @@ static void idle_inject_fn(unsigned int cpu) */ iit->should_run = 0; - play_idle(READ_ONCE(ii_dev->idle_duration_us)); + play_idle_precise(READ_ONCE(ii_dev->idle_duration_us) * NSEC_PER_USEC, + READ_ONCE(ii_dev->latency_us) * NSEC_PER_USEC); } /** @@ -169,6 +172,16 @@ void idle_inject_get_duration(struct idle_inject_device *ii_dev, *idle_duration_us = READ_ONCE(ii_dev->idle_duration_us); } +/** + * idle_inject_set_latency - set the maximum latency allowed + * @latency_us: set the latency requirement for the idle state + */ +void idle_inject_set_latency(struct idle_inject_device *ii_dev, + unsigned int latency_us) +{ + WRITE_ONCE(ii_dev->latency_us, latency_us); +} + /** * idle_inject_start - start idle injections * @ii_dev: idle injection control device structure @@ -297,6 +310,7 @@ struct idle_inject_device *idle_inject_register(struct cpumask *cpumask) cpumask_copy(to_cpumask(ii_dev->cpumask), cpumask); hrtimer_init(&ii_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ii_dev->timer.function = idle_inject_timer_fn; + ii_dev->latency_us = UINT_MAX; for_each_cpu(cpu, to_cpumask(ii_dev->cpumask)) { diff --git a/include/linux/idle_inject.h b/include/linux/idle_inject.h index a445cd1a36c5..91a8612b8bf9 100644 --- a/include/linux/idle_inject.h +++ b/include/linux/idle_inject.h @@ -26,4 +26,8 @@ void idle_inject_set_duration(struct idle_inject_device *ii_dev, void idle_inject_get_duration(struct idle_inject_device *ii_dev, unsigned int *run_duration_us, unsigned int *idle_duration_us); + +void idle_inject_set_latency(struct idle_inject_device *ii_dev, + unsigned int latency_ns); + #endif /* __IDLE_INJECT_H__ */ -- cgit v1.2.3 From dfd0bda3703cdaf1fccd5da72cb7101a4fedfe68 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 29 Apr 2020 12:36:41 +0200 Subject: thermal/drivers/cpuidle_cooling: Change the registration function Today, there is no user for the cpuidle cooling device. The targetted platform is ARM and ARM64. The cpuidle and the cpufreq cooling device are based on the device tree. As the cpuidle cooling device can have its own configuration depending on the platform and the available idle states. The DT node description will give the optional properties to set the cooling device up. Do no longer rely on the CPU node which is prone to error and will lead to a confusion in the DT because the cpufreq cooling device is also using it. Let initialize the cpuidle cooling device with the DT binding. This was tested on: - hikey960 - hikey6220 - rock960 - db845c Acked-by: Viresh Kumar Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Reviewed-by: Amit Kucheria Tested-by: Amit Kucheria Link: https://lore.kernel.org/r/20200429103644.5492-3-daniel.lezcano@linaro.org --- drivers/thermal/cpuidle_cooling.c | 63 +++++++++++++++++++++++++++++++-------- include/linux/cpu_cooling.h | 12 ++------ 2 files changed, 53 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/cpuidle_cooling.c b/drivers/thermal/cpuidle_cooling.c index 0bb843246f59..78e3e8238116 100644 --- a/drivers/thermal/cpuidle_cooling.c +++ b/drivers/thermal/cpuidle_cooling.c @@ -5,11 +5,14 @@ * Author: Daniel Lezcano * */ +#define pr_fmt(fmt) "cpuidle cooling: " fmt + #include #include #include #include #include +#include #include #include @@ -154,22 +157,25 @@ static struct thermal_cooling_device_ops cpuidle_cooling_ops = { }; /** - * cpuidle_of_cooling_register - Idle cooling device initialization function + * __cpuidle_cooling_register: register the cooling device * @drv: a cpuidle driver structure pointer - * @np: a node pointer to a device tree cooling device node + * @np: a device node structure pointer used for the thermal binding * - * This function is in charge of creating a cooling device per cpuidle - * driver and register it to thermal framework. + * This function is in charge of allocating the cpuidle cooling device + * structure, the idle injection, initialize them and register the + * cooling device to the thermal framework. * - * Return: zero on success, or negative value corresponding to the - * error detected in the underlying subsystems. + * Return: zero on success, a negative value returned by one of the + * underlying subsystem in case of error */ -int cpuidle_of_cooling_register(struct device_node *np, - struct cpuidle_driver *drv) +static int __cpuidle_cooling_register(struct device_node *np, + struct cpuidle_driver *drv) { struct idle_inject_device *ii_dev; struct cpuidle_cooling_device *idle_cdev; struct thermal_cooling_device *cdev; + unsigned int idle_duration_us = TICK_USEC; + unsigned int latency_us = UINT_MAX; char dev_name[THERMAL_NAME_LENGTH]; int id, ret; @@ -191,7 +197,11 @@ int cpuidle_of_cooling_register(struct device_node *np, goto out_id; } - idle_inject_set_duration(ii_dev, TICK_USEC, TICK_USEC); + of_property_read_u32(np, "duration-us", &idle_duration_us); + of_property_read_u32(np, "exit-latency-us", &latency_us); + + idle_inject_set_duration(ii_dev, TICK_USEC, idle_duration_us); + idle_inject_set_latency(ii_dev, latency_us); idle_cdev->ii_dev = ii_dev; @@ -204,6 +214,9 @@ int cpuidle_of_cooling_register(struct device_node *np, goto out_unregister; } + pr_debug("%s: Idle injection set with idle duration=%u, latency=%u\n", + dev_name, idle_duration_us, latency_us); + return 0; out_unregister: @@ -221,12 +234,38 @@ out: * @drv: a cpuidle driver structure pointer * * This function is in charge of creating a cooling device per cpuidle - * driver and register it to thermal framework. + * driver and register it to the thermal framework. * * Return: zero on success, or negative value corresponding to the * error detected in the underlying subsystems. */ -int cpuidle_cooling_register(struct cpuidle_driver *drv) +void cpuidle_cooling_register(struct cpuidle_driver *drv) { - return cpuidle_of_cooling_register(NULL, drv); + struct device_node *cooling_node; + struct device_node *cpu_node; + int cpu, ret; + + for_each_cpu(cpu, drv->cpumask) { + + cpu_node = of_cpu_device_node_get(cpu); + + cooling_node = of_get_child_by_name(cpu_node, "thermal-idle"); + + of_node_put(cpu_node); + + if (!cooling_node) { + pr_debug("'thermal-idle' node not found for cpu%d\n", cpu); + continue; + } + + ret = __cpuidle_cooling_register(cooling_node, drv); + + of_node_put(cooling_node); + + if (ret) { + pr_err("Failed to register the cpuidle cooling device" \ + "for cpu%d: %d\n", cpu, ret); + break; + } + } } diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index 65501d8f9778..a3bdc8a98f2c 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -63,18 +63,10 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy) struct cpuidle_driver; #ifdef CONFIG_CPU_IDLE_THERMAL -int cpuidle_cooling_register(struct cpuidle_driver *drv); -int cpuidle_of_cooling_register(struct device_node *np, - struct cpuidle_driver *drv); +void cpuidle_cooling_register(struct cpuidle_driver *drv); #else /* CONFIG_CPU_IDLE_THERMAL */ -static inline int cpuidle_cooling_register(struct cpuidle_driver *drv) +static inline void cpuidle_cooling_register(struct cpuidle_driver *drv) { - return 0; -} -static inline int cpuidle_of_cooling_register(struct device_node *np, - struct cpuidle_driver *drv) -{ - return 0; } #endif /* CONFIG_CPU_IDLE_THERMAL */ -- cgit v1.2.3 From 9d78edeaec759f997c303f286ecd39daee166f2a Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 18 May 2020 20:07:38 +0200 Subject: proc: proc_pid_ns takes super_block as an argument syzbot found that touch /proc/testfile causes NULL pointer dereference at tomoyo_get_local_path() because inode of the dentry is NULL. Before c59f415a7cb6, Tomoyo received pid_ns from proc's s_fs_info directly. Since proc_pid_ns() can only work with inode, using it in the tomoyo_get_local_path() was wrong. To avoid creating more functions for getting proc_ns, change the argument type of the proc_pid_ns() function. Then, Tomoyo can use the existing super_block to get pid_ns. Link: https://lkml.kernel.org/r/0000000000002f0c7505a5b0e04c@google.com Link: https://lkml.kernel.org/r/20200518180738.2939611-1-gladkov.alexey@gmail.com Reported-by: syzbot+c1af344512918c61362c@syzkaller.appspotmail.com Fixes: c59f415a7cb6 ("Use proc_pid_ns() to get pid_namespace from the proc superblock") Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman --- fs/locks.c | 4 ++-- fs/proc/array.c | 2 +- fs/proc/base.c | 10 +++++----- fs/proc/self.c | 2 +- fs/proc/thread_self.c | 2 +- include/linux/proc_fs.h | 4 ++-- kernel/fork.c | 2 +- net/ipv6/ip6_flowlabel.c | 2 +- security/tomoyo/realpath.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index 399c5dbb72c4..ab702d6efb55 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2823,7 +2823,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, { struct inode *inode = NULL; unsigned int fl_pid; - struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)); + struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); fl_pid = locks_translate_pid(fl, proc_pidns); /* @@ -2901,7 +2901,7 @@ static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; - struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)); + struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); fl = hlist_entry(v, struct file_lock, fl_link); diff --git a/fs/proc/array.c b/fs/proc/array.c index 8e16f14bb05a..043311014db2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -728,7 +728,7 @@ static int children_seq_show(struct seq_file *seq, void *v) { struct inode *inode = file_inode(seq->file); - seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode))); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 5a307b3bb2d1..30c9fceca0b7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -754,7 +754,7 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; @@ -1423,7 +1423,7 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); @@ -2466,7 +2466,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = proc_pid_ns(inode); + tp->ns = proc_pid_ns(inode->i_sb); return 0; } @@ -3377,7 +3377,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); - struct pid_namespace *ns = proc_pid_ns(file_inode(file)); + struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) @@ -3730,7 +3730,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = proc_pid_ns(inode); + ns = proc_pid_ns(inode->i_sb); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); diff --git a/fs/proc/self.c b/fs/proc/self.c index 309301ac0136..ca5158fa561c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 2493cbbdfa6f..ac284f409568 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 2cb424e6f36a..6ec524d8842c 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -202,9 +202,9 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ -static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) +static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) { - return proc_sb_info(inode->i_sb)->pid_ns; + return proc_sb_info(sb)->pid_ns; } #endif /* _LINUX_PROC_FS_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4385f3d639f2..e7bdaccad942 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1745,7 +1745,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)); + ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index d64b83e85642..ce4fbba4acce 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -779,7 +779,7 @@ static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); - state->pid_ns = proc_pid_ns(file_inode(seq->file)); + state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 08b096e2f7e3..df4798980416 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -162,7 +162,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); - struct pid_namespace *proc_pidns = proc_pid_ns(d_inode(dentry)); + struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { -- cgit v1.2.3 From 6553896666433e7efec589838b400a2a652b3ffa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Mar 2020 22:47:17 +0100 Subject: vmlinux.lds.h: Create section for protection against instrumentation Some code pathes, especially the low level entry code, must be protected against instrumentation for various reasons: - Low level entry code can be a fragile beast, especially on x86. - With NO_HZ_FULL RCU state needs to be established before using it. Having a dedicated section for such code allows to validate with tooling that no unsafe functions are invoked. Add the .noinstr.text section and the noinstr attribute to mark functions. noinstr implies notrace. Kprobes will gain a section check later. Provide also a set of markers: instrumentation_begin()/end() These are used to mark code inside a noinstr function which calls into regular instrumentable text section as safe. The instrumentation markers are only active when CONFIG_DEBUG_ENTRY is enabled as the end marker emits a NOP to prevent the compiler from merging the annotation points. This means the objtool verification requires a kernel compiled with this option. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.075416272@linutronix.de --- arch/powerpc/kernel/vmlinux.lds.S | 1 + include/asm-generic/sections.h | 3 +++ include/asm-generic/vmlinux.lds.h | 10 ++++++++ include/linux/compiler.h | 53 +++++++++++++++++++++++++++++++++++++++ include/linux/compiler_types.h | 4 +++ scripts/mod/modpost.c | 2 +- 6 files changed, 72 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 31a0f201fb6f..a1706b63b82d 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -90,6 +90,7 @@ SECTIONS #ifdef CONFIG_PPC64 *(.tramp.ftrace.text); #endif + NOINSTR_TEXT SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d1779d442aa5..66397ed10acb 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -53,6 +53,9 @@ extern char __ctors_start[], __ctors_end[]; /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; +/* Start and end of instrumentation protected text section */ +extern char __noinstr_text_start[], __noinstr_text_end[]; + extern __visible const void __nosave_begin, __nosave_end; /* Function descriptor handling (if any). Override in asm/sections.h */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 71e387a5fe90..db600ef218d7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -540,6 +540,15 @@ . = ALIGN((align)); \ __end_rodata = .; +/* + * Non-instrumentable text section + */ +#define NOINSTR_TEXT \ + ALIGN_FUNCTION(); \ + __noinstr_text_start = .; \ + *(.noinstr.text) \ + __noinstr_text_end = .; + /* * .text section. Map to function alignment to avoid address changes * during second ld run in second ld pass when generating System.map @@ -551,6 +560,7 @@ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \ + NOINSTR_TEXT \ *(.text..refcount) \ *(.ref.text) \ MEM_KEEP(init.text*) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 034b0a644efc..e9ead0505671 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -120,12 +120,65 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(.rodata..c_jump_table) +#ifdef CONFIG_DEBUG_ENTRY +/* Begin/end of an instrumentation safe region */ +#define instrumentation_begin() ({ \ + asm volatile("%c0:\n\t" \ + ".pushsection .discard.instr_begin\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) + +/* + * Because instrumentation_{begin,end}() can nest, objtool validation considers + * _begin() a +1 and _end() a -1 and computes a sum over the instructions. + * When the value is greater than 0, we consider instrumentation allowed. + * + * There is a problem with code like: + * + * noinstr void foo() + * { + * instrumentation_begin(); + * ... + * if (cond) { + * instrumentation_begin(); + * ... + * instrumentation_end(); + * } + * bar(); + * instrumentation_end(); + * } + * + * If instrumentation_end() would be an empty label, like all the other + * annotations, the inner _end(), which is at the end of a conditional block, + * would land on the instruction after the block. + * + * If we then consider the sum of the !cond path, we'll see that the call to + * bar() is with a 0-value, even though, we meant it to happen with a positive + * value. + * + * To avoid this, have _end() be a NOP instruction, this ensures it will be + * part of the condition block and does not escape. + */ +#define instrumentation_end() ({ \ + asm volatile("%c0: nop\n\t" \ + ".pushsection .discard.instr_end\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) +#endif /* CONFIG_DEBUG_ENTRY */ + #else #define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif +#ifndef instrumentation_begin +#define instrumentation_begin() do { } while(0) +#define instrumentation_end() do { } while(0) +#endif + #ifndef ASM_UNREACHABLE # define ASM_UNREACHABLE #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e970f97a7fcb..5da257cbebf1 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -118,6 +118,10 @@ struct ftrace_likely_data { #define notrace __attribute__((__no_instrument_function__)) #endif +/* Section for code which can't be instrumented at all */ +#define noinstr \ + noinline notrace __attribute((__section__(".noinstr.text"))) + /* * it doesn't make sense on ARM (currently the only user of __naked) * to trace naked functions because then mcount is called without diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5c3c50c5ec52..0053d4fea847 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -948,7 +948,7 @@ static void check_section(const char *modname, struct elf_info *elf, #define DATA_SECTIONS ".data", ".data.rel" #define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ - ".kprobes.text", ".cpuidle.text" + ".kprobes.text", ".cpuidle.text", ".noinstr.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ ".coldtext" -- cgit v1.2.3 From 0995a5dfbe49badff78e78761fb66f46579f2f9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 13:09:50 +0100 Subject: tracing: Provide lockdep less trace_hardirqs_on/off() variants trace_hardirqs_on/off() is only partially safe vs. RCU idle. The tracer core itself is safe, but the resulting tracepoints can be utilized by e.g. BPF which is unsafe. Provide variants which do not contain the lockdep invocation so the lockdep and tracer invocations can be split at the call site and placed properly. This is required because lockdep needs to be aware of the state before switching away from RCU idle and after switching to RCU idle because these transitions can take locks. As these code pathes are going to be non-instrumentable the tracer can be invoked after RCU is turned on and before the switch to RCU idle. So for these new variants there is no need to invoke the rcuidle aware tracer functions. Name them so they match the lockdep counterparts. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.270771162@linutronix.de --- include/linux/irqflags.h | 4 ++++ kernel/trace/trace_preemptirq.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 61a9ced3aa50..f150e69ab81d 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -29,6 +29,8 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS + extern void trace_hardirqs_on_prepare(void); + extern void trace_hardirqs_off_prepare(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); # define lockdep_hardirq_context(p) ((p)->hardirq_context) @@ -96,6 +98,8 @@ do { \ } while (0) #else +# define trace_hardirqs_on_prepare() do { } while (0) +# define trace_hardirqs_off_prepare() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) # define lockdep_hardirq_context(p) 0 diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..c00880162b06 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,6 +19,24 @@ /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { @@ -33,6 +51,25 @@ void trace_hardirqs_on(void) EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_prepare(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + } + +} +EXPORT_SYMBOL(trace_hardirqs_off_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); + void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { -- cgit v1.2.3 From c86e9b987cea3dd0209203e714553a47f5d7c6dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Mar 2020 14:22:03 +0100 Subject: lockdep: Prepare for noinstr sections Force inlining and prevent instrumentation of all sorts by marking the functions which are invoked from low level entry code with 'noinstr'. Split the irqflags tracking into two parts. One which does the heavy lifting while RCU is watching and the final one which can be invoked after RCU is turned off. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.484532537@linutronix.de --- include/linux/irqflags.h | 2 + include/linux/sched.h | 1 + kernel/locking/lockdep.c | 86 +++++++++++++++++++++++++++++++---------- kernel/trace/trace_preemptirq.c | 2 + lib/debug_locks.c | 2 +- 5 files changed, 71 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index f150e69ab81d..d7f7e436c3af 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -19,11 +19,13 @@ #ifdef CONFIG_PROVE_LOCKING extern void lockdep_softirqs_on(unsigned long ip); extern void lockdep_softirqs_off(unsigned long ip); + extern void lockdep_hardirqs_on_prepare(unsigned long ip); extern void lockdep_hardirqs_on(unsigned long ip); extern void lockdep_hardirqs_off(unsigned long ip); #else static inline void lockdep_softirqs_on(unsigned long ip) { } static inline void lockdep_softirqs_off(unsigned long ip) { } + static inline void lockdep_hardirqs_on_prepare(unsigned long ip) { } static inline void lockdep_hardirqs_on(unsigned long ip) { } static inline void lockdep_hardirqs_off(unsigned long ip) { } #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..658de6164853 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -983,6 +983,7 @@ struct task_struct { unsigned int hardirq_disable_event; int hardirqs_enabled; int hardirq_context; + u64 hardirq_chain_key; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; unsigned int softirq_disable_event; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..9ccd675a8b5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3635,13 +3635,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -3654,15 +3651,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -void lockdep_hardirqs_on(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * @ip: Caller address + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(unsigned long ip) { if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3698,20 +3699,62 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; + current->hardirq_chain_key = current->curr_chain_key; + current->lockdep_recursion++; - __trace_hardirqs_on_caller(ip); + __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } -NOKPROBE_SYMBOL(lockdep_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); + +void noinstr lockdep_hardirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || curr->lockdep_recursion)) + return; + + if (curr->hardirqs_enabled) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -void lockdep_hardirqs_off(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks || curr->lockdep_recursion)) return; /* @@ -3729,10 +3772,11 @@ void lockdep_hardirqs_off(unsigned long ip) curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -NOKPROBE_SYMBOL(lockdep_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -4408,8 +4452,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, dump_stack(); } -static int match_held_lock(const struct held_lock *hlock, - const struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; @@ -4696,7 +4740,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) return 0; } -static nokprobe_inline +static __always_inline int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; @@ -4956,7 +5000,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(const struct lockdep_map *lock, int read) +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index c00880162b06..fb0691b8a88d 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,6 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -93,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); diff --git a/lib/debug_locks.c b/lib/debug_locks.c index a75ee30b77cb..06d3135bd184 100644 --- a/lib/debug_locks.c +++ b/lib/debug_locks.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent); /* * Generic 'turn off all lock debugging' function: */ -int debug_locks_off(void) +noinstr int debug_locks_off(void) { if (debug_locks && __debug_locks_off()) { if (!debug_locks_silent) { -- cgit v1.2.3 From af1e56b78534c38bb0e0c712ca70e59f816b74e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Mar 2020 14:53:56 +0100 Subject: context_tracking: Make guest_enter/exit() .noinstr ready Force inlining of the helpers and mark the instrumentable parts accordingly. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134341.672545766@linutronix.de --- include/linux/context_tracking.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 8150f5ac176c..8cac62ee6add 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -101,12 +101,14 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN /* must be called with irqs disabled */ -static inline void guest_enter_irqoff(void) +static __always_inline void guest_enter_irqoff(void) { + instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) vtime_guest_enter(current); else current->flags |= PF_VCPU; + instrumentation_end(); if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_GUEST); @@ -118,39 +120,48 @@ static inline void guest_enter_irqoff(void) * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ - if (!context_tracking_enabled_this_cpu()) + if (!context_tracking_enabled_this_cpu()) { + instrumentation_begin(); rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); + } } -static inline void guest_exit_irqoff(void) +static __always_inline void guest_exit_irqoff(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_GUEST); + instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) vtime_guest_exit(current); else current->flags &= ~PF_VCPU; + instrumentation_end(); } #else -static inline void guest_enter_irqoff(void) +static __always_inline void guest_enter_irqoff(void) { /* * This is running in ioctl context so its safe * to assume that it's the stime pending cputime * to flush. */ + instrumentation_begin(); vtime_account_kernel(current); current->flags |= PF_VCPU; rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); } -static inline void guest_exit_irqoff(void) +static __always_inline void guest_exit_irqoff(void) { + instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_kernel(current); current->flags &= ~PF_VCPU; + instrumentation_end(); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From 69ea03b56ed2c7189ccd0b5910ad39f3cad1df21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 09:46:47 +0100 Subject: hardirq/nmi: Allow nested nmi_enter() Since there are already a number of sites (ARM64, PowerPC) that effectively nest nmi_enter(), make the primitive support this before adding even more. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Acked-by: Will Deacon Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200505134100.864179229@linutronix.de --- arch/arm64/kernel/sdei.c | 14 ++------------ arch/arm64/kernel/traps.c | 8 ++------ arch/powerpc/kernel/traps.c | 22 ++++++---------------- include/linux/hardirq.h | 5 ++++- include/linux/preempt.h | 4 ++-- 5 files changed, 16 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index d6259dac62b6..e396e69e33a1 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -251,22 +251,12 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - bool do_nmi_exit = false; - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } + nmi_enter(); ret = _sdei_handler(regs, arg); - if (do_nmi_exit) - nmi_exit(); + nmi_exit(); return ret; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cf402be5c573..c728f163f329 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -906,17 +906,13 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) { - const bool was_in_nmi = in_nmi(); - - if (!was_in_nmi) - nmi_enter(); + nmi_enter(); /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - if (!was_in_nmi) - nmi_exit(); + nmi_exit(); } asmlinkage void enter_from_user_mode(void) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 3fca22276bb1..b44dd75de517 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -441,15 +441,9 @@ nonrecoverable: void system_reset_exception(struct pt_regs *regs) { unsigned long hsrr0, hsrr1; - bool nested = in_nmi(); bool saved_hsrrs = false; - /* - * Avoid crashes in case of nested NMI exceptions. Recoverability - * is determined by RI and in_nmi - */ - if (!nested) - nmi_enter(); + nmi_enter(); /* * System reset can interrupt code where HSRRs are live and MSR[RI]=1. @@ -521,8 +515,7 @@ out: mtspr(SPRN_HSRR1, hsrr1); } - if (!nested) - nmi_exit(); + nmi_exit(); /* What should we do here? We could issue a shutdown or hard reset. */ } @@ -823,9 +816,8 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { int recover = 0; - bool nested = in_nmi(); - if (!nested) - nmi_enter(); + + nmi_enter(); __this_cpu_inc(irq_stat.mce_exceptions); @@ -851,8 +843,7 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - if (!nested) - nmi_exit(); + nmi_exit(); die("Machine check", regs, SIGBUS); @@ -863,8 +854,7 @@ void machine_check_exception(struct pt_regs *regs) return; bail: - if (!nested) - nmi_exit(); + nmi_exit(); } void SMIException(struct pt_regs *regs) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 7c8b82f69288..a043ad826c67 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -65,13 +65,16 @@ extern void irq_exit(void); #define arch_nmi_exit() do { } while (0) #endif +/* + * nmi_enter() can nest up to 15 times; see NMI_BITS. + */ #define nmi_enter() \ do { \ arch_nmi_enter(); \ printk_nmi_enter(); \ lockdep_off(); \ ftrace_nmi_enter(); \ - BUG_ON(in_nmi()); \ + BUG_ON(in_nmi() == NMI_MASK); \ preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ lockdep_hardirq_enter(); \ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bc3f1aecaa19..7d9c1c0e149c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -26,13 +26,13 @@ * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 + * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 -#define NMI_BITS 1 +#define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -- cgit v1.2.3 From e616cb8daadf637175af4fe53138a94c190c4816 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:14:51 +0100 Subject: lockdep: Always inline lockdep_{off,on}() These functions are called {early,late} in nmi_{enter,exit} and should not be traced or probed. They are also puny, so 'inline' them. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.048523500@linutronix.de --- include/linux/lockdep.h | 23 +++++++++++++++++++++-- kernel/locking/lockdep.c | 19 ------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 206774ac6946..8fce5c98a4b0 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -308,8 +308,27 @@ extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); -extern void lockdep_off(void); -extern void lockdep_on(void); +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + +/* + * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due + * to header dependencies. + */ + +#define lockdep_off() \ +do { \ + current->lockdep_recursion += LOCKDEP_OFF; \ +} while (0) + +#define lockdep_on() \ +do { \ + current->lockdep_recursion -= LOCKDEP_OFF; \ +} while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..6f1c8cba09c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,25 +393,6 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - static inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) -- cgit v1.2.3 From 178ba00c354eb15cec6806a812771e60a5ae3ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:26:21 +0100 Subject: sh/ftrace: Move arch_ftrace_nmi_{enter,exit} into nmi exception SuperH is the last remaining user of arch_ftrace_nmi_{enter,exit}(), remove it from the generic code and into the SuperH code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Steven Rostedt (VMware) Cc: Rich Felker Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200505134101.248881738@linutronix.de --- Documentation/trace/ftrace-design.rst | 8 -------- arch/sh/Kconfig | 1 - arch/sh/kernel/traps.c | 12 ++++++++++++ include/linux/ftrace_irq.h | 11 ----------- kernel/trace/Kconfig | 10 ---------- 5 files changed, 12 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/Documentation/trace/ftrace-design.rst b/Documentation/trace/ftrace-design.rst index a8e22e0db63c..6893399157f0 100644 --- a/Documentation/trace/ftrace-design.rst +++ b/Documentation/trace/ftrace-design.rst @@ -229,14 +229,6 @@ Adding support for it is easy: just define the macro in asm/ftrace.h and pass the return address pointer as the 'retp' argument to ftrace_push_return_trace(). -HAVE_FTRACE_NMI_ENTER ---------------------- - -If you can't trace NMI functions, then skip this option. - -
- - HAVE_SYSCALL_TRACEPOINTS ------------------------ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b4f0e37b83eb..97656d20b9ea 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -71,7 +71,6 @@ config SUPERH32 select HAVE_FUNCTION_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select ARCH_WANT_IPC_PARSE_VERSION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_ARCH_KGDB diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 63cf17bc760d..2130381c9d57 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -170,11 +170,21 @@ BUILD_TRAP_HANDLER(bug) force_sig(SIGTRAP); } +#ifdef CONFIG_DYNAMIC_FTRACE +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + BUILD_TRAP_HANDLER(nmi) { unsigned int cpu = smp_processor_id(); TRAP_HANDLER_DECL; + arch_ftrace_nmi_enter(); + nmi_enter(); nmi_count(cpu)++; @@ -190,4 +200,6 @@ BUILD_TRAP_HANDLER(nmi) } nmi_exit(); + + arch_ftrace_nmi_exit(); } diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index ccda97dc7f8b..0abd9a1d2852 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,15 +2,6 @@ #ifndef _LINUX_FTRACE_IRQ_H #define _LINUX_FTRACE_IRQ_H - -#ifdef CONFIG_FTRACE_NMI_ENTER -extern void arch_ftrace_nmi_enter(void); -extern void arch_ftrace_nmi_exit(void); -#else -static inline void arch_ftrace_nmi_enter(void) { } -static inline void arch_ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_HWLAT_TRACER extern bool trace_hwlat_callback_enabled; extern void trace_hwlat_callback(bool enter); @@ -22,12 +13,10 @@ static inline void ftrace_nmi_enter(void) if (trace_hwlat_callback_enabled) trace_hwlat_callback(true); #endif - arch_ftrace_nmi_enter(); } static inline void ftrace_nmi_exit(void) { - arch_ftrace_nmi_exit(); #ifdef CONFIG_HWLAT_TRACER if (trace_hwlat_callback_enabled) trace_hwlat_callback(false); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1e9a8f9a3459..24876faac753 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB -- cgit v1.2.3 From f93524eb9c54f49be150167918f6546b0a2e09b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Feb 2020 21:01:16 +0100 Subject: sched,rcu,tracing: Avoid tracing before in_nmi() is correct If a tracer is invoked before in_nmi() becomes true, the tracer can no longer detect it is called from NMI context and behave correctly. Therefore change nmi_{enter,exit}() to use __preempt_count_{add,sub}() as the normal preempt_count_{add,sub}() have a (desired) function trace entry. This fixes a potential issue with the current code; when the function-tracer has stack-tracing enabled __trace_stack() will malfunction when it hits the preempt_count_add() function entry from NMI context. Suggested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.434193525@linutronix.de --- include/linux/hardirq.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index a043ad826c67..621556efe45f 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -65,6 +65,15 @@ extern void irq_exit(void); #define arch_nmi_exit() do { } while (0) #endif +/* + * NMI vs Tracing + * -------------- + * + * We must not land in a tracer until (or after) we've changed preempt_count + * such that in_nmi() becomes true. To that effect all NMI C entry points must + * be marked 'notrace' and call nmi_enter() as soon as possible. + */ + /* * nmi_enter() can nest up to 15 times; see NMI_BITS. */ @@ -75,7 +84,7 @@ extern void irq_exit(void); lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi() == NMI_MASK); \ - preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ + __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ lockdep_hardirq_enter(); \ } while (0) @@ -85,7 +94,7 @@ extern void irq_exit(void); lockdep_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ - preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ printk_nmi_exit(); \ -- cgit v1.2.3 From 5567d11c21a1d508a91a8cb64a819783a0835d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 10:22:06 +0100 Subject: x86/mce: Send #MC singal from task work Convert #MC over to using task_work_add(); it will run the same code slightly later, on the return to user path of the same exception. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.957390899@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 56 +++++++++++++++++++++++------------------- include/linux/sched.h | 6 +++++ 2 files changed, 37 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 98bf91cd7d5d..2f0ef95795f3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1086,23 +1087,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1204,6 +1188,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1222,7 +1229,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1354,13 +1361,13 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - local_irq_enable(); - preempt_enable(); - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - preempt_disable(); - local_irq_disable(); + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); @@ -1370,7 +1377,6 @@ out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9437b53cc603..57d0ed061ae4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1297,6 +1297,12 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_X86_MCE + u64 mce_addr; + u64 mce_status; + struct callback_head mce_kill_me; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. -- cgit v1.2.3 From 8ae0ae6737ad449c8ae21e2bb01d9736f360a933 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 3 May 2020 15:08:52 +0200 Subject: rcu: Provide rcu_irq_exit_preempt() Interrupts and exceptions invoke rcu_irq_enter() on entry and need to invoke rcu_irq_exit() before they either return to the interrupted code or invoke the scheduler due to preemption. The general assumption is that RCU idle code has to have preemption disabled so that a return from interrupt cannot schedule. So the return from interrupt code invokes rcu_irq_exit() and preempt_schedule_irq(). If there is any imbalance in the rcu_irq/nmi* invocations or RCU idle code had preemption enabled then this goes unnoticed until the CPU goes idle or some other RCU check is executed. Provide rcu_irq_exit_preempt() which can be invoked from the interrupt/exception return code in case that preemption is enabled. It invokes rcu_irq_exit() and contains a few sanity checks in case that CONFIG_PROVE_RCU is enabled to catch such issues directly. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.364456424@linutronix.de --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 3465ba704a11..980eb78751d9 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -71,6 +71,7 @@ static inline void rcu_irq_enter(void) { } static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } +static inline void rcu_irq_exit_preempt(void) { } static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index fbc26274af4d..02016e0aa8eb 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -47,6 +47,7 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 945401674b9d..62ee01299386 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -743,6 +743,28 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3 From b1fcf9b83c4149c63d1e0c699e85f93cbe28e211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2020 09:44:43 +0200 Subject: rcu: Provide __rcu_is_watching() Same as rcu_is_watching() but without the preempt_disable/enable() pair inside the function. It is merked noinstr so it ends up in the non-instrumentable text section. This is useful for non-preemptible code especially in the low level entry section. Using rcu_is_watching() there results in a call to the preempt_schedule_notrace() thunk which triggers noinstr section warnings in objtool. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200512213810.518709291@linutronix.de --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 5 +++++ 3 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 980eb78751d9..c869fb20cc51 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -86,6 +86,7 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } +static inline bool __rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } static inline bool rcu_gp_might_be_stalled(void) { return false; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 02016e0aa8eb..9366fa4d0717 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -58,6 +58,7 @@ extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); +bool __rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62ee01299386..90c8be22d57a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -985,6 +985,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * -- cgit v1.2.3 From 66e9b0717102507e64f638790eaece88765cc9e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2020 14:04:34 +0100 Subject: kprobes: Prevent probes in .noinstr.text section Instrumentation is forbidden in the .noinstr.text section. Make kprobes respect this. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200505134100.179862032@linutronix.de --- include/linux/module.h | 2 ++ kernel/kprobes.c | 18 ++++++++++++++++++ kernel/module.c | 3 +++ 3 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 1192097c9a69..d849d06e4d44 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -458,6 +458,8 @@ struct module { void __percpu *percpu; unsigned int percpu_size; #endif + void *noinstr_text_start; + unsigned int noinstr_text_size; #ifdef CONFIG_TRACEPOINTS unsigned int num_tracepoints; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9eb5acf0a9f3..3f310df4a693 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2229,6 +2229,12 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } @@ -2248,6 +2254,12 @@ static void add_module_kprobe_blacklist(struct module *mod) end = start + mod->kprobes_text_size; kprobe_add_area_blacklist(start, end); } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } } static void remove_module_kprobe_blacklist(struct module *mod) @@ -2265,6 +2277,12 @@ static void remove_module_kprobe_blacklist(struct module *mod) end = start + mod->kprobes_text_size; kprobe_remove_area_blacklist(start, end); } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } } /* Module notifier call back, checking kprobes on the module */ diff --git a/kernel/module.c b/kernel/module.c index faf733789560..72ed2b3a6ee2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3150,6 +3150,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), -- cgit v1.2.3 From 344fa64ef8f6740e99b32ab788b6e3742d7284b3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: security: Add a hook for the point of notification insertion Add a security hook that allows an LSM to rule on whether a notification message is allowed to be inserted into a particular watch queue. The hook is given the following information: (1) The credentials of the triggerer (which may be init_cred for a system notification, eg. a hardware error). (2) The credentials of the whoever set the watch. (3) The notification message. Signed-off-by: David Howells Acked-by: James Morris cc: Casey Schaufler cc: Stephen Smalley cc: linux-security-module@vger.kernel.org --- include/linux/lsm_hook_defs.h | 5 +++++ include/linux/lsm_hooks.h | 9 +++++++++ include/linux/security.h | 15 +++++++++++++++ security/security.c | 9 +++++++++ 4 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 5616b2567aa7..e0def45b5cc5 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -253,6 +253,11 @@ LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx, u32 *ctxlen) +#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) +LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, + const struct cred *cred, struct watch_notification *n) +#endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ + #ifdef CONFIG_SECURITY_NETWORK LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other, struct sock *newsk) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 988ca0df7824..0b5e5769b836 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1437,6 +1437,15 @@ * @ctx is a pointer in which to place the allocated security context. * @ctxlen points to the place to put the length of @ctx. * + * Security hooks for the general notification queue: + * + * @post_notification: + * Check to see if a watch notification can be posted to a particular + * queue. + * @w_cred: The credentials of the whoever set the watch. + * @cred: The event-triggerer's credentials + * @n: The notification being posted + * * Security hooks for using the eBPF maps and programs functionalities through * eBPF syscalls. * diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..9a5d12ab491b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -56,6 +56,8 @@ struct mm_struct; struct fs_context; struct fs_parameter; enum fs_value_type; +struct watch; +struct watch_notification; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -1275,6 +1277,19 @@ static inline int security_locked_down(enum lockdown_reason what) } #endif /* CONFIG_SECURITY */ +#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) +int security_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n); +#else +static inline int security_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n) +{ + return 0; +} +#endif + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk); diff --git a/security/security.c b/security/security.c index 7fed24b9d57e..7d55607120b4 100644 --- a/security/security.c +++ b/security/security.c @@ -2007,6 +2007,15 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) } EXPORT_SYMBOL(security_inode_getsecctx); +#ifdef CONFIG_WATCH_QUEUE +int security_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n) +{ + return call_int_hook(post_notification, 0, w_cred, cred, n); +} +#endif /* CONFIG_WATCH_QUEUE */ + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) -- cgit v1.2.3 From c73be61cede5882f9605a852414db559c0ebedfd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Add general notification queue support Make it possible to have a general notification queue built on top of a standard pipe. Notifications are 'spliced' into the pipe and then read out. splice(), vmsplice() and sendfile() are forbidden on pipes used for notifications as post_one_notification() cannot take pipe->mutex. This means that notifications could be posted in between individual pipe buffers, making iov_iter_revert() difficult to effect. The way the notification queue is used is: (1) An application opens a pipe with a special flag and indicates the number of messages it wishes to be able to queue at once (this can only be set once): pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[0], IOC_WATCH_QUEUE_SET_SIZE, queue_depth); (2) The application then uses poll() and read() as normal to extract data from the pipe. read() will return multiple notifications if the buffer is big enough, but it will not split a notification across buffers - rather it will return a short read or EMSGSIZE. Notification messages include a length in the header so that the caller can split them up. Each message has a header that describes it: struct watch_notification { __u32 type:24; __u32 subtype:8; __u32 info; }; The type indicates the source (eg. mount tree changes, superblock events, keyring changes, block layer events) and the subtype indicates the event type (eg. mount, unmount; EIO, EDQUOT; link, unlink). The info field indicates a number of things, including the entry length, an ID assigned to a watchpoint contributing to this buffer and type-specific flags. Supplementary data, such as the key ID that generated an event, can be attached in additional slots. The maximum message size is 127 bytes. Messages may not be padded or aligned, so there is no guarantee, for example, that the notification type will be on a 4-byte bounary. Signed-off-by: David Howells --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + Documentation/watch_queue.rst | 339 +++++++++++ fs/pipe.c | 206 ++++--- fs/splice.c | 12 +- include/linux/pipe_fs_i.h | 19 +- include/linux/watch_queue.h | 127 ++++ include/uapi/linux/watch_queue.h | 20 + init/Kconfig | 12 + kernel/Makefile | 1 + kernel/watch_queue.c | 657 +++++++++++++++++++++ 10 files changed, 1318 insertions(+), 76 deletions(-) create mode 100644 Documentation/watch_queue.rst create mode 100644 include/linux/watch_queue.h create mode 100644 kernel/watch_queue.c (limited to 'include/linux') diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index f759edafd938..9425377615ce 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -201,6 +201,7 @@ Code Seq# Include File Comments 'W' 00-1F linux/wanrouter.h conflict! (pre 3.9) 'W' 00-3F sound/asound.h conflict! 'W' 40-5F drivers/pci/switch/switchtec.c +'W' 60-61 linux/watch_queue.h 'X' all fs/xfs/xfs_fs.h, conflict! fs/xfs/linux-2.6/xfs_ioctl32.h, include/linux/falloc.h, diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst new file mode 100644 index 000000000000..849fad6893ef --- /dev/null +++ b/Documentation/watch_queue.rst @@ -0,0 +1,339 @@ +============================== +General notification mechanism +============================== + +The general notification mechanism is built on top of the standard pipe driver +whereby it effectively splices notification messages from the kernel into pipes +opened by userspace. This can be used in conjunction with:: + + * Key/keyring notifications + + +The notifications buffers can be enabled by: + + "General setup"/"General notification queue" + (CONFIG_WATCH_QUEUE) + +This document has the following sections: + +.. contents:: :local: + + +Overview +======== + +This facility appears as a pipe that is opened in a special mode. The pipe's +internal ring buffer is used to hold messages that are generated by the kernel. +These messages are then read out by read(). Splice and similar are disabled on +such pipes due to them wanting to, under some circumstances, revert their +additions to the ring - which might end up interleaved with notification +messages. + +The owner of the pipe has to tell the kernel which sources it would like to +watch through that pipe. Only sources that have been connected to a pipe will +insert messages into it. Note that a source may be bound to multiple pipes and +insert messages into all of them simultaneously. + +Filters may also be emplaced on a pipe so that certain source types and +subevents can be ignored if they're not of interest. + +A message will be discarded if there isn't a slot available in the ring or if +no preallocated message buffer is available. In both of these cases, read() +will insert a WATCH_META_LOSS_NOTIFICATION message into the output buffer after +the last message currently in the buffer has been read. + +Note that when producing a notification, the kernel does not wait for the +consumers to collect it, but rather just continues on. This means that +notifications can be generated whilst spinlocks are held and also protects the +kernel from being held up indefinitely by a userspace malfunction. + + +Message Structure +================= + +Notification messages begin with a short header:: + + struct watch_notification { + __u32 type:24; + __u32 subtype:8; + __u32 info; + }; + +"type" indicates the source of the notification record and "subtype" indicates +the type of record from that source (see the Watch Sources section below). The +type may also be "WATCH_TYPE_META". This is a special record type generated +internally by the watch queue itself. There are two subtypes: + + * WATCH_META_REMOVAL_NOTIFICATION + * WATCH_META_LOSS_NOTIFICATION + +The first indicates that an object on which a watch was installed was removed +or destroyed and the second indicates that some messages have been lost. + +"info" indicates a bunch of things, including: + + * The length of the message in bytes, including the header (mask with + WATCH_INFO_LENGTH and shift by WATCH_INFO_LENGTH__SHIFT). This indicates + the size of the record, which may be between 8 and 127 bytes. + + * The watch ID (mask with WATCH_INFO_ID and shift by WATCH_INFO_ID__SHIFT). + This indicates that caller's ID of the watch, which may be between 0 + and 255. Multiple watches may share a queue, and this provides a means to + distinguish them. + + * A type-specific field (WATCH_INFO_TYPE_INFO). This is set by the + notification producer to indicate some meaning specific to the type and + subtype. + +Everything in info apart from the length can be used for filtering. + +The header can be followed by supplementary information. The format of this is +at the discretion is defined by the type and subtype. + + +Watch List (Notification Source) API +==================================== + +A "watch list" is a list of watchers that are subscribed to a source of +notifications. A list may be attached to an object (say a key or a superblock) +or may be global (say for device events). From a userspace perspective, a +non-global watch list is typically referred to by reference to the object it +belongs to (such as using KEYCTL_NOTIFY and giving it a key serial number to +watch that specific key). + +To manage a watch list, the following functions are provided: + + * ``void init_watch_list(struct watch_list *wlist, + void (*release_watch)(struct watch *wlist));`` + + Initialise a watch list. If ``release_watch`` is not NULL, then this + indicates a function that should be called when the watch_list object is + destroyed to discard any references the watch list holds on the watched + object. + + * ``void remove_watch_list(struct watch_list *wlist);`` + + This removes all of the watches subscribed to a watch_list and frees them + and then destroys the watch_list object itself. + + +Watch Queue (Notification Output) API +===================================== + +A "watch queue" is the buffer allocated by an application that notification +records will be written into. The workings of this are hidden entirely inside +of the pipe device driver, but it is necessary to gain a reference to it to set +a watch. These can be managed with: + + * ``struct watch_queue *get_watch_queue(int fd);`` + + Since watch queues are indicated to the kernel by the fd of the pipe that + implements the buffer, userspace must hand that fd through a system call. + This can be used to look up an opaque pointer to the watch queue from the + system call. + + * ``void put_watch_queue(struct watch_queue *wqueue);`` + + This discards the reference obtained from ``get_watch_queue()``. + + +Watch Subscription API +====================== + +A "watch" is a subscription on a watch list, indicating the watch queue, and +thus the buffer, into which notification records should be written. The watch +queue object may also carry filtering rules for that object, as set by +userspace. Some parts of the watch struct can be set by the driver:: + + struct watch { + union { + u32 info_id; /* ID to be OR'd in to info field */ + ... + }; + void *private; /* Private data for the watched object */ + u64 id; /* Internal identifier */ + ... + }; + +The ``info_id`` value should be an 8-bit number obtained from userspace and +shifted by WATCH_INFO_ID__SHIFT. This is OR'd into the WATCH_INFO_ID field of +struct watch_notification::info when and if the notification is written into +the associated watch queue buffer. + +The ``private`` field is the driver's data associated with the watch_list and +is cleaned up by the ``watch_list::release_watch()`` method. + +The ``id`` field is the source's ID. Notifications that are posted with a +different ID are ignored. + +The following functions are provided to manage watches: + + * ``void init_watch(struct watch *watch, struct watch_queue *wqueue);`` + + Initialise a watch object, setting its pointer to the watch queue, using + appropriate barriering to avoid lockdep complaints. + + * ``int add_watch_to_object(struct watch *watch, struct watch_list *wlist);`` + + Subscribe a watch to a watch list (notification source). The + driver-settable fields in the watch struct must have been set before this + is called. + + * ``int remove_watch_from_object(struct watch_list *wlist, + struct watch_queue *wqueue, + u64 id, false);`` + + Remove a watch from a watch list, where the watch must match the specified + watch queue (``wqueue``) and object identifier (``id``). A notification + (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue to + indicate that the watch got removed. + + * ``int remove_watch_from_object(struct watch_list *wlist, NULL, 0, true);`` + + Remove all the watches from a watch list. It is expected that this will be + called preparatory to destruction and that the watch list will be + inaccessible to new watches by this point. A notification + (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue of each + subscribed watch to indicate that the watch got removed. + + +Notification Posting API +======================== + +To post a notification to watch list so that the subscribed watches can see it, +the following function should be used:: + + void post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id); + +The notification should be preformatted and a pointer to the header (``n``) +should be passed in. The notification may be larger than this and the size in +units of buffer slots is noted in ``n->info & WATCH_INFO_LENGTH``. + +The ``cred`` struct indicates the credentials of the source (subject) and is +passed to the LSMs, such as SELinux, to allow or suppress the recording of the +note in each individual queue according to the credentials of that queue +(object). + +The ``id`` is the ID of the source object (such as the serial number on a key). +Only watches that have the same ID set in them will see this notification. + + +Watch Sources +============= + +Any particular buffer can be fed from multiple sources. Sources include: + + * WATCH_TYPE_KEY_NOTIFY + + Notifications of this type indicate changes to keys and keyrings, including + the changes of keyring contents or the attributes of keys. + + See Documentation/security/keys/core.rst for more information. + + +Event Filtering +=============== + +Once a watch queue has been created, a set of filters can be applied to limit +the events that are received using:: + + struct watch_notification_filter filter = { + ... + }; + ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) + +The filter description is a variable of type:: + + struct watch_notification_filter { + __u32 nr_filters; + __u32 __reserved; + struct watch_notification_type_filter filters[]; + }; + +Where "nr_filters" is the number of filters in filters[] and "__reserved" +should be 0. The "filters" array has elements of the following type:: + + struct watch_notification_type_filter { + __u32 type; + __u32 info_filter; + __u32 info_mask; + __u32 subtype_filter[8]; + }; + +Where: + + * ``type`` is the event type to filter for and should be something like + "WATCH_TYPE_KEY_NOTIFY" + + * ``info_filter`` and ``info_mask`` act as a filter on the info field of the + notification record. The notification is only written into the buffer if:: + + (watch.info & info_mask) == info_filter + + This could be used, for example, to ignore events that are not exactly on + the watched point in a mount tree. + + * ``subtype_filter`` is a bitmask indicating the subtypes that are of + interest. Bit 0 of subtype_filter[0] corresponds to subtype 0, bit 1 to + subtype 1, and so on. + +If the argument to the ioctl() is NULL, then the filters will be removed and +all events from the watched sources will come through. + + +Userspace Code Example +====================== + +A buffer is created with something like the following:: + + pipe2(fds, O_TMPFILE); + ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256); + +It can then be set to receive keyring change notifications:: + + keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01); + +The notifications can then be consumed by something like the following:: + + static void consumer(int rfd, struct watch_queue_buffer *buf) + { + unsigned char buffer[128]; + ssize_t buf_len; + + while (buf_len = read(rfd, buffer, sizeof(buffer)), + buf_len > 0 + ) { + void *p = buffer; + void *end = buffer + buf_len; + while (p < end) { + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + memcpy(&n, p, largest); + + len = (n->info & WATCH_INFO_LENGTH) >> + WATCH_INFO_LENGTH__SHIFT; + if (len == 0 || len > largest) + return; + + switch (n.n.type) { + case WATCH_TYPE_META: + got_meta(&n.n); + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n); + break; + } + + p += len; + } + } + } diff --git a/fs/pipe.c b/fs/pipe.c index 16fb72e9abf7..da9bc1f21fd1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -459,6 +460,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) { + ret = -EXDEV; + goto out; + } +#endif + /* * Only wake up if the pipe started out empty, since * otherwise there should be no readers waiting. @@ -628,22 +636,37 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int count, head, tail, mask; switch (cmd) { - case FIONREAD: - __pipe_lock(pipe); - count = 0; - head = pipe->head; - tail = pipe->tail; - mask = pipe->ring_size - 1; + case FIONREAD: + __pipe_lock(pipe); + count = 0; + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; - while (tail != head) { - count += pipe->bufs[tail & mask].len; - tail++; - } - __pipe_unlock(pipe); + while (tail != head) { + count += pipe->bufs[tail & mask].len; + tail++; + } + __pipe_unlock(pipe); - return put_user(count, (int __user *)arg); - default: - return -ENOIOCTLCMD; + return put_user(count, (int __user *)arg); + +#ifdef CONFIG_WATCH_QUEUE + case IOC_WATCH_QUEUE_SET_SIZE: { + int ret; + __pipe_lock(pipe); + ret = watch_queue_set_size(pipe, arg); + __pipe_unlock(pipe); + return ret; + } + + case IOC_WATCH_QUEUE_SET_FILTER: + return watch_queue_set_filter( + pipe, (struct watch_notification_filter __user *)arg); +#endif + + default: + return -ENOIOCTLCMD; } } @@ -754,27 +777,27 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static unsigned long account_pipe_buffers(struct user_struct *user, - unsigned long old, unsigned long new) +unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } -static bool too_many_pipe_buffers_soft(unsigned long user_bufs) +bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } -static bool too_many_pipe_buffers_hard(unsigned long user_bufs) +bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } -static bool is_unprivileged_user(void) +bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } @@ -796,12 +819,12 @@ struct pipe_inode_info *alloc_pipe_info(void) user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { + if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) + if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -813,6 +836,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; + pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; @@ -830,7 +854,14 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) { + watch_queue_clear(pipe->watch_queue); + put_watch_queue(pipe->watch_queue); + } +#endif + + (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); free_uid(pipe->user); for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; @@ -906,6 +937,17 @@ int create_pipe_files(struct file **res, int flags) if (!inode) return -ENFILE; + if (flags & O_NOTIFICATION_PIPE) { +#ifdef CONFIG_WATCH_QUEUE + if (watch_queue_init(inode->i_pipe) < 0) { + iput(inode); + return -ENOMEM; + } +#else + return -ENOPKG; +#endif + } + f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), &pipefifo_fops); @@ -936,7 +978,7 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) int error; int fdw, fdr; - if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); @@ -1184,42 +1226,12 @@ unsigned int round_pipe_size(unsigned long size) } /* - * Allocate a new array of pipe buffers and copy the info over. Returns the - * pipe size if successful, or return -ERROR on error. + * Resize the pipe ring to a number of slots. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; - unsigned int size, nr_slots, head, tail, mask, n; - unsigned long user_bufs; - long ret = 0; - - size = round_pipe_size(arg); - nr_slots = size >> PAGE_SHIFT; - - if (!nr_slots) - return -EINVAL; - - /* - * If trying to increase the pipe capacity, check that an - * unprivileged user is not trying to exceed various limits - * (soft limit check here, hard limit check just below). - * Decreasing the pipe capacity is always permitted, even - * if the user is currently over a limit. - */ - if (nr_slots > pipe->ring_size && - size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); - - if (nr_slots > pipe->ring_size && - (too_many_pipe_buffers_hard(user_bufs) || - too_many_pipe_buffers_soft(user_bufs)) && - is_unprivileged_user()) { - ret = -EPERM; - goto out_revert_acct; - } + unsigned int head, tail, mask, n; /* * We can shrink the pipe, if arg is greater than the ring occupancy. @@ -1231,17 +1243,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) head = pipe->head; tail = pipe->tail; n = pipe_occupancy(pipe->head, pipe->tail); - if (nr_slots < n) { - ret = -EBUSY; - goto out_revert_acct; - } + if (nr_slots < n) + return -EBUSY; bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) { - ret = -ENOMEM; - goto out_revert_acct; - } + if (unlikely(!bufs)) + return -ENOMEM; /* * The pipe array wraps around, so just start the new one at zero @@ -1269,16 +1277,68 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; - pipe->max_usage = nr_slots; + if (pipe->max_usage > nr_slots) + pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); + return 0; +} + +/* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + unsigned long user_bufs; + unsigned int nr_slots, size; + long ret = 0; + +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + return -EBUSY; +#endif + + size = round_pipe_size(arg); + nr_slots = size >> PAGE_SHIFT; + + if (!nr_slots) + return -EINVAL; + + /* + * If trying to increase the pipe capacity, check that an + * unprivileged user is not trying to exceed various limits + * (soft limit check here, hard limit check just below). + * Decreasing the pipe capacity is always permitted, even + * if the user is currently over a limit. + */ + if (nr_slots > pipe->max_usage && + size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); + + if (nr_slots > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto out_revert_acct; + } + + ret = pipe_resize_ring(pipe, nr_slots); + if (ret < 0) + goto out_revert_acct; + + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: - (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); + (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); return ret; } @@ -1287,9 +1347,17 @@ out_revert_acct: * location, so checking ->i_pipe is not enough to verify that this is a * pipe. */ -struct pipe_inode_info *get_pipe_info(struct file *file) +struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { - return file->f_op == &pipefifo_fops ? file->private_data : NULL; + struct pipe_inode_info *pipe = file->private_data; + + if (file->f_op != &pipefifo_fops || !pipe) + return NULL; +#ifdef CONFIG_WATCH_QUEUE + if (for_splice && pipe->watch_queue) + return NULL; +#endif + return pipe; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1297,7 +1365,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) struct pipe_inode_info *pipe; long ret; - pipe = get_pipe_info(file); + pipe = get_pipe_info(file, false); if (!pipe) return -EBADF; diff --git a/fs/splice.c b/fs/splice.c index fd0a1e7e5959..50f3c0260c00 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1122,8 +1122,8 @@ long do_splice(struct file *in, loff_t __user *off_in, !(out->f_mode & FMODE_WRITE))) return -EBADF; - ipipe = get_pipe_info(in); - opipe = get_pipe_info(out); + ipipe = get_pipe_info(in, true); + opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) @@ -1273,7 +1273,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, static long vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { - struct pipe_inode_info *pipe = get_pipe_info(file); + struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, @@ -1308,7 +1308,7 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; - pipe = get_pipe_info(file); + pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; @@ -1757,8 +1757,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = get_pipe_info(in); - struct pipe_inode_info *opipe = get_pipe_info(out); + struct pipe_inode_info *ipipe = get_pipe_info(in, true); + struct pipe_inode_info *opipe = get_pipe_info(out, true); int ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ae58fad7f1e0..1d3eaa233f4a 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -35,6 +35,7 @@ struct pipe_buffer { * @tail: The point of buffer consumption * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) + * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe @@ -45,6 +46,7 @@ struct pipe_buffer { * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe + * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; @@ -53,6 +55,7 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; + unsigned int nr_accounted; unsigned int readers; unsigned int writers; unsigned int files; @@ -63,6 +66,9 @@ struct pipe_inode_info { struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; struct user_struct *user; +#ifdef CONFIG_WATCH_QUEUE + struct watch_queue *watch_queue; +#endif }; /* @@ -237,9 +243,20 @@ void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; +#ifdef CONFIG_WATCH_QUEUE +unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new); +bool too_many_pipe_buffers_soft(unsigned long user_bufs); +bool too_many_pipe_buffers_hard(unsigned long user_bufs); +bool pipe_is_unprivileged_user(void); +#endif + /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ +#ifdef CONFIG_WATCH_QUEUE +int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); +#endif long pipe_fcntl(struct file *, unsigned int, unsigned long arg); -struct pipe_inode_info *get_pipe_info(struct file *file); +struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); unsigned int round_pipe_size(unsigned long size); diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h new file mode 100644 index 000000000000..5e08db2adc31 --- /dev/null +++ b/include/linux/watch_queue.h @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* User-mappable watch queue + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#ifndef _LINUX_WATCH_QUEUE_H +#define _LINUX_WATCH_QUEUE_H + +#include +#include +#include + +#ifdef CONFIG_WATCH_QUEUE + +struct cred; + +struct watch_type_filter { + enum watch_notification_type type; + __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */ + __u32 info_filter; /* Filter on watch_notification::info */ + __u32 info_mask; /* Mask of relevant bits in info_filter */ +}; + +struct watch_filter { + union { + struct rcu_head rcu; + unsigned long type_filter[2]; /* Bitmask of accepted types */ + }; + u32 nr_filters; /* Number of filters */ + struct watch_type_filter filters[]; +}; + +struct watch_queue { + struct rcu_head rcu; + struct watch_filter __rcu *filter; + struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */ + struct hlist_head watches; /* Contributory watches */ + struct page **notes; /* Preallocated notifications */ + unsigned long *notes_bitmap; /* Allocation bitmap for notes */ + struct kref usage; /* Object usage count */ + spinlock_t lock; + unsigned int nr_notes; /* Number of notes */ + unsigned int nr_pages; /* Number of pages in notes[] */ + bool defunct; /* T when queues closed */ +}; + +/* + * Representation of a watch on an object. + */ +struct watch { + union { + struct rcu_head rcu; + u32 info_id; /* ID to be OR'd in to info field */ + }; + struct watch_queue __rcu *queue; /* Queue to post events to */ + struct hlist_node queue_node; /* Link in queue->watches */ + struct watch_list __rcu *watch_list; + struct hlist_node list_node; /* Link in watch_list->watchers */ + const struct cred *cred; /* Creds of the owner of the watch */ + void *private; /* Private data for the watched object */ + u64 id; /* Internal identifier */ + struct kref usage; /* Object usage count */ +}; + +/* + * List of watches on an object. + */ +struct watch_list { + struct rcu_head rcu; + struct hlist_head watchers; + void (*release_watch)(struct watch *); + spinlock_t lock; +}; + +extern void __post_watch_notification(struct watch_list *, + struct watch_notification *, + const struct cred *, + u64); +extern struct watch_queue *get_watch_queue(int); +extern void put_watch_queue(struct watch_queue *); +extern void init_watch(struct watch *, struct watch_queue *); +extern int add_watch_to_object(struct watch *, struct watch_list *); +extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); +extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); +extern long watch_queue_set_filter(struct pipe_inode_info *, + struct watch_notification_filter __user *); +extern int watch_queue_init(struct pipe_inode_info *); +extern void watch_queue_clear(struct watch_queue *); + +static inline void init_watch_list(struct watch_list *wlist, + void (*release_watch)(struct watch *)) +{ + INIT_HLIST_HEAD(&wlist->watchers); + spin_lock_init(&wlist->lock); + wlist->release_watch = release_watch; +} + +static inline void post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + if (unlikely(wlist)) + __post_watch_notification(wlist, n, cred, id); +} + +static inline void remove_watch_list(struct watch_list *wlist, u64 id) +{ + if (wlist) { + remove_watch_from_object(wlist, NULL, id, true); + kfree_rcu(wlist, rcu); + } +} + +/** + * watch_sizeof - Calculate the information part of the size of a watch record, + * given the structure size. + */ +#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT) + +#endif + +#endif /* _LINUX_WATCH_QUEUE_H */ diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 9df72227f515..3a5790f1f05d 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -4,9 +4,13 @@ #include #include +#include #define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */ +#define IOC_WATCH_QUEUE_SET_SIZE _IO('W', 0x60) /* Set the size in pages */ +#define IOC_WATCH_QUEUE_SET_FILTER _IO('W', 0x61) /* Set the filter */ + enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ WATCH_TYPE__NR = 1 @@ -41,6 +45,22 @@ struct watch_notification { #define WATCH_INFO_FLAG_7 0x00800000 }; +/* + * Notification filtering rules (IOC_WATCH_QUEUE_SET_FILTER). + */ +struct watch_notification_type_filter { + __u32 type; /* Type to apply filter to */ + __u32 info_filter; /* Filter on watch_notification::info */ + __u32 info_mask; /* Mask of relevant bits in info_filter */ + __u32 subtype_filter[8]; /* Bitmask of subtypes to filter on */ +}; + +struct watch_notification_filter { + __u32 nr_filters; /* Number of filters */ + __u32 __reserved; /* Must be 0 */ + struct watch_notification_type_filter filters[]; +}; + /* * Extended watch removal notification. This is used optionally if the type diff --git a/init/Kconfig b/init/Kconfig index 74a5ac65644f..c95a2a5654a9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -326,6 +326,18 @@ config POSIX_MQUEUE_SYSCTL depends on SYSCTL default y +config WATCH_QUEUE + bool "General notification queue" + default n + help + + This is a general notification queue for the kernel to pass events to + userspace by splicing them into pipes. It can be used in conjunction + with watches for key/keyring change notifications and device + notifications. + + See Documentation/watch_queue.rst + config CROSS_MEMORY_ATTACH bool "Enable process_vm_readv/writev syscalls" depends on MMU diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..41e7e3ae07ec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c new file mode 100644 index 000000000000..c103e34f8705 --- /dev/null +++ b/kernel/watch_queue.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Watch queue and general notification mechanism, built on pipes + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#define pr_fmt(fmt) "watchq: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("Watch queue"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define WATCH_QUEUE_NOTE_SIZE 128 +#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) + +static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct watch_queue *wqueue = (struct watch_queue *)buf->private; + struct page *page; + unsigned int bit; + + /* We need to work out which note within the page this refers to, but + * the note might have been maximum size, so merely ANDing the offset + * off doesn't work. OTOH, the note must've been more than zero size. + */ + bit = buf->offset + buf->len; + if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) + bit -= WATCH_QUEUE_NOTE_SIZE; + bit /= WATCH_QUEUE_NOTE_SIZE; + + page = buf->page; + bit += page->index; + + set_bit(bit, wqueue->notes_bitmap); +} + +static int watch_queue_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return -1; /* No. */ +} + +/* New data written to a pipe may be appended to a buffer with this type. */ +static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { + .confirm = generic_pipe_buf_confirm, + .release = watch_queue_pipe_buf_release, + .steal = watch_queue_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +/* + * Post a notification to a watch queue. + */ +static bool post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n) +{ + void *p; + struct pipe_inode_info *pipe = wqueue->pipe; + struct pipe_buffer *buf; + struct page *page; + unsigned int head, tail, mask, note, offset, len; + bool done = false; + + if (!pipe) + return false; + + spin_lock_irq(&pipe->rd_wait.lock); + + if (wqueue->defunct) + goto out; + + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + if (pipe_full(head, tail, pipe->ring_size)) + goto lost; + + note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + if (note >= wqueue->nr_notes) + goto lost; + + page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; + offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; + get_page(page); + len = n->info & WATCH_INFO_LENGTH; + p = kmap_atomic(page); + memcpy(p + offset, n, len); + kunmap_atomic(p); + + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->private = (unsigned long)wqueue; + buf->ops = &watch_queue_pipe_buf_ops; + buf->offset = offset; + buf->len = len; + buf->flags = 0; + pipe->head = head + 1; + + if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { + spin_unlock_irq(&pipe->rd_wait.lock); + BUG(); + } + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + done = true; + +out: + spin_unlock_irq(&pipe->rd_wait.lock); + if (done) + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + return done; + +lost: + goto out; +} + +/* + * Apply filter rules to a notification. + */ +static bool filter_watch_notification(const struct watch_filter *wf, + const struct watch_notification *n) +{ + const struct watch_type_filter *wt; + unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; + unsigned int st_index = n->subtype / st_bits; + unsigned int st_bit = 1U << (n->subtype % st_bits); + int i; + + if (!test_bit(n->type, wf->type_filter)) + return false; + + for (i = 0; i < wf->nr_filters; i++) { + wt = &wf->filters[i]; + if (n->type == wt->type && + (wt->subtype_filter[st_index] & st_bit) && + (n->info & wt->info_mask) == wt->info_filter) + return true; + } + + return false; /* If there is a filter, the default is to reject. */ +} + +/** + * __post_watch_notification - Post an event notification + * @wlist: The watch list to post the event to. + * @n: The notification record to post. + * @cred: The creds of the process that triggered the notification. + * @id: The ID to match on the watch. + * + * Post a notification of an event into a set of watch queues and let the users + * know. + * + * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and + * should be in units of sizeof(*n). + */ +void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + const struct watch_filter *wf; + struct watch_queue *wqueue; + struct watch *watch; + + if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { + WARN_ON(1); + return; + } + + rcu_read_lock(); + + hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { + if (watch->id != id) + continue; + n->info &= ~WATCH_INFO_ID; + n->info |= watch->info_id; + + wqueue = rcu_dereference(watch->queue); + wf = rcu_dereference(wqueue->filter); + if (wf && !filter_watch_notification(wf, n)) + continue; + + if (security_post_notification(watch->cred, cred, n) < 0) + continue; + + post_one_notification(wqueue, n); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL(__post_watch_notification); + +/* + * Allocate sufficient pages to preallocation for the requested number of + * notifications. + */ +long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) +{ + struct watch_queue *wqueue = pipe->watch_queue; + struct page **pages; + unsigned long *bitmap; + unsigned long user_bufs; + unsigned int bmsize; + int ret, i, nr_pages; + + if (!wqueue) + return -ENODEV; + if (wqueue->notes) + return -EBUSY; + + if (nr_notes < 1 || + nr_notes > 512) /* TODO: choose a better hard limit */ + return -EINVAL; + + nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); + nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); + + if (nr_pages > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto error; + } + + ret = pipe_resize_ring(pipe, nr_notes); + if (ret < 0) + goto error; + + pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + if (!pages) + goto error; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto error_p; + pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + } + + bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; + bmsize *= sizeof(unsigned long); + bitmap = kmalloc(bmsize, GFP_KERNEL); + if (!bitmap) + goto error_p; + + memset(bitmap, 0xff, bmsize); + wqueue->notes = pages; + wqueue->notes_bitmap = bitmap; + wqueue->nr_pages = nr_pages; + wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + return 0; + +error_p: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); +error: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); + return ret; +} + +/* + * Set the filter on a watch queue. + */ +long watch_queue_set_filter(struct pipe_inode_info *pipe, + struct watch_notification_filter __user *_filter) +{ + struct watch_notification_type_filter *tf; + struct watch_notification_filter filter; + struct watch_type_filter *q; + struct watch_filter *wfilter; + struct watch_queue *wqueue = pipe->watch_queue; + int ret, nr_filter = 0, i; + + if (!wqueue) + return -ENODEV; + + if (!_filter) { + /* Remove the old filter */ + wfilter = NULL; + goto set; + } + + /* Grab the user's filter specification */ + if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) + return -EFAULT; + if (filter.nr_filters == 0 || + filter.nr_filters > 16 || + filter.__reserved != 0) + return -EINVAL; + + tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + if (IS_ERR(tf)) + return PTR_ERR(tf); + + ret = -EINVAL; + for (i = 0; i < filter.nr_filters; i++) { + if ((tf[i].info_filter & ~tf[i].info_mask) || + tf[i].info_mask & WATCH_INFO_LENGTH) + goto err_filter; + /* Ignore any unknown types */ + if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + continue; + nr_filter++; + } + + /* Now we need to build the internal filter from only the relevant + * user-specified filters. + */ + ret = -ENOMEM; + wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); + if (!wfilter) + goto err_filter; + wfilter->nr_filters = nr_filter; + + q = wfilter->filters; + for (i = 0; i < filter.nr_filters; i++) { + if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + continue; + + q->type = tf[i].type; + q->info_filter = tf[i].info_filter; + q->info_mask = tf[i].info_mask; + q->subtype_filter[0] = tf[i].subtype_filter[0]; + __set_bit(q->type, wfilter->type_filter); + q++; + } + + kfree(tf); +set: + pipe_lock(pipe); + wfilter = rcu_replace_pointer(wqueue->filter, wfilter, + lockdep_is_held(&pipe->mutex)); + pipe_unlock(pipe); + if (wfilter) + kfree_rcu(wfilter, rcu); + return 0; + +err_filter: + kfree(tf); + return ret; +} + +static void __put_watch_queue(struct kref *kref) +{ + struct watch_queue *wqueue = + container_of(kref, struct watch_queue, usage); + struct watch_filter *wfilter; + int i; + + for (i = 0; i < wqueue->nr_pages; i++) + __free_page(wqueue->notes[i]); + + wfilter = rcu_access_pointer(wqueue->filter); + if (wfilter) + kfree_rcu(wfilter, rcu); + kfree_rcu(wqueue, rcu); +} + +/** + * put_watch_queue - Dispose of a ref on a watchqueue. + * @wqueue: The watch queue to unref. + */ +void put_watch_queue(struct watch_queue *wqueue) +{ + kref_put(&wqueue->usage, __put_watch_queue); +} +EXPORT_SYMBOL(put_watch_queue); + +static void free_watch(struct rcu_head *rcu) +{ + struct watch *watch = container_of(rcu, struct watch, rcu); + + put_watch_queue(rcu_access_pointer(watch->queue)); + put_cred(watch->cred); +} + +static void __put_watch(struct kref *kref) +{ + struct watch *watch = container_of(kref, struct watch, usage); + + call_rcu(&watch->rcu, free_watch); +} + +/* + * Discard a watch. + */ +static void put_watch(struct watch *watch) +{ + kref_put(&watch->usage, __put_watch); +} + +/** + * init_watch_queue - Initialise a watch + * @watch: The watch to initialise. + * @wqueue: The queue to assign. + * + * Initialise a watch and set the watch queue. + */ +void init_watch(struct watch *watch, struct watch_queue *wqueue) +{ + kref_init(&watch->usage); + INIT_HLIST_NODE(&watch->list_node); + INIT_HLIST_NODE(&watch->queue_node); + rcu_assign_pointer(watch->queue, wqueue); +} + +/** + * add_watch_to_object - Add a watch on an object to a watch list + * @watch: The watch to add + * @wlist: The watch list to add to + * + * @watch->queue must have been set to point to the queue to post notifications + * to and the watch list of the object to be watched. @watch->cred must also + * have been set to the appropriate credentials and a ref taken on them. + * + * The caller must pin the queue and the list both and must hold the list + * locked against racing watch additions/removals. + */ +int add_watch_to_object(struct watch *watch, struct watch_list *wlist) +{ + struct watch_queue *wqueue = rcu_access_pointer(watch->queue); + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + watch->cred = get_current_cred(); + rcu_assign_pointer(watch->watch_list, wlist); + + spin_lock_bh(&wqueue->lock); + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + spin_unlock_bh(&wqueue->lock); + + hlist_add_head(&watch->list_node, &wlist->watchers); + return 0; +} +EXPORT_SYMBOL(add_watch_to_object); + +/** + * remove_watch_from_object - Remove a watch or all watches from an object. + * @wlist: The watch list to remove from + * @wq: The watch queue of interest (ignored if @all is true) + * @id: The ID of the watch to remove (ignored if @all is true) + * @all: True to remove all objects + * + * Remove a specific watch or all watches from an object. A notification is + * sent to the watcher to tell them that this happened. + */ +int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, + u64 id, bool all) +{ + struct watch_notification_removal n; + struct watch_queue *wqueue; + struct watch *watch; + int ret = -EBADSLT; + + rcu_read_lock(); + +again: + spin_lock(&wlist->lock); + hlist_for_each_entry(watch, &wlist->watchers, list_node) { + if (all || + (watch->id == id && rcu_access_pointer(watch->queue) == wq)) + goto found; + } + spin_unlock(&wlist->lock); + goto out; + +found: + ret = 0; + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + spin_unlock(&wlist->lock); + + /* We now own the reference on watch that used to belong to wlist. */ + + n.watch.type = WATCH_TYPE_META; + n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; + n.watch.info = watch->info_id | watch_sizeof(n.watch); + n.id = id; + if (id != 0) + n.watch.info = watch->info_id | watch_sizeof(n); + + wqueue = rcu_dereference(watch->queue); + + /* We don't need the watch list lock for the next bit as RCU is + * protecting *wqueue from deallocation. + */ + if (wqueue) { + post_one_notification(wqueue, &n.watch); + + spin_lock_bh(&wqueue->lock); + + if (!hlist_unhashed(&watch->queue_node)) { + hlist_del_init_rcu(&watch->queue_node); + put_watch(watch); + } + + spin_unlock_bh(&wqueue->lock); + } + + if (wlist->release_watch) { + void (*release_watch)(struct watch *); + + release_watch = wlist->release_watch; + rcu_read_unlock(); + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + + if (all && !hlist_empty(&wlist->watchers)) + goto again; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(remove_watch_from_object); + +/* + * Remove all the watches that are contributory to a queue. This has the + * potential to race with removal of the watches by the destruction of the + * objects being watched or with the distribution of notifications. + */ +void watch_queue_clear(struct watch_queue *wqueue) +{ + struct watch_list *wlist; + struct watch *watch; + bool release; + + rcu_read_lock(); + spin_lock_bh(&wqueue->lock); + + /* Prevent new additions and prevent notifications from happening */ + wqueue->defunct = true; + + while (!hlist_empty(&wqueue->watches)) { + watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); + hlist_del_init_rcu(&watch->queue_node); + /* We now own a ref on the watch. */ + spin_unlock_bh(&wqueue->lock); + + /* We can't do the next bit under the queue lock as we need to + * get the list lock - which would cause a deadlock if someone + * was removing from the opposite direction at the same time or + * posting a notification. + */ + wlist = rcu_dereference(watch->watch_list); + if (wlist) { + void (*release_watch)(struct watch *); + + spin_lock(&wlist->lock); + + release = !hlist_unhashed(&watch->list_node); + if (release) { + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + + /* We now own a second ref on the watch. */ + } + + release_watch = wlist->release_watch; + spin_unlock(&wlist->lock); + + if (release) { + if (release_watch) { + rcu_read_unlock(); + /* This might need to call dput(), so + * we have to drop all the locks. + */ + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + } + } + + put_watch(watch); + spin_lock_bh(&wqueue->lock); + } + + spin_unlock_bh(&wqueue->lock); + rcu_read_unlock(); +} + +/** + * get_watch_queue - Get a watch queue from its file descriptor. + * @fd: The fd to query. + */ +struct watch_queue *get_watch_queue(int fd) +{ + struct pipe_inode_info *pipe; + struct watch_queue *wqueue = ERR_PTR(-EINVAL); + struct fd f; + + f = fdget(fd); + if (f.file) { + pipe = get_pipe_info(f.file, false); + if (pipe && pipe->watch_queue) { + wqueue = pipe->watch_queue; + kref_get(&wqueue->usage); + } + fdput(f); + } + + return wqueue; +} +EXPORT_SYMBOL(get_watch_queue); + +/* + * Initialise a watch queue + */ +int watch_queue_init(struct pipe_inode_info *pipe) +{ + struct watch_queue *wqueue; + + wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); + if (!wqueue) + return -ENOMEM; + + wqueue->pipe = pipe; + kref_init(&wqueue->usage); + spin_lock_init(&wqueue->lock); + INIT_HLIST_HEAD(&wqueue->watches); + + pipe->watch_queue = wqueue; + return 0; +} -- cgit v1.2.3 From 998f50407ffc9370565c7ed3fcd1366adccdfbbf Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: security: Add hooks to rule on setting a watch Add security hooks that will allow an LSM to rule on whether or not a watch may be set. More than one hook is required as the watches watch different types of object. Signed-off-by: David Howells Acked-by: James Morris cc: Casey Schaufler cc: Stephen Smalley cc: linux-security-module@vger.kernel.org --- include/linux/lsm_hook_defs.h | 4 ++++ include/linux/lsm_hooks.h | 5 +++++ include/linux/security.h | 9 +++++++++ security/security.c | 7 +++++++ 4 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e0def45b5cc5..a54f49e95708 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -256,6 +256,10 @@ LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx, #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) +#endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */ + +#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) +LSM_HOOK(int, 0, watch_key, struct key *key) #endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0b5e5769b836..3f1374cffb76 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1446,6 +1446,11 @@ * @cred: The event-triggerer's credentials * @n: The notification being posted * + * @watch_key: + * Check to see if a process is allowed to watch for event notifications + * from a key or keyring. + * @key: The key to watch. + * * Security hooks for using the eBPF maps and programs functionalities through * eBPF syscalls. * diff --git a/include/linux/security.h b/include/linux/security.h index 9a5d12ab491b..e7914e4e0b02 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1290,6 +1290,15 @@ static inline int security_post_notification(const struct cred *w_cred, } #endif +#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) +int security_watch_key(struct key *key); +#else +static inline int security_watch_key(struct key *key) +{ + return 0; +} +#endif + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk); diff --git a/security/security.c b/security/security.c index 7d55607120b4..c73334ab2882 100644 --- a/security/security.c +++ b/security/security.c @@ -2016,6 +2016,13 @@ int security_post_notification(const struct cred *w_cred, } #endif /* CONFIG_WATCH_QUEUE */ +#ifdef CONFIG_KEY_NOTIFICATIONS +int security_watch_key(struct key *key) +{ + return call_int_hook(watch_key, 0, key); +} +#endif + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) -- cgit v1.2.3 From f7e47677e39a03057dcced2016c92a9c868693ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: watch_queue: Add a key/keyring notification facility Add a key/keyring change notification facility whereby notifications about changes in key and keyring content and attributes can be received. Firstly, an event queue needs to be created: pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256); then a notification can be set up to report notifications via that queue: struct watch_notification_filter filter = { .nr_filters = 1, .filters = { [0] = { .type = WATCH_TYPE_KEY_NOTIFY, .subtype_filter[0] = UINT_MAX, }, }, }; ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter); keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fds[1], 0x01); After that, records will be placed into the queue when events occur in which keys are changed in some way. Records are of the following format: struct key_notification { struct watch_notification watch; __u32 key_id; __u32 aux; } *n; Where: n->watch.type will be WATCH_TYPE_KEY_NOTIFY. n->watch.subtype will indicate the type of event, such as NOTIFY_KEY_REVOKED. n->watch.info & WATCH_INFO_LENGTH will indicate the length of the record. n->watch.info & WATCH_INFO_ID will be the second argument to keyctl_watch_key(), shifted. n->key will be the ID of the affected key. n->aux will hold subtype-dependent information, such as the key being linked into the keyring specified by n->key in the case of NOTIFY_KEY_LINKED. Note that it is permissible for event records to be of variable length - or, at least, the length may be dependent on the subtype. Note also that the queue can be shared between multiple notifications of various types. Signed-off-by: David Howells Reviewed-by: James Morris --- Documentation/security/keys/core.rst | 57 +++++++++++++++++++++ include/linux/key.h | 3 ++ include/uapi/linux/keyctl.h | 2 + include/uapi/linux/watch_queue.h | 28 +++++++++- security/keys/Kconfig | 9 ++++ security/keys/compat.c | 3 ++ security/keys/gc.c | 5 ++ security/keys/internal.h | 30 ++++++++++- security/keys/key.c | 38 +++++++++----- security/keys/keyctl.c | 99 ++++++++++++++++++++++++++++++++++-- security/keys/keyring.c | 20 +++++--- security/keys/request_key.c | 4 +- 12 files changed, 270 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index d9b0b859018b..6cff2b5f88ed 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -1026,6 +1026,63 @@ The keyctl syscall functions are: written into the output buffer. Verification returns 0 on success. + * Watch a key or keyring for changes:: + + long keyctl(KEYCTL_WATCH_KEY, key_serial_t key, int queue_fd, + const struct watch_notification_filter *filter); + + This will set or remove a watch for changes on the specified key or + keyring. + + "key" is the ID of the key to be watched. + + "queue_fd" is a file descriptor referring to an open "/dev/watch_queue" + which manages the buffer into which notifications will be delivered. + + "filter" is either NULL to remove a watch or a filter specification to + indicate what events are required from the key. + + See Documentation/watch_queue.rst for more information. + + Note that only one watch may be emplaced for any particular { key, + queue_fd } combination. + + Notification records look like:: + + struct key_notification { + struct watch_notification watch; + __u32 key_id; + __u32 aux; + }; + + In this, watch::type will be "WATCH_TYPE_KEY_NOTIFY" and subtype will be + one of:: + + NOTIFY_KEY_INSTANTIATED + NOTIFY_KEY_UPDATED + NOTIFY_KEY_LINKED + NOTIFY_KEY_UNLINKED + NOTIFY_KEY_CLEARED + NOTIFY_KEY_REVOKED + NOTIFY_KEY_INVALIDATED + NOTIFY_KEY_SETATTR + + Where these indicate a key being instantiated/rejected, updated, a link + being made in a keyring, a link being removed from a keyring, a keyring + being cleared, a key being revoked, a key being invalidated or a key + having one of its attributes changed (user, group, perm, timeout, + restriction). + + If a watched key is deleted, a basic watch_notification will be issued + with "type" set to WATCH_TYPE_META and "subtype" set to + watch_meta_removal_notification. The watchpoint ID will be set in the + "info" field. + + This needs to be configured by enabling: + + "Provide key/keyring change notifications" (KEY_NOTIFICATIONS) + + Kernel Services =============== diff --git a/include/linux/key.h b/include/linux/key.h index 6cf8e71cf8b7..b99b40db08fc 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -176,6 +176,9 @@ struct key { struct list_head graveyard_link; struct rb_node serial_node; }; +#ifdef CONFIG_KEY_NOTIFICATIONS + struct watch_list *watchers; /* Entities watching this key for changes */ +#endif struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index ed3d5893830d..4c8884eea808 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -69,6 +69,7 @@ #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ #define KEYCTL_MOVE 30 /* Move keys between keyrings */ #define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */ +#define KEYCTL_WATCH_KEY 32 /* Watch a key or ring of keys for changes */ /* keyctl structures */ struct keyctl_dh_params { @@ -130,5 +131,6 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ #define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ #define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */ +#define KEYCTL_CAPS1_NOTIFICATIONS 0x04 /* Keys generate watchable notifications */ #endif /* _LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 3a5790f1f05d..c3d8320b5d3a 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -13,7 +13,8 @@ enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ - WATCH_TYPE__NR = 1 + WATCH_TYPE_KEY_NOTIFY = 1, /* Key change event notification */ + WATCH_TYPE__NR = 2 }; enum watch_meta_notification_subtype { @@ -75,4 +76,29 @@ struct watch_notification_removal { __u64 id; /* Type-dependent identifier */ }; +/* + * Type of key/keyring change notification. + */ +enum key_notification_subtype { + NOTIFY_KEY_INSTANTIATED = 0, /* Key was instantiated (aux is error code) */ + NOTIFY_KEY_UPDATED = 1, /* Key was updated */ + NOTIFY_KEY_LINKED = 2, /* Key (aux) was added to watched keyring */ + NOTIFY_KEY_UNLINKED = 3, /* Key (aux) was removed from watched keyring */ + NOTIFY_KEY_CLEARED = 4, /* Keyring was cleared */ + NOTIFY_KEY_REVOKED = 5, /* Key was revoked */ + NOTIFY_KEY_INVALIDATED = 6, /* Key was invalidated */ + NOTIFY_KEY_SETATTR = 7, /* Key's attributes got changed */ +}; + +/* + * Key/keyring notification record. + * - watch.type = WATCH_TYPE_KEY_NOTIFY + * - watch.subtype = enum key_notification_type + */ +struct key_notification { + struct watch_notification watch; + __u32 key_id; /* The key/keyring affected */ + __u32 aux; /* Per-type auxiliary data */ +}; + #endif /* _UAPI_LINUX_WATCH_QUEUE_H */ diff --git a/security/keys/Kconfig b/security/keys/Kconfig index 47c041563d41..d4dc5ea208af 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -116,3 +116,12 @@ config KEY_DH_OPERATIONS in the kernel. If you are unsure as to whether this is required, answer N. + +config KEY_NOTIFICATIONS + bool "Provide key/keyring change notifications" + depends on KEYS && WATCH_QUEUE + help + This option provides support for getting change notifications on keys + and keyrings on which the caller has View permission. This makes use + of the /dev/watch_queue misc device to handle the notification + buffer and provides KEYCTL_WATCH_KEY to enable/disable watches. diff --git a/security/keys/compat.c b/security/keys/compat.c index b975f8f11124..6ee9d8f6a4a5 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -156,6 +156,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, case KEYCTL_CAPABILITIES: return keyctl_capabilities(compat_ptr(arg2), arg3); + case KEYCTL_WATCH_KEY: + return keyctl_watch_key(arg2, arg3, arg4); + default: return -EOPNOTSUPP; } diff --git a/security/keys/gc.c b/security/keys/gc.c index 671dd730ecfc..3c90807476eb 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -131,6 +131,11 @@ static noinline void key_gc_unused_keys(struct list_head *keys) kdebug("- %u", key->serial); key_check(key); +#ifdef CONFIG_KEY_NOTIFICATIONS + remove_watch_list(key->watchers, key->serial); + key->watchers = NULL; +#endif + /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 6d0ca48ae9a5..28e17f4f3328 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,8 @@ extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); -extern void __key_link(struct key *key, struct assoc_array_edit **_edit); +extern void __key_link(struct key *keyring, struct key *key, + struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit); @@ -183,6 +185,23 @@ extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, key_perm_t perm); +static inline void notify_key(struct key *key, + enum key_notification_subtype subtype, u32 aux) +{ +#ifdef CONFIG_KEY_NOTIFICATIONS + struct key_notification n = { + .watch.type = WATCH_TYPE_KEY_NOTIFY, + .watch.subtype = subtype, + .watch.info = watch_sizeof(n), + .key_id = key_serial(key), + .aux = aux, + }; + + post_watch_notification(key->watchers, &n.watch, current_cred(), + n.key_id); +#endif +} + /* * Check to see whether permission is granted to use a key in the desired way. */ @@ -333,6 +352,15 @@ static inline long keyctl_pkey_e_d_s(int op, extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); +#ifdef CONFIG_KEY_NOTIFICATIONS +extern long keyctl_watch_key(key_serial_t, int, int); +#else +static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id) +{ + return -EOPNOTSUPP; +} +#endif + /* * Debugging key validation */ diff --git a/security/keys/key.c b/security/keys/key.c index e959b3c96b48..e282c6179b21 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -444,6 +444,7 @@ static int __key_instantiate_and_link(struct key *key, /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_INSTANTIATED, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; @@ -453,7 +454,7 @@ static int __key_instantiate_and_link(struct key *key, if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) set_bit(KEY_FLAG_KEEP, &key->flags); - __key_link(key, _edit); + __key_link(keyring, key, _edit); } /* disable the authorisation key */ @@ -601,6 +602,7 @@ int key_reject_and_link(struct key *key, /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, -error); + notify_key(key, NOTIFY_KEY_INSTANTIATED, -error); key->expiry = ktime_get_real_seconds() + timeout; key_schedule_gc(key->expiry + key_gc_delay); @@ -611,7 +613,7 @@ int key_reject_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring && link_ret == 0) - __key_link(key, &edit); + __key_link(keyring, key, &edit); /* disable the authorisation key */ if (authkey) @@ -764,9 +766,11 @@ static inline key_ref_t __key_update(key_ref_t key_ref, down_write(&key->sem); ret = key->type->update(key, prep); - if (ret == 0) + if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_UPDATED, 0); + } up_write(&key->sem); @@ -1023,9 +1027,11 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) down_write(&key->sem); ret = key->type->update(key, &prep); - if (ret == 0) + if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_UPDATED, 0); + } up_write(&key->sem); @@ -1057,15 +1063,17 @@ void key_revoke(struct key *key) * instantiated */ down_write_nested(&key->sem, 1); - if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags) && - key->type->revoke) - key->type->revoke(key); - - /* set the death time to no more than the expiry time */ - time = ktime_get_real_seconds(); - if (key->revoked_at == 0 || key->revoked_at > time) { - key->revoked_at = time; - key_schedule_gc(key->revoked_at + key_gc_delay); + if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) { + notify_key(key, NOTIFY_KEY_REVOKED, 0); + if (key->type->revoke) + key->type->revoke(key); + + /* set the death time to no more than the expiry time */ + time = ktime_get_real_seconds(); + if (key->revoked_at == 0 || key->revoked_at > time) { + key->revoked_at = time; + key_schedule_gc(key->revoked_at + key_gc_delay); + } } up_write(&key->sem); @@ -1087,8 +1095,10 @@ void key_invalidate(struct key *key) if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { down_write_nested(&key->sem, 1); - if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) + if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) { + notify_key(key, NOTIFY_KEY_INVALIDATED, 0); key_schedule_gc_links(); + } up_write(&key->sem); } } diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5e01192e222a..7d8de1c9a478 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -37,7 +37,9 @@ static const unsigned char keyrings_capabilities[2] = { KEYCTL_CAPS0_MOVE ), [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | - KEYCTL_CAPS1_NS_KEY_TAG), + KEYCTL_CAPS1_NS_KEY_TAG | + (IS_ENABLED(CONFIG_KEY_NOTIFICATIONS) ? KEYCTL_CAPS1_NOTIFICATIONS : 0) + ), }; static int key_get_type_from_user(char *type, @@ -1039,6 +1041,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) if (group != (gid_t) -1) key->gid = gid; + notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; error_put: @@ -1089,6 +1092,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) /* if we're not the sysadmin, we can only change a key that we own */ if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) { key->perm = perm; + notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; } @@ -1480,10 +1484,12 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) okay: key = key_ref_to_ptr(key_ref); ret = 0; - if (test_bit(KEY_FLAG_KEEP, &key->flags)) + if (test_bit(KEY_FLAG_KEEP, &key->flags)) { ret = -EPERM; - else + } else { key_set_timeout(key, timeout); + notify_key(key, NOTIFY_KEY_SETATTR, 0); + } key_put(key); error: @@ -1757,6 +1763,90 @@ error: return ret; } +#ifdef CONFIG_KEY_NOTIFICATIONS +/* + * Watch for changes to a key. + * + * The caller must have View permission to watch a key or keyring. + */ +long keyctl_watch_key(key_serial_t id, int watch_queue_fd, int watch_id) +{ + struct watch_queue *wqueue; + struct watch_list *wlist = NULL; + struct watch *watch = NULL; + struct key *key; + key_ref_t key_ref; + long ret; + + if (watch_id < -1 || watch_id > 0xff) + return -EINVAL; + + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_VIEW); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + key = key_ref_to_ptr(key_ref); + + wqueue = get_watch_queue(watch_queue_fd); + if (IS_ERR(wqueue)) { + ret = PTR_ERR(wqueue); + goto err_key; + } + + if (watch_id >= 0) { + ret = -ENOMEM; + if (!key->watchers) { + wlist = kzalloc(sizeof(*wlist), GFP_KERNEL); + if (!wlist) + goto err_wqueue; + init_watch_list(wlist, NULL); + } + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (!watch) + goto err_wlist; + + init_watch(watch, wqueue); + watch->id = key->serial; + watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT; + + ret = security_watch_key(key); + if (ret < 0) + goto err_watch; + + down_write(&key->sem); + if (!key->watchers) { + key->watchers = wlist; + wlist = NULL; + } + + ret = add_watch_to_object(watch, key->watchers); + up_write(&key->sem); + + if (ret == 0) + watch = NULL; + } else { + ret = -EBADSLT; + if (key->watchers) { + down_write(&key->sem); + ret = remove_watch_from_object(key->watchers, + wqueue, key_serial(key), + false); + up_write(&key->sem); + } + } + +err_watch: + kfree(watch); +err_wlist: + kfree(wlist); +err_wqueue: + put_watch_queue(wqueue); +err_key: + key_put(key); + return ret; +} +#endif /* CONFIG_KEY_NOTIFICATIONS */ + /* * Get keyrings subsystem capabilities. */ @@ -1926,6 +2016,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, case KEYCTL_CAPABILITIES: return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); + case KEYCTL_WATCH_KEY: + return keyctl_watch_key((key_serial_t)arg2, (int)arg3, (int)arg4); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 5ca620d31cd3..14abfe765b7e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1056,12 +1056,14 @@ int keyring_restrict(key_ref_t keyring_ref, const char *type, down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); - if (keyring->restrict_link) + if (keyring->restrict_link) { ret = -EEXIST; - else if (keyring_detect_restriction_cycle(keyring, restrict_link)) + } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; - else + } else { keyring->restrict_link = restrict_link; + notify_key(keyring, NOTIFY_KEY_SETATTR, 0); + } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); @@ -1362,12 +1364,14 @@ int __key_link_check_live_key(struct key *keyring, struct key *key) * holds at most one link to any given key of a particular type+description * combination. */ -void __key_link(struct key *key, struct assoc_array_edit **_edit) +void __key_link(struct key *keyring, struct key *key, + struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; + notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* @@ -1451,7 +1455,7 @@ int key_link(struct key *keyring, struct key *key) if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) - __key_link(key, &edit); + __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); @@ -1483,7 +1487,7 @@ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); - + edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) @@ -1503,6 +1507,7 @@ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); + notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } @@ -1621,7 +1626,7 @@ int key_move(struct key *key, goto error; __key_unlink(from_keyring, key, &from_edit); - __key_link(key, &to_edit); + __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); @@ -1655,6 +1660,7 @@ int keyring_clear(struct key *keyring) } else { if (edit) assoc_array_apply_edit(edit); + notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 957b9e3e1492..e1b9f1a80676 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -418,7 +418,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx, goto key_already_present; if (dest_keyring) - __key_link(key, &edit); + __key_link(dest_keyring, key, &edit); mutex_unlock(&key_construction_mutex); if (dest_keyring) @@ -437,7 +437,7 @@ key_already_present: if (dest_keyring) { ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) - __key_link(key, &edit); + __key_link(dest_keyring, key, &edit); __key_link_end(dest_keyring, &ctx->index_key, edit); if (ret < 0) goto link_check_failed; -- cgit v1.2.3 From 80961525880e58e0efcf536fd357d642c68f4176 Mon Sep 17 00:00:00 2001 From: Mike Leach Date: Mon, 18 May 2020 12:02:22 -0600 Subject: coresight: Add generic sysfs link creation functions To allow the connections between coresight components to be represented in sysfs, generic methods for creating sysfs links between two coresight devices are added. Signed-off-by: Mike Leach Reviewed-by: Suzuki K Poulose Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200518180242.7916-4-mathieu.poirier@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/Makefile | 3 +- drivers/hwtracing/coresight/coresight-priv.h | 4 + drivers/hwtracing/coresight/coresight-sysfs.c | 124 ++++++++++++++++++++++++++ include/linux/coresight.h | 20 +++++ 4 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 drivers/hwtracing/coresight/coresight-sysfs.c (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/Makefile b/drivers/hwtracing/coresight/Makefile index 0e3e72f0f510..19497d1d92bf 100644 --- a/drivers/hwtracing/coresight/Makefile +++ b/drivers/hwtracing/coresight/Makefile @@ -2,7 +2,8 @@ # # Makefile for CoreSight drivers. # -obj-$(CONFIG_CORESIGHT) += coresight.o coresight-etm-perf.o coresight-platform.o +obj-$(CONFIG_CORESIGHT) += coresight.o coresight-etm-perf.o \ + coresight-platform.o coresight-sysfs.o obj-$(CONFIG_CORESIGHT_LINK_AND_SINK_TMC) += coresight-tmc.o \ coresight-tmc-etf.o \ coresight-tmc-etr.o diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 1cad642f27aa..a4a658d46045 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -153,6 +153,10 @@ struct coresight_device *coresight_get_sink_by_id(u32 id); struct list_head *coresight_build_path(struct coresight_device *csdev, struct coresight_device *sink); void coresight_release_path(struct list_head *path); +int coresight_add_sysfs_link(struct coresight_sysfs_link *info); +void coresight_remove_sysfs_link(struct coresight_sysfs_link *info); +int coresight_create_conns_sysfs_group(struct coresight_device *csdev); +void coresight_remove_conns_sysfs_group(struct coresight_device *csdev); #ifdef CONFIG_CORESIGHT_SOURCE_ETM3X extern int etm_readl_cp14(u32 off, unsigned int *val); diff --git a/drivers/hwtracing/coresight/coresight-sysfs.c b/drivers/hwtracing/coresight/coresight-sysfs.c new file mode 100644 index 000000000000..6759f78733ab --- /dev/null +++ b/drivers/hwtracing/coresight/coresight-sysfs.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019, Linaro Limited, All rights reserved. + * Author: Mike Leach + */ + +#include +#include + +#include "coresight-priv.h" + +/* + * Connections group - links attribute. + * Count of created links between coresight components in the group. + */ +static ssize_t nr_links_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct coresight_device *csdev = to_coresight_device(dev); + + return sprintf(buf, "%d\n", csdev->nr_links); +} +static DEVICE_ATTR_RO(nr_links); + +static struct attribute *coresight_conns_attrs[] = { + &dev_attr_nr_links.attr, + NULL, +}; + +static struct attribute_group coresight_conns_group = { + .attrs = coresight_conns_attrs, + .name = "connections", +}; + +/* + * Create connections group for CoreSight devices. + * This group will then be used to collate the sysfs links between + * devices. + */ +int coresight_create_conns_sysfs_group(struct coresight_device *csdev) +{ + int ret = 0; + + if (!csdev) + return -EINVAL; + + ret = sysfs_create_group(&csdev->dev.kobj, &coresight_conns_group); + if (ret) + return ret; + + csdev->has_conns_grp = true; + return ret; +} + +void coresight_remove_conns_sysfs_group(struct coresight_device *csdev) +{ + if (!csdev) + return; + + if (csdev->has_conns_grp) { + sysfs_remove_group(&csdev->dev.kobj, &coresight_conns_group); + csdev->has_conns_grp = false; + } +} + +int coresight_add_sysfs_link(struct coresight_sysfs_link *info) +{ + int ret = 0; + + if (!info) + return -EINVAL; + if (!info->orig || !info->target || + !info->orig_name || !info->target_name) + return -EINVAL; + if (!info->orig->has_conns_grp || !info->target->has_conns_grp) + return -EINVAL; + + /* first link orig->target */ + ret = sysfs_add_link_to_group(&info->orig->dev.kobj, + coresight_conns_group.name, + &info->target->dev.kobj, + info->orig_name); + if (ret) + return ret; + + /* second link target->orig */ + ret = sysfs_add_link_to_group(&info->target->dev.kobj, + coresight_conns_group.name, + &info->orig->dev.kobj, + info->target_name); + + /* error in second link - remove first - otherwise inc counts */ + if (ret) { + sysfs_remove_link_from_group(&info->orig->dev.kobj, + coresight_conns_group.name, + info->orig_name); + } else { + info->orig->nr_links++; + info->target->nr_links++; + } + + return ret; +} + +void coresight_remove_sysfs_link(struct coresight_sysfs_link *info) +{ + if (!info) + return; + if (!info->orig || !info->target || + !info->orig_name || !info->target_name) + return; + + sysfs_remove_link_from_group(&info->orig->dev.kobj, + coresight_conns_group.name, + info->orig_name); + + sysfs_remove_link_from_group(&info->target->dev.kobj, + coresight_conns_group.name, + info->target_name); + + info->orig->nr_links--; + info->target->nr_links--; +} diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 193cc9dbf448..a2ec25e02ca9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -148,6 +148,20 @@ struct coresight_connection { struct coresight_device *child_dev; }; +/** + * struct coresight_sysfs_link - representation of a connection in sysfs. + * @orig: Originating (master) coresight device for the link. + * @orig_name: Name to use for the link orig->target. + * @target: Target (slave) coresight device for the link. + * @target_name: Name to use for the link target->orig. + */ +struct coresight_sysfs_link { + struct coresight_device *orig; + const char *orig_name; + struct coresight_device *target; + const char *target_name; +}; + /** * struct coresight_device - representation of a device as used by the framework * @pdata: Platform data with device connections associated to this device. @@ -165,6 +179,9 @@ struct coresight_connection { * @ea: Device attribute for sink representation under PMU directory. * @ect_dev: Associated cross trigger device. Not part of the trace data * path or connections. + * @nr_links: number of sysfs links created to other components from this + * device. These will appear in the "connections" group. + * @has_conns_grp: Have added a "connections" group for sysfs links. */ struct coresight_device { struct coresight_platform_data *pdata; @@ -180,6 +197,9 @@ struct coresight_device { struct dev_ext_attribute *ea; /* cross trigger handling */ struct coresight_device *ect_dev; + /* sysfs links between components */ + int nr_links; + bool has_conns_grp; }; /* -- cgit v1.2.3 From 8a7365c2d41898f2376efa929aadf2723dc281b0 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 18 May 2020 12:02:23 -0600 Subject: coresight: Expose device connections via sysfs Coresight device connections are a bit complicated and is not exposed currently to the user. One has to look at the platform descriptions (DT bindings or ACPI bindings) to make an understanding. Given the new naming scheme, it will be helpful to have this information to choose the appropriate devices for tracing. This patch exposes the device connections via links in the sysfs directories. e.g, for a connection devA[OutputPort_X] -> devB[InputPort_Y] is represented as two symlinks: /sys/bus/coresight/.../devA/out:X -> /sys/bus/coresight/.../devB /sys/bus/coresight/.../devB/in:Y -> /sys/bus/coresight/.../devA Signed-off-by: Suzuki K Poulose [Revised to use the generic sysfs links functions & link structures. Provides a connections sysfs group in each device to hold the links.] Co-developed-by: Mike Leach Signed-off-by: Mike Leach Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200518180242.7916-5-mathieu.poirier@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight-priv.h | 5 ++ drivers/hwtracing/coresight/coresight-sysfs.c | 80 +++++++++++++++++++++++++++ drivers/hwtracing/coresight/coresight.c | 46 +++++++++++---- include/linux/coresight.h | 2 + 4 files changed, 121 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index a4a658d46045..5a36f0f50899 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -157,6 +157,11 @@ int coresight_add_sysfs_link(struct coresight_sysfs_link *info); void coresight_remove_sysfs_link(struct coresight_sysfs_link *info); int coresight_create_conns_sysfs_group(struct coresight_device *csdev); void coresight_remove_conns_sysfs_group(struct coresight_device *csdev); +int coresight_make_links(struct coresight_device *orig, + struct coresight_connection *conn, + struct coresight_device *target); +void coresight_remove_links(struct coresight_device *orig, + struct coresight_connection *conn); #ifdef CONFIG_CORESIGHT_SOURCE_ETM3X extern int etm_readl_cp14(u32 off, unsigned int *val); diff --git a/drivers/hwtracing/coresight/coresight-sysfs.c b/drivers/hwtracing/coresight/coresight-sysfs.c index 6759f78733ab..82afeaf2ccc4 100644 --- a/drivers/hwtracing/coresight/coresight-sysfs.c +++ b/drivers/hwtracing/coresight/coresight-sysfs.c @@ -122,3 +122,83 @@ void coresight_remove_sysfs_link(struct coresight_sysfs_link *info) info->orig->nr_links--; info->target->nr_links--; } + +/* + * coresight_make_links: Make a link for a connection from a @orig + * device to @target, represented by @conn. + * + * e.g, for devOrig[output_X] -> devTarget[input_Y] is represented + * as two symbolic links : + * + * /sys/.../devOrig/out:X -> /sys/.../devTarget/ + * /sys/.../devTarget/in:Y -> /sys/.../devOrig/ + * + * The link names are allocated for a device where it appears. i.e, the + * "out" link on the master and "in" link on the slave device. + * The link info is stored in the connection record for avoiding + * the reconstruction of names for removal. + */ +int coresight_make_links(struct coresight_device *orig, + struct coresight_connection *conn, + struct coresight_device *target) +{ + int ret = -ENOMEM; + char *outs = NULL, *ins = NULL; + struct coresight_sysfs_link *link = NULL; + + do { + outs = devm_kasprintf(&orig->dev, GFP_KERNEL, + "out:%d", conn->outport); + if (!outs) + break; + ins = devm_kasprintf(&target->dev, GFP_KERNEL, + "in:%d", conn->child_port); + if (!ins) + break; + link = devm_kzalloc(&orig->dev, + sizeof(struct coresight_sysfs_link), + GFP_KERNEL); + if (!link) + break; + + link->orig = orig; + link->target = target; + link->orig_name = outs; + link->target_name = ins; + + ret = coresight_add_sysfs_link(link); + if (ret) + break; + + conn->link = link; + + /* + * Install the device connection. This also indicates that + * the links are operational on both ends. + */ + conn->child_dev = target; + return 0; + } while (0); + + return ret; +} + +/* + * coresight_remove_links: Remove the sysfs links for a given connection @conn, + * from @orig device to @target device. See coresight_make_links() for more + * details. + */ +void coresight_remove_links(struct coresight_device *orig, + struct coresight_connection *conn) +{ + if (!orig || !conn->link) + return; + + coresight_remove_sysfs_link(conn->link); + + devm_kfree(&conn->child_dev->dev, conn->link->target_name); + devm_kfree(&orig->dev, conn->link->orig_name); + devm_kfree(&orig->dev, conn->link); + conn->link = NULL; + conn->child_dev = NULL; +} diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c index 07f66a3968f1..4f10cfa9dc18 100644 --- a/drivers/hwtracing/coresight/coresight.c +++ b/drivers/hwtracing/coresight/coresight.c @@ -1031,7 +1031,7 @@ static void coresight_device_release(struct device *dev) static int coresight_orphan_match(struct device *dev, void *data) { - int i; + int i, ret = 0; bool still_orphan = false; struct coresight_device *csdev, *i_csdev; struct coresight_connection *conn; @@ -1056,19 +1056,23 @@ static int coresight_orphan_match(struct device *dev, void *data) /* We have found at least one orphan connection */ if (conn->child_dev == NULL) { /* Does it match this newly added device? */ - if (conn->child_fwnode == csdev->dev.fwnode) - conn->child_dev = csdev; - else + if (conn->child_fwnode == csdev->dev.fwnode) { + ret = coresight_make_links(i_csdev, + conn, csdev); + if (ret) + return ret; + } else { /* This component still has an orphan */ still_orphan = true; + } } } i_csdev->orphan = still_orphan; /* - * Returning '0' ensures that all known component on the - * bus will be checked. + * Returning '0' in case we didn't encounter any error, + * ensures that all known component on the bus will be checked. */ return 0; } @@ -1082,15 +1086,21 @@ static int coresight_fixup_orphan_conns(struct coresight_device *csdev) static int coresight_fixup_device_conns(struct coresight_device *csdev) { - int i; + int i, ret = 0; for (i = 0; i < csdev->pdata->nr_outport; i++) { struct coresight_connection *conn = &csdev->pdata->conns[i]; conn->child_dev = coresight_find_csdev_by_fwnode(conn->child_fwnode); - if (!conn->child_dev) + if (conn->child_dev) { + ret = coresight_make_links(csdev, conn, + conn->child_dev); + if (ret) + break; + } else { csdev->orphan = true; + } } return 0; @@ -1121,7 +1131,7 @@ static int coresight_remove_match(struct device *dev, void *data) if (csdev->dev.fwnode == conn->child_fwnode) { iterator->orphan = true; - conn->child_dev = NULL; + coresight_remove_links(iterator, conn); /* * Drop the reference to the handle for the remote * device acquired in parsing the connections from @@ -1215,13 +1225,23 @@ void coresight_release_platform_data(struct coresight_device *csdev, struct coresight_platform_data *pdata) { int i; + struct coresight_connection *conns = pdata->conns; for (i = 0; i < pdata->nr_outport; i++) { - if (pdata->conns[i].child_fwnode) { - fwnode_handle_put(pdata->conns[i].child_fwnode); + /* If we have made the links, remove them now */ + if (csdev && conns[i].child_dev) + coresight_remove_links(csdev, &conns[i]); + /* + * Drop the refcount and clear the handle as this device + * is going away + */ + if (conns[i].child_fwnode) { + fwnode_handle_put(conns[i].child_fwnode); pdata->conns[i].child_fwnode = NULL; } } + if (csdev) + coresight_remove_conns_sysfs_group(csdev); } struct coresight_device *coresight_register(struct coresight_desc *desc) @@ -1303,7 +1323,9 @@ struct coresight_device *coresight_register(struct coresight_desc *desc) mutex_lock(&coresight_mutex); - ret = coresight_fixup_device_conns(csdev); + ret = coresight_create_conns_sysfs_group(csdev); + if (!ret) + ret = coresight_fixup_device_conns(csdev); if (!ret) ret = coresight_fixup_orphan_conns(csdev); if (!ret) diff --git a/include/linux/coresight.h b/include/linux/coresight.h index a2ec25e02ca9..ccd17304d7bd 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -140,12 +140,14 @@ struct coresight_desc { * @chid_fwnode: remote component's fwnode handle. * @child_dev: a @coresight_device representation of the component connected to @outport. + * @link: Representation of the connection as a sysfs link. */ struct coresight_connection { int outport; int child_port; struct fwnode_handle *child_fwnode; struct coresight_device *child_dev; + struct coresight_sysfs_link *link; }; /** -- cgit v1.2.3 From d375b356e687f2eefb51ddc3f1f2414cfa498f86 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 18 May 2020 12:02:31 -0600 Subject: coresight: Fix support for sparsely populated ports On some systems the firmware may not describe all the ports connected to a component (e.g, for security reasons). This could be especially problematic for "funnels" where we could end up in modifying memory beyond the allocated space for refcounts. e.g, for a funnel with input ports listed 0, 3, 5, nr_inport = 3. However the we could access refcnts[5] while checking for references, like : [ 526.110401] ================================================================== [ 526.117988] BUG: KASAN: slab-out-of-bounds in funnel_enable+0x54/0x1b0 [ 526.124706] Read of size 4 at addr ffffff8135f9549c by task bash/1114 [ 526.131324] [ 526.132886] CPU: 3 PID: 1114 Comm: bash Tainted: G S 5.4.25 #232 [ 526.140397] Hardware name: Qualcomm Technologies, Inc. SC7180 IDP (DT) [ 526.147113] Call trace: [ 526.149653] dump_backtrace+0x0/0x188 [ 526.153431] show_stack+0x20/0x2c [ 526.156852] dump_stack+0xdc/0x144 [ 526.160370] print_address_description+0x3c/0x494 [ 526.165211] __kasan_report+0x144/0x168 [ 526.169170] kasan_report+0x10/0x18 [ 526.172769] check_memory_region+0x1a4/0x1b4 [ 526.177164] __kasan_check_read+0x18/0x24 [ 526.181292] funnel_enable+0x54/0x1b0 [ 526.185072] coresight_enable_path+0x104/0x198 [ 526.189649] coresight_enable+0x118/0x26c ... [ 526.237782] Allocated by task 280: [ 526.241298] __kasan_kmalloc+0xf0/0x1ac [ 526.245249] kasan_kmalloc+0xc/0x14 [ 526.248849] __kmalloc+0x28c/0x3b4 [ 526.252361] coresight_register+0x88/0x250 [ 526.256587] funnel_probe+0x15c/0x228 [ 526.260365] dynamic_funnel_probe+0x20/0x2c [ 526.264679] amba_probe+0xbc/0x158 [ 526.268193] really_probe+0x144/0x408 [ 526.271970] driver_probe_device+0x70/0x140 ... [ 526.316810] [ 526.318364] Freed by task 0: [ 526.321344] (stack is not available) [ 526.325024] [ 526.326580] The buggy address belongs to the object at ffffff8135f95480 [ 526.326580] which belongs to the cache kmalloc-128 of size 128 [ 526.339439] The buggy address is located 28 bytes inside of [ 526.339439] 128-byte region [ffffff8135f95480, ffffff8135f95500) [ 526.351399] The buggy address belongs to the page: [ 526.356342] page:ffffffff04b7e500 refcount:1 mapcount:0 mapping:ffffff814b00c380 index:0x0 compound_mapcount: 0 [ 526.366711] flags: 0x4000000000010200(slab|head) [ 526.371475] raw: 4000000000010200 ffffffff05034008 ffffffff0501eb08 ffffff814b00c380 [ 526.379435] raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000 [ 526.387393] page dumped because: kasan: bad access detected [ 526.393128] [ 526.394681] Memory state around the buggy address: [ 526.399619] ffffff8135f95380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 526.407046] ffffff8135f95400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 526.414473] >ffffff8135f95480: 04 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 526.421900] ^ [ 526.426029] ffffff8135f95500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 526.433456] ffffff8135f95580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 526.440883] ================================================================== To keep the code simple, we now track the maximum number of possible input/output connections to/from this component @ nr_inport and nr_outport in platform_data, respectively. Thus the output connections could be sparse and code is adjusted to skip the unspecified connections. Cc: Mathieu Poirier Cc: Mike Leach Reported-by: Sai Prakash Ranjan Tested-by: Sai Prakash Ranjan Tested-by: Stephen Boyd Signed-off-by: Suzuki K Poulose Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200518180242.7916-13-mathieu.poirier@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight-platform.c | 85 +++++++++++++++++------- drivers/hwtracing/coresight/coresight.c | 7 +- include/linux/coresight.h | 10 +-- 3 files changed, 72 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c index 4b78e1ac5285..d58dcd0ab514 100644 --- a/drivers/hwtracing/coresight/coresight-platform.c +++ b/drivers/hwtracing/coresight/coresight-platform.c @@ -87,6 +87,7 @@ static void of_coresight_get_ports_legacy(const struct device_node *node, int *nr_inport, int *nr_outport) { struct device_node *ep = NULL; + struct of_endpoint endpoint; int in = 0, out = 0; do { @@ -94,10 +95,16 @@ static void of_coresight_get_ports_legacy(const struct device_node *node, if (!ep) break; - if (of_coresight_legacy_ep_is_input(ep)) - in++; - else - out++; + if (of_graph_parse_endpoint(ep, &endpoint)) + continue; + + if (of_coresight_legacy_ep_is_input(ep)) { + in = (endpoint.port + 1 > in) ? + endpoint.port + 1 : in; + } else { + out = (endpoint.port + 1) > out ? + endpoint.port + 1 : out; + } } while (ep); @@ -137,9 +144,16 @@ of_coresight_count_ports(struct device_node *port_parent) { int i = 0; struct device_node *ep = NULL; + struct of_endpoint endpoint; + + while ((ep = of_graph_get_next_endpoint(port_parent, ep))) { + /* Defer error handling to parsing */ + if (of_graph_parse_endpoint(ep, &endpoint)) + continue; + if (endpoint.port + 1 > i) + i = endpoint.port + 1; + } - while ((ep = of_graph_get_next_endpoint(port_parent, ep))) - i++; return i; } @@ -191,14 +205,12 @@ static int of_coresight_get_cpu(struct device *dev) * Parses the local port, remote device name and the remote port. * * Returns : - * 1 - If the parsing is successful and a connection record - * was created for an output connection. * 0 - If the parsing completed without any fatal errors. * -Errno - Fatal error, abort the scanning. */ static int of_coresight_parse_endpoint(struct device *dev, struct device_node *ep, - struct coresight_connection *conn) + struct coresight_platform_data *pdata) { int ret = 0; struct of_endpoint endpoint, rendpoint; @@ -206,6 +218,7 @@ static int of_coresight_parse_endpoint(struct device *dev, struct device_node *rep = NULL; struct device *rdev = NULL; struct fwnode_handle *rdev_fwnode; + struct coresight_connection *conn; do { /* Parse the local port details */ @@ -232,6 +245,13 @@ static int of_coresight_parse_endpoint(struct device *dev, break; } + conn = &pdata->conns[endpoint.port]; + if (conn->child_fwnode) { + dev_warn(dev, "Duplicate output port %d\n", + endpoint.port); + ret = -EINVAL; + break; + } conn->outport = endpoint.port; /* * Hold the refcount to the target device. This could be @@ -244,7 +264,6 @@ static int of_coresight_parse_endpoint(struct device *dev, conn->child_fwnode = fwnode_handle_get(rdev_fwnode); conn->child_port = rendpoint.port; /* Connection record updated */ - ret = 1; } while (0); of_node_put(rparent); @@ -258,7 +277,6 @@ static int of_get_coresight_platform_data(struct device *dev, struct coresight_platform_data *pdata) { int ret = 0; - struct coresight_connection *conn; struct device_node *ep = NULL; const struct device_node *parent = NULL; bool legacy_binding = false; @@ -287,8 +305,6 @@ static int of_get_coresight_platform_data(struct device *dev, dev_warn_once(dev, "Uses obsolete Coresight DT bindings\n"); } - conn = pdata->conns; - /* Iterate through each output port to discover topology */ while ((ep = of_graph_get_next_endpoint(parent, ep))) { /* @@ -300,15 +316,9 @@ static int of_get_coresight_platform_data(struct device *dev, if (legacy_binding && of_coresight_legacy_ep_is_input(ep)) continue; - ret = of_coresight_parse_endpoint(dev, ep, conn); - switch (ret) { - case 1: - conn++; /* Fall through */ - case 0: - break; - default: + ret = of_coresight_parse_endpoint(dev, ep, pdata); + if (ret) return ret; - } } return 0; @@ -647,6 +657,16 @@ static int acpi_coresight_parse_link(struct acpi_device *adev, * coresight_remove_match(). */ conn->child_fwnode = fwnode_handle_get(&r_adev->fwnode); + } else if (dir == ACPI_CORESIGHT_LINK_SLAVE) { + /* + * We are only interested in the port number + * for the input ports at this component. + * Store the port number in child_port. + */ + conn->child_port = fields[0].integer.value; + } else { + /* Invalid direction */ + return -EINVAL; } return dir; @@ -692,10 +712,20 @@ static int acpi_coresight_parse_graph(struct acpi_device *adev, return dir; if (dir == ACPI_CORESIGHT_LINK_MASTER) { - pdata->nr_outport++; + if (ptr->outport > pdata->nr_outport) + pdata->nr_outport = ptr->outport; ptr++; } else { - pdata->nr_inport++; + WARN_ON(pdata->nr_inport == ptr->child_port); + /* + * We do not track input port connections for a device. + * However we need the highest port number described, + * which can be recorded now and reuse this connection + * record for an output connection. Hence, do not move + * the ptr for input connections + */ + if (ptr->child_port > pdata->nr_inport) + pdata->nr_inport = ptr->child_port; } } @@ -704,8 +734,13 @@ static int acpi_coresight_parse_graph(struct acpi_device *adev, return rc; /* Copy the connection information to the final location */ - for (i = 0; i < pdata->nr_outport; i++) - pdata->conns[i] = conns[i]; + for (i = 0; conns + i < ptr; i++) { + int port = conns[i].outport; + + /* Duplicate output port */ + WARN_ON(pdata->conns[port].child_fwnode); + pdata->conns[port] = conns[i]; + } devm_kfree(&adev->dev, conns); return 0; diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c index 4f10cfa9dc18..f3efbb3b2b4d 100644 --- a/drivers/hwtracing/coresight/coresight.c +++ b/drivers/hwtracing/coresight/coresight.c @@ -1053,6 +1053,9 @@ static int coresight_orphan_match(struct device *dev, void *data) for (i = 0; i < i_csdev->pdata->nr_outport; i++) { conn = &i_csdev->pdata->conns[i]; + /* Skip the port if FW doesn't describe it */ + if (!conn->child_fwnode) + continue; /* We have found at least one orphan connection */ if (conn->child_dev == NULL) { /* Does it match this newly added device? */ @@ -1091,6 +1094,8 @@ static int coresight_fixup_device_conns(struct coresight_device *csdev) for (i = 0; i < csdev->pdata->nr_outport; i++) { struct coresight_connection *conn = &csdev->pdata->conns[i]; + if (!conn->child_fwnode) + continue; conn->child_dev = coresight_find_csdev_by_fwnode(conn->child_fwnode); if (conn->child_dev) { @@ -1126,7 +1131,7 @@ static int coresight_remove_match(struct device *dev, void *data) for (i = 0; i < iterator->pdata->nr_outport; i++) { conn = &iterator->pdata->conns[i]; - if (conn->child_dev == NULL) + if (conn->child_dev == NULL || conn->child_fwnode == NULL) continue; if (csdev->dev.fwnode == conn->child_fwnode) { diff --git a/include/linux/coresight.h b/include/linux/coresight.h index ccd17304d7bd..e3e9f0e3a878 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -100,10 +100,12 @@ union coresight_dev_subtype { }; /** - * struct coresight_platform_data - data harvested from the DT specification - * @nr_inport: number of input ports for this component. - * @nr_outport: number of output ports for this component. - * @conns: Array of nr_outport connections from this component + * struct coresight_platform_data - data harvested from the firmware + * specification. + * + * @nr_inport: Number of elements for the input connections. + * @nr_outport: Number of elements for the output connections. + * @conns: Sparse array of nr_outport connections from this component. */ struct coresight_platform_data { int nr_inport; -- cgit v1.2.3 From e9b880581d555c8f7b58c7d19cc3f8f9016a1b5f Mon Sep 17 00:00:00 2001 From: Mike Leach Date: Mon, 18 May 2020 12:02:41 -0600 Subject: coresight: cti: Add CPU Hotplug handling to CTI driver Adds registration of CPU start and stop functions to CPU hotplug mechanisms - for any CPU bound CTI. Sets CTI powered flag according to state. Will enable CTI on CPU start if there are existing enable requests. Signed-off-by: Mike Leach Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200518180242.7916-23-mathieu.poirier@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight-cti.c | 92 +++++++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 2 files changed, 93 insertions(+) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-cti.c b/drivers/hwtracing/coresight/coresight-cti.c index be61c1705916..7e7ec6dd93c0 100644 --- a/drivers/hwtracing/coresight/coresight-cti.c +++ b/drivers/hwtracing/coresight/coresight-cti.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,12 @@ static DEFINE_MUTEX(ect_mutex); #define csdev_to_cti_drvdata(csdev) \ dev_get_drvdata(csdev->dev.parent) +/* power management handling */ +static int nr_cti_cpu; + +/* quick lookup list for CPU bound CTIs when power handling */ +static struct cti_drvdata *cti_cpu_drvdata[NR_CPUS]; + /* * CTI naming. CTI bound to cores will have the name cti_cpu where * N is the CPU ID. System CTIs will have the name cti_sys where I @@ -129,6 +136,35 @@ cti_err_not_enabled: return rc; } +/* re-enable CTI on CPU when using CPU hotplug */ +static void cti_cpuhp_enable_hw(struct cti_drvdata *drvdata) +{ + struct cti_config *config = &drvdata->config; + struct device *dev = &drvdata->csdev->dev; + + pm_runtime_get_sync(dev->parent); + spin_lock(&drvdata->spinlock); + config->hw_powered = true; + + /* no need to do anything if no enable request */ + if (!atomic_read(&drvdata->config.enable_req_count)) + goto cti_hp_not_enabled; + + /* try to claim the device */ + if (coresight_claim_device(drvdata->base)) + goto cti_hp_not_enabled; + + cti_write_all_hw_regs(drvdata); + config->hw_enabled = true; + spin_unlock(&drvdata->spinlock); + return; + + /* did not re-enable due to no claim / no request */ +cti_hp_not_enabled: + spin_unlock(&drvdata->spinlock); + pm_runtime_put(dev->parent); +} + /* disable hardware */ static int cti_disable_hw(struct cti_drvdata *drvdata) { @@ -620,6 +656,44 @@ static void cti_remove_conn_xrefs(struct cti_drvdata *drvdata) } } +/* CPU HP handlers */ +static int cti_starting_cpu(unsigned int cpu) +{ + struct cti_drvdata *drvdata = cti_cpu_drvdata[cpu]; + + if (!drvdata) + return 0; + + cti_cpuhp_enable_hw(drvdata); + return 0; +} + +static int cti_dying_cpu(unsigned int cpu) +{ + struct cti_drvdata *drvdata = cti_cpu_drvdata[cpu]; + + if (!drvdata) + return 0; + + spin_lock(&drvdata->spinlock); + drvdata->config.hw_powered = false; + coresight_disclaim_device(drvdata->base); + spin_unlock(&drvdata->spinlock); + return 0; +} + +/* release PM registrations */ +static void cti_pm_release(struct cti_drvdata *drvdata) +{ + if (drvdata->ctidev.cpu >= 0) { + if (--nr_cti_cpu == 0) { + cpuhp_remove_state_nocalls( + CPUHP_AP_ARM_CORESIGHT_CTI_STARTING); + } + cti_cpu_drvdata[drvdata->ctidev.cpu] = NULL; + } +} + /** cti ect operations **/ int cti_enable(struct coresight_device *csdev) { @@ -655,6 +729,7 @@ static void cti_device_release(struct device *dev) mutex_lock(&ect_mutex); cti_remove_conn_xrefs(drvdata); + cti_pm_release(drvdata); /* remove from the list */ list_for_each_entry_safe(ect_item, ect_tmp, &ect_net, node) { @@ -730,6 +805,22 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id) goto err_out; } + /* setup CPU power management handling for CPU bound CTI devices. */ + if (drvdata->ctidev.cpu >= 0) { + cti_cpu_drvdata[drvdata->ctidev.cpu] = drvdata; + if (!nr_cti_cpu++) { + cpus_read_lock(); + ret = cpuhp_setup_state_nocalls_cpuslocked( + CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, + "arm/coresight_cti:starting", + cti_starting_cpu, cti_dying_cpu); + + cpus_read_unlock(); + if (ret) + goto err_out; + } + } + /* create dynamic attributes for connections */ ret = cti_create_cons_sysfs(dev, drvdata); if (ret) { @@ -768,6 +859,7 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id) return 0; err_out: + cti_pm_release(drvdata); return ret; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 77d70b633531..6dc7332307ca 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -142,6 +142,7 @@ enum cpuhp_state { CPUHP_AP_ARM_XEN_STARTING, CPUHP_AP_ARM_KVMPV_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, + CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, CPUHP_AP_X86_TBOOT_DYING, -- cgit v1.2.3 From 8cfba76383e902acbed95092163052b1572f17a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Allow buffers to be marked read-whole-or-error for notifications Allow a buffer to be marked such that read() must return the entire buffer in one go or return ENOBUFS. Multiple buffers can be amalgamated into a single read, but a short read will occur if the next "whole" buffer won't fit. This is useful for watch queue notifications to make sure we don't split a notification across multiple reads, especially given that we need to fabricate an overrun record under some circumstances - and that isn't in the buffers. Signed-off-by: David Howells --- fs/pipe.c | 8 +++++++- include/linux/pipe_fs_i.h | 1 + kernel/watch_queue.c | 2 +- samples/watch_queue/watch_test.c | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index da9bc1f21fd1..0f9fd897ceb5 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -320,8 +320,14 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) size_t written; int error; - if (chars > total_len) + if (chars > total_len) { + if (buf->flags & PIPE_BUF_FLAG_WHOLE) { + if (ret == 0) + ret = -ENOBUFS; + break; + } chars = total_len; + } error = pipe_buf_confirm(pipe, buf); if (error) { diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 1d3eaa233f4a..eaff59a2f074 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -8,6 +8,7 @@ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ +#define PIPE_BUF_FLAG_WHOLE 0x10 /* read() must return entire buffer or error */ /** * struct pipe_buffer - a linux kernel pipe buffer diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c103e34f8705..ad64ea300f6d 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -115,7 +115,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; - buf->flags = 0; + buf->flags = PIPE_BUF_FLAG_WHOLE; pipe->head = head + 1; if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c index 4a311790d4ca..8628b4c5d567 100644 --- a/samples/watch_queue/watch_test.c +++ b/samples/watch_queue/watch_test.c @@ -63,7 +63,7 @@ static void saw_key_change(struct watch_notification *n, size_t len) */ static void consumer(int fd) { - unsigned char buffer[4096], *p, *end; + unsigned char buffer[433], *p, *end; union { struct watch_notification n; unsigned char buf1[128]; -- cgit v1.2.3 From e7d553d69cf63aec7de0f38fed49ccbb30922e1e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:12 +0000 Subject: pipe: Add notification lossage handling Add handling for loss of notifications by having read() insert a loss-notification message after it has read the pipe buffer that was last in the ring when the loss occurred. Lossage can come about either by running out of notification descriptors or by running out of space in the pipe ring. Signed-off-by: David Howells --- fs/pipe.c | 28 ++++++++++++++++++++++++++++ include/linux/pipe_fs_i.h | 7 +++++++ kernel/watch_queue.c | 2 ++ samples/watch_queue/watch_test.c | 3 +++ 4 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 0f9fd897ceb5..620a113f92eb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -314,6 +314,30 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; +#ifdef CONFIG_WATCH_QUEUE + if (pipe->note_loss) { + struct watch_notification n; + + if (total_len < 8) { + if (ret == 0) + ret = -ENOBUFS; + break; + } + + n.type = WATCH_TYPE_META; + n.subtype = WATCH_META_LOSS_NOTIFICATION; + n.info = watch_sizeof(n); + if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { + if (ret == 0) + ret = -EFAULT; + break; + } + ret += sizeof(n); + total_len -= sizeof(n); + pipe->note_loss = false; + } +#endif + if (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; @@ -355,6 +379,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (!buf->len) { pipe_buf_release(pipe, buf); spin_lock_irq(&pipe->rd_wait.lock); +#ifdef CONFIG_WATCH_QUEUE + if (buf->flags & PIPE_BUF_FLAG_LOSS) + pipe->note_loss = true; +#endif tail++; pipe->tail = tail; spin_unlock_irq(&pipe->rd_wait.lock); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index eaff59a2f074..6626f511de6f 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -9,6 +9,9 @@ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_WHOLE 0x10 /* read() must return entire buffer or error */ +#ifdef CONFIG_WATCH_QUEUE +#define PIPE_BUF_FLAG_LOSS 0x20 /* Message loss happened after this buffer */ +#endif /** * struct pipe_buffer - a linux kernel pipe buffer @@ -34,6 +37,7 @@ struct pipe_buffer { * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption + * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs @@ -56,6 +60,9 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; +#ifdef CONFIG_WATCH_QUEUE + bool note_loss; +#endif unsigned int nr_accounted; unsigned int readers; unsigned int writers; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index ad64ea300f6d..9a9699c06709 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -132,6 +132,8 @@ out: return done; lost: + buf = &pipe->bufs[(head - 1) & mask]; + buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c index 8628b4c5d567..46e618a897fe 100644 --- a/samples/watch_queue/watch_test.c +++ b/samples/watch_queue/watch_test.c @@ -120,6 +120,9 @@ static void consumer(int fd) (n.n.info & WATCH_INFO_ID) >> WATCH_INFO_ID__SHIFT); break; + case WATCH_META_LOSS_NOTIFICATION: + printf("-- LOSS --\n"); + break; default: printf("other meta record\n"); break; -- cgit v1.2.3 From 8c0637e950d68933a67f7438f779d79b049b5e5c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2020 15:16:29 +0100 Subject: keys: Make the KEY_NEED_* perms an enum rather than a mask Since the meaning of combining the KEY_NEED_* constants is undefined, make it so that you can't do that by turning them into an enum. The enum is also given some extra values to represent special circumstances, such as: (1) The '0' value is reserved and causes a warning to trap the parameter being unset. (2) The key is to be unlinked and we require no permissions on it, only the keyring, (this replaces the KEY_LOOKUP_FOR_UNLINK flag). (3) An override due to CAP_SYS_ADMIN. (4) An override due to an instantiation token being present. (5) The permissions check is being deferred to later key_permission() calls. The extra values give the opportunity for LSMs to audit these situations. [Note: This really needs overhauling so that lookup_user_key() tells key_task_permission() and the LSM what operation is being done and leaves it to those functions to decide how to map that onto the available permits. However, I don't really want to make these change in the middle of the notifications patchset.] Signed-off-by: David Howells cc: Jarkko Sakkinen cc: Paul Moore cc: Stephen Smalley cc: Casey Schaufler cc: keyrings@vger.kernel.org cc: selinux@vger.kernel.org --- include/linux/key.h | 30 +++++++++++++++++------------ include/linux/security.h | 6 +++--- security/keys/internal.h | 8 ++++---- security/keys/keyctl.c | 16 ++++++++------- security/keys/permission.c | 31 ++++++++++++++++++++++------- security/keys/process_keys.c | 46 +++++++++++++++++++++----------------------- security/security.c | 6 +++--- security/selinux/hooks.c | 37 ++++++++++++++++++++++++++++------- security/smack/smack_lsm.c | 29 +++++++++++++++++++++------- 9 files changed, 135 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index b99b40db08fc..0f2e24f13c2b 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -71,6 +71,23 @@ struct net; #define KEY_PERM_UNDEF 0xffffffff +/* + * The permissions required on a key that we're looking up. + */ +enum key_need_perm { + KEY_NEED_UNSPECIFIED, /* Needed permission unspecified */ + KEY_NEED_VIEW, /* Require permission to view attributes */ + KEY_NEED_READ, /* Require permission to read content */ + KEY_NEED_WRITE, /* Require permission to update / modify */ + KEY_NEED_SEARCH, /* Require permission to search (keyring) or find (key) */ + KEY_NEED_LINK, /* Require permission to link */ + KEY_NEED_SETATTR, /* Require permission to change attributes */ + KEY_NEED_UNLINK, /* Require permission to unlink key */ + KEY_SYSADMIN_OVERRIDE, /* Special: override by CAP_SYS_ADMIN */ + KEY_AUTHTOKEN_OVERRIDE, /* Special: override by possession of auth token */ + KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ +}; + struct seq_file; struct user_struct; struct signal_struct; @@ -420,20 +437,9 @@ static inline key_serial_t key_serial(const struct key *key) extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, - key_perm_t perm); + enum key_need_perm need_perm); extern void key_free_user_ns(struct user_namespace *); -/* - * The permissions required on a key that we're looking up. - */ -#define KEY_NEED_VIEW 0x01 /* Require permission to view attributes */ -#define KEY_NEED_READ 0x02 /* Require permission to read content */ -#define KEY_NEED_WRITE 0x04 /* Require permission to update / modify */ -#define KEY_NEED_SEARCH 0x08 /* Require permission to search (keyring) or find (key) */ -#define KEY_NEED_LINK 0x10 /* Require permission to link */ -#define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ -#define KEY_NEED_ALL 0x3f /* All the above permissions */ - static inline short key_read_state(const struct key *key) { /* Barrier versus mark_key_instantiated(). */ diff --git a/include/linux/security.h b/include/linux/security.h index e7914e4e0b02..57aac14e3418 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1767,8 +1767,8 @@ static inline int security_path_chroot(const struct path *path) int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags); void security_key_free(struct key *key); -int security_key_permission(key_ref_t key_ref, - const struct cred *cred, unsigned perm); +int security_key_permission(key_ref_t key_ref, const struct cred *cred, + enum key_need_perm need_perm); int security_key_getsecurity(struct key *key, char **_buffer); #else @@ -1786,7 +1786,7 @@ static inline void security_key_free(struct key *key) static inline int security_key_permission(key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) { return 0; } diff --git a/security/keys/internal.h b/security/keys/internal.h index 28e17f4f3328..1fc17cb317a9 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -167,7 +167,6 @@ extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); #define KEY_LOOKUP_CREATE 0x01 #define KEY_LOOKUP_PARTIAL 0x02 -#define KEY_LOOKUP_FOR_UNLINK 0x04 extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); @@ -183,7 +182,7 @@ extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - key_perm_t perm); + enum key_need_perm need_perm); static inline void notify_key(struct key *key, enum key_notification_subtype subtype, u32 aux) @@ -205,9 +204,10 @@ static inline void notify_key(struct key *key, /* * Check to see whether permission is granted to use a key in the desired way. */ -static inline int key_permission(const key_ref_t key_ref, unsigned perm) +static inline int key_permission(const key_ref_t key_ref, + enum key_need_perm need_perm) { - return key_task_permission(key_ref, current_cred(), perm); + return key_task_permission(key_ref, current_cred(), need_perm); } extern struct key_type key_type_request_key_auth; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 7d8de1c9a478..6763ee45e04d 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -434,7 +434,7 @@ long keyctl_invalidate_key(key_serial_t id) /* Root is permitted to invalidate certain special keys */ if (capable(CAP_SYS_ADMIN)) { - key_ref = lookup_user_key(id, 0, 0); + key_ref = lookup_user_key(id, 0, KEY_SYSADMIN_OVERRIDE); if (IS_ERR(key_ref)) goto error; if (test_bit(KEY_FLAG_ROOT_CAN_INVAL, @@ -479,7 +479,8 @@ long keyctl_keyring_clear(key_serial_t ringid) /* Root is permitted to invalidate certain special keyrings */ if (capable(CAP_SYS_ADMIN)) { - keyring_ref = lookup_user_key(ringid, 0, 0); + keyring_ref = lookup_user_key(ringid, 0, + KEY_SYSADMIN_OVERRIDE); if (IS_ERR(keyring_ref)) goto error; if (test_bit(KEY_FLAG_ROOT_CAN_CLEAR, @@ -563,7 +564,7 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid) goto error; } - key_ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0); + key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, KEY_NEED_UNLINK); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -663,7 +664,7 @@ long keyctl_describe_key(key_serial_t keyid, key_put(instkey); key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, - 0); + KEY_AUTHTOKEN_OVERRIDE); if (!IS_ERR(key_ref)) goto okay; } @@ -833,7 +834,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) size_t key_data_len; /* find the key first */ - key_ref = lookup_user_key(keyid, 0, 0); + key_ref = lookup_user_key(keyid, 0, KEY_DEFER_PERM_CHECK); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto out; @@ -1471,7 +1472,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) key_put(instkey); key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, - 0); + KEY_AUTHTOKEN_OVERRIDE); if (!IS_ERR(key_ref)) goto okay; } @@ -1579,7 +1580,8 @@ long keyctl_get_security(key_serial_t keyid, return PTR_ERR(instkey); key_put(instkey); - key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, 0); + key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, + KEY_AUTHTOKEN_OVERRIDE); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); } diff --git a/security/keys/permission.c b/security/keys/permission.c index 085f907b64ac..4a61f804e80f 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -13,7 +13,7 @@ * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. - * @perm: The permissions to check for. + * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. @@ -24,12 +24,30 @@ * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) { struct key *key; - key_perm_t kperm; + key_perm_t kperm, mask; int ret; + switch (need_perm) { + default: + WARN_ON(1); + return -EACCES; + case KEY_NEED_UNLINK: + case KEY_SYSADMIN_OVERRIDE: + case KEY_AUTHTOKEN_OVERRIDE: + case KEY_DEFER_PERM_CHECK: + goto lsm; + + case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; + case KEY_NEED_READ: mask = KEY_OTH_READ; break; + case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; + case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; + case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; + case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; + } + key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ @@ -64,13 +82,12 @@ use_these_perms: if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; - kperm = kperm & perm & KEY_NEED_ALL; - - if (kperm != perm) + if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ - return security_key_permission(key_ref, cred, perm); +lsm: + return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 09541de31f2f..7e0232db1707 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -609,7 +609,7 @@ bool lookup_user_key_possessed(const struct key *key, * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, - key_perm_t perm) + enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, @@ -773,35 +773,33 @@ try_again: /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ - if (lflags & KEY_LOOKUP_FOR_UNLINK) { - ret = 0; - goto error; - } - - if (!(lflags & KEY_LOOKUP_PARTIAL)) { - ret = wait_for_key_construction(key, true); - switch (ret) { - case -ERESTARTSYS: - goto invalid_key; - default: - if (perm) + if (need_perm != KEY_NEED_UNLINK) { + if (!(lflags & KEY_LOOKUP_PARTIAL)) { + ret = wait_for_key_construction(key, true); + switch (ret) { + case -ERESTARTSYS: + goto invalid_key; + default: + if (need_perm != KEY_AUTHTOKEN_OVERRIDE && + need_perm != KEY_DEFER_PERM_CHECK) + goto invalid_key; + case 0: + break; + } + } else if (need_perm != KEY_DEFER_PERM_CHECK) { + ret = key_validate(key); + if (ret < 0) goto invalid_key; - case 0: - break; } - } else if (perm) { - ret = key_validate(key); - if (ret < 0) + + ret = -EIO; + if (!(lflags & KEY_LOOKUP_PARTIAL) && + key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } - ret = -EIO; - if (!(lflags & KEY_LOOKUP_PARTIAL) && - key_read_state(key) == KEY_IS_UNINSTANTIATED) - goto invalid_key; - /* check the permissions */ - ret = key_task_permission(key_ref, ctx.cred, perm); + ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; diff --git a/security/security.c b/security/security.c index c73334ab2882..af32d4cd0462 100644 --- a/security/security.c +++ b/security/security.c @@ -2398,10 +2398,10 @@ void security_key_free(struct key *key) call_void_hook(key_free, key); } -int security_key_permission(key_ref_t key_ref, - const struct cred *cred, unsigned perm) +int security_key_permission(key_ref_t key_ref, const struct cred *cred, + enum key_need_perm need_perm) { - return call_int_hook(key_permission, 0, key_ref, cred, perm); + return call_int_hook(key_permission, 0, key_ref, cred, need_perm); } int security_key_getsecurity(struct key *key, char **_buffer) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4c037c2545c1..196acaccbfdd 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6561,20 +6561,43 @@ static void selinux_key_free(struct key *k) static int selinux_key_permission(key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) { struct key *key; struct key_security_struct *ksec; - u32 sid; + u32 perm, sid; - /* if no specific permissions are requested, we skip the - permission check. No serious, additional covert channels - appear to be created. */ - if (perm == 0) + switch (need_perm) { + case KEY_NEED_VIEW: + perm = KEY__VIEW; + break; + case KEY_NEED_READ: + perm = KEY__READ; + break; + case KEY_NEED_WRITE: + perm = KEY__WRITE; + break; + case KEY_NEED_SEARCH: + perm = KEY__SEARCH; + break; + case KEY_NEED_LINK: + perm = KEY__LINK; + break; + case KEY_NEED_SETATTR: + perm = KEY__SETATTR; + break; + case KEY_NEED_UNLINK: + case KEY_SYSADMIN_OVERRIDE: + case KEY_AUTHTOKEN_OVERRIDE: + case KEY_DEFER_PERM_CHECK: return 0; + default: + WARN_ON(1); + return -EPERM; - sid = cred_sid(cred); + } + sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8c61d175e195..0d6bb53efe74 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4230,13 +4230,14 @@ static void smack_key_free(struct key *key) * smack_key_permission - Smack access on a key * @key_ref: gets to the object * @cred: the credentials to use - * @perm: requested key permissions + * @need_perm: requested key permission * * Return 0 if the task has read and write to the object, * an error code otherwise */ static int smack_key_permission(key_ref_t key_ref, - const struct cred *cred, unsigned perm) + const struct cred *cred, + enum key_need_perm need_perm) { struct key *keyp; struct smk_audit_info ad; @@ -4247,8 +4248,26 @@ static int smack_key_permission(key_ref_t key_ref, /* * Validate requested permissions */ - if (perm & ~KEY_NEED_ALL) + switch (need_perm) { + case KEY_NEED_READ: + case KEY_NEED_SEARCH: + case KEY_NEED_VIEW: + request |= MAY_READ; + break; + case KEY_NEED_WRITE: + case KEY_NEED_LINK: + case KEY_NEED_SETATTR: + request |= MAY_WRITE; + break; + case KEY_NEED_UNSPECIFIED: + case KEY_NEED_UNLINK: + case KEY_SYSADMIN_OVERRIDE: + case KEY_AUTHTOKEN_OVERRIDE: + case KEY_DEFER_PERM_CHECK: + return 0; + default: return -EINVAL; + } keyp = key_ref_to_ptr(key_ref); if (keyp == NULL) @@ -4273,10 +4292,6 @@ static int smack_key_permission(key_ref_t key_ref, ad.a.u.key_struct.key = keyp->serial; ad.a.u.key_struct.key_desc = keyp->description; #endif - if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW)) - request |= MAY_READ; - if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR)) - request |= MAY_WRITE; rc = smk_access(tkp, keyp->security, request, &ad); rc = smk_bu_note("key access", tkp, keyp->security, request, rc); return rc; -- cgit v1.2.3 From a8478a602913dc89a7cd2060e613edecd07e1dbd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:13 +0000 Subject: smack: Implement the watch_key and post_notification hooks Implement the watch_key security hook in Smack to make sure that a key grants the caller Read permission in order to set a watch on a key. Also implement the post_notification security hook to make sure that the notification source is granted Write permission by the watch queue. For the moment, the watch_devices security hook is left unimplemented as it's not obvious what the object should be since the queue is global and didn't previously exist. Signed-off-by: David Howells Acked-by: Casey Schaufler --- include/linux/lsm_audit.h | 1 + security/smack/smack_lsm.c | 83 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 99d629fd9944..28f23b341c1c 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -75,6 +75,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_IBPKEY 13 #define LSM_AUDIT_DATA_IBENDPORT 14 #define LSM_AUDIT_DATA_LOCKDOWN 15 +#define LSM_AUDIT_DATA_NOTIFICATION 16 union { struct path path; struct dentry *dentry; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0d6bb53efe74..3c4d4836da4a 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "smack.h" #define TRANS_TRUE "TRUE" @@ -4284,7 +4285,7 @@ static int smack_key_permission(key_ref_t key_ref, if (tkp == NULL) return -EACCES; - if (smack_privileged_cred(CAP_MAC_OVERRIDE, cred)) + if (smack_privileged(CAP_MAC_OVERRIDE)) return 0; #ifdef CONFIG_AUDIT @@ -4326,8 +4327,81 @@ static int smack_key_getsecurity(struct key *key, char **_buffer) return length; } + +#ifdef CONFIG_KEY_NOTIFICATIONS +/** + * smack_watch_key - Smack access to watch a key for notifications. + * @key: The key to be watched + * + * Return 0 if the @watch->cred has permission to read from the key object and + * an error otherwise. + */ +static int smack_watch_key(struct key *key) +{ + struct smk_audit_info ad; + struct smack_known *tkp = smk_of_current(); + int rc; + + if (key == NULL) + return -EINVAL; + /* + * If the key hasn't been initialized give it access so that + * it may do so. + */ + if (key->security == NULL) + return 0; + /* + * This should not occur + */ + if (tkp == NULL) + return -EACCES; + + if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred())) + return 0; + +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY); + ad.a.u.key_struct.key = key->serial; + ad.a.u.key_struct.key_desc = key->description; +#endif + rc = smk_access(tkp, key->security, MAY_READ, &ad); + rc = smk_bu_note("key watch", tkp, key->security, MAY_READ, rc); + return rc; +} +#endif /* CONFIG_KEY_NOTIFICATIONS */ #endif /* CONFIG_KEYS */ +#ifdef CONFIG_WATCH_QUEUE +/** + * smack_post_notification - Smack access to post a notification to a queue + * @w_cred: The credentials of the watcher. + * @cred: The credentials of the event source (may be NULL). + * @n: The notification message to be posted. + */ +static int smack_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n) +{ + struct smk_audit_info ad; + struct smack_known *subj, *obj; + int rc; + + /* Always let maintenance notifications through. */ + if (n->type == WATCH_TYPE_META) + return 0; + + if (!cred) + return 0; + subj = smk_of_task(smack_cred(cred)); + obj = smk_of_task(smack_cred(w_cred)); + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION); + rc = smk_access(subj, obj, MAY_WRITE, &ad); + rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc); + return rc; +} +#endif /* CONFIG_WATCH_QUEUE */ + /* * Smack Audit hooks * @@ -4716,8 +4790,15 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(key_free, smack_key_free), LSM_HOOK_INIT(key_permission, smack_key_permission), LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity), +#ifdef CONFIG_KEY_NOTIFICATIONS + LSM_HOOK_INIT(watch_key, smack_watch_key), +#endif #endif /* CONFIG_KEYS */ +#ifdef CONFIG_WATCH_QUEUE + LSM_HOOK_INIT(post_notification, smack_post_notification), +#endif + /* Audit hooks */ #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init), -- cgit v1.2.3 From c1527c0e12d461ff4c603996f77983f37b85c286 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 May 2020 21:07:35 -0700 Subject: bio.h: Declare the arguments of the bio iteration functions const This change makes it possible to pass 'const struct bio *' arguments to these functions. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Cc: Ming Lei Cc: Damien Le Moal Cc: Chaitanya Kulkarni Cc: Alexander Potapenko Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index a0ee494a6329..950c9dc44c4f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -70,7 +70,7 @@ static inline bool bio_has_data(struct bio *bio) return false; } -static inline bool bio_no_advance_iter(struct bio *bio) +static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || @@ -138,8 +138,8 @@ static inline bool bio_next_segment(const struct bio *bio, #define bio_for_each_segment_all(bvl, bio, iter) \ for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); ) -static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, - unsigned bytes) +static inline void bio_advance_iter(const struct bio *bio, + struct bvec_iter *iter, unsigned int bytes) { iter->bi_sector += bytes >> 9; -- cgit v1.2.3 From 854b5f01dc6a7b59f0dddd646a80b1fcd767a8db Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 May 2020 21:07:36 -0700 Subject: block: Document the bio_vec properties Since it is nontrivial that nth_page() does not have to be used for a bio_vec, document this. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig CC: Christoph Hellwig Cc: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index a81c13ac1972..ac0c7299d5b8 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -12,8 +12,17 @@ #include #include -/* - * was unsigned short, but we might as well be ready for > 64kB I/O pages +/** + * struct bio_vec - a contiguous range of physical memory addresses + * @bv_page: First page associated with the address range. + * @bv_len: Number of bytes in the address range. + * @bv_offset: Start of the address range relative to the start of @bv_page. + * + * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: + * + * nth_page(@bv_page, n) == @bv_page + n + * + * This holds because page_is_mergeable() checks the above property. */ struct bio_vec { struct page *bv_page; -- cgit v1.2.3 From 062022315e8ad9e0628515dfc756ab54b5fdb26b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 1 May 2020 17:45:41 +0100 Subject: mm/memory-failure: Add memory_failure_queue_kick() The GHES code calls memory_failure_queue() from IRQ context to schedule work on the current CPU so that memory_failure() can sleep. For synchronous memory errors the arch code needs to know any signals that memory_failure() will trigger are pending before it returns to user-space, possibly when exiting from the IRQ. Add a helper to kick the memory failure queue, to ensure the scheduled work has happened. This has to be called from process context, so may have been migrated from the original cpu. Pass the cpu the work was queued on. Change memory_failure_work_func() to permit being called on the 'wrong' cpu. Signed-off-by: James Morse Tested-by: Tyler Baicar Acked-by: Naoya Horiguchi Signed-off-by: Rafael J. Wysocki --- include/linux/mm.h | 1 + mm/memory-failure.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..c606dbbfa5e1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3012,6 +3012,7 @@ enum mf_flags { }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); +extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int get_hwpoison_page(struct page *page); #define put_hwpoison_page(page) put_page(page) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a96364be8ab4..c4afb407bf0f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1493,7 +1493,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = this_cpu_ptr(&memory_failure_cpu); + mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); @@ -1507,6 +1507,19 @@ static void memory_failure_work_func(struct work_struct *work) } } +/* + * Process memory_failure work queued on the specified CPU. + * Used to avoid return-to-userspace racing with the memory_failure workqueue. + */ +void memory_failure_queue_kick(int cpu) +{ + struct memory_failure_cpu *mf_cpu; + + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + cancel_work_sync(&mf_cpu->work); + memory_failure_work_func(&mf_cpu->work); +} + static int __init memory_failure_init(void) { struct memory_failure_cpu *mf_cpu; -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 8 ++++++-- net/ipv6/af_inet6.c | 9 ++++++--- tools/include/uapi/linux/bpf.h | 4 ++++ 8 files changed, 42 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fcf0d12a407a..8f5c8c9409d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -755,12 +755,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 771a462a8322..3b6fcc0c321a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..1cddc398404a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From c50c75e9b87946499a62bffc021e95c87a1d57cd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 11 May 2020 15:12:27 -0500 Subject: perf/core: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200511201227.GA14041@embeddedor --- include/linux/perf_event.h | 4 ++-- kernel/events/callchain.c | 2 +- kernel/events/internal.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 87e21681759c..d7b610c4eebd 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,7 +61,7 @@ struct perf_guest_info_callbacks { struct perf_callchain_entry { __u64 nr; - __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ + __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { @@ -113,7 +113,7 @@ struct perf_raw_record { struct perf_branch_stack { __u64 nr; __u64 hw_idx; - struct perf_branch_entry entries[0]; + struct perf_branch_entry entries[]; }; struct task_struct; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..b1991043b7d8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); -- cgit v1.2.3 From 607259a695312cdfac2b52fb9d5b5890c834d573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:13 +0200 Subject: net: add a new ndo_tunnel_ioctl method This method is used to properly allow kernel callers of the IPv4 route management ioctls. The exsting ip_tunnel_ioctl helper is renamed to ip_tunnel_ctl to better reflect that it doesn't directly implement ioctls touching user memory, and is used for the guts of ndo_tunnel_ctl implementations. A new ip_tunnel_ioctl helper is added that can be wired up directly to the ndo_do_ioctl method and takes care of the copy to and from userspace. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip_tunnels.h | 3 ++- net/ipv4/ip_gre.c | 35 ++++++++++++++--------------------- net/ipv4/ip_tunnel.c | 16 +++++++++++++++- net/ipv4/ip_vti.c | 32 +++++++++++++------------------- net/ipv4/ipip.c | 30 +++++++++--------------------- 6 files changed, 59 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a8f8daef09d..a18f8fdf4260 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -53,6 +53,7 @@ struct netpoll_info; struct device; struct phy_device; struct dsa_port; +struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; @@ -1274,6 +1275,9 @@ struct netdev_net_notifier { * Get devlink port instance associated with a given netdev. * Called with a reference on the netdevice and devlink locks only, * rtnl_lock is not held. + * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, + * int cmd); + * Add, change, delete or get information on an IPv4 tunnel. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1479,6 +1483,8 @@ struct net_device_ops { int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); + int (*ndo_tunnel_ctl)(struct net_device *dev, + struct ip_tunnel_parm *p, int cmd); }; /** diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 236503a50759..076e5d7db7d3 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -269,7 +269,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0ce9b91ff55c..4e31f23e4117 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -768,45 +768,37 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, + int cmd) { - struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || - ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || + ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p.i_flags = gre_flags_to_tnl_flags(p.i_flags); - p.o_flags = gre_flags_to_tnl_flags(p.o_flags); + p->i_flags = gre_flags_to_tnl_flags(p->i_flags); + p->o_flags = gre_flags_to_tnl_flags(p->o_flags); - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + t->parms.i_flags = p->i_flags; + t->parms.o_flags = p->o_flags; if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - + p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); return 0; } @@ -924,10 +916,11 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index cd4b84310d92..f4f1d11eab50 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -860,7 +860,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, netdev_state_change(dev); } -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -960,6 +960,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) done: return err; } +EXPORT_SYMBOL_GPL(ip_tunnel_ctl); + +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct ip_tunnel_parm p; + int err; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); + if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; + return err; +} EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1b4e6f298648..c8974360a99f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -378,38 +378,31 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || - p.iph.ihl != 5) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || + p->iph.ihl != 5) return -EINVAL; } - if (!(p.i_flags & GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags & GRE_KEY)) - p.o_key = 0; + if (!(p->i_flags & GRE_KEY)) + p->i_key = 0; + if (!(p->o_flags & GRE_KEY)) + p->o_key = 0; - p.i_flags = VTI_ISVTI; + p->i_flags = VTI_ISVTI; - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY; - p.o_flags |= GRE_KEY; + p->i_flags |= GRE_KEY; + p->o_flags |= GRE_KEY; } - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; return 0; } @@ -417,10 +410,11 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = vti_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2f01cf6fa0de..df663baf2516 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -327,41 +327,29 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { - int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || - !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + if (p->iph.version != 4 || + !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } - p.i_key = p.o_key = 0; - p.i_flags = p.o_flags = 0; - err = ip_tunnel_ioctl(dev, &p, cmd); - if (err) - return err; - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - - return 0; + p->i_key = p->o_key = 0; + p->i_flags = p->o_flags = 0; + return ip_tunnel_ctl(dev, p, cmd); } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ipip_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip_tunnel_ctl, }; #define IPIP_FEATURES (NETIF_F_SG | \ -- cgit v1.2.3 From febd668d375caf13a7fcd93b3498366854de854a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2020 06:30:09 -0400 Subject: rcuwait: avoid lockdep splats from rcuwait_active() rcuwait_active only returns whether w->task is not NULL. This is exactly one of the usecases that are mentioned in the documentation for rcu_access_pointer() where it is correct to bypass lockdep checks. This avoids a splat from kvm_vcpu_on_spin(). Reported-by: Wanpeng Li Tested-by: Wanpeng Li Acked-by: Davidlohr Bueso Cc: Peter Zijlstra Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index c1414ce44abc..61c56cca95c4 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -31,7 +31,7 @@ static inline void rcuwait_init(struct rcuwait *w) */ static inline int rcuwait_active(struct rcuwait *w) { - return !!rcu_dereference(w->task); + return !!rcu_access_pointer(w->task); } extern int rcuwait_wake_up(struct rcuwait *w); -- cgit v1.2.3 From a1ceea67f2e5b73cebd456e7fb463b3052bc6344 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 27 Apr 2020 16:25:27 +0200 Subject: PCI/IOV: Introduce pci_iov_sysfs_link() function Currently pci_iov_add_virtfn() scans the SR-IOV BARs, adds the VF to the bus and also creates the sysfs links between the newly added VF and its parent PF. With pdev->no_vf_scan fencing off the entire pci_iov_add_virtfn() call s390 as the sole pdev->no_vf_scan user thus ends up missing these sysfs links which are required for example by QEMU/libvirt. Instead of duplicating the code refactor pci_iov_add_virtfn() to make sysfs link creation callable separately. Signed-off-by: Niklas Schnelle Acked-by: Bjorn Helgaas Reviewed-by: Pierre Morel Link: https://lore.kernel.org/r/20200506154139.90609-1-schnelle@linux.ibm.com Signed-off-by: Vasily Gorbik --- drivers/pci/iov.c | 36 +++++++++++++++++++++++++----------- include/linux/pci.h | 8 ++++++++ 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 4d1f392b05f9..ee6fbe688498 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -133,12 +133,35 @@ static void pci_read_vf_config_common(struct pci_dev *virtfn) &physfn->sriov->subsystem_device); } +int pci_iov_sysfs_link(struct pci_dev *dev, + struct pci_dev *virtfn, int id) +{ + char buf[VIRTFN_ID_LEN]; + int rc; + + sprintf(buf, "virtfn%u", id); + rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf); + if (rc) + goto failed; + rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn"); + if (rc) + goto failed1; + + kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE); + + return 0; + +failed1: + sysfs_remove_link(&dev->dev.kobj, buf); +failed: + return rc; +} + int pci_iov_add_virtfn(struct pci_dev *dev, int id) { int i; int rc = -ENOMEM; u64 size; - char buf[VIRTFN_ID_LEN]; struct pci_dev *virtfn; struct resource *res; struct pci_sriov *iov = dev->sriov; @@ -182,23 +205,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id) } pci_device_add(virtfn, virtfn->bus); - - sprintf(buf, "virtfn%u", id); - rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf); + rc = pci_iov_sysfs_link(dev, virtfn, id); if (rc) goto failed1; - rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn"); - if (rc) - goto failed2; - - kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE); pci_bus_add_device(virtfn); return 0; -failed2: - sysfs_remove_link(&dev->dev.kobj, buf); failed1: pci_stop_and_remove_bus_device(virtfn); pci_dev_put(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..93a063a7d7f9 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2048,6 +2048,8 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int id); int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); void pci_disable_sriov(struct pci_dev *dev); + +int pci_iov_sysfs_link(struct pci_dev *dev, struct pci_dev *virtfn, int id); int pci_iov_add_virtfn(struct pci_dev *dev, int id); void pci_iov_remove_virtfn(struct pci_dev *dev, int id); int pci_num_vf(struct pci_dev *dev); @@ -2073,6 +2075,12 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id) } static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) { return -ENODEV; } + +static inline int pci_iov_sysfs_link(struct pci_dev *dev, + struct pci_dev *virtfn, int id) +{ + return -ENODEV; +} static inline int pci_iov_add_virtfn(struct pci_dev *dev, int id) { return -ENOSYS; -- cgit v1.2.3 From 7769e18c201aa88eade5556faf9da7f2bc15bb8a Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:14 +0200 Subject: scsi: storvsc: Re-init stor_chns when a channel interrupt is re-assigned For each storvsc_device, storvsc keeps track of the channel target CPUs associated to the device (alloced_cpus) and it uses this information to fill a "cache" (stor_chns) mapping CPU->channel according to a certain heuristic. Update the alloced_cpus mask and the stor_chns array when a channel of the storvsc device is re-assigned to a different CPU. Signed-off-by: Andrea Parri (Microsoft) Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Link: https://lore.kernel.org/r/20200406001514.19876-12-parri.andrea@gmail.com Reviewed-by; Long Li Reviewed-by: Michael Kelley [ wei: fix a small issue reported by kbuild test robot ] Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 4 ++ drivers/scsi/storvsc_drv.c | 96 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/hyperv.h | 3 ++ 3 files changed, 95 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 5d24b25fb5aa..28c009c7a9cd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1777,6 +1777,10 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * in on a CPU that is different from the channel target_cpu value. */ + if (channel->change_target_cpu_callback) + (*channel->change_target_cpu_callback)(channel, + channel->target_cpu, target_cpu); + channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); channel->numa_node = cpu_to_node(target_cpu); diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index fb41636519ee..072ed8728657 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -621,6 +621,64 @@ get_in_err: } +static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old, + u32 new) +{ + struct storvsc_device *stor_device; + struct vmbus_channel *cur_chn; + bool old_is_alloced = false; + struct hv_device *device; + unsigned long flags; + int cpu; + + device = channel->primary_channel ? + channel->primary_channel->device_obj + : channel->device_obj; + stor_device = get_out_stor_device(device); + if (!stor_device) + return; + + /* See storvsc_do_io() -> get_og_chn(). */ + spin_lock_irqsave(&device->channel->lock, flags); + + /* + * Determines if the storvsc device has other channels assigned to + * the "old" CPU to update the alloced_cpus mask and the stor_chns + * array. + */ + if (device->channel != channel && device->channel->target_cpu == old) { + cur_chn = device->channel; + old_is_alloced = true; + goto old_is_alloced; + } + list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) { + if (cur_chn == channel) + continue; + if (cur_chn->target_cpu == old) { + old_is_alloced = true; + goto old_is_alloced; + } + } + +old_is_alloced: + if (old_is_alloced) + WRITE_ONCE(stor_device->stor_chns[old], cur_chn); + else + cpumask_clear_cpu(old, &stor_device->alloced_cpus); + + /* "Flush" the stor_chns array. */ + for_each_possible_cpu(cpu) { + if (stor_device->stor_chns[cpu] && !cpumask_test_cpu( + cpu, &stor_device->alloced_cpus)) + WRITE_ONCE(stor_device->stor_chns[cpu], NULL); + } + + WRITE_ONCE(stor_device->stor_chns[new], channel); + cpumask_set_cpu(new, &stor_device->alloced_cpus); + + spin_unlock_irqrestore(&device->channel->lock, flags); +} + static void handle_sc_creation(struct vmbus_channel *new_sc) { struct hv_device *device = new_sc->primary_channel->device_obj; @@ -648,6 +706,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) return; } + new_sc->change_target_cpu_callback = storvsc_change_target_cpu; + /* Add the sub-channel to the array of available channels. */ stor_device->stor_chns[new_sc->target_cpu] = new_sc; cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus); @@ -876,6 +936,8 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) if (stor_device->stor_chns == NULL) return -ENOMEM; + device->channel->change_target_cpu_callback = storvsc_change_target_cpu; + stor_device->stor_chns[device->channel->target_cpu] = device->channel; cpumask_set_cpu(device->channel->target_cpu, &stor_device->alloced_cpus); @@ -1248,8 +1310,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, const struct cpumask *node_mask; int num_channels, tgt_cpu; - if (stor_device->num_sc == 0) + if (stor_device->num_sc == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } /* * Our channel array is sparsley populated and we @@ -1258,7 +1322,6 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, * The strategy is simple: * I. Ensure NUMA locality * II. Distribute evenly (best effort) - * III. Mapping is persistent. */ node_mask = cpumask_of_node(cpu_to_node(q_num)); @@ -1268,8 +1331,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, if (cpumask_test_cpu(tgt_cpu, node_mask)) num_channels++; } - if (num_channels == 0) + if (num_channels == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } hash_qnum = q_num; while (hash_qnum >= num_channels) @@ -1295,6 +1360,7 @@ static int storvsc_do_io(struct hv_device *device, struct storvsc_device *stor_device; struct vstor_packet *vstor_packet; struct vmbus_channel *outgoing_channel, *channel; + unsigned long flags; int ret = 0; const struct cpumask *node_mask; int tgt_cpu; @@ -1308,10 +1374,11 @@ static int storvsc_do_io(struct hv_device *device, request->device = device; /* - * Select an an appropriate channel to send the request out. + * Select an appropriate channel to send the request out. */ - if (stor_device->stor_chns[q_num] != NULL) { - outgoing_channel = stor_device->stor_chns[q_num]; + /* See storvsc_change_target_cpu(). */ + outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); + if (outgoing_channel != NULL) { if (outgoing_channel->target_cpu == q_num) { /* * Ideally, we want to pick a different channel if @@ -1324,7 +1391,10 @@ static int storvsc_do_io(struct hv_device *device, continue; if (tgt_cpu == q_num) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1350,7 +1420,10 @@ static int storvsc_do_io(struct hv_device *device, for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { if (cpumask_test_cpu(tgt_cpu, node_mask)) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1360,7 +1433,14 @@ static int storvsc_do_io(struct hv_device *device, } } } else { + spin_lock_irqsave(&device->channel->lock, flags); + outgoing_channel = stor_device->stor_chns[q_num]; + if (outgoing_channel != NULL) { + spin_unlock_irqrestore(&device->channel->lock, flags); + goto found_channel; + } outgoing_channel = get_og_chn(stor_device, q_num); + spin_unlock_irqrestore(&device->channel->lock, flags); } found_channel: diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b85d7580f2c1..cd64ab7bb3b7 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -773,6 +773,9 @@ struct vmbus_channel { void (*onchannel_callback)(void *context); void *channel_callback_context; + void (*change_target_cpu_callback)(struct vmbus_channel *channel, + u32 old, u32 new); + /* * Synchronize channel scheduling and channel removal; see the inline * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). -- cgit v1.2.3 From b7d18c57c94a38a81c1dbcfee0d63153ffaf1856 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Apr 2020 16:45:05 +0300 Subject: hyper-v: Switch to use UUID types directly uuid_le is an alias for guid_t and is going to be removed in the future. Replace it with original type. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200423134505.78221-4-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..f36b1cf5ffe3 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -434,7 +434,7 @@ struct virtio_device_id { * For Hyper-V devices we use the device guid as the id. */ struct hv_vmbus_device_id { - uuid_le guid; + guid_t guid; kernel_ulong_t driver_data; /* Data private to the driver */ }; -- cgit v1.2.3 From db5871e85533f06ffe1795c1cb9b9153860d1e4f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:23 -0500 Subject: vmbus: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507185323.GA14416@embeddedor Signed-off-by: Wei Liu --- include/linux/hyperv.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index cd64ab7bb3b7..d783847d8cb4 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -117,7 +117,7 @@ struct hv_ring_buffer { * Ring data starts here + RingDataStartOffset * !!! DO NOT place any fields below this !!! */ - u8 buffer[0]; + u8 buffer[]; } __packed; struct hv_ring_buffer_info { @@ -313,7 +313,7 @@ struct vmadd_remove_transfer_page_set { struct gpa_range { u32 byte_count; u32 byte_offset; - u64 pfn_array[0]; + u64 pfn_array[]; }; /* @@ -563,7 +563,7 @@ struct vmbus_channel_gpadl_header { u32 gpadl; u16 range_buflen; u16 rangecount; - struct gpa_range range[0]; + struct gpa_range range[]; } __packed; /* This is the followup packet that contains more PFNs. */ @@ -571,7 +571,7 @@ struct vmbus_channel_gpadl_body { struct vmbus_channel_message_header header; u32 msgnumber; u32 gpadl; - u64 pfn[0]; + u64 pfn[]; } __packed; struct vmbus_channel_gpadl_created { @@ -679,7 +679,7 @@ struct vmbus_channel_msginfo { * The channel message that goes out on the "wire". * It will contain at minimum the VMBUS_CHANNEL_MESSAGE_HEADER header */ - unsigned char msg[0]; + unsigned char msg[]; }; struct vmbus_close_msg { -- cgit v1.2.3 From d85234994b2fb2d88fadd3d9e60385b02b244dac Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Fri, 15 May 2020 10:57:07 -0700 Subject: mfd: Add Gateworks System Controller core driver The Gateworks System Controller (GSC) is an I2C slave controller implemented with an MSP430 micro-controller whose firmware embeds the following features: - I/O expander (16 GPIO's) using PCA955x protocol - Real Time Clock using DS1672 protocol - User EEPROM using AT24 protocol - HWMON using custom protocol - Interrupt controller with tamper detect, user pushbotton - Watchdog controller capable of full board power-cycle - Power Control capable of full board power-cycle see http://trac.gateworks.com/wiki/gsc for more details Signed-off-by: Tim Harvey Signed-off-by: Lee Jones --- MAINTAINERS | 8 ++ drivers/mfd/Kconfig | 15 +++ drivers/mfd/Makefile | 1 + drivers/mfd/gateworks-gsc.c | 277 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/gsc.h | 76 ++++++++++++ 5 files changed, 377 insertions(+) create mode 100644 drivers/mfd/gateworks-gsc.c create mode 100644 include/linux/mfd/gsc.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..a5d5e5d963f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7028,6 +7028,14 @@ F: kernel/futex.c F: tools/perf/bench/futex* F: tools/testing/selftests/futex/ +GATEWORKS SYSTEM CONTROLLER (GSC) DRIVER +M: Tim Harvey +M: Robert Jones +S: Maintained +F: Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml +F: drivers/mfd/gateworks-gsc.c +F: include/linux/mfd/gsc.h + GASKET DRIVER FRAMEWORK M: Rob Springer M: Todd Poynor diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 0a59249198d3..05448e78e5b8 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -407,6 +407,21 @@ config MFD_EXYNOS_LPASS Select this option to enable support for Samsung Exynos Low Power Audio Subsystem. +config MFD_GATEWORKS_GSC + tristate "Gateworks System Controller" + depends on (I2C && OF) + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + help + Enable support for the Gateworks System Controller (GSC) found + on Gateworks Single Board Computers supporting system functions + such as push-button monitor, multiple ADC's for voltage and + temperature monitoring, fan controller and watchdog monitor. + This driver provides common support for accessing the device. + Additional drivers must be enabled in order to use the + functionality of the device. + config MFD_MC13XXX tristate depends on (SPI_MASTER || I2C) diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f935d10cbf0f..ed433aed7010 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_MFD_BCM590XX) += bcm590xx.o obj-$(CONFIG_MFD_BD9571MWV) += bd9571mwv.o obj-$(CONFIG_MFD_CROS_EC_DEV) += cros_ec_dev.o obj-$(CONFIG_MFD_EXYNOS_LPASS) += exynos-lpass.o +obj-$(CONFIG_MFD_GATEWORKS_GSC) += gateworks-gsc.o obj-$(CONFIG_HTC_PASIC3) += htc-pasic3.o obj-$(CONFIG_HTC_I2CPLD) += htc-i2cpld.o diff --git a/drivers/mfd/gateworks-gsc.c b/drivers/mfd/gateworks-gsc.c new file mode 100644 index 000000000000..576da62fbb0c --- /dev/null +++ b/drivers/mfd/gateworks-gsc.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The Gateworks System Controller (GSC) is a multi-function + * device designed for use in Gateworks Single Board Computers. + * The control interface is I2C, with an interrupt. The device supports + * system functions such as push-button monitoring, multiple ADC's for + * voltage and temperature monitoring, fan controller and watchdog monitor. + * + * Copyright (C) 2020 Gateworks Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * The GSC suffers from an errata where occasionally during + * ADC cycles the chip can NAK I2C transactions. To ensure we have reliable + * register access we place retries around register access. + */ +#define I2C_RETRIES 3 + +int gsc_write(void *context, unsigned int reg, unsigned int val) +{ + struct i2c_client *client = context; + int retry, ret; + + for (retry = 0; retry < I2C_RETRIES; retry++) { + ret = i2c_smbus_write_byte_data(client, reg, val); + /* + * -EAGAIN returned when the i2c host controller is busy + * -EIO returned when i2c device is busy + */ + if (ret != -EAGAIN && ret != -EIO) + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(gsc_write); + +int gsc_read(void *context, unsigned int reg, unsigned int *val) +{ + struct i2c_client *client = context; + int retry, ret; + + for (retry = 0; retry < I2C_RETRIES; retry++) { + ret = i2c_smbus_read_byte_data(client, reg); + /* + * -EAGAIN returned when the i2c host controller is busy + * -EIO returned when i2c device is busy + */ + if (ret != -EAGAIN && ret != -EIO) + break; + } + *val = ret & 0xff; + + return 0; +} +EXPORT_SYMBOL_GPL(gsc_read); + +/* + * gsc_powerdown - API to use GSC to power down board for a specific time + * + * secs - number of seconds to remain powered off + */ +static int gsc_powerdown(struct gsc_dev *gsc, unsigned long secs) +{ + int ret; + unsigned char regs[4]; + + dev_info(&gsc->i2c->dev, "GSC powerdown for %ld seconds\n", + secs); + + put_unaligned_le32(secs, regs); + ret = regmap_bulk_write(gsc->regmap, GSC_TIME_ADD, regs, 4); + if (ret) + return ret; + + ret = regmap_update_bits(gsc->regmap, GSC_CTRL_1, + BIT(GSC_CTRL_1_SLEEP_ADD), + BIT(GSC_CTRL_1_SLEEP_ADD)); + if (ret) + return ret; + + ret = regmap_update_bits(gsc->regmap, GSC_CTRL_1, + BIT(GSC_CTRL_1_SLEEP_ACTIVATE) | + BIT(GSC_CTRL_1_SLEEP_ENABLE), + BIT(GSC_CTRL_1_SLEEP_ACTIVATE) | + BIT(GSC_CTRL_1_SLEEP_ENABLE)); + + + return ret; +} + +static ssize_t gsc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gsc_dev *gsc = dev_get_drvdata(dev); + const char *name = attr->attr.name; + int rz = 0; + + if (strcasecmp(name, "fw_version") == 0) + rz = sprintf(buf, "%d\n", gsc->fwver); + else if (strcasecmp(name, "fw_crc") == 0) + rz = sprintf(buf, "0x%04x\n", gsc->fwcrc); + else + dev_err(dev, "invalid command: '%s'\n", name); + + return rz; +} + +static ssize_t gsc_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gsc_dev *gsc = dev_get_drvdata(dev); + const char *name = attr->attr.name; + long value; + + if (strcasecmp(name, "powerdown") == 0) { + if (kstrtol(buf, 0, &value) == 0) + gsc_powerdown(gsc, value); + } else { + dev_err(dev, "invalid command: '%s\n", name); + } + + return count; +} + +static struct device_attribute attr_fwver = + __ATTR(fw_version, 0440, gsc_show, NULL); +static struct device_attribute attr_fwcrc = + __ATTR(fw_crc, 0440, gsc_show, NULL); +static struct device_attribute attr_pwrdown = + __ATTR(powerdown, 0220, NULL, gsc_store); + +static struct attribute *gsc_attrs[] = { + &attr_fwver.attr, + &attr_fwcrc.attr, + &attr_pwrdown.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = gsc_attrs, +}; + +static const struct of_device_id gsc_of_match[] = { + { .compatible = "gw,gsc", }, + { } +}; +MODULE_DEVICE_TABLE(of, gsc_of_match); + +static struct regmap_bus gsc_regmap_bus = { + .reg_read = gsc_read, + .reg_write = gsc_write, +}; + +static const struct regmap_config gsc_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .cache_type = REGCACHE_NONE, + .max_register = GSC_WP, +}; + +static const struct regmap_irq gsc_irqs[] = { + REGMAP_IRQ_REG(GSC_IRQ_PB, 0, BIT(GSC_IRQ_PB)), + REGMAP_IRQ_REG(GSC_IRQ_KEY_ERASED, 0, BIT(GSC_IRQ_KEY_ERASED)), + REGMAP_IRQ_REG(GSC_IRQ_EEPROM_WP, 0, BIT(GSC_IRQ_EEPROM_WP)), + REGMAP_IRQ_REG(GSC_IRQ_RESV, 0, BIT(GSC_IRQ_RESV)), + REGMAP_IRQ_REG(GSC_IRQ_GPIO, 0, BIT(GSC_IRQ_GPIO)), + REGMAP_IRQ_REG(GSC_IRQ_TAMPER, 0, BIT(GSC_IRQ_TAMPER)), + REGMAP_IRQ_REG(GSC_IRQ_WDT_TIMEOUT, 0, BIT(GSC_IRQ_WDT_TIMEOUT)), + REGMAP_IRQ_REG(GSC_IRQ_SWITCH_HOLD, 0, BIT(GSC_IRQ_SWITCH_HOLD)), +}; + +static const struct regmap_irq_chip gsc_irq_chip = { + .name = "gateworks-gsc", + .irqs = gsc_irqs, + .num_irqs = ARRAY_SIZE(gsc_irqs), + .num_regs = 1, + .status_base = GSC_IRQ_STATUS, + .mask_base = GSC_IRQ_ENABLE, + .mask_invert = true, + .ack_base = GSC_IRQ_STATUS, + .ack_invert = true, +}; + +static int gsc_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct gsc_dev *gsc; + struct regmap_irq_chip_data *irq_data; + int ret; + unsigned int reg; + + gsc = devm_kzalloc(dev, sizeof(*gsc), GFP_KERNEL); + if (!gsc) + return -ENOMEM; + + gsc->dev = &client->dev; + gsc->i2c = client; + i2c_set_clientdata(client, gsc); + + gsc->regmap = devm_regmap_init(dev, &gsc_regmap_bus, client, + &gsc_regmap_config); + if (IS_ERR(gsc->regmap)) + return PTR_ERR(gsc->regmap); + + if (regmap_read(gsc->regmap, GSC_FW_VER, ®)) + return -EIO; + gsc->fwver = reg; + + regmap_read(gsc->regmap, GSC_FW_CRC, ®); + gsc->fwcrc = reg; + regmap_read(gsc->regmap, GSC_FW_CRC + 1, ®); + gsc->fwcrc |= reg << 8; + + gsc->i2c_hwmon = devm_i2c_new_dummy_device(dev, client->adapter, + GSC_HWMON); + if (IS_ERR(gsc->i2c_hwmon)) { + dev_err(dev, "Failed to allocate I2C device for HWMON\n"); + return PTR_ERR(gsc->i2c_hwmon); + } + + ret = devm_regmap_add_irq_chip(dev, gsc->regmap, client->irq, + IRQF_ONESHOT | IRQF_SHARED | + IRQF_TRIGGER_FALLING, 0, + &gsc_irq_chip, &irq_data); + if (ret) + return ret; + + dev_info(dev, "Gateworks System Controller v%d: fw 0x%04x\n", + gsc->fwver, gsc->fwcrc); + + ret = sysfs_create_group(&dev->kobj, &attr_group); + if (ret) + dev_err(dev, "failed to create sysfs attrs\n"); + + ret = devm_of_platform_populate(dev); + if (ret) { + sysfs_remove_group(&dev->kobj, &attr_group); + return ret; + } + + return 0; +} + +static int gsc_remove(struct i2c_client *client) +{ + sysfs_remove_group(&client->dev.kobj, &attr_group); + + return 0; +} + +static struct i2c_driver gsc_driver = { + .driver = { + .name = "gateworks-gsc", + .of_match_table = gsc_of_match, + }, + .probe_new = gsc_probe, + .remove = gsc_remove, +}; +module_i2c_driver(gsc_driver); + +MODULE_AUTHOR("Tim Harvey "); +MODULE_DESCRIPTION("I2C Core interface for GSC"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/mfd/gsc.h b/include/linux/mfd/gsc.h new file mode 100644 index 000000000000..6bd639c285b4 --- /dev/null +++ b/include/linux/mfd/gsc.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2020 Gateworks Corporation + */ +#ifndef __LINUX_MFD_GSC_H_ +#define __LINUX_MFD_GSC_H_ + +#include + +/* Device Addresses */ +#define GSC_MISC 0x20 +#define GSC_UPDATE 0x21 +#define GSC_GPIO 0x23 +#define GSC_HWMON 0x29 +#define GSC_EEPROM0 0x50 +#define GSC_EEPROM1 0x51 +#define GSC_EEPROM2 0x52 +#define GSC_EEPROM3 0x53 +#define GSC_RTC 0x68 + +/* Register offsets */ +enum { + GSC_CTRL_0 = 0x00, + GSC_CTRL_1 = 0x01, + GSC_TIME = 0x02, + GSC_TIME_ADD = 0x06, + GSC_IRQ_STATUS = 0x0A, + GSC_IRQ_ENABLE = 0x0B, + GSC_FW_CRC = 0x0C, + GSC_FW_VER = 0x0E, + GSC_WP = 0x0F, +}; + +/* Bit definitions */ +#define GSC_CTRL_0_PB_HARD_RESET 0 +#define GSC_CTRL_0_PB_CLEAR_SECURE_KEY 1 +#define GSC_CTRL_0_PB_SOFT_POWER_DOWN 2 +#define GSC_CTRL_0_PB_BOOT_ALTERNATE 3 +#define GSC_CTRL_0_PERFORM_CRC 4 +#define GSC_CTRL_0_TAMPER_DETECT 5 +#define GSC_CTRL_0_SWITCH_HOLD 6 + +#define GSC_CTRL_1_SLEEP_ENABLE 0 +#define GSC_CTRL_1_SLEEP_ACTIVATE 1 +#define GSC_CTRL_1_SLEEP_ADD 2 +#define GSC_CTRL_1_SLEEP_NOWAKEPB 3 +#define GSC_CTRL_1_WDT_TIME 4 +#define GSC_CTRL_1_WDT_ENABLE 5 +#define GSC_CTRL_1_SWITCH_BOOT_ENABLE 6 +#define GSC_CTRL_1_SWITCH_BOOT_CLEAR 7 + +#define GSC_IRQ_PB 0 +#define GSC_IRQ_KEY_ERASED 1 +#define GSC_IRQ_EEPROM_WP 2 +#define GSC_IRQ_RESV 3 +#define GSC_IRQ_GPIO 4 +#define GSC_IRQ_TAMPER 5 +#define GSC_IRQ_WDT_TIMEOUT 6 +#define GSC_IRQ_SWITCH_HOLD 7 + +int gsc_read(void *context, unsigned int reg, unsigned int *val); +int gsc_write(void *context, unsigned int reg, unsigned int val); + +struct gsc_dev { + struct device *dev; + + struct i2c_client *i2c; /* 0x20: interrupt controller, WDT */ + struct i2c_client *i2c_hwmon; /* 0x29: hwmon, fan controller */ + + struct regmap *regmap; + + unsigned int fwver; + unsigned short fwcrc; +}; + +#endif /* __LINUX_MFD_GSC_H_ */ -- cgit v1.2.3 From 3bce5377ef66a8700dcf7a9cb89b7aeb99326cb7 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Fri, 15 May 2020 10:57:08 -0700 Subject: hwmon: Add Gateworks System Controller support The Gateworks System Controller has a hwmon sub-component that exposes up to 16 ADC's, some of which are temperature sensors, others which are voltage inputs. The ADC configuration (register mapping and name) is configured via device-tree and varies board to board. Signed-off-by: Tim Harvey Reviewed-by: Guenter Roeck Signed-off-by: Lee Jones --- Documentation/hwmon/gsc-hwmon.rst | 53 +++++ Documentation/hwmon/index.rst | 1 + MAINTAINERS | 3 + drivers/hwmon/Kconfig | 9 + drivers/hwmon/Makefile | 1 + drivers/hwmon/gsc-hwmon.c | 390 ++++++++++++++++++++++++++++++++ include/linux/platform_data/gsc_hwmon.h | 44 ++++ 7 files changed, 501 insertions(+) create mode 100644 Documentation/hwmon/gsc-hwmon.rst create mode 100644 drivers/hwmon/gsc-hwmon.c create mode 100644 include/linux/platform_data/gsc_hwmon.h (limited to 'include/linux') diff --git a/Documentation/hwmon/gsc-hwmon.rst b/Documentation/hwmon/gsc-hwmon.rst new file mode 100644 index 000000000000..ffac392a7129 --- /dev/null +++ b/Documentation/hwmon/gsc-hwmon.rst @@ -0,0 +1,53 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Kernel driver gsc-hwmon +======================= + +Supported chips: Gateworks GSC +Datasheet: http://trac.gateworks.com/wiki/gsc +Author: Tim Harvey + +Description: +------------ + +This driver supports hardware monitoring for the temperature sensor, +various ADC's connected to the GSC, and optional FAN controller available +on some boards. + + +Voltage Monitoring +------------------ + +The voltage inputs are scaled either internally or by the driver depending +on the GSC version and firmware. The values returned by the driver do not need +further scaling. The voltage input labels provide the voltage rail name: + +inX_input Measured voltage (mV). +inX_label Name of voltage rail. + + +Temperature Monitoring +---------------------- + +Temperatures are measured with 12-bit or 10-bit resolution and are scaled +either internally or by the driver depending on the GSC version and firmware. +The values returned by the driver reflect millidegree Celcius: + +tempX_input Measured temperature. +tempX_label Name of temperature input. + + +PWM Output Control +------------------ + +The GSC features 1 PWM output that operates in automatic mode where the +PWM value will be scalled depending on 6 temperature boundaries. +The tempeature boundaries are read-write and in millidegree Celcius and the +read-only PWM values range from 0 (off) to 255 (full speed). +Fan speed will be set to minimum (off) when the temperature sensor reads +less than pwm1_auto_point1_temp and maximum when the temperature sensor +equals or exceeds pwm1_auto_point6_temp. + +pwm1_auto_point[1-6]_pwm PWM value. +pwm1_auto_point[1-6]_temp Temperature boundary. + diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index 8ef62fd39787..1c02aa697a02 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -60,6 +60,7 @@ Hardware Monitoring Kernel Drivers ftsteutates g760a g762 + gsc-hwmon gl518sm hih6130 ibmaem diff --git a/MAINTAINERS b/MAINTAINERS index a5d5e5d963f1..d9cee5698bcc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7035,6 +7035,9 @@ S: Maintained F: Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml F: drivers/mfd/gateworks-gsc.c F: include/linux/mfd/gsc.h +F: Documentation/hwmon/gsc-hwmon.rst +F: drivers/hwmon/gsc-hwmon.c +F: include/linux/platform_data/gsc_hwmon.h GASKET DRIVER FRAMEWORK M: Rob Springer diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 05a30832c6ba..d008b445baba 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -523,6 +523,15 @@ config SENSORS_F75375S This driver can also be built as a module. If so, the module will be called f75375s. +config SENSORS_GSC + tristate "Gateworks System Controller ADC" + depends on MFD_GATEWORKS_GSC + help + Support for the Gateworks System Controller A/D converters. + + To compile this driver as a module, choose M here: + the module will be called gsc-hwmon. + config SENSORS_MC13783_ADC tristate "Freescale MC13783/MC13892 ADC" depends on MFD_MC13XXX diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index b0b9c8e57176..a6564d00d94c 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -74,6 +74,7 @@ obj-$(CONFIG_SENSORS_G760A) += g760a.o obj-$(CONFIG_SENSORS_G762) += g762.o obj-$(CONFIG_SENSORS_GL518SM) += gl518sm.o obj-$(CONFIG_SENSORS_GL520SM) += gl520sm.o +obj-$(CONFIG_SENSORS_GSC) += gsc-hwmon.o obj-$(CONFIG_SENSORS_GPIO_FAN) += gpio-fan.o obj-$(CONFIG_SENSORS_HIH6130) += hih6130.o obj-$(CONFIG_SENSORS_ULTRA45) += ultra45_env.o diff --git a/drivers/hwmon/gsc-hwmon.c b/drivers/hwmon/gsc-hwmon.c new file mode 100644 index 000000000000..2137bc65829d --- /dev/null +++ b/drivers/hwmon/gsc-hwmon.c @@ -0,0 +1,390 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for Gateworks System Controller Hardware Monitor module + * + * Copyright (C) 2020 Gateworks Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define GSC_HWMON_MAX_TEMP_CH 16 +#define GSC_HWMON_MAX_IN_CH 16 + +#define GSC_HWMON_RESOLUTION 12 +#define GSC_HWMON_VREF 2500 + +struct gsc_hwmon_data { + struct gsc_dev *gsc; + struct gsc_hwmon_platform_data *pdata; + struct regmap *regmap; + const struct gsc_hwmon_channel *temp_ch[GSC_HWMON_MAX_TEMP_CH]; + const struct gsc_hwmon_channel *in_ch[GSC_HWMON_MAX_IN_CH]; + u32 temp_config[GSC_HWMON_MAX_TEMP_CH + 1]; + u32 in_config[GSC_HWMON_MAX_IN_CH + 1]; + struct hwmon_channel_info temp_info; + struct hwmon_channel_info in_info; + const struct hwmon_channel_info *info[3]; + struct hwmon_chip_info chip; +}; + +static struct regmap_bus gsc_hwmon_regmap_bus = { + .reg_read = gsc_read, + .reg_write = gsc_write, +}; + +static const struct regmap_config gsc_hwmon_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .cache_type = REGCACHE_NONE, +}; + +static ssize_t pwm_auto_point_temp_show(struct device *dev, + struct device_attribute *devattr, + char *buf) +{ + struct gsc_hwmon_data *hwmon = dev_get_drvdata(dev); + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); + u8 reg = hwmon->pdata->fan_base + (2 * attr->index); + u8 regs[2]; + int ret; + + ret = regmap_bulk_read(hwmon->regmap, reg, regs, 2); + if (ret) + return ret; + + ret = regs[0] | regs[1] << 8; + return sprintf(buf, "%d\n", ret * 10); +} + +static ssize_t pwm_auto_point_temp_store(struct device *dev, + struct device_attribute *devattr, + const char *buf, size_t count) +{ + struct gsc_hwmon_data *hwmon = dev_get_drvdata(dev); + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); + u8 reg = hwmon->pdata->fan_base + (2 * attr->index); + u8 regs[2]; + long temp; + int err; + + if (kstrtol(buf, 10, &temp)) + return -EINVAL; + + temp = clamp_val(temp, 0, 10000); + temp = DIV_ROUND_CLOSEST(temp, 10); + + regs[0] = temp & 0xff; + regs[1] = (temp >> 8) & 0xff; + err = regmap_bulk_write(hwmon->regmap, reg, regs, 2); + if (err) + return err; + + return count; +} + +static ssize_t pwm_auto_point_pwm_show(struct device *dev, + struct device_attribute *devattr, + char *buf) +{ + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); + + return sprintf(buf, "%d\n", 255 * (50 + (attr->index * 10)) / 100); +} + +static SENSOR_DEVICE_ATTR_RO(pwm1_auto_point1_pwm, pwm_auto_point_pwm, 0); +static SENSOR_DEVICE_ATTR_RW(pwm1_auto_point1_temp, pwm_auto_point_temp, 0); + +static SENSOR_DEVICE_ATTR_RO(pwm1_auto_point2_pwm, pwm_auto_point_pwm, 1); +static SENSOR_DEVICE_ATTR_RW(pwm1_auto_point2_temp, pwm_auto_point_temp, 1); + +static SENSOR_DEVICE_ATTR_RO(pwm1_auto_point3_pwm, pwm_auto_point_pwm, 2); +static SENSOR_DEVICE_ATTR_RW(pwm1_auto_point3_temp, pwm_auto_point_temp, 2); + +static SENSOR_DEVICE_ATTR_RO(pwm1_auto_point4_pwm, pwm_auto_point_pwm, 3); +static SENSOR_DEVICE_ATTR_RW(pwm1_auto_point4_temp, pwm_auto_point_temp, 3); + +static SENSOR_DEVICE_ATTR_RO(pwm1_auto_point5_pwm, pwm_auto_point_pwm, 4); +static SENSOR_DEVICE_ATTR_RW(pwm1_auto_point5_temp, pwm_auto_point_temp, 4); + +static SENSOR_DEVICE_ATTR_RO(pwm1_auto_point6_pwm, pwm_auto_point_pwm, 5); +static SENSOR_DEVICE_ATTR_RW(pwm1_auto_point6_temp, pwm_auto_point_temp, 5); + +static struct attribute *gsc_hwmon_attributes[] = { + &sensor_dev_attr_pwm1_auto_point1_pwm.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point1_temp.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point2_pwm.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point2_temp.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point3_pwm.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point3_temp.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point4_pwm.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point4_temp.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point5_pwm.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point5_temp.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point6_pwm.dev_attr.attr, + &sensor_dev_attr_pwm1_auto_point6_temp.dev_attr.attr, + NULL +}; + +static const struct attribute_group gsc_hwmon_group = { + .attrs = gsc_hwmon_attributes, +}; +__ATTRIBUTE_GROUPS(gsc_hwmon); + +static int +gsc_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long *val) +{ + struct gsc_hwmon_data *hwmon = dev_get_drvdata(dev); + const struct gsc_hwmon_channel *ch; + int sz, ret; + long tmp; + u8 buf[3]; + + switch (type) { + case hwmon_in: + ch = hwmon->in_ch[channel]; + break; + case hwmon_temp: + ch = hwmon->temp_ch[channel]; + break; + default: + return -EOPNOTSUPP; + } + + sz = (ch->mode == mode_voltage) ? 3 : 2; + ret = regmap_bulk_read(hwmon->regmap, ch->reg, buf, sz); + if (ret) + return ret; + + tmp = 0; + while (sz-- > 0) + tmp |= (buf[sz] << (8 * sz)); + + switch (ch->mode) { + case mode_temperature: + if (tmp > 0x8000) + tmp -= 0xffff; + break; + case mode_voltage_raw: + tmp = clamp_val(tmp, 0, BIT(GSC_HWMON_RESOLUTION)); + /* scale based on ref voltage and ADC resolution */ + tmp *= GSC_HWMON_VREF; + tmp >>= GSC_HWMON_RESOLUTION; + /* scale based on optional voltage divider */ + if (ch->vdiv[0] && ch->vdiv[1]) { + tmp *= (ch->vdiv[0] + ch->vdiv[1]); + tmp /= ch->vdiv[1]; + } + /* adjust by uV offset */ + tmp += ch->mvoffset; + break; + case mode_voltage: + /* no adjustment needed */ + break; + } + + *val = tmp; + + return 0; +} + +static int +gsc_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, const char **buf) +{ + struct gsc_hwmon_data *hwmon = dev_get_drvdata(dev); + + switch (type) { + case hwmon_in: + *buf = hwmon->in_ch[channel]->name; + break; + case hwmon_temp: + *buf = hwmon->temp_ch[channel]->name; + break; + default: + return -ENOTSUPP; + } + + return 0; +} + +static umode_t +gsc_hwmon_is_visible(const void *_data, enum hwmon_sensor_types type, u32 attr, + int ch) +{ + return 0444; +} + +static const struct hwmon_ops gsc_hwmon_ops = { + .is_visible = gsc_hwmon_is_visible, + .read = gsc_hwmon_read, + .read_string = gsc_hwmon_read_string, +}; + +static struct gsc_hwmon_platform_data * +gsc_hwmon_get_devtree_pdata(struct device *dev) +{ + struct gsc_hwmon_platform_data *pdata; + struct gsc_hwmon_channel *ch; + struct fwnode_handle *child; + struct device_node *fan; + int nchannels; + + nchannels = device_get_child_node_count(dev); + if (nchannels == 0) + return ERR_PTR(-ENODEV); + + pdata = devm_kzalloc(dev, + sizeof(*pdata) + nchannels * sizeof(*ch), + GFP_KERNEL); + if (!pdata) + return ERR_PTR(-ENOMEM); + ch = (struct gsc_hwmon_channel *)(pdata + 1); + pdata->channels = ch; + pdata->nchannels = nchannels; + + /* fan controller base address */ + fan = of_find_compatible_node(dev->parent->of_node, NULL, "gw,gsc-fan"); + if (fan && of_property_read_u32(fan, "reg", &pdata->fan_base)) { + dev_err(dev, "fan node without base\n"); + return ERR_PTR(-EINVAL); + } + + /* allocate structures for channels and count instances of each type */ + device_for_each_child_node(dev, child) { + if (fwnode_property_read_string(child, "label", &ch->name)) { + dev_err(dev, "channel without label\n"); + fwnode_handle_put(child); + return ERR_PTR(-EINVAL); + } + if (fwnode_property_read_u32(child, "reg", &ch->reg)) { + dev_err(dev, "channel without reg\n"); + fwnode_handle_put(child); + return ERR_PTR(-EINVAL); + } + if (fwnode_property_read_u32(child, "gw,mode", &ch->mode)) { + dev_err(dev, "channel without mode\n"); + fwnode_handle_put(child); + return ERR_PTR(-EINVAL); + } + if (ch->mode > mode_max) { + dev_err(dev, "invalid channel mode\n"); + fwnode_handle_put(child); + return ERR_PTR(-EINVAL); + } + + if (!fwnode_property_read_u32(child, + "gw,voltage-offset-microvolt", + &ch->mvoffset)) + ch->mvoffset /= 1000; + fwnode_property_read_u32_array(child, + "gw,voltage-divider-ohms", + ch->vdiv, ARRAY_SIZE(ch->vdiv)); + ch++; + } + + return pdata; +} + +static int gsc_hwmon_probe(struct platform_device *pdev) +{ + struct gsc_dev *gsc = dev_get_drvdata(pdev->dev.parent); + struct device *dev = &pdev->dev; + struct device *hwmon_dev; + struct gsc_hwmon_platform_data *pdata = dev_get_platdata(dev); + struct gsc_hwmon_data *hwmon; + const struct attribute_group **groups; + int i, i_in, i_temp; + + if (!pdata) { + pdata = gsc_hwmon_get_devtree_pdata(dev); + if (IS_ERR(pdata)) + return PTR_ERR(pdata); + } + + hwmon = devm_kzalloc(dev, sizeof(*hwmon), GFP_KERNEL); + if (!hwmon) + return -ENOMEM; + hwmon->gsc = gsc; + hwmon->pdata = pdata; + + hwmon->regmap = devm_regmap_init(dev, &gsc_hwmon_regmap_bus, + gsc->i2c_hwmon, + &gsc_hwmon_regmap_config); + if (IS_ERR(hwmon->regmap)) + return PTR_ERR(hwmon->regmap); + + for (i = 0, i_in = 0, i_temp = 0; i < hwmon->pdata->nchannels; i++) { + const struct gsc_hwmon_channel *ch = &pdata->channels[i]; + + switch (ch->mode) { + case mode_temperature: + if (i_temp == GSC_HWMON_MAX_TEMP_CH) { + dev_err(gsc->dev, "too many temp channels\n"); + return -EINVAL; + } + hwmon->temp_ch[i_temp] = ch; + hwmon->temp_config[i_temp] = HWMON_T_INPUT | + HWMON_T_LABEL; + i_temp++; + break; + case mode_voltage: + case mode_voltage_raw: + if (i_in == GSC_HWMON_MAX_IN_CH) { + dev_err(gsc->dev, "too many input channels\n"); + return -EINVAL; + } + hwmon->in_ch[i_in] = ch; + hwmon->in_config[i_in] = + HWMON_I_INPUT | HWMON_I_LABEL; + i_in++; + break; + default: + dev_err(gsc->dev, "invalid mode: %d\n", ch->mode); + return -EINVAL; + } + } + + /* setup config structures */ + hwmon->chip.ops = &gsc_hwmon_ops; + hwmon->chip.info = hwmon->info; + hwmon->info[0] = &hwmon->temp_info; + hwmon->info[1] = &hwmon->in_info; + hwmon->temp_info.type = hwmon_temp; + hwmon->temp_info.config = hwmon->temp_config; + hwmon->in_info.type = hwmon_in; + hwmon->in_info.config = hwmon->in_config; + + groups = pdata->fan_base ? gsc_hwmon_groups : NULL; + hwmon_dev = devm_hwmon_device_register_with_info(dev, + KBUILD_MODNAME, hwmon, + &hwmon->chip, groups); + return PTR_ERR_OR_ZERO(hwmon_dev); +} + +static const struct of_device_id gsc_hwmon_of_match[] = { + { .compatible = "gw,gsc-adc", }, + {} +}; + +static struct platform_driver gsc_hwmon_driver = { + .driver = { + .name = "gsc-hwmon", + .of_match_table = gsc_hwmon_of_match, + }, + .probe = gsc_hwmon_probe, +}; + +module_platform_driver(gsc_hwmon_driver); + +MODULE_AUTHOR("Tim Harvey "); +MODULE_DESCRIPTION("GSC hardware monitor driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/platform_data/gsc_hwmon.h b/include/linux/platform_data/gsc_hwmon.h new file mode 100644 index 000000000000..ec1611aff863 --- /dev/null +++ b/include/linux/platform_data/gsc_hwmon.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _GSC_HWMON_H +#define _GSC_HWMON_H + +enum gsc_hwmon_mode { + mode_temperature, + mode_voltage, + mode_voltage_raw, + mode_max, +}; + +/** + * struct gsc_hwmon_channel - configuration parameters + * @reg: I2C register offset + * @mode: channel mode + * @name: channel name + * @mvoffset: voltage offset + * @vdiv: voltage divider array (2 resistor values in milli-ohms) + */ +struct gsc_hwmon_channel { + unsigned int reg; + unsigned int mode; + const char *name; + unsigned int mvoffset; + unsigned int vdiv[2]; +}; + +/** + * struct gsc_hwmon_platform_data - platform data for gsc_hwmon driver + * @channels: pointer to array of gsc_hwmon_channel structures + * describing channels + * @nchannels: number of elements in @channels array + * @vreference: voltage reference (mV) + * @resolution: ADC bit resolution + * @fan_base: register base for FAN controller + */ +struct gsc_hwmon_platform_data { + const struct gsc_hwmon_channel *channels; + int nchannels; + unsigned int resolution; + unsigned int vreference; + unsigned int fan_base; +}; +#endif -- cgit v1.2.3 From 6bf393c577c4a6e324ab103425fbf71126e5385b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 19 May 2020 04:35:49 +0800 Subject: soundwire: disco: s/ch/channels/ Use more meaningful member names in preparation for sysfs support. No functionality change. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20200518203551.2053-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/mipi_disco.c | 11 ++++++----- include/linux/soundwire/sdw.h | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c index 844e6b22974f..4ae62b452b8c 100644 --- a/drivers/soundwire/mipi_disco.c +++ b/drivers/soundwire/mipi_disco.c @@ -231,16 +231,17 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, nval = fwnode_property_count_u32(node, "mipi-sdw-channel-number-list"); if (nval > 0) { - dpn[i].num_ch = nval; - dpn[i].ch = devm_kcalloc(&slave->dev, dpn[i].num_ch, - sizeof(*dpn[i].ch), + dpn[i].num_channels = nval; + dpn[i].channels = devm_kcalloc(&slave->dev, + dpn[i].num_channels, + sizeof(*dpn[i].channels), GFP_KERNEL); - if (!dpn[i].ch) + if (!dpn[i].channels) return -ENOMEM; fwnode_property_read_u32_array(node, "mipi-sdw-channel-number-list", - dpn[i].ch, dpn[i].num_ch); + dpn[i].channels, dpn[i].num_channels); } nval = fwnode_property_count_u32(node, "mipi-sdw-channel-combination-list"); diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 7658d9698dd5..9c27a32df9bb 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -291,8 +291,8 @@ struct sdw_dpn_audio_mode { * implementation-defined interrupts * @max_ch: Maximum channels supported * @min_ch: Minimum channels supported - * @num_ch: Number of discrete channels supported - * @ch: Discrete channels supported + * @num_channels: Number of discrete channels supported + * @channels: Discrete channels supported * @num_ch_combinations: Number of channel combinations supported * @ch_combinations: Channel combinations supported * @modes: SDW mode supported @@ -316,8 +316,8 @@ struct sdw_dpn_prop { u32 imp_def_interrupts; u32 max_ch; u32 min_ch; - u32 num_ch; - u32 *ch; + u32 num_channels; + u32 *channels; u32 num_ch_combinations; u32 *ch_combinations; u32 modes; -- cgit v1.2.3 From 6fe12cdbcfe35ad4726a619a9546822d34fc934c Mon Sep 17 00:00:00 2001 From: Bibby Hsieh Date: Tue, 19 May 2020 15:27:29 +0800 Subject: i2c: core: support bus regulator controlling in adapter Although in the most platforms, the bus power of i2c are alway on, some platforms disable the i2c bus power in order to meet low power request. We get and enable bulk regulator in i2c adapter device. Signed-off-by: Bibby Hsieh Reviewed-by: Tomasz Figa Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 84 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/i2c.h | 2 ++ 2 files changed, 86 insertions(+) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index a7c21d4cbd90..5be24bf8a194 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -313,12 +313,14 @@ static int i2c_smbus_host_notify_to_irq(const struct i2c_client *client) static int i2c_device_probe(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); + struct i2c_adapter *adap; struct i2c_driver *driver; int status; if (!client) return 0; + adap = client->adapter; driver = to_i2c_driver(dev->driver); client->irq = client->init_irq; @@ -378,6 +380,12 @@ static int i2c_device_probe(struct device *dev) dev_dbg(dev, "probe\n"); + status = regulator_enable(adap->bus_regulator); + if (status < 0) { + dev_err(&adap->dev, "Failed to enable power regulator\n"); + goto err_clear_wakeup_irq; + } + status = of_clk_set_defaults(dev->of_node, false); if (status < 0) goto err_clear_wakeup_irq; @@ -414,12 +422,14 @@ err_clear_wakeup_irq: static int i2c_device_remove(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); + struct i2c_adapter *adap; struct i2c_driver *driver; int status = 0; if (!client || !dev->driver) return 0; + adap = client->adapter; driver = to_i2c_driver(dev->driver); if (driver->remove) { dev_dbg(dev, "remove\n"); @@ -427,6 +437,8 @@ static int i2c_device_remove(struct device *dev) } dev_pm_domain_detach(&client->dev, true); + if (!pm_runtime_status_suspended(&client->dev)) + regulator_disable(adap->bus_regulator); dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); @@ -438,6 +450,72 @@ static int i2c_device_remove(struct device *dev) return status; } +#ifdef CONFIG_PM_SLEEP +static int i2c_resume_early(struct device *dev) +{ + struct i2c_client *client = i2c_verify_client(dev); + struct i2c_adapter *adap = client->adapter; + int err; + + if (!pm_runtime_status_suspended(&client->dev)) { + err = regulator_enable(adap->bus_regulator); + if (err) + return err; + } + + return pm_generic_resume_early(&client->dev); +} + +static int i2c_suspend_late(struct device *dev) +{ + struct i2c_client *client = i2c_verify_client(dev); + struct i2c_adapter *adap = client->adapter; + int err; + + err = pm_generic_suspend_late(&client->dev); + if (err) + return err; + + if (!pm_runtime_status_suspended(&client->dev)) + return regulator_disable(adap->bus_regulator); + + return 0; +} +#endif + +#ifdef CONFIG_PM +static int i2c_runtime_resume(struct device *dev) +{ + struct i2c_client *client = i2c_verify_client(dev); + struct i2c_adapter *adap = client->adapter; + int err; + + err = regulator_enable(adap->bus_regulator); + if (err) + return err; + + return pm_generic_runtime_resume(&client->dev); +} + +static int i2c_runtime_suspend(struct device *dev) +{ + struct i2c_client *client = i2c_verify_client(dev); + struct i2c_adapter *adap = client->adapter; + int err; + + err = pm_generic_runtime_suspend(&client->dev); + if (err) + return err; + + return regulator_disable(adap->bus_regulator); +} +#endif + +static const struct dev_pm_ops i2c_device_pm = { + SET_LATE_SYSTEM_SLEEP_PM_OPS(i2c_suspend_late, i2c_resume_early) + SET_RUNTIME_PM_OPS(i2c_runtime_suspend, i2c_runtime_resume, NULL) +}; + static void i2c_device_shutdown(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); @@ -495,6 +573,7 @@ struct bus_type i2c_bus_type = { .probe = i2c_device_probe, .remove = i2c_device_remove, .shutdown = i2c_device_shutdown, + .pm = &i2c_device_pm, }; EXPORT_SYMBOL_GPL(i2c_bus_type); @@ -1333,6 +1412,11 @@ static int i2c_register_adapter(struct i2c_adapter *adap) if (res) goto out_reg; + adap->bus_regulator = devm_regulator_get(&adap->dev, "bus"); + if (IS_ERR(adap->bus_regulator)) { + res = PTR_ERR(adap->bus_regulator); + goto out_reg; + } dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name); pm_runtime_no_callbacks(&adap->dev); diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 456fc17ecb1c..bc83af0d38d1 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -15,6 +15,7 @@ #include /* for struct device */ #include /* for completion */ #include +#include #include #include /* for Host Notify IRQ */ #include /* for struct device_node */ @@ -721,6 +722,7 @@ struct i2c_adapter { const struct i2c_adapter_quirks *quirks; struct irq_domain *host_notify_domain; + struct regulator *bus_regulator; }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) -- cgit v1.2.3 From f6dd975583bd8ce088400648fd9819e4691c8958 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:12 +0200 Subject: pipe: merge anon_pipe_buf*_ops All the op vectors are exactly the same, they are just used to encode packet or nomerge behavior. There already is a flag for the packet behavior, so just add a new one to allow for merging. Inverting it vs the previous nomerge special casing actually allows for much nicer code. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/pipe.c | 45 +++++---------------------------------------- fs/splice.c | 12 +++++------- include/linux/pipe_fs_i.h | 2 +- 3 files changed, 11 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 16fb72e9abf7..8e52b78b4042 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -231,7 +231,6 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, } EXPORT_SYMBOL(generic_pipe_buf_release); -/* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations anon_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, @@ -239,40 +238,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { - .confirm = generic_pipe_buf_confirm, - .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - -static const struct pipe_buf_operations packet_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - -/** - * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable - * @buf: the buffer to mark - * - * Description: - * This function ensures that no future writes will be merged into the - * given &struct pipe_buffer. This is necessary when multiple pipe buffers - * share the same backing page. - */ -void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) -{ - if (buf->ops == &anon_pipe_buf_ops) - buf->ops = &anon_pipe_buf_nomerge_ops; -} - -static bool pipe_buf_can_merge(struct pipe_buffer *buf) -{ - return buf->ops == &anon_pipe_buf_ops; -} - /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { @@ -478,7 +443,8 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; - if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { + if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && + offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; @@ -541,11 +507,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = 0; - buf->flags = 0; - if (is_packetized(filp)) { - buf->ops = &packet_pipe_buf_ops; + if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; - } + else + buf->flags = PIPE_BUF_FLAG_CAN_MERGE; pipe->tmp_page = NULL; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); diff --git a/fs/splice.c b/fs/splice.c index 88942bf177d1..fb9670e7fc1f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1624,12 +1624,11 @@ retry: *obuf = *ibuf; /* - * Don't inherit the gift flag, we need to + * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - - pipe_buf_mark_unmergeable(obuf); + obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; @@ -1717,12 +1716,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, *obuf = *ibuf; /* - * Don't inherit the gift flag, we need to - * prevent multiple steals of this page. + * Don't inherit the gift and merge flag, we need to prevent + * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - - pipe_buf_mark_unmergeable(obuf); + obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ae58fad7f1e0..3f7b07b38824 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -8,6 +8,7 @@ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ +#define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */ /** * struct pipe_buffer - a linux kernel pipe buffer @@ -233,7 +234,6 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); -void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; -- cgit v1.2.3 From 76887c256744740d6121af9bc4aa787712a1f694 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:14 +0200 Subject: fs: make the pipe_buf_operations ->steal operation optional Just return 1 for failure if it is not present. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/splice.c | 7 ------- include/linux/pipe_fs_i.h | 3 ++- kernel/trace/trace.c | 1 - net/smc/smc_rx.c | 7 ------- 4 files changed, 2 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/splice.c b/fs/splice.c index fb9670e7fc1f..6c19bda274c8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -337,17 +337,10 @@ const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, - .steal = generic_pipe_buf_nosteal, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 3f7b07b38824..e022b2459301 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -206,6 +206,8 @@ static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, static inline int pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { + if (!buf->ops->steal) + return 1; return buf->ops->steal(pipe, buf); } @@ -232,7 +234,6 @@ void free_pipe_info(struct pipe_inode_info *); bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); -int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc9783797d27..29fa25cfb6c2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7576,7 +7576,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, static const struct pipe_buf_operations buffer_pipe_buf_ops = { .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 39d7b34d06d2..5fe25279702d 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -129,16 +129,9 @@ out: sock_put(sk); } -static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static const struct pipe_buf_operations smc_pipe_ops = { .confirm = generic_pipe_buf_confirm, .release = smc_rx_pipe_buf_release, - .steal = smc_rx_pipe_buf_nosteal, .get = generic_pipe_buf_get }; -- cgit v1.2.3 From b8d9e7f2411b0744df2ec33e80d7698180fef21a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:15 +0200 Subject: fs: make the pipe_buf_operations ->confirm operation optional Just return 0 for success if it is not present. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/pipe.c | 17 ----------------- fs/splice.c | 3 --- include/linux/pipe_fs_i.h | 5 +++-- kernel/relay.c | 1 - kernel/trace/trace.c | 1 - net/smc/smc_rx.c | 1 - 6 files changed, 3 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 8e52b78b4042..58890897402a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -200,22 +200,6 @@ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } EXPORT_SYMBOL(generic_pipe_buf_get); -/** - * generic_pipe_buf_confirm - verify contents of the pipe buffer - * @info: the pipe that the buffer belongs to - * @buf: the buffer to confirm - * - * Description: - * This function does nothing, because the generic pipe code uses - * pages that are always good when inserted into the pipe. - */ -int generic_pipe_buf_confirm(struct pipe_inode_info *info, - struct pipe_buffer *buf) -{ - return 0; -} -EXPORT_SYMBOL(generic_pipe_buf_confirm); - /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to @@ -232,7 +216,6 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, diff --git a/fs/splice.c b/fs/splice.c index 6c19bda274c8..bc834073cf74 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -156,7 +156,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations user_page_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -331,7 +330,6 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, EXPORT_SYMBOL(generic_file_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -339,7 +337,6 @@ const struct pipe_buf_operations default_pipe_buf_ops = { /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e022b2459301..7c057daa0931 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -82,7 +82,7 @@ struct pipe_buf_operations { * and that the contents are good. If the pages in the pipe belong * to a file system, we may need to wait for IO completion in this * hook. Returns 0 for good, or a negative error value in case of - * error. + * error. If not present all pages are considered good. */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); @@ -195,6 +195,8 @@ static inline void pipe_buf_release(struct pipe_inode_info *pipe, static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { + if (!buf->ops->confirm) + return 0; return buf->ops->confirm(pipe, buf); } @@ -232,7 +234,6 @@ void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); -int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..c5ece4c2b04d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29fa25cfb6c2..63d1ab978435 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7574,7 +7574,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .get = buffer_pipe_buf_get, }; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 5fe25279702d..fcfac59f8b72 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -130,7 +130,6 @@ out: } static const struct pipe_buf_operations smc_pipe_ops = { - .confirm = generic_pipe_buf_confirm, .release = smc_rx_pipe_buf_release, .get = generic_pipe_buf_get }; -- cgit v1.2.3 From c928f642c29a5ffb02e16f2430b42b876dde69de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 May 2020 17:58:16 +0200 Subject: fs: rename pipe_buf ->steal to ->try_steal And replace the arcane return value convention with a simple bool where true means success and false means failure. [AV: braino fix folded in] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- drivers/char/virtio_console.c | 2 +- fs/fuse/dev.c | 2 +- fs/pipe.c | 34 ++++++++++++++++------------------ fs/splice.c | 40 ++++++++++++++++++++-------------------- include/linux/pipe_fs_i.h | 34 +++++++++++++++++----------------- kernel/relay.c | 6 +++--- 6 files changed, 58 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 3cbaec925606..00c5e3acee46 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -871,7 +871,7 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, return 0; /* Try lock this page */ - if (pipe_buf_steal(pipe, buf) == 0) { + if (pipe_buf_try_steal(pipe, buf)) { /* Get reference and unlock page for moving */ get_page(buf->page); unlock_page(buf->page); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..7a2b2de87b1f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -805,7 +805,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (cs->len != PAGE_SIZE) goto out_fallback; - if (pipe_buf_steal(cs->pipe, buf) != 0) + if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; newpage = buf->page; diff --git a/fs/pipe.c b/fs/pipe.c index 58890897402a..c7c4fb5f345f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -140,21 +140,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, put_page(page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; - if (page_count(page) == 1) { - memcg_kmem_uncharge_page(page, 0); - __SetPageLocked(page); - return 0; - } - return 1; + if (page_count(page) != 1) + return false; + memcg_kmem_uncharge_page(page, 0); + __SetPageLocked(page); + return true; } /** - * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer + * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal * @@ -165,8 +164,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, * he wishes; the typical use is insertion into a different file * page cache. */ -int generic_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; @@ -177,12 +176,11 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, */ if (page_count(page) == 1) { lock_page(page); - return 0; + return true; } - - return 1; + return false; } -EXPORT_SYMBOL(generic_pipe_buf_steal); +EXPORT_SYMBOL(generic_pipe_buf_try_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer @@ -216,9 +214,9 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { - .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = anon_pipe_buf_release, + .try_steal = anon_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ diff --git a/fs/splice.c b/fs/splice.c index bc834073cf74..2c2f45a9afc0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -44,8 +44,8 @@ * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ -static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping; @@ -76,7 +76,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ if (remove_mapping(mapping, page)) { buf->flags |= PIPE_BUF_FLAG_LRU; - return 0; + return true; } } @@ -86,7 +86,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ out_unlock: unlock_page(page); - return 1; + return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, @@ -139,26 +139,26 @@ error: } const struct pipe_buf_operations page_cache_pipe_buf_ops = { - .confirm = page_cache_pipe_buf_confirm, - .release = page_cache_pipe_buf_release, - .steal = page_cache_pipe_buf_steal, - .get = generic_pipe_buf_get, + .confirm = page_cache_pipe_buf_confirm, + .release = page_cache_pipe_buf_release, + .try_steal = page_cache_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; -static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) - return 1; + return false; buf->flags |= PIPE_BUF_FLAG_LRU; - return generic_pipe_buf_steal(pipe, buf); + return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { - .release = page_cache_pipe_buf_release, - .steal = user_page_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = page_cache_pipe_buf_release, + .try_steal = user_page_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) @@ -330,15 +330,15 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, EXPORT_SYMBOL(generic_file_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { - .release = generic_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = generic_pipe_buf_release, + .try_steal = generic_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { - .release = generic_pipe_buf_release, - .get = generic_pipe_buf_get, + .release = generic_pipe_buf_release, + .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 7c057daa0931..0c31b9461262 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -70,11 +70,11 @@ struct pipe_inode_info { * Note on the nesting of these functions: * * ->confirm() - * ->steal() + * ->try_steal() * - * That is, ->steal() must be called on a confirmed buffer. - * See below for the meaning of each operation. Also see kerneldoc - * in fs/pipe.c for the pipe and generic variants of these hooks. + * That is, ->try_steal() must be called on a confirmed buffer. See below for + * the meaning of each operation. Also see the kerneldoc in fs/pipe.c for the + * pipe and generic variants of these hooks. */ struct pipe_buf_operations { /* @@ -94,13 +94,13 @@ struct pipe_buf_operations { /* * Attempt to take ownership of the pipe buffer and its contents. - * ->steal() returns 0 for success, in which case the contents - * of the pipe (the buf->page) is locked and now completely owned - * by the caller. The page may then be transferred to a different - * mapping, the most often used case is insertion into different - * file address space cache. + * ->try_steal() returns %true for success, in which case the contents + * of the pipe (the buf->page) is locked and now completely owned by the + * caller. The page may then be transferred to a different mapping, the + * most often used case is insertion into different file address space + * cache. */ - int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *); /* * Get a reference to the pipe buffer. @@ -201,16 +201,16 @@ static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, } /** - * pipe_buf_steal - attempt to take ownership of a pipe_buffer + * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal */ -static inline int pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { - if (!buf->ops->steal) - return 1; - return buf->ops->steal(pipe, buf); + if (!buf->ops->try_steal) + return false; + return buf->ops->try_steal(pipe, buf); } /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual @@ -234,7 +234,7 @@ void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); -int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); +bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; diff --git a/kernel/relay.c b/kernel/relay.c index c5ece4c2b04d..5c2263f3f857 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,9 +1177,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = relay_pipe_buf_release, + .try_steal = generic_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -- cgit v1.2.3 From 9b47c5275614a16fd64359fab73fe6c736bf57a0 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 18 May 2020 15:07:10 -0400 Subject: efi/libstub: Add definitions for console input and events Add the required typedefs etc for using con_in's simple text input protocol, and for using the boottime event services. Also add the prototype for the "stall" boot service. Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20200518190716.751506-19-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 10 +++++ arch/x86/xen/efi.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 77 +++++++++++++++++++++++++++++++--- include/linux/efi.h | 3 +- 4 files changed, 85 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 6b9ab0d8b2a7..89dcc7aa7e2c 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -9,6 +9,7 @@ #include #include #include +#include extern unsigned long efi_fw_vendor, efi_config_table; @@ -293,6 +294,15 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_allocate_pool(type, size, buffer) \ ((type), (size), efi64_zero_upper(buffer)) +#define __efi64_argmap_create_event(type, tpl, f, c, event) \ + ((type), (tpl), (f), (c), efi64_zero_upper(event)) + +#define __efi64_argmap_set_timer(event, type, time) \ + ((event), (type), lower_32_bits(time), upper_32_bits(time)) + +#define __efi64_argmap_wait_for_event(num, event, index) \ + ((num), (event), efi64_zero_upper(index)) + #define __efi64_argmap_handle_protocol(handle, protocol, interface) \ ((handle), (protocol), efi64_zero_upper(interface)) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 1abe455d926a..205a9bc981b0 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -29,7 +29,7 @@ static efi_system_table_t efi_systab_xen __initdata = { .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */ .fw_revision = 0, /* Initialized later. */ .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ - .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_in = NULL, /* Not used under Xen. */ .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ .con_out = NULL, /* Not used under Xen. */ .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 3a323a009836..c7c03099367f 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -111,6 +111,16 @@ void efi_set_u64_split(u64 data, u32 *lo, u32 *hi) #define EFI_LOCATE_BY_REGISTER_NOTIFY 1 #define EFI_LOCATE_BY_PROTOCOL 2 +/* + * boottime->stall takes the time period in microseconds + */ +#define EFI_USEC_PER_SEC 1000000 + +/* + * boottime->set_timer takes the time in 100ns units + */ +#define EFI_100NSEC_PER_USEC ((u64)10) + struct efi_boot_memmap { efi_memory_desc_t **map; unsigned long *map_size; @@ -122,6 +132,39 @@ struct efi_boot_memmap { typedef struct efi_generic_dev_path efi_device_path_protocol_t; +typedef void *efi_event_t; +/* Note that notifications won't work in mixed mode */ +typedef void (__efiapi *efi_event_notify_t)(efi_event_t, void *); + +#define EFI_EVT_TIMER 0x80000000U +#define EFI_EVT_RUNTIME 0x40000000U +#define EFI_EVT_NOTIFY_WAIT 0x00000100U +#define EFI_EVT_NOTIFY_SIGNAL 0x00000200U + +/* + * boottime->wait_for_event takes an array of events as input. + * Provide a helper to set it up correctly for mixed mode. + */ +static inline +void efi_set_event_at(efi_event_t *events, size_t idx, efi_event_t event) +{ + if (efi_is_native()) + events[idx] = event; + else + ((u32 *)events)[idx] = (u32)(unsigned long)event; +} + +#define EFI_TPL_APPLICATION 4 +#define EFI_TPL_CALLBACK 8 +#define EFI_TPL_NOTIFY 16 +#define EFI_TPL_HIGH_LEVEL 31 + +typedef enum { + EfiTimerCancel, + EfiTimerPeriodic, + EfiTimerRelative +} EFI_TIMER_DELAY; + /* * EFI Boot Services table */ @@ -140,11 +183,16 @@ union efi_boot_services { efi_status_t (__efiapi *allocate_pool)(int, unsigned long, void **); efi_status_t (__efiapi *free_pool)(void *); - void *create_event; - void *set_timer; - void *wait_for_event; + efi_status_t (__efiapi *create_event)(u32, unsigned long, + efi_event_notify_t, void *, + efi_event_t *); + efi_status_t (__efiapi *set_timer)(efi_event_t, + EFI_TIMER_DELAY, u64); + efi_status_t (__efiapi *wait_for_event)(unsigned long, + efi_event_t *, + unsigned long *); void *signal_event; - void *close_event; + efi_status_t (__efiapi *close_event)(efi_event_t); void *check_event; void *install_protocol_interface; void *reinstall_protocol_interface; @@ -171,7 +219,7 @@ union efi_boot_services { efi_status_t (__efiapi *exit_boot_services)(efi_handle_t, unsigned long); void *get_next_monotonic_count; - void *stall; + efi_status_t (__efiapi *stall)(unsigned long); void *set_watchdog_timer; void *connect_controller; efi_status_t (__efiapi *disconnect_controller)(efi_handle_t, @@ -256,6 +304,25 @@ union efi_uga_draw_protocol { } mixed_mode; }; +typedef struct { + u16 scan_code; + efi_char16_t unicode_char; +} efi_input_key_t; + +union efi_simple_text_input_protocol { + struct { + void *reset; + efi_status_t (__efiapi *read_keystroke)(efi_simple_text_input_protocol_t *, + efi_input_key_t *); + efi_event_t wait_for_key; + }; + struct { + u32 reset; + u32 read_keystroke; + u32 wait_for_key; + } mixed_mode; +}; + union efi_simple_text_output_protocol { struct { void *reset; diff --git a/include/linux/efi.h b/include/linux/efi.h index 9b7c7ec319ac..974648db0c68 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -426,6 +426,7 @@ typedef struct { u32 tables; } efi_system_table_32_t; +typedef union efi_simple_text_input_protocol efi_simple_text_input_protocol_t; typedef union efi_simple_text_output_protocol efi_simple_text_output_protocol_t; typedef union { @@ -434,7 +435,7 @@ typedef union { unsigned long fw_vendor; /* physical addr of CHAR16 vendor string */ u32 fw_revision; unsigned long con_in_handle; - unsigned long con_in; + efi_simple_text_input_protocol_t *con_in; unsigned long con_out_handle; efi_simple_text_output_protocol_t *con_out; unsigned long stderr_handle; -- cgit v1.2.3 From 14c574f35cfbc9272fc67b41f074c847db139652 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 18 May 2020 15:07:11 -0400 Subject: efi/gop: Add an option to list out the available GOP modes Add video=efifb:list option to list the modes that are available. Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20200518190716.751506-20-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- Documentation/fb/efifb.rst | 5 ++ drivers/firmware/efi/libstub/efi-stub-helper.c | 35 ++++++++++ drivers/firmware/efi/libstub/efistub.h | 2 + drivers/firmware/efi/libstub/gop.c | 97 +++++++++++++++++++++++++- include/linux/efi.h | 1 + 5 files changed, 139 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/fb/efifb.rst b/Documentation/fb/efifb.rst index 519550517fd4..6badff64756f 100644 --- a/Documentation/fb/efifb.rst +++ b/Documentation/fb/efifb.rst @@ -63,4 +63,9 @@ auto with the highest resolution, it will choose one with the highest color depth. +list + The EFI stub will list out all the display modes that are available. A + specific mode can then be chosen using one of the above options for the + next boot. + Edgar Hucek diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 1f5a00b4f201..f338d149aaa5 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -463,3 +463,38 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image, return status; } + +efi_status_t efi_wait_for_key(unsigned long usec, efi_input_key_t *key) +{ + efi_event_t events[2], timer; + unsigned long index; + efi_simple_text_input_protocol_t *con_in; + efi_status_t status; + + con_in = efi_table_attr(efi_system_table, con_in); + if (!con_in) + return EFI_UNSUPPORTED; + efi_set_event_at(events, 0, efi_table_attr(con_in, wait_for_key)); + + status = efi_bs_call(create_event, EFI_EVT_TIMER, 0, NULL, NULL, &timer); + if (status != EFI_SUCCESS) + return status; + + status = efi_bs_call(set_timer, timer, EfiTimerRelative, + EFI_100NSEC_PER_USEC * usec); + if (status != EFI_SUCCESS) + return status; + efi_set_event_at(events, 1, timer); + + status = efi_bs_call(wait_for_event, 2, events, &index); + if (status == EFI_SUCCESS) { + if (index == 0) + status = efi_call_proto(con_in, read_keystroke, key); + else + status = EFI_TIMEOUT; + } + + efi_bs_call(close_event, timer); + + return status; +} diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index c7c03099367f..ad7e0406d0ba 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -323,6 +323,8 @@ union efi_simple_text_input_protocol { } mixed_mode; }; +efi_status_t efi_wait_for_key(unsigned long usec, efi_input_key_t *key); + union efi_simple_text_output_protocol { struct { void *reset; diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c index 34c0cba2c8bf..ea5da307d542 100644 --- a/drivers/firmware/efi/libstub/gop.c +++ b/drivers/firmware/efi/libstub/gop.c @@ -19,7 +19,8 @@ enum efi_cmdline_option { EFI_CMDLINE_NONE, EFI_CMDLINE_MODE_NUM, EFI_CMDLINE_RES, - EFI_CMDLINE_AUTO + EFI_CMDLINE_AUTO, + EFI_CMDLINE_LIST }; static struct { @@ -100,6 +101,19 @@ static bool parse_auto(char *option, char **next) return true; } +static bool parse_list(char *option, char **next) +{ + if (!strstarts(option, "list")) + return false; + option += strlen("list"); + if (*option && *option++ != ',') + return false; + cmdline.option = EFI_CMDLINE_LIST; + + *next = option; + return true; +} + void efi_parse_option_graphics(char *option) { while (*option) { @@ -109,6 +123,8 @@ void efi_parse_option_graphics(char *option) continue; if (parse_auto(option, &option)) continue; + if (parse_list(option, &option)) + continue; while (*option && *option++ != ',') ; @@ -290,6 +306,82 @@ static u32 choose_mode_auto(efi_graphics_output_protocol_t *gop) return best_mode; } +static u32 choose_mode_list(efi_graphics_output_protocol_t *gop) +{ + efi_status_t status; + + efi_graphics_output_protocol_mode_t *mode; + efi_graphics_output_mode_info_t *info; + unsigned long info_size; + + u32 max_mode, cur_mode; + int pf; + efi_pixel_bitmask_t pi; + u32 m, w, h; + u8 d; + const char *dstr; + bool valid; + efi_input_key_t key; + + mode = efi_table_attr(gop, mode); + + cur_mode = efi_table_attr(mode, mode); + max_mode = efi_table_attr(mode, max_mode); + + efi_printk("Available graphics modes are 0-%u\n", max_mode-1); + efi_puts(" * = current mode\n" + " - = unusable mode\n"); + for (m = 0; m < max_mode; m++) { + status = efi_call_proto(gop, query_mode, m, + &info_size, &info); + if (status != EFI_SUCCESS) + continue; + + pf = info->pixel_format; + pi = info->pixel_information; + w = info->horizontal_resolution; + h = info->vertical_resolution; + + efi_bs_call(free_pool, info); + + valid = !(pf == PIXEL_BLT_ONLY || pf >= PIXEL_FORMAT_MAX); + d = 0; + switch (pf) { + case PIXEL_RGB_RESERVED_8BIT_PER_COLOR: + dstr = "rgb"; + break; + case PIXEL_BGR_RESERVED_8BIT_PER_COLOR: + dstr = "bgr"; + break; + case PIXEL_BIT_MASK: + dstr = ""; + d = pixel_bpp(pf, pi); + break; + case PIXEL_BLT_ONLY: + dstr = "blt"; + break; + default: + dstr = "xxx"; + break; + } + + efi_printk("Mode %3u %c%c: Resolution %ux%u-%s%.0hhu\n", + m, + m == cur_mode ? '*' : ' ', + !valid ? '-' : ' ', + w, h, dstr, d); + } + + efi_puts("\nPress any key to continue (or wait 10 seconds)\n"); + status = efi_wait_for_key(10 * EFI_USEC_PER_SEC, &key); + if (status != EFI_SUCCESS && status != EFI_TIMEOUT) { + efi_err("Unable to read key, continuing in 10 seconds\n"); + efi_bs_call(stall, 10 * EFI_USEC_PER_SEC); + } + + return cur_mode; +} + static void set_mode(efi_graphics_output_protocol_t *gop) { efi_graphics_output_protocol_mode_t *mode; @@ -305,6 +397,9 @@ static void set_mode(efi_graphics_output_protocol_t *gop) case EFI_CMDLINE_AUTO: new_mode = choose_mode_auto(gop); break; + case EFI_CMDLINE_LIST: + new_mode = choose_mode_list(gop); + break; default: return; } diff --git a/include/linux/efi.h b/include/linux/efi.h index 974648db0c68..609201bd4682 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -39,6 +39,7 @@ #define EFI_WRITE_PROTECTED ( 8 | (1UL << (BITS_PER_LONG-1))) #define EFI_OUT_OF_RESOURCES ( 9 | (1UL << (BITS_PER_LONG-1))) #define EFI_NOT_FOUND (14 | (1UL << (BITS_PER_LONG-1))) +#define EFI_TIMEOUT (18 | (1UL << (BITS_PER_LONG-1))) #define EFI_ABORTED (21 | (1UL << (BITS_PER_LONG-1))) #define EFI_SECURITY_VIOLATION (26 | (1UL << (BITS_PER_LONG-1))) -- cgit v1.2.3 From 15c704ab6244ac95be54b2c05411b70501d50e8f Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 18 May 2020 10:12:17 +0100 Subject: firmware: smccc: Update link to latest SMCCC specification The current link gets redirected to the revision B published in November 2016 though it actually points to the original revision A published in June 2013. Let us update the link to point to the latest version, so that it doesn't get stale anytime soon. Currently it points to v1.2 published in March 2020(i.e. DEN0028C). Signed-off-by: Sudeep Holla Tested-by: Etienne Carriere Reviewed-by: Steven Price Reviewed-by: Etienne Carriere Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20200518091222.27467-3-sudeep.holla@arm.com Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 59494df0f55b..31b15db9685d 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -10,7 +10,9 @@ /* * This file provides common defines for ARM SMC Calling Convention as * specified in - * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html + * https://developer.arm.com/docs/den0028/latest + * + * This code is up-to-date with version DEN 0028 B */ #define ARM_SMCCC_STD_CALL _AC(0,U) -- cgit v1.2.3 From 0441bfe7f00acaae7c4937ba0ca48ee1de9b709f Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 18 May 2020 10:12:18 +0100 Subject: firmware: smccc: Add the definition for SMCCCv1.2 version/error codes Add the definition for SMCCC v1.2 version and new error code added. While at it, also add a note that ARM DEN 0070A is deprecated and is now merged into the main SMCCC specification(ARM DEN 0028C). Signed-off-by: Sudeep Holla Tested-by: Etienne Carriere Reviewed-by: Steven Price Reviewed-by: Etienne Carriere Link: https://lore.kernel.org/r/20200518091222.27467-4-sudeep.holla@arm.com Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 31b15db9685d..c3784ba8e2a4 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -12,7 +12,7 @@ * specified in * https://developer.arm.com/docs/den0028/latest * - * This code is up-to-date with version DEN 0028 B + * This code is up-to-date with version DEN 0028 C */ #define ARM_SMCCC_STD_CALL _AC(0,U) @@ -58,6 +58,7 @@ #define ARM_SMCCC_VERSION_1_0 0x10000 #define ARM_SMCCC_VERSION_1_1 0x10001 +#define ARM_SMCCC_VERSION_1_2 0x10002 #define ARM_SMCCC_VERSION_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ @@ -316,10 +317,14 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, */ #define arm_smccc_1_1_hvc(...) __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__) -/* Return codes defined in ARM DEN 0070A */ +/* + * Return codes defined in ARM DEN 0070A + * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C + */ #define SMCCC_RET_SUCCESS 0 #define SMCCC_RET_NOT_SUPPORTED -1 #define SMCCC_RET_NOT_REQUIRED -2 +#define SMCCC_RET_INVALID_PARAMETER -3 /* * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED. -- cgit v1.2.3 From ad5a57dfe434b02ab28852703d7ad5510998ccef Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 18 May 2020 10:12:19 +0100 Subject: firmware: smccc: Drop smccc_version enum and use ARM_SMCCC_VERSION_1_x instead Instead of maintaining 2 sets of enums/macros for tracking SMCCC version, let us drop smccc_version enum and use ARM_SMCCC_VERSION_1_x directly instead. This is in preparation to drop smccc_version here and move it separately under drivers/firmware/smccc. Signed-off-by: Sudeep Holla Tested-by: Etienne Carriere Reviewed-by: Steven Price Reviewed-by: Etienne Carriere Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20200518091222.27467-5-sudeep.holla@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/paravirt.c | 2 +- drivers/firmware/psci/psci.c | 8 ++++---- include/linux/psci.h | 7 +------ 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 1ef702b0be2d..295d66490584 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -120,7 +120,7 @@ static bool has_pv_steal_clock(void) struct arm_smccc_res res; /* To detect the presence of PV time support we require SMCCC 1.1+ */ - if (psci_ops.smccc_version < SMCCC_VERSION_1_1) + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) return false; arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 2937d44b5df4..6a56d7196697 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -54,12 +54,12 @@ bool psci_tos_resident_on(int cpu) struct psci_operations psci_ops = { .conduit = SMCCC_CONDUIT_NONE, - .smccc_version = SMCCC_VERSION_1_0, + .smccc_version = ARM_SMCCC_VERSION_1_0, }; enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) { - if (psci_ops.smccc_version < SMCCC_VERSION_1_1) + if (psci_ops.smccc_version < ARM_SMCCC_VERSION_1_1) return SMCCC_CONDUIT_NONE; return psci_ops.conduit; @@ -411,8 +411,8 @@ static void __init psci_init_smccc(void) if (feature != PSCI_RET_NOT_SUPPORTED) { u32 ret; ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0); - if (ret == ARM_SMCCC_VERSION_1_1) { - psci_ops.smccc_version = SMCCC_VERSION_1_1; + if (ret >= ARM_SMCCC_VERSION_1_1) { + psci_ops.smccc_version = ret; ver = ret; } } diff --git a/include/linux/psci.h b/include/linux/psci.h index a67712b73b6c..29bd0671e5bb 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -21,11 +21,6 @@ bool psci_power_state_is_valid(u32 state); int psci_set_osi_mode(void); bool psci_has_osi_support(void); -enum smccc_version { - SMCCC_VERSION_1_0, - SMCCC_VERSION_1_1, -}; - struct psci_operations { u32 (*get_version)(void); int (*cpu_suspend)(u32 state, unsigned long entry_point); @@ -36,7 +31,7 @@ struct psci_operations { unsigned long lowest_affinity_level); int (*migrate_info_type)(void); enum arm_smccc_conduit conduit; - enum smccc_version smccc_version; + u32 smccc_version; }; extern struct psci_operations psci_ops; -- cgit v1.2.3 From f2ae97062a48b114bcf8fb2e99574d9ed2c2cd1b Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 18 May 2020 10:12:20 +0100 Subject: firmware: smccc: Refactor SMCCC specific bits into separate file In order to add newer SMCCC v1.1+ functionality and to avoid cluttering PSCI firmware driver with SMCCC bits, let us move the SMCCC specific details under drivers/firmware/smccc/smccc.c We can also drop conduit and smccc_version from psci_operations structure as SMCCC was the sole user and now it maintains those. No functionality change in this patch though. Signed-off-by: Sudeep Holla Tested-by: Etienne Carriere Reviewed-by: Etienne Carriere Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20200518091222.27467-6-sudeep.holla@arm.com Signed-off-by: Will Deacon --- MAINTAINERS | 9 +++++++++ drivers/firmware/Makefile | 3 ++- drivers/firmware/psci/psci.c | 20 +++++--------------- drivers/firmware/smccc/Makefile | 3 +++ drivers/firmware/smccc/smccc.c | 26 ++++++++++++++++++++++++++ include/linux/psci.h | 2 -- 6 files changed, 45 insertions(+), 18 deletions(-) create mode 100644 drivers/firmware/smccc/Makefile create mode 100644 drivers/firmware/smccc/smccc.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 26f281d9f32a..cb993caf5cd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15474,6 +15474,15 @@ M: Nicolas Pitre S: Odd Fixes F: drivers/net/ethernet/smsc/smc91x.* +SECURE MONITOR CALL(SMC) CALLING CONVENTION (SMCCC) +M: Mark Rutland +M: Lorenzo Pieralisi +M: Sudeep Holla +L: linux-arm-kernel@lists.infradead.org +S: Maintained +F: drivers/firmware/smccc/ +F: include/linux/arm-smccc.h + SMIA AND SMIA++ IMAGE SENSOR DRIVER M: Sakari Ailus L: linux-media@vger.kernel.org diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index e9fb838af4df..99510be9f5ed 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -23,12 +23,13 @@ obj-$(CONFIG_TRUSTED_FOUNDATIONS) += trusted_foundations.o obj-$(CONFIG_TURRIS_MOX_RWTM) += turris-mox-rwtm.o obj-$(CONFIG_ARM_SCMI_PROTOCOL) += arm_scmi/ -obj-y += psci/ obj-y += broadcom/ obj-y += meson/ obj-$(CONFIG_GOOGLE_FIRMWARE) += google/ obj-$(CONFIG_EFI) += efi/ obj-$(CONFIG_UEFI_CPER) += efi/ obj-y += imx/ +obj-y += psci/ +obj-y += smccc/ obj-y += tegra/ obj-y += xilinx/ diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 6a56d7196697..1330a698a178 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -46,25 +46,14 @@ * require cooperation with a Trusted OS driver. */ static int resident_cpu = -1; +struct psci_operations psci_ops; +static enum arm_smccc_conduit psci_conduit = SMCCC_CONDUIT_NONE; bool psci_tos_resident_on(int cpu) { return cpu == resident_cpu; } -struct psci_operations psci_ops = { - .conduit = SMCCC_CONDUIT_NONE, - .smccc_version = ARM_SMCCC_VERSION_1_0, -}; - -enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) -{ - if (psci_ops.smccc_version < ARM_SMCCC_VERSION_1_1) - return SMCCC_CONDUIT_NONE; - - return psci_ops.conduit; -} - typedef unsigned long (psci_fn)(unsigned long, unsigned long, unsigned long, unsigned long); static psci_fn *invoke_psci_fn; @@ -90,6 +79,7 @@ static u32 psci_function_id[PSCI_FN_MAX]; static u32 psci_cpu_suspend_feature; static bool psci_system_reset2_supported; +void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit); static inline bool psci_has_ext_power_state(void) { @@ -242,7 +232,7 @@ static void set_conduit(enum arm_smccc_conduit conduit) WARN(1, "Unexpected PSCI conduit %d\n", conduit); } - psci_ops.conduit = conduit; + psci_conduit = conduit; } static int get_set_conduit_method(struct device_node *np) @@ -412,7 +402,7 @@ static void __init psci_init_smccc(void) u32 ret; ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0); if (ret >= ARM_SMCCC_VERSION_1_1) { - psci_ops.smccc_version = ret; + arm_smccc_version_init(ret, psci_conduit); ver = ret; } } diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile new file mode 100644 index 000000000000..6f369fe3f0b9 --- /dev/null +++ b/drivers/firmware/smccc/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +# +obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c new file mode 100644 index 000000000000..de92a4b9f8f6 --- /dev/null +++ b/drivers/firmware/smccc/smccc.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Arm Limited + */ + +#define pr_fmt(fmt) "smccc: " fmt + +#include +#include + +static u32 smccc_version = ARM_SMCCC_VERSION_1_0; +static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE; + +void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit) +{ + smccc_version = version; + smccc_conduit = conduit; +} + +enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) +{ + if (smccc_version < ARM_SMCCC_VERSION_1_1) + return SMCCC_CONDUIT_NONE; + + return smccc_conduit; +} diff --git a/include/linux/psci.h b/include/linux/psci.h index 29bd0671e5bb..14ad9b9ebcd6 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -30,8 +30,6 @@ struct psci_operations { int (*affinity_info)(unsigned long target_affinity, unsigned long lowest_affinity_level); int (*migrate_info_type)(void); - enum arm_smccc_conduit conduit; - u32 smccc_version; }; extern struct psci_operations psci_ops; -- cgit v1.2.3 From a4fb17465182c9fc13104e4df04d050892055205 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 18 May 2020 10:12:21 +0100 Subject: firmware: smccc: Add function to fetch SMCCC version For backward compatibility reasons, PSCI maintains SMCCC version as SMCCC didn't provide ARM_SMCCC_VERSION_FUNC_ID until v1.1. PSCI initialises both the SMCCC version and conduit. Similar to the conduit, let us provide accessors to fetch the SMCCC version also so that other SMCCC v1.1+ features can use it. Signed-off-by: Sudeep Holla Tested-by: Etienne Carriere Reviewed-by: Steven Price Reviewed-by: Etienne Carriere Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20200518091222.27467-7-sudeep.holla@arm.com Signed-off-by: Will Deacon --- drivers/firmware/smccc/smccc.c | 5 +++++ include/linux/arm-smccc.h | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c index de92a4b9f8f6..4e80921ee212 100644 --- a/drivers/firmware/smccc/smccc.c +++ b/drivers/firmware/smccc/smccc.c @@ -24,3 +24,8 @@ enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) return smccc_conduit; } + +u32 arm_smccc_get_version(void) +{ + return smccc_version; +} diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index c3784ba8e2a4..c491d210e3c3 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -100,6 +100,17 @@ enum arm_smccc_conduit { */ enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void); +/** + * arm_smccc_get_version() + * + * Returns the version to be used for SMCCCv1.1 or later. + * + * When SMCCCv1.1 or above is not present, returns SMCCCv1.0, but this + * does not imply the presence of firmware or a valid conduit. Caller + * handling SMCCCv1.0 must determine the conduit by other means. + */ +u32 arm_smccc_get_version(void); + /** * struct arm_smccc_res - Result from SMC/HVC call * @a0-a3 result values from registers 0 to 3 -- cgit v1.2.3 From b8bff599261c930630385ee21d3f98e7ce7d4843 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Mar 2020 15:46:24 -0500 Subject: exec: Factor security_bprm_creds_for_exec out of security_bprm_set_creds Today security_bprm_set_creds has several implementations: apparmor_bprm_set_creds, cap_bprm_set_creds, selinux_bprm_set_creds, smack_bprm_set_creds, and tomoyo_bprm_set_creds. Except for cap_bprm_set_creds they all test bprm->called_set_creds and return immediately if it is true. The function cap_bprm_set_creds ignores bprm->calld_sed_creds entirely. Create a new LSM hook security_bprm_creds_for_exec that is called just before prepare_binprm in __do_execve_file, resulting in a LSM hook that is called exactly once for the entire of exec. Modify the bits of security_bprm_set_creds that only want to be called once per exec into security_bprm_creds_for_exec, leaving only cap_bprm_set_creds behind. Remove bprm->called_set_creds all of it's former users have been moved to security_bprm_creds_for_exec. Add or upate comments a appropriate to bring them up to date and to reflect this change. Link: https://lkml.kernel.org/r/87v9kszrzh.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Acked-by: Casey Schaufler # For the LSM and Smack bits Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 6 ++++- include/linux/binfmts.h | 18 ++++---------- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 50 +++++++++++++++++++++----------------- include/linux/security.h | 6 +++++ security/apparmor/domain.c | 7 ++---- security/apparmor/include/domain.h | 2 +- security/apparmor/lsm.c | 2 +- security/security.c | 5 ++++ security/selinux/hooks.c | 8 +++--- security/smack/smack_lsm.c | 9 +++---- security/tomoyo/tomoyo.c | 12 +++------ 12 files changed, 63 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 14b786158aa9..9e70da47f8d9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1640,7 +1640,6 @@ int prepare_binprm(struct linux_binprm *bprm) retval = security_bprm_set_creds(bprm); if (retval) return retval; - bprm->called_set_creds = 1; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); @@ -1855,6 +1854,11 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval < 0) goto out; + /* Set the unchanging part of bprm->cred */ + retval = security_bprm_creds_for_exec(bprm); + if (retval) + goto out; + retval = prepare_binprm(bprm); if (retval < 0) goto out; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 1b48e2154766..d1217fcdedea 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -27,22 +27,14 @@ struct linux_binprm { unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int /* - * True after the bprm_set_creds hook has been called once - * (multiple calls can be made via prepare_binprm() for - * binfmt_script/misc). - */ - called_set_creds:1, - /* - * True if most recent call to the commoncaps bprm_set_creds - * hook (due to multiple prepare_binprm() calls from the - * binfmt_script/misc handlers) resulted in elevated - * privileges. + * True if most recent call to cap_bprm_set_creds + * resulted in elevated privileges. */ cap_elevated:1, /* - * Set by bprm_set_creds hook to indicate a privilege-gaining - * exec has happened. Used to sanitize execution environment - * and to set AT_SECURE auxv for glibc. + * Set by bprm_creds_for_exec hook to indicate a + * privilege-gaining exec has happened. Used to set + * AT_SECURE auxv for glibc. */ secureexec:1, /* diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9cd4455528e5..aab0695f41df 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -49,6 +49,7 @@ LSM_HOOK(int, 0, syslog, int type) LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages) +LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_set_creds, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 988ca0df7824..c719af37df20 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -34,40 +34,46 @@ * * Security hooks for program execution operations. * + * @bprm_creds_for_exec: + * If the setup in prepare_exec_creds did not setup @bprm->cred->security + * properly for executing @bprm->file, update the LSM's portion of + * @bprm->cred->security to be what commit_creds needs to install for the + * new program. This hook may also optionally check permissions + * (e.g. for transitions between security domains). + * The hook must set @bprm->secureexec to 1 if AT_SECURE should be set to + * request libc enable secure mode. + * @bprm contains the linux_binprm structure. + * Return 0 if the hook is successful and permission is granted. * @bprm_set_creds: - * Save security information in the bprm->security field, typically based - * on information about the bprm->file, for later use by the apply_creds - * hook. This hook may also optionally check permissions (e.g. for + * Assuming that the relevant bits of @bprm->cred->security have been + * previously set, examine @bprm->file and regenerate them. This is + * so that the credentials derived from the interpreter the code is + * actually going to run are used rather than credentials derived + * from a script. This done because the interpreter binary needs to + * reopen script, and may end up opening something completely different. + * This hook may also optionally check permissions (e.g. for * transitions between security domains). - * This hook may be called multiple times during a single execve, e.g. for - * interpreters. The hook can tell whether it has already been called by - * checking to see if @bprm->security is non-NULL. If so, then the hook - * may decide either to retain the security information saved earlier or - * to replace it. The hook must set @bprm->secureexec to 1 if a "secure - * exec" has happened as a result of this hook call. The flag is used to - * indicate the need for a sanitized execution environment, and is also - * passed in the ELF auxiliary table on the initial stack to indicate - * whether libc should enable secure mode. + * The hook must set @bprm->cap_elevated to 1 if AT_SECURE should be set to + * request libc enable secure mode. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. * @bprm_check_security: * This hook mediates the point when a search for a binary handler will - * begin. It allows a check the @bprm->security value which is set in the - * preceding set_creds call. The primary difference from set_creds is - * that the argv list and envp list are reliably available in @bprm. This - * hook may be called multiple times during a single execve; and in each - * pass set_creds is called first. + * begin. It allows a check against the @bprm->cred->security value + * which was set in the preceding creds_for_exec call. The argv list and + * envp list are reliably available in @bprm. This hook may be called + * multiple times during a single execve. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. * @bprm_committing_creds: * Prepare to install the new security attributes of a process being * transformed by an execve operation, based on the old credentials * pointed to by @current->cred and the information set in @bprm->cred by - * the bprm_set_creds hook. @bprm points to the linux_binprm structure. - * This hook is a good place to perform state changes on the process such - * as closing open file descriptors to which access will no longer be - * granted when the attributes are changed. This is called immediately - * before commit_creds(). + * the bprm_creds_for_exec hook. @bprm points to the linux_binprm + * structure. This hook is a good place to perform state changes on the + * process such as closing open file descriptors to which access will no + * longer be granted when the attributes are changed. This is called + * immediately before commit_creds(). * @bprm_committed_creds: * Tidy up after the installation of the new security attributes of a * process being transformed by an execve operation. The new credentials diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..1bd7a6582775 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -276,6 +276,7 @@ int security_quota_on(struct dentry *dentry); int security_syslog(int type); int security_settime64(const struct timespec64 *ts, const struct timezone *tz); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); +int security_bprm_creds_for_exec(struct linux_binprm *bprm); int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); @@ -569,6 +570,11 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages)); } +static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm) +{ + return 0; +} + static inline int security_bprm_set_creds(struct linux_binprm *bprm) { return cap_bprm_set_creds(bprm); diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 6ceb74e0f789..0b870a647488 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -854,14 +854,14 @@ static struct aa_label *handle_onexec(struct aa_label *label, } /** - * apparmor_bprm_set_creds - set the new creds on the bprm struct + * apparmor_bprm_creds_for_exec - Update the new creds on the bprm struct * @bprm: binprm for the exec (NOT NULL) * * Returns: %0 or error on failure * * TODO: once the other paths are done see if we can't refactor into a fn */ -int apparmor_bprm_set_creds(struct linux_binprm *bprm) +int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm) { struct aa_task_ctx *ctx; struct aa_label *label, *new = NULL; @@ -875,9 +875,6 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm) file_inode(bprm->file)->i_mode }; - if (bprm->called_set_creds) - return 0; - ctx = task_ctx(current); AA_BUG(!cred_label(bprm->cred)); AA_BUG(!ctx); diff --git a/security/apparmor/include/domain.h b/security/apparmor/include/domain.h index 21b875fe2d37..d14928fe1c6f 100644 --- a/security/apparmor/include/domain.h +++ b/security/apparmor/include/domain.h @@ -30,7 +30,7 @@ struct aa_domain { struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, const char **name); -int apparmor_bprm_set_creds(struct linux_binprm *bprm); +int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm); void aa_free_domain_entries(struct aa_domain *domain); int aa_change_hat(const char *hats[], int count, u64 token, int flags); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b621ad74f54a..3623ab08279d 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1232,7 +1232,7 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), - LSM_HOOK_INIT(bprm_set_creds, apparmor_bprm_set_creds), + LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), diff --git a/security/security.c b/security/security.c index 7fed24b9d57e..4ee76a729f73 100644 --- a/security/security.c +++ b/security/security.c @@ -823,6 +823,11 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return __vm_enough_memory(mm, pages, cap_sys_admin); } +int security_bprm_creds_for_exec(struct linux_binprm *bprm) +{ + return call_int_hook(bprm_creds_for_exec, 0, bprm); +} + int security_bprm_set_creds(struct linux_binprm *bprm) { return call_int_hook(bprm_set_creds, 0, bprm); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 0b4e32161b77..718345dd76bb 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2286,7 +2286,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, return -EACCES; } -static int selinux_bprm_set_creds(struct linux_binprm *bprm) +static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { const struct task_security_struct *old_tsec; struct task_security_struct *new_tsec; @@ -2297,8 +2297,6 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm) /* SELinux context only depends on initial program or script and not * the script interpreter */ - if (bprm->called_set_creds) - return 0; old_tsec = selinux_cred(current_cred()); new_tsec = selinux_cred(bprm->cred); @@ -6385,7 +6383,7 @@ static int selinux_setprocattr(const char *name, void *value, size_t size) /* Permission checking based on the specified context is performed during the actual operation (execve, open/mkdir/...), when we know the full context of the - operation. See selinux_bprm_set_creds for the execve + operation. See selinux_bprm_creds_for_exec for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ tsec = selinux_cred(new); @@ -6914,7 +6912,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(netlink_send, selinux_netlink_send), - LSM_HOOK_INIT(bprm_set_creds, selinux_bprm_set_creds), + LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds), diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8c61d175e195..0ac8f4518d07 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -891,12 +891,12 @@ static int smack_sb_statfs(struct dentry *dentry) */ /** - * smack_bprm_set_creds - set creds for exec + * smack_bprm_creds_for_exec - Update bprm->cred if needed for exec * @bprm: the exec information * * Returns 0 if it gets a blob, -EPERM if exec forbidden and -ENOMEM otherwise */ -static int smack_bprm_set_creds(struct linux_binprm *bprm) +static int smack_bprm_creds_for_exec(struct linux_binprm *bprm) { struct inode *inode = file_inode(bprm->file); struct task_smack *bsp = smack_cred(bprm->cred); @@ -904,9 +904,6 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) struct superblock_smack *sbsp; int rc; - if (bprm->called_set_creds) - return 0; - isp = smack_inode(inode); if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task) return 0; @@ -4598,7 +4595,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(sb_statfs, smack_sb_statfs), LSM_HOOK_INIT(sb_set_mnt_opts, smack_set_mnt_opts), - LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds), + LSM_HOOK_INIT(bprm_creds_for_exec, smack_bprm_creds_for_exec), LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security), LSM_HOOK_INIT(inode_init_security, smack_inode_init_security), diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 716c92ec941a..f9adddc42ac8 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -63,20 +63,14 @@ static void tomoyo_bprm_committed_creds(struct linux_binprm *bprm) #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER /** - * tomoyo_bprm_set_creds - Target for security_bprm_set_creds(). + * tomoyo_bprm_for_exec - Target for security_bprm_creds_for_exec(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0. */ -static int tomoyo_bprm_set_creds(struct linux_binprm *bprm) +static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm) { - /* - * Do only if this function is called for the first time of an execve - * operation. - */ - if (bprm->called_set_creds) - return 0; /* * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested * for the first time. @@ -539,7 +533,7 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc), LSM_HOOK_INIT(task_free, tomoyo_task_free), #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER - LSM_HOOK_INIT(bprm_set_creds, tomoyo_bprm_set_creds), + LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec), #endif LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security), LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl), -- cgit v1.2.3 From 74244b59a82358b9f51c80981a99c5951ea3028f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 14 May 2020 08:09:28 +0200 Subject: dm: use dynamic debug instead of compile-time config option Switch to use dynamic debug to avoid having recompile the kernel just to enable debugging messages. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 934037d938b9..8750f2dc5613 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -559,13 +559,8 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); #define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__) #define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__) -#ifdef CONFIG_DM_DEBUG -#define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__) +#define DMDEBUG(fmt, ...) pr_debug(DM_FMT(fmt), ##__VA_ARGS__) #define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__) -#else -#define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif #define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 0 : scnprintf(result + sz, maxlen - sz, x)) -- cgit v1.2.3 From ca07eda33e01eafa7a26ec06974f7eacee6a89c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 May 2020 17:30:12 -0400 Subject: SUNRPC: Refactor svc_recvfrom() This function is not currently "generic" so remove the documenting comment and rename it appropriately. Its internals are converted to use bio_vecs for reading from the transport socket. In existing typical sunrpc uses of bio_vecs, the bio_vec array is allocated dynamically. Here, instead, an array of bio_vecs is added to svc_rqst. The lifetime of this array can be greater than one call to xpo_recvfrom(): - Multiple calls to xpo_recvfrom() might be needed to read an RPC message completely. - At some later point, rq_arg.bvecs will point to this array and it will carry the received message into svc_process(). I also expect that a future optimization will remove either the rq_vec or rq_pages array in favor of rq_bvec, thus conserving the size of struct svc_rqst. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 + net/sunrpc/svcsock.c | 109 ++++++++++++++++++++++++++++----------------- 2 files changed, 69 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index fd390894a584..05da19a0516d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -254,6 +254,7 @@ struct svc_rqst { struct page * *rq_page_end; /* one past the last page */ struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ + struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; __be32 rq_xid; /* transmission id */ u32 rq_prog; /* program number */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5b2981a0711c..3e25511b800e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -223,26 +223,62 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek) +{ + struct bvec_iter bi = { + .bi_size = size, + }; + struct bio_vec bv; + + bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); + for_each_bvec(bv, bvec, bi, bi) + flush_dcache_page(bv.bv_page); +} +#else +static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size, + size_t seek) +{ +} +#endif + /* - * Generic recvfrom routine. + * Read from @rqstp's transport socket. The incoming message fills whole + * pages in @rqstp's rq_pages array until the last page of the message + * has been received into a partial page. */ -static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, - unsigned int nr, size_t buflen, unsigned int base) +static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, + size_t seek) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct bio_vec *bvec = rqstp->rq_bvec; struct msghdr msg = { NULL }; + unsigned int i; ssize_t len; + size_t t; rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); - if (base != 0) { - iov_iter_advance(&msg.msg_iter, base); - buflen -= base; + + for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) { + bvec[i].bv_page = rqstp->rq_pages[i]; + bvec[i].bv_len = PAGE_SIZE; + bvec[i].bv_offset = 0; + } + rqstp->rq_respages = &rqstp->rq_pages[i]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + iov_iter_bvec(&msg.msg_iter, READ, bvec, i, buflen); + if (seek) { + iov_iter_advance(&msg.msg_iter, seek); + buflen -= seek; } len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + if (len > 0) + svc_flush_bvec(bvec, len, seek); + /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ @@ -775,13 +811,14 @@ failed: return NULL; } -static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +static size_t svc_tcp_restore_pages(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - unsigned int i, len, npages; + size_t len = svsk->sk_datalen; + unsigned int i, npages; - if (svsk->sk_datalen == 0) + if (!len) return 0; - len = svsk->sk_datalen; npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) @@ -917,20 +954,6 @@ unlock_eagain: return -EAGAIN; } -static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) -{ - int i = 0; - int t = 0; - - while (t < len) { - vec[i].iov_base = page_address(pages[i]); - vec[i].iov_len = PAGE_SIZE; - i++; - t += PAGE_SIZE; - } - return i; -} - static void svc_tcp_fragment_received(struct svc_sock *svsk) { /* If we have more data, signal svc_xprt_enqueue() to try again */ @@ -938,20 +961,33 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk) svsk->sk_marker = xdr_zero; } -/* - * Receive data from a TCP socket. +/** + * svc_tcp_recvfrom - Receive data from a TCP socket + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Read the 4-byte stream record marker, then use the record length + * in that marker to set up exactly the resources needed to receive + * the next RPC message into @rqstp. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return + * + * The zero return case handles partial receives and callback Replies. + * The state of a partial receive is preserved in the svc_sock for + * the next call to svc_tcp_recvfrom. */ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int len; - struct kvec *vec; - unsigned int want, base; + size_t want, base; + ssize_t len; __be32 *p; __be32 calldir; - int pnum; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); len = svc_tcp_read_marker(svsk, rqstp); @@ -960,16 +996,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) base = svc_tcp_restore_pages(svsk, rqstp); want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); - - vec = rqstp->rq_vec; - - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); - - rqstp->rq_respages = &rqstp->rq_pages[pnum]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, base + want, base); + len = svc_tcp_read_msg(rqstp, base + want, base); if (len >= 0) { trace_svcsock_tcp_recv(&svsk->sk_xprt, len); svsk->sk_tcplen += len; -- cgit v1.2.3 From 931ca7ab7fe804d77bc6952f1512950c0d870f26 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2020 17:18:30 -0400 Subject: ip*_mc_gsfget(): lift copyout of struct group_filter into callers pass the userland pointer to the array in its tail, so that part gets copied out by our functions; copyout of everything else is done in the callers. Rationale: reuse for compat; the array is the same in native and compat, the layout of parts before it is different for compat. Signed-off-by: Al Viro --- include/linux/igmp.h | 2 +- include/net/ipv6.h | 2 +- net/ipv4/igmp.c | 18 +++++------------- net/ipv4/ip_sockglue.c | 19 ++++++++++++++----- net/ipv6/ipv6_sockglue.c | 18 ++++++++++++++---- net/ipv6/mcast.c | 10 +++------- 6 files changed, 38 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index faa6586a5783..64ce8cd1cfaf 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -123,7 +123,7 @@ extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct ip_msfilter __user *optval, int __user *optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen); + struct sockaddr_storage __user *p); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5fc3a9d7b053..c45eb78d970f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1138,7 +1138,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen); + struct sockaddr_storage __user *p); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 47f0502b2101..7b272bbed2b4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2565,9 +2565,9 @@ done: } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen) + struct sockaddr_storage __user *p) { - int err, i, count, copycount; + int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; @@ -2583,37 +2583,29 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - err = -EADDRNOTAVAIL; - for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ - goto done; + return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || - copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { - return -EFAULT; - } - for (i = 0; i < copycount; i++) { + for (i = 0; i < copycount; i++, p++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; - if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + if (copy_to_user(p, &ss, sizeof(ss))) return -EFAULT; } return 0; -done: - return err; } /* diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3c2c6cd3933b..e3703a3e7ef4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1473,19 +1473,28 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, } case MCAST_MSFILTER: { + struct group_filter __user *p = (void __user *)optval; struct group_filter gsf; + const int size0 = offsetof(struct group_filter, gf_slist); + int num; - if (len < GROUP_FILTER_SIZE(0)) { + if (len < size0) { err = -EINVAL; goto out; } - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + if (copy_from_user(&gsf, p, size0)) { err = -EFAULT; goto out; } - err = ip_mc_gsfget(sk, &gsf, - (struct group_filter __user *)optval, - optlen); + num = gsf.gf_numsrc; + err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + if (err) + goto out; + if (gsf.gf_numsrc < num) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; goto out; } case IP_MULTICAST_ALL: diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 96e3f603c8d8..e4a62ca1a3d0 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1056,18 +1056,28 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case MCAST_MSFILTER: { + struct group_filter __user *p = (void __user *)optval; struct group_filter gsf; + const int size0 = offsetof(struct group_filter, gf_slist); + int num; int err; - if (len < GROUP_FILTER_SIZE(0)) + if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + if (copy_from_user(&gsf, p, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; + num = gsf.gf_numsrc; lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, - (struct group_filter __user *)optval, optlen); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + if (!err) { + if (num > gsf.gf_numsrc) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; + } release_sock(sk); return err; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index eaa4c2cc2fbb..97d796c7d6c0 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -547,7 +547,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen) + struct sockaddr_storage *p) { int err, i, count, copycount; const struct in6_addr *group; @@ -592,14 +592,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || - copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { - return -EFAULT; - } /* changes to psl require the socket lock, and a write lock * on pmc->sflock. We have the socket lock so reading here is safe. */ - for (i = 0; i < copycount; i++) { + for (i = 0; i < copycount; i++, p++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -607,7 +603,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; - if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + if (copy_to_user(p, &ss, sizeof(ss))) return -EFAULT; } return 0; -- cgit v1.2.3 From 4e2e7cfec13afa41fcbed1d9b93d83432aa0154f Mon Sep 17 00:00:00 2001 From: Hsin-Hsiung Wang Date: Tue, 21 Apr 2020 11:00:07 +0800 Subject: mfd: mt6397: Modify suspend/resume behavior Some pmics don't need backup interrupt settings, so we change to use pm notifier for the pmics which are necessary to store settings. Signed-off-by: Hsin-Hsiung Wang Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 30 ------------------------------ drivers/mfd/mt6397-irq.c | 35 ++++++++++++++++++++++++++++++++++- include/linux/mfd/mt6397/core.h | 2 ++ 3 files changed, 36 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 0437c858d115..d2e70d834c40 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -100,35 +100,6 @@ static const struct mfd_cell mt6397_devs[] = { } }; -#ifdef CONFIG_PM_SLEEP -static int mt6397_irq_suspend(struct device *dev) -{ - struct mt6397_chip *chip = dev_get_drvdata(dev); - - regmap_write(chip->regmap, chip->int_con[0], chip->wake_mask[0]); - regmap_write(chip->regmap, chip->int_con[1], chip->wake_mask[1]); - - enable_irq_wake(chip->irq); - - return 0; -} - -static int mt6397_irq_resume(struct device *dev) -{ - struct mt6397_chip *chip = dev_get_drvdata(dev); - - regmap_write(chip->regmap, chip->int_con[0], chip->irq_masks_cur[0]); - regmap_write(chip->regmap, chip->int_con[1], chip->irq_masks_cur[1]); - - disable_irq_wake(chip->irq); - - return 0; -} -#endif - -static SIMPLE_DEV_PM_OPS(mt6397_pm_ops, mt6397_irq_suspend, - mt6397_irq_resume); - struct chip_data { u32 cid_addr; u32 cid_shift; @@ -238,7 +209,6 @@ static struct platform_driver mt6397_driver = { .driver = { .name = "mt6397", .of_match_table = of_match_ptr(mt6397_of_match), - .pm = &mt6397_pm_ops, }, .id_table = mt6397_id, }; diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c index b2d3ce1f3115..2924919da991 100644 --- a/drivers/mfd/mt6397-irq.c +++ b/drivers/mfd/mt6397-irq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -81,7 +82,7 @@ static struct irq_chip mt6397_irq_chip = { static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, int irqbase) { - unsigned int status; + unsigned int status = 0; int i, irq, ret; ret = regmap_read(mt6397->regmap, reg, &status); @@ -128,6 +129,36 @@ static const struct irq_domain_ops mt6397_irq_domain_ops = { .map = mt6397_irq_domain_map, }; +static int mt6397_irq_pm_notifier(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + struct mt6397_chip *chip = + container_of(notifier, struct mt6397_chip, pm_nb); + + switch (pm_event) { + case PM_SUSPEND_PREPARE: + regmap_write(chip->regmap, + chip->int_con[0], chip->wake_mask[0]); + regmap_write(chip->regmap, + chip->int_con[1], chip->wake_mask[1]); + enable_irq_wake(chip->irq); + break; + + case PM_POST_SUSPEND: + regmap_write(chip->regmap, + chip->int_con[0], chip->irq_masks_cur[0]); + regmap_write(chip->regmap, + chip->int_con[1], chip->irq_masks_cur[1]); + disable_irq_wake(chip->irq); + break; + + default: + break; + } + + return NOTIFY_DONE; +} + int mt6397_irq_init(struct mt6397_chip *chip) { int ret; @@ -159,6 +190,7 @@ int mt6397_irq_init(struct mt6397_chip *chip) regmap_write(chip->regmap, chip->int_con[0], 0x0); regmap_write(chip->regmap, chip->int_con[1], 0x0); + chip->pm_nb.notifier_call = mt6397_irq_pm_notifier; chip->irq_domain = irq_domain_add_linear(chip->dev->of_node, MT6397_IRQ_NR, &mt6397_irq_domain_ops, @@ -177,5 +209,6 @@ int mt6397_irq_init(struct mt6397_chip *chip) return ret; } + register_pm_notifier(&chip->pm_nb); return 0; } diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index fc88d315bdde..b81d33325ade 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -8,6 +8,7 @@ #define __MFD_MT6397_CORE_H__ #include +#include enum chip_id { MT6323_CHIP_ID = 0x23, @@ -54,6 +55,7 @@ enum mt6397_irq_numbers { struct mt6397_chip { struct device *dev; struct regmap *regmap; + struct notifier_block pm_nb; int irq; struct irq_domain *irq_domain; struct mutex irqlock; -- cgit v1.2.3 From 2b91c28f2abd9471aac4aa6cd8de7896ef469153 Mon Sep 17 00:00:00 2001 From: Hsin-Hsiung Wang Date: Tue, 21 Apr 2020 11:00:10 +0800 Subject: mfd: Add support for the MediaTek MT6358 PMIC This adds support for the MediaTek MT6358 PMIC. This is a multifunction device with the following sub modules: - Regulator - RTC - Codec - Interrupt It is interfaced to the host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6358 MFD is a child device of the pwrap. Signed-off-by: Hsin-Hsiung Wang Signed-off-by: Lee Jones --- drivers/mfd/Makefile | 2 +- drivers/mfd/mt6358-irq.c | 235 +++++++++++++++++++++++++++++ drivers/mfd/mt6397-core.c | 36 +++++ include/linux/mfd/mt6358/core.h | 158 ++++++++++++++++++++ include/linux/mfd/mt6358/registers.h | 282 +++++++++++++++++++++++++++++++++++ include/linux/mfd/mt6397/core.h | 3 + 6 files changed, 715 insertions(+), 1 deletion(-) create mode 100644 drivers/mfd/mt6358-irq.c create mode 100644 include/linux/mfd/mt6358/core.h create mode 100644 include/linux/mfd/mt6358/registers.h (limited to 'include/linux') diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f935d10cbf0f..947032b63c64 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -239,7 +239,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC) += intel-soc-pmic.o obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o -mt6397-objs := mt6397-core.o mt6397-irq.o +mt6397-objs := mt6397-core.o mt6397-irq.o mt6358-irq.o obj-$(CONFIG_MFD_MT6397) += mt6397.o obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o diff --git a/drivers/mfd/mt6358-irq.c b/drivers/mfd/mt6358-irq.c new file mode 100644 index 000000000000..db734f2831ff --- /dev/null +++ b/drivers/mfd/mt6358-irq.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (c) 2020 MediaTek Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct irq_top_t mt6358_ints[] = { + MT6358_TOP_GEN(BUCK), + MT6358_TOP_GEN(LDO), + MT6358_TOP_GEN(PSC), + MT6358_TOP_GEN(SCK), + MT6358_TOP_GEN(BM), + MT6358_TOP_GEN(HK), + MT6358_TOP_GEN(AUD), + MT6358_TOP_GEN(MISC), +}; + +static void pmic_irq_enable(struct irq_data *data) +{ + unsigned int hwirq = irqd_to_hwirq(data); + struct mt6397_chip *chip = irq_data_get_irq_chip_data(data); + struct pmic_irq_data *irqd = chip->irq_data; + + irqd->enable_hwirq[hwirq] = true; +} + +static void pmic_irq_disable(struct irq_data *data) +{ + unsigned int hwirq = irqd_to_hwirq(data); + struct mt6397_chip *chip = irq_data_get_irq_chip_data(data); + struct pmic_irq_data *irqd = chip->irq_data; + + irqd->enable_hwirq[hwirq] = false; +} + +static void pmic_irq_lock(struct irq_data *data) +{ + struct mt6397_chip *chip = irq_data_get_irq_chip_data(data); + + mutex_lock(&chip->irqlock); +} + +static void pmic_irq_sync_unlock(struct irq_data *data) +{ + unsigned int i, top_gp, gp_offset, en_reg, int_regs, shift; + struct mt6397_chip *chip = irq_data_get_irq_chip_data(data); + struct pmic_irq_data *irqd = chip->irq_data; + + for (i = 0; i < irqd->num_pmic_irqs; i++) { + if (irqd->enable_hwirq[i] == irqd->cache_hwirq[i]) + continue; + + /* Find out the IRQ group */ + top_gp = 0; + while ((top_gp + 1) < irqd->num_top && + i >= mt6358_ints[top_gp + 1].hwirq_base) + top_gp++; + + /* Find the IRQ registers */ + gp_offset = i - mt6358_ints[top_gp].hwirq_base; + int_regs = gp_offset / MT6358_REG_WIDTH; + shift = gp_offset % MT6358_REG_WIDTH; + en_reg = mt6358_ints[top_gp].en_reg + + (mt6358_ints[top_gp].en_reg_shift * int_regs); + + regmap_update_bits(chip->regmap, en_reg, BIT(shift), + irqd->enable_hwirq[i] << shift); + + irqd->cache_hwirq[i] = irqd->enable_hwirq[i]; + } + mutex_unlock(&chip->irqlock); +} + +static struct irq_chip mt6358_irq_chip = { + .name = "mt6358-irq", + .flags = IRQCHIP_SKIP_SET_WAKE, + .irq_enable = pmic_irq_enable, + .irq_disable = pmic_irq_disable, + .irq_bus_lock = pmic_irq_lock, + .irq_bus_sync_unlock = pmic_irq_sync_unlock, +}; + +static void mt6358_irq_sp_handler(struct mt6397_chip *chip, + unsigned int top_gp) +{ + unsigned int irq_status, sta_reg, status; + unsigned int hwirq, virq; + int i, j, ret; + + for (i = 0; i < mt6358_ints[top_gp].num_int_regs; i++) { + sta_reg = mt6358_ints[top_gp].sta_reg + + mt6358_ints[top_gp].sta_reg_shift * i; + + ret = regmap_read(chip->regmap, sta_reg, &irq_status); + if (ret) { + dev_err(chip->dev, + "Failed to read IRQ status, ret=%d\n", ret); + return; + } + + if (!irq_status) + continue; + + status = irq_status; + do { + j = __ffs(status); + + hwirq = mt6358_ints[top_gp].hwirq_base + + MT6358_REG_WIDTH * i + j; + + virq = irq_find_mapping(chip->irq_domain, hwirq); + if (virq) + handle_nested_irq(virq); + + status &= ~BIT(j); + } while (status); + + regmap_write(chip->regmap, sta_reg, irq_status); + } +} + +static irqreturn_t mt6358_irq_handler(int irq, void *data) +{ + struct mt6397_chip *chip = data; + struct pmic_irq_data *mt6358_irq_data = chip->irq_data; + unsigned int bit, i, top_irq_status = 0; + int ret; + + ret = regmap_read(chip->regmap, + mt6358_irq_data->top_int_status_reg, + &top_irq_status); + if (ret) { + dev_err(chip->dev, + "Failed to read status from the device, ret=%d\n", ret); + return IRQ_NONE; + } + + for (i = 0; i < mt6358_irq_data->num_top; i++) { + bit = BIT(mt6358_ints[i].top_offset); + if (top_irq_status & bit) { + mt6358_irq_sp_handler(chip, i); + top_irq_status &= ~bit; + if (!top_irq_status) + break; + } + } + + return IRQ_HANDLED; +} + +static int pmic_irq_domain_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hw) +{ + struct mt6397_chip *mt6397 = d->host_data; + + irq_set_chip_data(irq, mt6397); + irq_set_chip_and_handler(irq, &mt6358_irq_chip, handle_level_irq); + irq_set_nested_thread(irq, 1); + irq_set_noprobe(irq); + + return 0; +} + +static const struct irq_domain_ops mt6358_irq_domain_ops = { + .map = pmic_irq_domain_map, + .xlate = irq_domain_xlate_twocell, +}; + +int mt6358_irq_init(struct mt6397_chip *chip) +{ + int i, j, ret; + struct pmic_irq_data *irqd; + + irqd = devm_kzalloc(chip->dev, sizeof(*irqd), GFP_KERNEL); + if (!irqd) + return -ENOMEM; + + chip->irq_data = irqd; + + mutex_init(&chip->irqlock); + irqd->top_int_status_reg = MT6358_TOP_INT_STATUS0; + irqd->num_pmic_irqs = MT6358_IRQ_NR; + irqd->num_top = ARRAY_SIZE(mt6358_ints); + + irqd->enable_hwirq = devm_kcalloc(chip->dev, + irqd->num_pmic_irqs, + sizeof(*irqd->enable_hwirq), + GFP_KERNEL); + if (!irqd->enable_hwirq) + return -ENOMEM; + + irqd->cache_hwirq = devm_kcalloc(chip->dev, + irqd->num_pmic_irqs, + sizeof(*irqd->cache_hwirq), + GFP_KERNEL); + if (!irqd->cache_hwirq) + return -ENOMEM; + + /* Disable all interrupts for initializing */ + for (i = 0; i < irqd->num_top; i++) { + for (j = 0; j < mt6358_ints[i].num_int_regs; j++) + regmap_write(chip->regmap, + mt6358_ints[i].en_reg + + mt6358_ints[i].en_reg_shift * j, 0); + } + + chip->irq_domain = irq_domain_add_linear(chip->dev->of_node, + irqd->num_pmic_irqs, + &mt6358_irq_domain_ops, chip); + if (!chip->irq_domain) { + dev_err(chip->dev, "Could not create IRQ domain\n"); + return -ENODEV; + } + + ret = devm_request_threaded_irq(chip->dev, chip->irq, NULL, + mt6358_irq_handler, IRQF_ONESHOT, + mt6358_irq_chip.name, chip); + if (ret) { + dev_err(chip->dev, "Failed to register IRQ=%d, ret=%d\n", + chip->irq, ret); + return ret; + } + + enable_irq_wake(chip->irq); + return ret; +} diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index a313a720bb8b..f6cd8a660602 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -12,13 +12,18 @@ #include #include #include +#include #include #include +#include #include #define MT6323_RTC_BASE 0x8000 #define MT6323_RTC_SIZE 0x40 +#define MT6358_RTC_BASE 0x0588 +#define MT6358_RTC_SIZE 0x3c + #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e @@ -30,6 +35,11 @@ static const struct resource mt6323_rtc_resources[] = { DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC), }; +static const struct resource mt6358_rtc_resources[] = { + DEFINE_RES_MEM(MT6358_RTC_BASE, MT6358_RTC_SIZE), + DEFINE_RES_IRQ(MT6358_IRQ_RTC), +}; + static const struct resource mt6397_rtc_resources[] = { DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE), DEFINE_RES_IRQ(MT6397_IRQ_RTC), @@ -74,6 +84,21 @@ static const struct mfd_cell mt6323_devs[] = { }, }; +static const struct mfd_cell mt6358_devs[] = { + { + .name = "mt6358-regulator", + .of_compatible = "mediatek,mt6358-regulator" + }, { + .name = "mt6358-rtc", + .num_resources = ARRAY_SIZE(mt6358_rtc_resources), + .resources = mt6358_rtc_resources, + .of_compatible = "mediatek,mt6358-rtc", + }, { + .name = "mt6358-sound", + .of_compatible = "mediatek,mt6358-sound" + }, +}; + static const struct mfd_cell mt6397_devs[] = { { .name = "mt6397-rtc", @@ -116,6 +141,14 @@ static const struct chip_data mt6323_core = { .irq_init = mt6397_irq_init, }; +static const struct chip_data mt6358_core = { + .cid_addr = MT6358_SWCID, + .cid_shift = 8, + .cells = mt6358_devs, + .cell_size = ARRAY_SIZE(mt6358_devs), + .irq_init = mt6358_irq_init, +}; + static const struct chip_data mt6397_core = { .cid_addr = MT6397_CID, .cid_shift = 0, @@ -182,6 +215,9 @@ static const struct of_device_id mt6397_of_match[] = { { .compatible = "mediatek,mt6323", .data = &mt6323_core, + }, { + .compatible = "mediatek,mt6358", + .data = &mt6358_core, }, { .compatible = "mediatek,mt6397", .data = &mt6397_core, diff --git a/include/linux/mfd/mt6358/core.h b/include/linux/mfd/mt6358/core.h new file mode 100644 index 000000000000..c5a11b7458d4 --- /dev/null +++ b/include/linux/mfd/mt6358/core.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 MediaTek Inc. + */ + +#ifndef __MFD_MT6358_CORE_H__ +#define __MFD_MT6358_CORE_H__ + +#define MT6358_REG_WIDTH 16 + +struct irq_top_t { + int hwirq_base; + unsigned int num_int_regs; + unsigned int num_int_bits; + unsigned int en_reg; + unsigned int en_reg_shift; + unsigned int sta_reg; + unsigned int sta_reg_shift; + unsigned int top_offset; +}; + +struct pmic_irq_data { + unsigned int num_top; + unsigned int num_pmic_irqs; + unsigned short top_int_status_reg; + bool *enable_hwirq; + bool *cache_hwirq; +}; + +enum mt6358_irq_top_status_shift { + MT6358_BUCK_TOP = 0, + MT6358_LDO_TOP, + MT6358_PSC_TOP, + MT6358_SCK_TOP, + MT6358_BM_TOP, + MT6358_HK_TOP, + MT6358_AUD_TOP, + MT6358_MISC_TOP, +}; + +enum mt6358_irq_numbers { + MT6358_IRQ_VPROC11_OC = 0, + MT6358_IRQ_VPROC12_OC, + MT6358_IRQ_VCORE_OC, + MT6358_IRQ_VGPU_OC, + MT6358_IRQ_VMODEM_OC, + MT6358_IRQ_VDRAM1_OC, + MT6358_IRQ_VS1_OC, + MT6358_IRQ_VS2_OC, + MT6358_IRQ_VPA_OC, + MT6358_IRQ_VCORE_PREOC, + MT6358_IRQ_VFE28_OC = 16, + MT6358_IRQ_VXO22_OC, + MT6358_IRQ_VRF18_OC, + MT6358_IRQ_VRF12_OC, + MT6358_IRQ_VEFUSE_OC, + MT6358_IRQ_VCN33_OC, + MT6358_IRQ_VCN28_OC, + MT6358_IRQ_VCN18_OC, + MT6358_IRQ_VCAMA1_OC, + MT6358_IRQ_VCAMA2_OC, + MT6358_IRQ_VCAMD_OC, + MT6358_IRQ_VCAMIO_OC, + MT6358_IRQ_VLDO28_OC, + MT6358_IRQ_VA12_OC, + MT6358_IRQ_VAUX18_OC, + MT6358_IRQ_VAUD28_OC, + MT6358_IRQ_VIO28_OC, + MT6358_IRQ_VIO18_OC, + MT6358_IRQ_VSRAM_PROC11_OC, + MT6358_IRQ_VSRAM_PROC12_OC, + MT6358_IRQ_VSRAM_OTHERS_OC, + MT6358_IRQ_VSRAM_GPU_OC, + MT6358_IRQ_VDRAM2_OC, + MT6358_IRQ_VMC_OC, + MT6358_IRQ_VMCH_OC, + MT6358_IRQ_VEMC_OC, + MT6358_IRQ_VSIM1_OC, + MT6358_IRQ_VSIM2_OC, + MT6358_IRQ_VIBR_OC, + MT6358_IRQ_VUSB_OC, + MT6358_IRQ_VBIF28_OC, + MT6358_IRQ_PWRKEY = 48, + MT6358_IRQ_HOMEKEY, + MT6358_IRQ_PWRKEY_R, + MT6358_IRQ_HOMEKEY_R, + MT6358_IRQ_NI_LBAT_INT, + MT6358_IRQ_CHRDET, + MT6358_IRQ_CHRDET_EDGE, + MT6358_IRQ_VCDT_HV_DET, + MT6358_IRQ_RTC = 64, + MT6358_IRQ_FG_BAT0_H = 80, + MT6358_IRQ_FG_BAT0_L, + MT6358_IRQ_FG_CUR_H, + MT6358_IRQ_FG_CUR_L, + MT6358_IRQ_FG_ZCV, + MT6358_IRQ_FG_BAT1_H, + MT6358_IRQ_FG_BAT1_L, + MT6358_IRQ_FG_N_CHARGE_L, + MT6358_IRQ_FG_IAVG_H, + MT6358_IRQ_FG_IAVG_L, + MT6358_IRQ_FG_TIME_H, + MT6358_IRQ_FG_DISCHARGE, + MT6358_IRQ_FG_CHARGE, + MT6358_IRQ_BATON_LV = 96, + MT6358_IRQ_BATON_HT, + MT6358_IRQ_BATON_BAT_IN, + MT6358_IRQ_BATON_BAT_OUT, + MT6358_IRQ_BIF, + MT6358_IRQ_BAT_H = 112, + MT6358_IRQ_BAT_L, + MT6358_IRQ_BAT2_H, + MT6358_IRQ_BAT2_L, + MT6358_IRQ_BAT_TEMP_H, + MT6358_IRQ_BAT_TEMP_L, + MT6358_IRQ_AUXADC_IMP, + MT6358_IRQ_NAG_C_DLTV, + MT6358_IRQ_AUDIO = 128, + MT6358_IRQ_ACCDET = 133, + MT6358_IRQ_ACCDET_EINT0, + MT6358_IRQ_ACCDET_EINT1, + MT6358_IRQ_SPI_CMD_ALERT = 144, + MT6358_IRQ_NR, +}; + +#define MT6358_IRQ_BUCK_BASE MT6358_IRQ_VPROC11_OC +#define MT6358_IRQ_LDO_BASE MT6358_IRQ_VFE28_OC +#define MT6358_IRQ_PSC_BASE MT6358_IRQ_PWRKEY +#define MT6358_IRQ_SCK_BASE MT6358_IRQ_RTC +#define MT6358_IRQ_BM_BASE MT6358_IRQ_FG_BAT0_H +#define MT6358_IRQ_HK_BASE MT6358_IRQ_BAT_H +#define MT6358_IRQ_AUD_BASE MT6358_IRQ_AUDIO +#define MT6358_IRQ_MISC_BASE MT6358_IRQ_SPI_CMD_ALERT + +#define MT6358_IRQ_BUCK_BITS (MT6358_IRQ_VCORE_PREOC - MT6358_IRQ_BUCK_BASE + 1) +#define MT6358_IRQ_LDO_BITS (MT6358_IRQ_VBIF28_OC - MT6358_IRQ_LDO_BASE + 1) +#define MT6358_IRQ_PSC_BITS (MT6358_IRQ_VCDT_HV_DET - MT6358_IRQ_PSC_BASE + 1) +#define MT6358_IRQ_SCK_BITS (MT6358_IRQ_RTC - MT6358_IRQ_SCK_BASE + 1) +#define MT6358_IRQ_BM_BITS (MT6358_IRQ_BIF - MT6358_IRQ_BM_BASE + 1) +#define MT6358_IRQ_HK_BITS (MT6358_IRQ_NAG_C_DLTV - MT6358_IRQ_HK_BASE + 1) +#define MT6358_IRQ_AUD_BITS (MT6358_IRQ_ACCDET_EINT1 - MT6358_IRQ_AUD_BASE + 1) +#define MT6358_IRQ_MISC_BITS \ + (MT6358_IRQ_SPI_CMD_ALERT - MT6358_IRQ_MISC_BASE + 1) + +#define MT6358_TOP_GEN(sp) \ +{ \ + .hwirq_base = MT6358_IRQ_##sp##_BASE, \ + .num_int_regs = \ + ((MT6358_IRQ_##sp##_BITS - 1) / MT6358_REG_WIDTH) + 1, \ + .num_int_bits = MT6358_IRQ_##sp##_BITS, \ + .en_reg = MT6358_##sp##_TOP_INT_CON0, \ + .en_reg_shift = 0x6, \ + .sta_reg = MT6358_##sp##_TOP_INT_STATUS0, \ + .sta_reg_shift = 0x2, \ + .top_offset = MT6358_##sp##_TOP, \ +} + +#endif /* __MFD_MT6358_CORE_H__ */ diff --git a/include/linux/mfd/mt6358/registers.h b/include/linux/mfd/mt6358/registers.h new file mode 100644 index 000000000000..2ad0b312aa28 --- /dev/null +++ b/include/linux/mfd/mt6358/registers.h @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 MediaTek Inc. + */ + +#ifndef __MFD_MT6358_REGISTERS_H__ +#define __MFD_MT6358_REGISTERS_H__ + +/* PMIC Registers */ +#define MT6358_SWCID 0xa +#define MT6358_MISC_TOP_INT_CON0 0x188 +#define MT6358_MISC_TOP_INT_STATUS0 0x194 +#define MT6358_TOP_INT_STATUS0 0x19e +#define MT6358_SCK_TOP_INT_CON0 0x52e +#define MT6358_SCK_TOP_INT_STATUS0 0x53a +#define MT6358_EOSC_CALI_CON0 0x540 +#define MT6358_EOSC_CALI_CON1 0x542 +#define MT6358_RTC_MIX_CON0 0x544 +#define MT6358_RTC_MIX_CON1 0x546 +#define MT6358_RTC_MIX_CON2 0x548 +#define MT6358_RTC_DSN_ID 0x580 +#define MT6358_RTC_DSN_REV0 0x582 +#define MT6358_RTC_DBI 0x584 +#define MT6358_RTC_DXI 0x586 +#define MT6358_RTC_BBPU 0x588 +#define MT6358_RTC_IRQ_STA 0x58a +#define MT6358_RTC_IRQ_EN 0x58c +#define MT6358_RTC_CII_EN 0x58e +#define MT6358_RTC_AL_MASK 0x590 +#define MT6358_RTC_TC_SEC 0x592 +#define MT6358_RTC_TC_MIN 0x594 +#define MT6358_RTC_TC_HOU 0x596 +#define MT6358_RTC_TC_DOM 0x598 +#define MT6358_RTC_TC_DOW 0x59a +#define MT6358_RTC_TC_MTH 0x59c +#define MT6358_RTC_TC_YEA 0x59e +#define MT6358_RTC_AL_SEC 0x5a0 +#define MT6358_RTC_AL_MIN 0x5a2 +#define MT6358_RTC_AL_HOU 0x5a4 +#define MT6358_RTC_AL_DOM 0x5a6 +#define MT6358_RTC_AL_DOW 0x5a8 +#define MT6358_RTC_AL_MTH 0x5aa +#define MT6358_RTC_AL_YEA 0x5ac +#define MT6358_RTC_OSC32CON 0x5ae +#define MT6358_RTC_POWERKEY1 0x5b0 +#define MT6358_RTC_POWERKEY2 0x5b2 +#define MT6358_RTC_PDN1 0x5b4 +#define MT6358_RTC_PDN2 0x5b6 +#define MT6358_RTC_SPAR0 0x5b8 +#define MT6358_RTC_SPAR1 0x5ba +#define MT6358_RTC_PROT 0x5bc +#define MT6358_RTC_DIFF 0x5be +#define MT6358_RTC_CALI 0x5c0 +#define MT6358_RTC_WRTGR 0x5c2 +#define MT6358_RTC_CON 0x5c4 +#define MT6358_RTC_SEC_CTRL 0x5c6 +#define MT6358_RTC_INT_CNT 0x5c8 +#define MT6358_RTC_SEC_DAT0 0x5ca +#define MT6358_RTC_SEC_DAT1 0x5cc +#define MT6358_RTC_SEC_DAT2 0x5ce +#define MT6358_RTC_SEC_DSN_ID 0x600 +#define MT6358_RTC_SEC_DSN_REV0 0x602 +#define MT6358_RTC_SEC_DBI 0x604 +#define MT6358_RTC_SEC_DXI 0x606 +#define MT6358_RTC_TC_SEC_SEC 0x608 +#define MT6358_RTC_TC_MIN_SEC 0x60a +#define MT6358_RTC_TC_HOU_SEC 0x60c +#define MT6358_RTC_TC_DOM_SEC 0x60e +#define MT6358_RTC_TC_DOW_SEC 0x610 +#define MT6358_RTC_TC_MTH_SEC 0x612 +#define MT6358_RTC_TC_YEA_SEC 0x614 +#define MT6358_RTC_SEC_CK_PDN 0x616 +#define MT6358_RTC_SEC_WRTGR 0x618 +#define MT6358_PSC_TOP_INT_CON0 0x910 +#define MT6358_PSC_TOP_INT_STATUS0 0x91c +#define MT6358_BM_TOP_INT_CON0 0xc32 +#define MT6358_BM_TOP_INT_CON1 0xc38 +#define MT6358_BM_TOP_INT_STATUS0 0xc4a +#define MT6358_BM_TOP_INT_STATUS1 0xc4c +#define MT6358_HK_TOP_INT_CON0 0xf92 +#define MT6358_HK_TOP_INT_STATUS0 0xf9e +#define MT6358_BUCK_TOP_INT_CON0 0x1318 +#define MT6358_BUCK_TOP_INT_STATUS0 0x1324 +#define MT6358_BUCK_VPROC11_CON0 0x1388 +#define MT6358_BUCK_VPROC11_DBG0 0x139e +#define MT6358_BUCK_VPROC11_DBG1 0x13a0 +#define MT6358_BUCK_VPROC11_ELR0 0x13a6 +#define MT6358_BUCK_VPROC12_CON0 0x1408 +#define MT6358_BUCK_VPROC12_DBG0 0x141e +#define MT6358_BUCK_VPROC12_DBG1 0x1420 +#define MT6358_BUCK_VPROC12_ELR0 0x1426 +#define MT6358_BUCK_VCORE_CON0 0x1488 +#define MT6358_BUCK_VCORE_DBG0 0x149e +#define MT6358_BUCK_VCORE_DBG1 0x14a0 +#define MT6358_BUCK_VCORE_ELR0 0x14aa +#define MT6358_BUCK_VGPU_CON0 0x1508 +#define MT6358_BUCK_VGPU_DBG0 0x151e +#define MT6358_BUCK_VGPU_DBG1 0x1520 +#define MT6358_BUCK_VGPU_ELR0 0x1526 +#define MT6358_BUCK_VMODEM_CON0 0x1588 +#define MT6358_BUCK_VMODEM_DBG0 0x159e +#define MT6358_BUCK_VMODEM_DBG1 0x15a0 +#define MT6358_BUCK_VMODEM_ELR0 0x15a6 +#define MT6358_BUCK_VDRAM1_CON0 0x1608 +#define MT6358_BUCK_VDRAM1_DBG0 0x161e +#define MT6358_BUCK_VDRAM1_DBG1 0x1620 +#define MT6358_BUCK_VDRAM1_ELR0 0x1626 +#define MT6358_BUCK_VS1_CON0 0x1688 +#define MT6358_BUCK_VS1_DBG0 0x169e +#define MT6358_BUCK_VS1_DBG1 0x16a0 +#define MT6358_BUCK_VS1_ELR0 0x16ae +#define MT6358_BUCK_VS2_CON0 0x1708 +#define MT6358_BUCK_VS2_DBG0 0x171e +#define MT6358_BUCK_VS2_DBG1 0x1720 +#define MT6358_BUCK_VS2_ELR0 0x172e +#define MT6358_BUCK_VPA_CON0 0x1788 +#define MT6358_BUCK_VPA_CON1 0x178a +#define MT6358_BUCK_VPA_ELR0 MT6358_BUCK_VPA_CON1 +#define MT6358_BUCK_VPA_DBG0 0x1792 +#define MT6358_BUCK_VPA_DBG1 0x1794 +#define MT6358_VPROC_ANA_CON0 0x180c +#define MT6358_VCORE_VGPU_ANA_CON0 0x1828 +#define MT6358_VMODEM_ANA_CON0 0x1888 +#define MT6358_VDRAM1_ANA_CON0 0x1896 +#define MT6358_VS1_ANA_CON0 0x18a2 +#define MT6358_VS2_ANA_CON0 0x18ae +#define MT6358_VPA_ANA_CON0 0x18ba +#define MT6358_LDO_TOP_INT_CON0 0x1a50 +#define MT6358_LDO_TOP_INT_CON1 0x1a56 +#define MT6358_LDO_TOP_INT_STATUS0 0x1a68 +#define MT6358_LDO_TOP_INT_STATUS1 0x1a6a +#define MT6358_LDO_VXO22_CON0 0x1a88 +#define MT6358_LDO_VXO22_CON1 0x1a96 +#define MT6358_LDO_VA12_CON0 0x1a9c +#define MT6358_LDO_VA12_CON1 0x1aaa +#define MT6358_LDO_VAUX18_CON0 0x1ab0 +#define MT6358_LDO_VAUX18_CON1 0x1abe +#define MT6358_LDO_VAUD28_CON0 0x1ac4 +#define MT6358_LDO_VAUD28_CON1 0x1ad2 +#define MT6358_LDO_VIO28_CON0 0x1ad8 +#define MT6358_LDO_VIO28_CON1 0x1ae6 +#define MT6358_LDO_VIO18_CON0 0x1aec +#define MT6358_LDO_VIO18_CON1 0x1afa +#define MT6358_LDO_VDRAM2_CON0 0x1b08 +#define MT6358_LDO_VDRAM2_CON1 0x1b16 +#define MT6358_LDO_VEMC_CON0 0x1b1c +#define MT6358_LDO_VEMC_CON1 0x1b2a +#define MT6358_LDO_VUSB_CON0_0 0x1b30 +#define MT6358_LDO_VUSB_CON1 0x1b40 +#define MT6358_LDO_VSRAM_PROC11_CON0 0x1b46 +#define MT6358_LDO_VSRAM_PROC11_DBG0 0x1b60 +#define MT6358_LDO_VSRAM_PROC11_DBG1 0x1b62 +#define MT6358_LDO_VSRAM_PROC11_TRACKING_CON0 0x1b64 +#define MT6358_LDO_VSRAM_PROC11_TRACKING_CON1 0x1b66 +#define MT6358_LDO_VSRAM_PROC11_TRACKING_CON2 0x1b68 +#define MT6358_LDO_VSRAM_PROC11_TRACKING_CON3 0x1b6a +#define MT6358_LDO_VSRAM_PROC12_TRACKING_CON0 0x1b6c +#define MT6358_LDO_VSRAM_PROC12_TRACKING_CON1 0x1b6e +#define MT6358_LDO_VSRAM_PROC12_TRACKING_CON2 0x1b70 +#define MT6358_LDO_VSRAM_PROC12_TRACKING_CON3 0x1b72 +#define MT6358_LDO_VSRAM_WAKEUP_CON0 0x1b74 +#define MT6358_LDO_GON1_ELR_NUM 0x1b76 +#define MT6358_LDO_VDRAM2_ELR0 0x1b78 +#define MT6358_LDO_VSRAM_PROC12_CON0 0x1b88 +#define MT6358_LDO_VSRAM_PROC12_DBG0 0x1ba2 +#define MT6358_LDO_VSRAM_PROC12_DBG1 0x1ba4 +#define MT6358_LDO_VSRAM_OTHERS_CON0 0x1ba6 +#define MT6358_LDO_VSRAM_OTHERS_DBG0 0x1bc0 +#define MT6358_LDO_VSRAM_OTHERS_DBG1 0x1bc2 +#define MT6358_LDO_VSRAM_GPU_CON0 0x1bc8 +#define MT6358_LDO_VSRAM_GPU_DBG0 0x1be2 +#define MT6358_LDO_VSRAM_GPU_DBG1 0x1be4 +#define MT6358_LDO_VSRAM_CON0 0x1bee +#define MT6358_LDO_VSRAM_CON1 0x1bf0 +#define MT6358_LDO_VSRAM_CON2 0x1bf2 +#define MT6358_LDO_VSRAM_CON3 0x1bf4 +#define MT6358_LDO_VFE28_CON0 0x1c08 +#define MT6358_LDO_VFE28_CON1 0x1c16 +#define MT6358_LDO_VFE28_CON2 0x1c18 +#define MT6358_LDO_VFE28_CON3 0x1c1a +#define MT6358_LDO_VRF18_CON0 0x1c1c +#define MT6358_LDO_VRF18_CON1 0x1c2a +#define MT6358_LDO_VRF18_CON2 0x1c2c +#define MT6358_LDO_VRF18_CON3 0x1c2e +#define MT6358_LDO_VRF12_CON0 0x1c30 +#define MT6358_LDO_VRF12_CON1 0x1c3e +#define MT6358_LDO_VRF12_CON2 0x1c40 +#define MT6358_LDO_VRF12_CON3 0x1c42 +#define MT6358_LDO_VEFUSE_CON0 0x1c44 +#define MT6358_LDO_VEFUSE_CON1 0x1c52 +#define MT6358_LDO_VEFUSE_CON2 0x1c54 +#define MT6358_LDO_VEFUSE_CON3 0x1c56 +#define MT6358_LDO_VCN18_CON0 0x1c58 +#define MT6358_LDO_VCN18_CON1 0x1c66 +#define MT6358_LDO_VCN18_CON2 0x1c68 +#define MT6358_LDO_VCN18_CON3 0x1c6a +#define MT6358_LDO_VCAMA1_CON0 0x1c6c +#define MT6358_LDO_VCAMA1_CON1 0x1c7a +#define MT6358_LDO_VCAMA1_CON2 0x1c7c +#define MT6358_LDO_VCAMA1_CON3 0x1c7e +#define MT6358_LDO_VCAMA2_CON0 0x1c88 +#define MT6358_LDO_VCAMA2_CON1 0x1c96 +#define MT6358_LDO_VCAMA2_CON2 0x1c98 +#define MT6358_LDO_VCAMA2_CON3 0x1c9a +#define MT6358_LDO_VCAMD_CON0 0x1c9c +#define MT6358_LDO_VCAMD_CON1 0x1caa +#define MT6358_LDO_VCAMD_CON2 0x1cac +#define MT6358_LDO_VCAMD_CON3 0x1cae +#define MT6358_LDO_VCAMIO_CON0 0x1cb0 +#define MT6358_LDO_VCAMIO_CON1 0x1cbe +#define MT6358_LDO_VCAMIO_CON2 0x1cc0 +#define MT6358_LDO_VCAMIO_CON3 0x1cc2 +#define MT6358_LDO_VMC_CON0 0x1cc4 +#define MT6358_LDO_VMC_CON1 0x1cd2 +#define MT6358_LDO_VMC_CON2 0x1cd4 +#define MT6358_LDO_VMC_CON3 0x1cd6 +#define MT6358_LDO_VMCH_CON0 0x1cd8 +#define MT6358_LDO_VMCH_CON1 0x1ce6 +#define MT6358_LDO_VMCH_CON2 0x1ce8 +#define MT6358_LDO_VMCH_CON3 0x1cea +#define MT6358_LDO_VIBR_CON0 0x1d08 +#define MT6358_LDO_VIBR_CON1 0x1d16 +#define MT6358_LDO_VIBR_CON2 0x1d18 +#define MT6358_LDO_VIBR_CON3 0x1d1a +#define MT6358_LDO_VCN33_CON0_0 0x1d1c +#define MT6358_LDO_VCN33_CON0_1 0x1d2a +#define MT6358_LDO_VCN33_CON1 0x1d2c +#define MT6358_LDO_VCN33_BT_CON1 MT6358_LDO_VCN33_CON1 +#define MT6358_LDO_VCN33_WIFI_CON1 MT6358_LDO_VCN33_CON1 +#define MT6358_LDO_VCN33_CON2 0x1d2e +#define MT6358_LDO_VCN33_CON3 0x1d30 +#define MT6358_LDO_VLDO28_CON0_0 0x1d32 +#define MT6358_LDO_VLDO28_CON0_1 0x1d40 +#define MT6358_LDO_VLDO28_CON1 0x1d42 +#define MT6358_LDO_VLDO28_CON2 0x1d44 +#define MT6358_LDO_VLDO28_CON3 0x1d46 +#define MT6358_LDO_VSIM1_CON0 0x1d48 +#define MT6358_LDO_VSIM1_CON1 0x1d56 +#define MT6358_LDO_VSIM1_CON2 0x1d58 +#define MT6358_LDO_VSIM1_CON3 0x1d5a +#define MT6358_LDO_VSIM2_CON0 0x1d5c +#define MT6358_LDO_VSIM2_CON1 0x1d6a +#define MT6358_LDO_VSIM2_CON2 0x1d6c +#define MT6358_LDO_VSIM2_CON3 0x1d6e +#define MT6358_LDO_VCN28_CON0 0x1d88 +#define MT6358_LDO_VCN28_CON1 0x1d96 +#define MT6358_LDO_VCN28_CON2 0x1d98 +#define MT6358_LDO_VCN28_CON3 0x1d9a +#define MT6358_VRTC28_CON0 0x1d9c +#define MT6358_LDO_VBIF28_CON0 0x1d9e +#define MT6358_LDO_VBIF28_CON1 0x1dac +#define MT6358_LDO_VBIF28_CON2 0x1dae +#define MT6358_LDO_VBIF28_CON3 0x1db0 +#define MT6358_VCAMA1_ANA_CON0 0x1e08 +#define MT6358_VCAMA2_ANA_CON0 0x1e0c +#define MT6358_VCN33_ANA_CON0 0x1e28 +#define MT6358_VSIM1_ANA_CON0 0x1e2c +#define MT6358_VSIM2_ANA_CON0 0x1e30 +#define MT6358_VUSB_ANA_CON0 0x1e34 +#define MT6358_VEMC_ANA_CON0 0x1e38 +#define MT6358_VLDO28_ANA_CON0 0x1e3c +#define MT6358_VIO28_ANA_CON0 0x1e40 +#define MT6358_VIBR_ANA_CON0 0x1e44 +#define MT6358_VMCH_ANA_CON0 0x1e48 +#define MT6358_VMC_ANA_CON0 0x1e4c +#define MT6358_VRF18_ANA_CON0 0x1e88 +#define MT6358_VCN18_ANA_CON0 0x1e8c +#define MT6358_VCAMIO_ANA_CON0 0x1e90 +#define MT6358_VIO18_ANA_CON0 0x1e94 +#define MT6358_VEFUSE_ANA_CON0 0x1e98 +#define MT6358_VRF12_ANA_CON0 0x1e9c +#define MT6358_VSRAM_PROC11_ANA_CON0 0x1ea0 +#define MT6358_VSRAM_PROC12_ANA_CON0 0x1ea4 +#define MT6358_VSRAM_OTHERS_ANA_CON0 0x1ea6 +#define MT6358_VSRAM_GPU_ANA_CON0 0x1ea8 +#define MT6358_VDRAM2_ANA_CON0 0x1eaa +#define MT6358_VCAMD_ANA_CON0 0x1eae +#define MT6358_VA12_ANA_CON0 0x1eb2 +#define MT6358_AUD_TOP_INT_CON0 0x2228 +#define MT6358_AUD_TOP_INT_STATUS0 0x2234 + +#endif /* __MFD_MT6358_REGISTERS_H__ */ diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index b81d33325ade..949268581b36 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -12,6 +12,7 @@ enum chip_id { MT6323_CHIP_ID = 0x23, + MT6358_CHIP_ID = 0x58, MT6391_CHIP_ID = 0x91, MT6397_CHIP_ID = 0x97, }; @@ -65,8 +66,10 @@ struct mt6397_chip { u16 int_con[2]; u16 int_status[2]; u16 chip_id; + void *irq_data; }; +int mt6358_irq_init(struct mt6397_chip *chip); int mt6397_irq_init(struct mt6397_chip *chip); #endif /* __MFD_MT6397_CORE_H__ */ -- cgit v1.2.3 From 29ee40091e27615530c0ba7773a2879d8266381e Mon Sep 17 00:00:00 2001 From: Ran Bi Date: Tue, 21 Apr 2020 11:00:11 +0800 Subject: rtc: mt6397: Add support for the MediaTek MT6358 RTC This add support for the MediaTek MT6358 RTC. Driver using compatible data to store different RTC_WRTGR address offset. This replace RTC_WRTGR to RTC_WRTGR_MT6323 in mt6323-poweroff driver which only needed by armv7 CPU without ATF. Signed-off-by: Ran Bi Signed-off-by: Hsin-Hsiung Wang Reviewed-by: Nicolas Boichat Acked-by: Alexandre Belloni Acked-by: Sebastian Reichel Reviewed-by: Yingjoe Chen Signed-off-by: Lee Jones --- drivers/power/reset/mt6323-poweroff.c | 2 +- drivers/rtc/rtc-mt6397.c | 18 +++++++++++++++--- include/linux/mfd/mt6397/rtc.h | 9 ++++++++- 3 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/reset/mt6323-poweroff.c b/drivers/power/reset/mt6323-poweroff.c index 1caf43d9e46d..0532803e6cbc 100644 --- a/drivers/power/reset/mt6323-poweroff.c +++ b/drivers/power/reset/mt6323-poweroff.c @@ -30,7 +30,7 @@ static void mt6323_do_pwroff(void) int ret; regmap_write(pwrc->regmap, pwrc->base + RTC_BBPU, RTC_BBPU_KEY); - regmap_write(pwrc->regmap, pwrc->base + RTC_WRTGR, 1); + regmap_write(pwrc->regmap, pwrc->base + RTC_WRTGR_MT6323, 1); ret = regmap_read_poll_timeout(pwrc->regmap, pwrc->base + RTC_BBPU, val, diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c index cda238dfe69b..f8b1353777ba 100644 --- a/drivers/rtc/rtc-mt6397.c +++ b/drivers/rtc/rtc-mt6397.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -20,7 +21,7 @@ static int mtk_rtc_write_trigger(struct mt6397_rtc *rtc) int ret; u32 data; - ret = regmap_write(rtc->regmap, rtc->addr_base + RTC_WRTGR, 1); + ret = regmap_write(rtc->regmap, rtc->addr_base + rtc->data->wrtgr, 1); if (ret < 0) return ret; @@ -269,6 +270,8 @@ static int mtk_rtc_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); rtc->addr_base = res->start; + rtc->data = of_device_get_match_data(&pdev->dev); + rtc->irq = platform_get_irq(pdev, 0); if (rtc->irq < 0) return rtc->irq; @@ -325,9 +328,18 @@ static int mt6397_rtc_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(mt6397_pm_ops, mt6397_rtc_suspend, mt6397_rtc_resume); +static const struct mtk_rtc_data mt6358_rtc_data = { + .wrtgr = RTC_WRTGR_MT6358, +}; + +static const struct mtk_rtc_data mt6397_rtc_data = { + .wrtgr = RTC_WRTGR_MT6397, +}; + static const struct of_device_id mt6397_rtc_of_match[] = { - { .compatible = "mediatek,mt6323-rtc", }, - { .compatible = "mediatek,mt6397-rtc", }, + { .compatible = "mediatek,mt6323-rtc", .data = &mt6397_rtc_data }, + { .compatible = "mediatek,mt6358-rtc", .data = &mt6358_rtc_data }, + { .compatible = "mediatek,mt6397-rtc", .data = &mt6397_rtc_data }, { } }; MODULE_DEVICE_TABLE(of, mt6397_rtc_of_match); diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h index 7dfb63b81373..66989a16221a 100644 --- a/include/linux/mfd/mt6397/rtc.h +++ b/include/linux/mfd/mt6397/rtc.h @@ -18,7 +18,9 @@ #define RTC_BBPU_CBUSY BIT(6) #define RTC_BBPU_KEY (0x43 << 8) -#define RTC_WRTGR 0x003c +#define RTC_WRTGR_MT6358 0x003a +#define RTC_WRTGR_MT6397 0x003c +#define RTC_WRTGR_MT6323 RTC_WRTGR_MT6397 #define RTC_IRQ_STA 0x0002 #define RTC_IRQ_STA_AL BIT(0) @@ -65,6 +67,10 @@ #define MTK_RTC_POLL_DELAY_US 10 #define MTK_RTC_POLL_TIMEOUT (jiffies_to_usecs(HZ)) +struct mtk_rtc_data { + u32 wrtgr; +}; + struct mt6397_rtc { struct device *dev; struct rtc_device *rtc_dev; @@ -74,6 +80,7 @@ struct mt6397_rtc { struct regmap *regmap; int irq; u32 addr_base; + const struct mtk_rtc_data *data; }; #endif /* _LINUX_MFD_MT6397_RTC_H_ */ -- cgit v1.2.3 From 269fd61e15d785b9e20786672765400732dde8a0 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 21 May 2020 12:08:36 +0100 Subject: firmware: smccc: Fix missing prototype warning for arm_smccc_version_init Commit f2ae97062a48 ("firmware: smccc: Refactor SMCCC specific bits into separate file") introduced the following build warning: drivers/firmware/smccc/smccc.c:14:13: warning: no previous prototype for function 'arm_smccc_version_init' [-Wmissing-prototypes] void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit) ^~~~~~~~~~~~~~~~~~~~~~ Fix the same by adding the missing prototype in arm-smccc.h Reported-by: kbuild test robot Signed-off-by: Sudeep Holla Link: https://lore.kernel.org/r/20200521110836.57252-1-sudeep.holla@arm.com Signed-off-by: Will Deacon --- drivers/firmware/psci/psci.c | 1 - include/linux/arm-smccc.h | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 1330a698a178..92013ecc2d9e 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -79,7 +79,6 @@ static u32 psci_function_id[PSCI_FN_MAX]; static u32 psci_cpu_suspend_feature; static bool psci_system_reset2_supported; -void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit); static inline bool psci_has_ext_power_state(void) { diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index c491d210e3c3..56d6a5c6e353 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -5,6 +5,7 @@ #ifndef __LINUX_ARM_SMCCC_H #define __LINUX_ARM_SMCCC_H +#include #include /* @@ -111,6 +112,8 @@ enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void); */ u32 arm_smccc_get_version(void); +void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit); + /** * struct arm_smccc_res - Result from SMC/HVC call * @a0-a3 result values from registers 0 to 3 -- cgit v1.2.3 From 26d7e28e38206b1b3207af1409eee2269ab36f82 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 19 May 2020 16:22:59 +0200 Subject: s390/dasd: remove ioctl_by_bdev calls The IBM partition parser requires device type specific information only available to the DASD driver to correctly register partitions. The current approach of using ioctl_by_bdev with a fake user space pointer is discouraged. Fix this by replacing IOCTL calls with direct in-kernel function calls. Suggested-by: Christoph Hellwig Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Reviewed-by: Peter Oberparleiter Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + block/partitions/ibm.c | 24 ++++++++++++++++++------ drivers/s390/block/dasd_ioctl.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/dasd_mod.h | 9 +++++++++ 4 files changed, 62 insertions(+), 6 deletions(-) create mode 100644 include/linux/dasd_mod.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index b816a453b10e..927fd9845d2f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14628,6 +14628,7 @@ S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ F: block/partitions/ibm.c F: drivers/s390/block/dasd* +F: include/linux/dasd_mod.h S390 IOMMU (PCI) M: Gerald Schaefer diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 073faa6a69b8..d6e18df9c53c 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -13,10 +13,11 @@ #include #include #include +#include +#include #include "check.h" - union label_t { struct vtoc_volume_label_cdl vol; struct vtoc_volume_label_ldl lnx; @@ -288,7 +289,9 @@ static int find_cms1_partitions(struct parsed_partitions *state, */ int ibm_partition(struct parsed_partitions *state) { + int (*fn)(struct gendisk *disk, dasd_information2_t *info); struct block_device *bdev = state->bdev; + struct gendisk *disk = bdev->bd_disk; int blocksize, res; loff_t i_size, offset, size; dasd_information2_t *info; @@ -299,24 +302,31 @@ int ibm_partition(struct parsed_partitions *state) union label_t *label; res = 0; + if (!disk->fops->getgeo) + goto out_exit; + fn = symbol_get(dasd_biodasdinfo); + if (!fn) + goto out_exit; blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) - goto out_exit; + goto out_symbol; i_size = i_size_read(bdev->bd_inode); if (i_size == 0) - goto out_exit; + goto out_symbol; info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); if (info == NULL) - goto out_exit; + goto out_symbol; geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); if (geo == NULL) goto out_nogeo; label = kmalloc(sizeof(union label_t), GFP_KERNEL); if (label == NULL) goto out_nolab; - if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) + /* set start if not filled by getgeo function e.g. virtblk */ + geo->start = get_start_sect(bdev); + if (disk->fops->getgeo(bdev, geo)) goto out_freeall; - if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) { + if (fn(disk, info)) { kfree(info); info = NULL; } @@ -359,6 +369,8 @@ out_nolab: kfree(geo); out_nogeo: kfree(info); +out_symbol: + symbol_put(dasd_biodasdinfo); out_exit: return res; } diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 9b7782395c37..777734d1b4e5 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -22,6 +22,7 @@ #include #include #include +#include /* This is ugly... */ #define PRINTK_HEADER "dasd_ioctl:" @@ -664,3 +665,36 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode, dasd_put_device(base); return rc; } + + +/** + * dasd_biodasdinfo() - fill out the dasd information structure + * @disk [in]: pointer to gendisk structure that references a DASD + * @info [out]: pointer to the dasd_information2_t structure + * + * Provide access to DASD specific information. + * The gendisk structure is checked if it belongs to the DASD driver by + * comparing the gendisk->fops pointer. + * If it does not belong to the DASD driver -EINVAL is returned. + * Otherwise the provided dasd_information2_t structure is filled out. + * + * Returns: + * %0 on success and a negative error value on failure. + */ +int dasd_biodasdinfo(struct gendisk *disk, struct dasd_information2_t *info) +{ + struct dasd_device *base; + int error; + + if (disk->fops != &dasd_device_operations) + return -EINVAL; + + base = dasd_device_from_gendisk(disk); + if (!base) + return -ENODEV; + error = __dasd_ioctl_information(base->block, info); + dasd_put_device(base); + return error; +} +/* export that symbol_get in partition detection is possible */ +EXPORT_SYMBOL_GPL(dasd_biodasdinfo); diff --git a/include/linux/dasd_mod.h b/include/linux/dasd_mod.h new file mode 100644 index 000000000000..d39abad2ff6e --- /dev/null +++ b/include/linux/dasd_mod.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef DASD_MOD_H +#define DASD_MOD_H + +#include + +extern int dasd_biodasdinfo(struct gendisk *disk, dasd_information2_t *info); + +#endif -- cgit v1.2.3 From 3783daeb1d24696ff00125050353cfce4f5b6239 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 16:33:21 +0200 Subject: block: remove ioctl_by_bdev No callers left. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 12 ------------ include/linux/fs.h | 1 - 2 files changed, 13 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index 7cbb7b79935e..3003831c771d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -2166,18 +2166,6 @@ const struct file_operations def_blk_fops = { .fallocate = blkdev_fallocate, }; -int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -{ - int res; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - res = blkdev_ioctl(bdev, 0, cmd, arg); - set_fs(old_fs); - return res; -} - -EXPORT_SYMBOL(ioctl_by_bdev); - /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a95e5158811..861ca61d728b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2636,7 +2636,6 @@ extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; #ifdef CONFIG_BLOCK -extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); -- cgit v1.2.3 From 112b7147592e8f46bd1da4f961773e6d974f38a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 14 May 2020 12:53:44 -0500 Subject: exec: Convert security_bprm_set_creds into security_bprm_repopulate_creds Rename bprm->cap_elevated to bprm->active_secureexec and initialize it in prepare_binprm instead of in cap_bprm_set_creds. Initializing bprm->active_secureexec in prepare_binprm allows multiple implementations of security_bprm_repopulate_creds to play nicely with each other. Rename security_bprm_set_creds to security_bprm_reopulate_creds to emphasize that this path recomputes part of bprm->cred. This recomputation avoids the time of check vs time of use problems that are inherent in unix #! interpreters. In short two renames and a move in the location of initializing bprm->active_secureexec. Link: https://lkml.kernel.org/r/87o8qkzrxp.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 8 ++++---- include/linux/binfmts.h | 4 ++-- include/linux/lsm_hook_defs.h | 2 +- include/linux/lsm_hooks.h | 4 ++-- include/linux/security.h | 8 ++++---- security/commoncap.c | 9 ++++----- security/security.c | 4 ++-- 7 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 9e70da47f8d9..8e3b93d51d31 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1366,7 +1366,7 @@ int begin_new_exec(struct linux_binprm * bprm) * the final state of setuid/setgid/fscaps can be merged into the * secureexec flag. */ - bprm->secureexec |= bprm->cap_elevated; + bprm->secureexec |= bprm->active_secureexec; if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ @@ -1634,10 +1634,10 @@ int prepare_binprm(struct linux_binprm *bprm) int retval; loff_t pos = 0; + /* Recompute parts of bprm->cred based on bprm->file */ + bprm->active_secureexec = 0; bprm_fill_uid(bprm); - - /* fill in binprm security blob */ - retval = security_bprm_set_creds(bprm); + retval = security_bprm_repopulate_creds(bprm); if (retval) return retval; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index d1217fcdedea..8605ab4a0f89 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -27,10 +27,10 @@ struct linux_binprm { unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int /* - * True if most recent call to cap_bprm_set_creds + * True if most recent call to security_bprm_set_creds * resulted in elevated privileges. */ - cap_elevated:1, + active_secureexec:1, /* * Set by bprm_creds_for_exec hook to indicate a * privilege-gaining exec has happened. Used to set diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index aab0695f41df..1e295ba12c0d 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -50,7 +50,7 @@ LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages) LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) -LSM_HOOK(int, 0, bprm_set_creds, struct linux_binprm *bprm) +LSM_HOOK(int, 0, bprm_repopulate_creds, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index c719af37df20..d618ecc4d660 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -44,7 +44,7 @@ * request libc enable secure mode. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. - * @bprm_set_creds: + * @bprm_repopulate_creds: * Assuming that the relevant bits of @bprm->cred->security have been * previously set, examine @bprm->file and regenerate them. This is * so that the credentials derived from the interpreter the code is @@ -53,7 +53,7 @@ * reopen script, and may end up opening something completely different. * This hook may also optionally check permissions (e.g. for * transitions between security domains). - * The hook must set @bprm->cap_elevated to 1 if AT_SECURE should be set to + * The hook must set @bprm->active_secureexec to 1 if AT_SECURE should be set to * request libc enable secure mode. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. diff --git a/include/linux/security.h b/include/linux/security.h index 1bd7a6582775..6dcec9375e8f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -140,7 +140,7 @@ extern int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -extern int cap_bprm_set_creds(struct linux_binprm *bprm); +extern int cap_bprm_repopulate_creds(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int cap_inode_removexattr(struct dentry *dentry, const char *name); @@ -277,7 +277,7 @@ int security_syslog(int type); int security_settime64(const struct timespec64 *ts, const struct timezone *tz); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_bprm_creds_for_exec(struct linux_binprm *bprm); -int security_bprm_set_creds(struct linux_binprm *bprm); +int security_bprm_repopulate_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); void security_bprm_committed_creds(struct linux_binprm *bprm); @@ -575,9 +575,9 @@ static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm) return 0; } -static inline int security_bprm_set_creds(struct linux_binprm *bprm) +static inline int security_bprm_repopulate_creds(struct linux_binprm *bprm) { - return cap_bprm_set_creds(bprm); + return cap_bprm_repopulate_creds(bprm); } static inline int security_bprm_check(struct linux_binprm *bprm) diff --git a/security/commoncap.c b/security/commoncap.c index f4ee0ae106b2..045b5b80ea40 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -797,14 +797,14 @@ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, } /** - * cap_bprm_set_creds - Set up the proposed credentials for execve(). + * cap_bprm_repopulate_creds - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. Returns 0 if successful, -ve on error. */ -int cap_bprm_set_creds(struct linux_binprm *bprm) +int cap_bprm_repopulate_creds(struct linux_binprm *bprm) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; @@ -884,12 +884,11 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) return -EPERM; /* Check for privilege-elevated exec. */ - bprm->cap_elevated = 0; if (is_setid || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) - bprm->cap_elevated = 1; + bprm->active_secureexec = 1; return 0; } @@ -1346,7 +1345,7 @@ static struct security_hook_list capability_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), - LSM_HOOK_INIT(bprm_set_creds, cap_bprm_set_creds), + LSM_HOOK_INIT(bprm_repopulate_creds, cap_bprm_repopulate_creds), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), diff --git a/security/security.c b/security/security.c index 4ee76a729f73..b890b7e2a765 100644 --- a/security/security.c +++ b/security/security.c @@ -828,9 +828,9 @@ int security_bprm_creds_for_exec(struct linux_binprm *bprm) return call_int_hook(bprm_creds_for_exec, 0, bprm); } -int security_bprm_set_creds(struct linux_binprm *bprm) +int security_bprm_repopulate_creds(struct linux_binprm *bprm) { - return call_int_hook(bprm_set_creds, 0, bprm); + return call_int_hook(bprm_repopulate_creds, 0, bprm); } int security_bprm_check(struct linux_binprm *bprm) -- cgit v1.2.3 From a16b3357b2b8e910bb614254d8a7e84d2bd59b4c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 16 May 2020 06:02:54 -0500 Subject: exec: Allow load_misc_binary to call prepare_binprm unconditionally Add a flag preserve_creds that binfmt_misc can set to prevent credentials from being updated. This allows binfmt_misc to always call prepare_binprm. Allowing the credential computation logic to be consolidated. Not replacing the credentials with the interpreters credentials is safe because because an open file descriptor to the executable is passed to the interpreter. As the interpreter does not need to reopen the executable it is guaranteed to see the same file that exec sees. Ref: c407c033de84 ("[PATCH] binfmt_misc: improve calculation of interpreter's credentials") Link: https://lkml.kernel.org/r/87imgszrwo.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/binfmt_misc.c | 15 +++------------ fs/exec.c | 19 ++++++++++++------- include/linux/binfmts.h | 2 ++ 3 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index cdb45829354d..264829745d6f 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -218,19 +218,10 @@ static int load_misc_binary(struct linux_binprm *bprm) goto error; bprm->file = interp_file; - if (fmt->flags & MISC_FMT_CREDENTIALS) { - loff_t pos = 0; - - /* - * No need to call prepare_binprm(), it's already been - * done. bprm->buf is stale, update from interp_file. - */ - memset(bprm->buf, 0, BINPRM_BUF_SIZE); - retval = kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, - &pos); - } else - retval = prepare_binprm(bprm); + if (fmt->flags & MISC_FMT_CREDENTIALS) + bprm->preserve_creds = 1; + retval = prepare_binprm(bprm); if (retval < 0) goto error; diff --git a/fs/exec.c b/fs/exec.c index 8e3b93d51d31..028e0e323af5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1631,15 +1631,20 @@ static void bprm_fill_uid(struct linux_binprm *bprm) */ int prepare_binprm(struct linux_binprm *bprm) { - int retval; loff_t pos = 0; - /* Recompute parts of bprm->cred based on bprm->file */ - bprm->active_secureexec = 0; - bprm_fill_uid(bprm); - retval = security_bprm_repopulate_creds(bprm); - if (retval) - return retval; + /* Can the interpreter get to the executable without races? */ + if (!bprm->preserve_creds) { + int retval; + + /* Recompute parts of bprm->cred based on bprm->file */ + bprm->active_secureexec = 0; + bprm_fill_uid(bprm); + retval = security_bprm_repopulate_creds(bprm); + if (retval) + return retval; + } + bprm->preserve_creds = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8605ab4a0f89..dbb5614d62a2 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -26,6 +26,8 @@ struct linux_binprm { unsigned long p; /* current top of mem */ unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int + /* It is safe to use the creds of a script (see binfmt_misc) */ + preserve_creds:1, /* * True if most recent call to security_bprm_set_creds * resulted in elevated privileges. -- cgit v1.2.3 From 8b72ca9004ed35104deb80b07990da5503bc5252 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 May 2020 22:25:20 -0500 Subject: exec: Move the call of prepare_binprm into search_binary_handler The code in prepare_binary_handler needs to be run every time search_binary_handler is called so move the call into search_binary_handler itself to make the code simpler and easier to understand. Link: https://lkml.kernel.org/r/87d070zrvx.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Reviewed-by: James Morris Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/binfmt_loader.c | 3 --- fs/binfmt_em86.c | 4 ---- fs/binfmt_misc.c | 4 ---- fs/binfmt_script.c | 3 --- fs/exec.c | 12 +++++------- include/linux/binfmts.h | 1 - 6 files changed, 5 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c index a8d0d6e06526..d712ba51d15a 100644 --- a/arch/alpha/kernel/binfmt_loader.c +++ b/arch/alpha/kernel/binfmt_loader.c @@ -35,9 +35,6 @@ static int load_binary(struct linux_binprm *bprm) bprm->file = file; bprm->loader = loader; - retval = prepare_binprm(bprm); - if (retval < 0) - return retval; return search_binary_handler(bprm); } diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 466497860c62..cedde2341ade 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -91,10 +91,6 @@ static int load_em86(struct linux_binprm *bprm) bprm->file = file; - retval = prepare_binprm(bprm); - if (retval < 0) - return retval; - return search_binary_handler(bprm); } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 264829745d6f..50a73afdf9b7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -221,10 +221,6 @@ static int load_misc_binary(struct linux_binprm *bprm) if (fmt->flags & MISC_FMT_CREDENTIALS) bprm->preserve_creds = 1; - retval = prepare_binprm(bprm); - if (retval < 0) - goto error; - retval = search_binary_handler(bprm); if (retval < 0) goto error; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index e9e6a6f4a35f..8d718d8fd0fe 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -143,9 +143,6 @@ static int load_script(struct linux_binprm *bprm) return PTR_ERR(file); bprm->file = file; - retval = prepare_binprm(bprm); - if (retval < 0) - return retval; return search_binary_handler(bprm); } diff --git a/fs/exec.c b/fs/exec.c index 028e0e323af5..5fc458460e44 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1629,7 +1629,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) * * This may be called multiple times for binary chains (scripts for example). */ -int prepare_binprm(struct linux_binprm *bprm) +static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; @@ -1650,8 +1650,6 @@ int prepare_binprm(struct linux_binprm *bprm) return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } -EXPORT_SYMBOL(prepare_binprm); - /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after @@ -1707,6 +1705,10 @@ int search_binary_handler(struct linux_binprm *bprm) if (bprm->recursion_depth > 5) return -ELOOP; + retval = prepare_binprm(bprm); + if (retval < 0) + return retval; + retval = security_bprm_check(bprm); if (retval) return retval; @@ -1864,10 +1866,6 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval) goto out; - retval = prepare_binprm(bprm); - if (retval < 0) - goto out; - retval = copy_strings_kernel(1, &bprm->filename, bprm); if (retval < 0) goto out; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index dbb5614d62a2..8c7779d6bf19 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -116,7 +116,6 @@ static inline void insert_binfmt(struct linux_binfmt *fmt) extern void unregister_binfmt(struct linux_binfmt *); -extern int prepare_binprm(struct linux_binprm *); extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *); extern int begin_new_exec(struct linux_binprm * bprm); -- cgit v1.2.3 From b8a61c9e7b4a0fec493d191429e9653d66a79ccc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 14 May 2020 15:17:40 -0500 Subject: exec: Generic execfd support Most of the support for passing the file descriptor of an executable to an interpreter already lives in the generic code and in binfmt_elf. Rework the fields in binfmt_elf that deal with executable file descriptor passing to make executable file descriptor passing a first class concept. Move the fd_install from binfmt_misc into begin_new_exec after the new creds have been installed. This means that accessing the file through /proc//fd/N is able to see the creds for the new executable before allowing access to the new executables files. Performing the install of the executables file descriptor after the point of no return also means that nothing special needs to be done on error. The exiting of the process will close all of it's open files. Move the would_dump from binfmt_misc into begin_new_exec right after would_dump is called on the bprm->file. This makes it obvious this case exists and that no nesting of bprm->file is currently supported. In binfmt_misc the movement of fd_install into generic code means that it's special error exit path is no longer needed. Link: https://lkml.kernel.org/r/87y2poyd91.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/binfmt_elf.c | 4 ++-- fs/binfmt_elf_fdpic.c | 4 ++-- fs/binfmt_misc.c | 40 ++++++++-------------------------------- fs/exec.c | 15 +++++++++++++++ include/linux/binfmts.h | 10 +++++----- 5 files changed, 32 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 396d5c2e6b5e..441c85f04dfd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -273,8 +273,8 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_BASE_PLATFORM, (elf_addr_t)(unsigned long)u_base_platform); } - if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { - NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + if (bprm->have_execfd) { + NEW_AUX_ENT(AT_EXECFD, bprm->execfd); } #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 896e3ca9bf85..2d5e9eb12075 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -628,10 +628,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, (elf_addr_t) (unsigned long) u_base_platform); } - if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + if (bprm->have_execfd) { nr = 0; csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + NEW_AUX_ENT(AT_EXECFD, bprm->execfd); } nr = 0; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 50a73afdf9b7..ad2866f28f0c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -134,7 +134,6 @@ static int load_misc_binary(struct linux_binprm *bprm) Node *fmt; struct file *interp_file = NULL; int retval; - int fd_binary = -1; retval = -ENOEXEC; if (!enabled) @@ -161,29 +160,12 @@ static int load_misc_binary(struct linux_binprm *bprm) } if (fmt->flags & MISC_FMT_OPEN_BINARY) { - - /* if the binary should be opened on behalf of the - * interpreter than keep it open and assign descriptor - * to it - */ - fd_binary = get_unused_fd_flags(0); - if (fd_binary < 0) { - retval = fd_binary; - goto ret; - } - fd_install(fd_binary, bprm->file); - - /* if the binary is not readable than enforce mm->dumpable=0 - regardless of the interpreter's permissions */ - would_dump(bprm, bprm->file); + /* Pass the open binary to the interpreter */ + bprm->have_execfd = 1; + bprm->executable = bprm->file; allow_write_access(bprm->file); bprm->file = NULL; - - /* mark the bprm that fd should be passed to interp */ - bprm->interp_flags |= BINPRM_FLAGS_EXECFD; - bprm->interp_data = fd_binary; - } else { allow_write_access(bprm->file); fput(bprm->file); @@ -192,19 +174,19 @@ static int load_misc_binary(struct linux_binprm *bprm) /* make argv[1] be the path to the binary */ retval = copy_strings_kernel(1, &bprm->interp, bprm); if (retval < 0) - goto error; + goto ret; bprm->argc++; /* add the interp as argv[0] */ retval = copy_strings_kernel(1, &fmt->interpreter, bprm); if (retval < 0) - goto error; + goto ret; bprm->argc++; /* Update interp in case binfmt_script needs it. */ retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) - goto error; + goto ret; if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = file_clone_open(fmt->interp_file); @@ -215,7 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm) } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) - goto error; + goto ret; bprm->file = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) @@ -223,17 +205,11 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = search_binary_handler(bprm); if (retval < 0) - goto error; + goto ret; ret: dput(fmt->dentry); return retval; -error: - if (fd_binary > 0) - ksys_close(fd_binary); - bprm->interp_flags = 0; - bprm->interp_data = 0; - goto ret; } /* Command parsers */ diff --git a/fs/exec.c b/fs/exec.c index 5fc458460e44..117ad8fc012b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1323,7 +1323,10 @@ int begin_new_exec(struct linux_binprm * bprm) */ set_mm_exe_file(bprm->mm, bprm->file); + /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff @@ -1427,6 +1430,16 @@ int begin_new_exec(struct linux_binprm * bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); + + /* Pass the opened binary to the interpreter. */ + if (bprm->have_execfd) { + retval = get_unused_fd_flags(0); + if (retval < 0) + goto out_unlock; + fd_install(retval, bprm->executable); + bprm->executable = NULL; + bprm->execfd = retval; + } return 0; out_unlock: @@ -1516,6 +1529,8 @@ static void free_bprm(struct linux_binprm *bprm) allow_write_access(bprm->file); fput(bprm->file); } + if (bprm->executable) + fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8c7779d6bf19..653508b25815 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -26,6 +26,9 @@ struct linux_binprm { unsigned long p; /* current top of mem */ unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int + /* Should an execfd be passed to userspace? */ + have_execfd:1, + /* It is safe to use the creds of a script (see binfmt_misc) */ preserve_creds:1, /* @@ -48,6 +51,7 @@ struct linux_binprm { unsigned int taso:1; #endif unsigned int recursion_depth; /* only for search_binary_handler() */ + struct file * executable; /* Executable to pass to the interpreter */ struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ @@ -58,7 +62,7 @@ struct linux_binprm { of the time same as filename, but could be different for binfmt_{misc,script} */ unsigned interp_flags; - unsigned interp_data; + int execfd; /* File descriptor of the executable */ unsigned long loader, exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ @@ -69,10 +73,6 @@ struct linux_binprm { #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) -/* fd of the binary should be passed to the interpreter */ -#define BINPRM_FLAGS_EXECFD_BIT 1 -#define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) - /* filename of the binary will be inaccessible after exec */ #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) -- cgit v1.2.3 From bc2bf338d54b7aadaed49bb45b9e10d4592b2a46 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 18 May 2020 18:43:20 -0500 Subject: exec: Remove recursion from search_binary_handler Recursion in kernel code is generally a bad idea as it can overflow the kernel stack. Recursion in exec also hides that the code is looping and that the loop changes bprm->file. Instead of recursing in search_binary_handler have the methods that would recurse set bprm->interpreter and return 0. Modify exec_binprm to loop when bprm->interpreter is set. Consolidate all of the reassignments of bprm->file in that loop to make it clear what is going on. The structure of the new loop in exec_binprm is that all errors return immediately, while successful completion (ret == 0 && !bprm->interpreter) just breaks out of the loop and runs what exec_bprm has always run upon successful completion. Fail if the an interpreter is being call after execfd has been set. The code has never properly handled an interpreter being called with execfd being set and with reassignments of bprm->file and the assignment of bprm->executable in generic code it has finally become possible to test and fail when if this problematic condition happens. With the reassignments of bprm->file and the assignment of bprm->executable moved into the generic code add a test to see if bprm->executable is being reassigned. In search_binary_handler remove the test for !bprm->file. With all reassignments of bprm->file moved to exec_binprm bprm->file can never be NULL in search_binary_handler. Link: https://lkml.kernel.org/r/87sgfwyd84.fsf_-_@x220.int.ebiederm.org Acked-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/binfmt_loader.c | 8 ++---- fs/binfmt_em86.c | 9 ++----- fs/binfmt_misc.c | 18 +++----------- fs/binfmt_script.c | 9 ++----- fs/exec.c | 51 +++++++++++++++++++++++++-------------- include/linux/binfmts.h | 3 +-- 6 files changed, 43 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c index d712ba51d15a..e4be7a543ecf 100644 --- a/arch/alpha/kernel/binfmt_loader.c +++ b/arch/alpha/kernel/binfmt_loader.c @@ -19,10 +19,6 @@ static int load_binary(struct linux_binprm *bprm) if (bprm->loader) return -ENOEXEC; - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - loader = bprm->vma->vm_end - sizeof(void *); file = open_exec("/sbin/loader"); @@ -33,9 +29,9 @@ static int load_binary(struct linux_binprm *bprm) /* Remember if the application is TASO. */ bprm->taso = eh->ah.entry < 0x100000000UL; - bprm->file = file; + bprm->interpreter = file; bprm->loader = loader; - return search_binary_handler(bprm); + return 0; } static struct linux_binfmt loader_format = { diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index cedde2341ade..995883693cb2 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -48,10 +48,6 @@ static int load_em86(struct linux_binprm *bprm) if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) return -ENOENT; - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - /* Unlike in the script case, we don't have to do any hairy * parsing to find our interpreter... it's hardcoded! */ @@ -89,9 +85,8 @@ static int load_em86(struct linux_binprm *bprm) if (IS_ERR(file)) return PTR_ERR(file); - bprm->file = file; - - return search_binary_handler(bprm); + bprm->interpreter = file; + return 0; } static struct linux_binfmt em86_format = { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ad2866f28f0c..53968ea07b57 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -159,18 +159,9 @@ static int load_misc_binary(struct linux_binprm *bprm) goto ret; } - if (fmt->flags & MISC_FMT_OPEN_BINARY) { - /* Pass the open binary to the interpreter */ + if (fmt->flags & MISC_FMT_OPEN_BINARY) bprm->have_execfd = 1; - bprm->executable = bprm->file; - allow_write_access(bprm->file); - bprm->file = NULL; - } else { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - } /* make argv[1] be the path to the binary */ retval = copy_strings_kernel(1, &bprm->interp, bprm); if (retval < 0) @@ -199,14 +190,11 @@ static int load_misc_binary(struct linux_binprm *bprm) if (IS_ERR(interp_file)) goto ret; - bprm->file = interp_file; + bprm->interpreter = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) bprm->preserve_creds = 1; - retval = search_binary_handler(bprm); - if (retval < 0) - goto ret; - + retval = 0; ret: dput(fmt->dentry); return retval; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 85e0ef86eb11..0e8b953d12cf 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -93,11 +93,6 @@ static int load_script(struct linux_binprm *bprm) if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) return -ENOENT; - /* Release since we are not mapping a binary into memory. */ - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - /* * OK, we've parsed out the interpreter name and * (optional) argument. @@ -138,8 +133,8 @@ static int load_script(struct linux_binprm *bprm) if (IS_ERR(file)) return PTR_ERR(file); - bprm->file = file; - return search_binary_handler(bprm); + bprm->interpreter = file; + return 0; } static struct linux_binfmt script_format = { diff --git a/fs/exec.c b/fs/exec.c index 117ad8fc012b..c3c879a55d65 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1710,16 +1710,12 @@ EXPORT_SYMBOL(remove_arg_zero); /* * cycle the list of binary formats handler, until one recognizes the image */ -int search_binary_handler(struct linux_binprm *bprm) +static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; - /* This allows 4 levels of binfmt rewrites before failing hard. */ - if (bprm->recursion_depth > 5) - return -ELOOP; - retval = prepare_binprm(bprm); if (retval < 0) return retval; @@ -1736,14 +1732,11 @@ int search_binary_handler(struct linux_binprm *bprm) continue; read_unlock(&binfmt_lock); - bprm->recursion_depth++; retval = fmt->load_binary(bprm); - bprm->recursion_depth--; read_lock(&binfmt_lock); put_binfmt(fmt); - if (bprm->point_of_no_return || !bprm->file || - (retval != -ENOEXEC)) { + if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } @@ -1762,12 +1755,11 @@ int search_binary_handler(struct linux_binprm *bprm) return retval; } -EXPORT_SYMBOL(search_binary_handler); static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; - int ret; + int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; @@ -1775,15 +1767,38 @@ static int exec_binprm(struct linux_binprm *bprm) old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); - ret = search_binary_handler(bprm); - if (ret >= 0) { - audit_bprm(bprm); - trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - proc_exec_connector(current); + /* This allows 4 levels of binfmt rewrites before failing hard. */ + for (depth = 0;; depth++) { + struct file *exec; + if (depth > 5) + return -ELOOP; + + ret = search_binary_handler(bprm); + if (ret < 0) + return ret; + if (!bprm->interpreter) + break; + + exec = bprm->file; + bprm->file = bprm->interpreter; + bprm->interpreter = NULL; + + allow_write_access(exec); + if (unlikely(bprm->have_execfd)) { + if (bprm->executable) { + fput(exec); + return -ENOEXEC; + } + bprm->executable = exec; + } else + fput(exec); } - return ret; + audit_bprm(bprm); + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + proc_exec_connector(current); + return 0; } /* diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 653508b25815..7fc05929c967 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -50,8 +50,8 @@ struct linux_binprm { #ifdef __alpha__ unsigned int taso:1; #endif - unsigned int recursion_depth; /* only for search_binary_handler() */ struct file * executable; /* Executable to pass to the interpreter */ + struct file * interpreter; struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ @@ -117,7 +117,6 @@ static inline void insert_binfmt(struct linux_binfmt *fmt) extern void unregister_binfmt(struct linux_binfmt *); extern int __must_check remove_arg_zero(struct linux_binprm *); -extern int search_binary_handler(struct linux_binprm *); extern int begin_new_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); extern void finalize_exec(struct linux_binprm *bprm); -- cgit v1.2.3 From 6e0688dbff625f1e49e3ddb028720ae9fd606f0b Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Wed, 20 May 2020 18:34:10 +0000 Subject: PCI: Use bridge window names (PCI_BRIDGE_IO_WINDOW etc) Use bridge resource definitions instead of using the PCI_BRIDGE_RESOURCES constant with an integer offeset. Link: https://lore.kernel.org/r/20200520183411.1534621-2-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/setup-bus.c | 114 ++++++++++++++++++++++-------------------- drivers/pcmcia/yenta_socket.c | 16 +++--- include/linux/pci.h | 14 +++++- 3 files changed, 84 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index bbcef1a053ab..07cd7a3817dc 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -583,7 +583,7 @@ static void pci_setup_bridge_io(struct pci_dev *bridge) io_mask = PCI_IO_1K_RANGE_MASK; /* Set up the top and bottom of the PCI I/O segment for this bus */ - res = &bridge->resource[PCI_BRIDGE_RESOURCES + 0]; + res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; pcibios_resource_to_bus(bridge->bus, ®ion, res); if (res->flags & IORESOURCE_IO) { pci_read_config_word(bridge, PCI_IO_BASE, &l); @@ -613,7 +613,7 @@ static void pci_setup_bridge_mmio(struct pci_dev *bridge) u32 l; /* Set up the top and bottom of the PCI Memory segment for this bus */ - res = &bridge->resource[PCI_BRIDGE_RESOURCES + 1]; + res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; pcibios_resource_to_bus(bridge->bus, ®ion, res); if (res->flags & IORESOURCE_MEM) { l = (region.start >> 16) & 0xfff0; @@ -640,7 +640,7 @@ static void pci_setup_bridge_mmio_pref(struct pci_dev *bridge) /* Set up PREF base/limit */ bu = lu = 0; - res = &bridge->resource[PCI_BRIDGE_RESOURCES + 2]; + res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; pcibios_resource_to_bus(bridge->bus, ®ion, res); if (res->flags & IORESOURCE_PREFETCH) { l = (region.start >> 16) & 0xfff0; @@ -707,14 +707,14 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i) if (!pci_bus_clip_resource(bridge, i)) return -EINVAL; /* Clipping didn't change anything */ - switch (i - PCI_BRIDGE_RESOURCES) { - case 0: + switch (i) { + case PCI_BRIDGE_IO_WINDOW: pci_setup_bridge_io(bridge); break; - case 1: + case PCI_BRIDGE_MEM_WINDOW: pci_setup_bridge_mmio(bridge); break; - case 2: + case PCI_BRIDGE_PREF_MEM_WINDOW: pci_setup_bridge_mmio_pref(bridge); break; default: @@ -735,18 +735,22 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i) static void pci_bridge_check_ranges(struct pci_bus *bus) { struct pci_dev *bridge = bus->self; - struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES]; + struct resource *b_res; - b_res[1].flags |= IORESOURCE_MEM; + b_res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; + b_res->flags |= IORESOURCE_MEM; - if (bridge->io_window) - b_res[0].flags |= IORESOURCE_IO; + if (bridge->io_window) { + b_res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; + b_res->flags |= IORESOURCE_IO; + } if (bridge->pref_window) { - b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH; + b_res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + b_res->flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH; if (bridge->pref_64_window) { - b_res[2].flags |= IORESOURCE_MEM_64; - b_res[2].flags |= PCI_PREF_RANGE_TYPE_64; + b_res->flags |= IORESOURCE_MEM_64 | + PCI_PREF_RANGE_TYPE_64; } } } @@ -1105,35 +1109,37 @@ static void pci_bus_size_cardbus(struct pci_bus *bus, struct list_head *realloc_head) { struct pci_dev *bridge = bus->self; - struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES]; + struct resource *b_res; resource_size_t b_res_3_size = pci_cardbus_mem_size * 2; u16 ctrl; - if (b_res[0].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_IO_0_WINDOW]; + if (b_res->parent) goto handle_b_res_1; /* * Reserve some resources for CardBus. We reserve a fixed amount * of bus space for CardBus bridges. */ - b_res[0].start = pci_cardbus_io_size; - b_res[0].end = b_res[0].start + pci_cardbus_io_size - 1; - b_res[0].flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_io_size; + b_res->end = b_res->start + pci_cardbus_io_size - 1; + b_res->flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[0].end -= pci_cardbus_io_size; + b_res->end -= pci_cardbus_io_size; add_to_list(realloc_head, bridge, b_res, pci_cardbus_io_size, - pci_cardbus_io_size); + pci_cardbus_io_size); } handle_b_res_1: - if (b_res[1].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_IO_1_WINDOW]; + if (b_res->parent) goto handle_b_res_2; - b_res[1].start = pci_cardbus_io_size; - b_res[1].end = b_res[1].start + pci_cardbus_io_size - 1; - b_res[1].flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_io_size; + b_res->end = b_res->start + pci_cardbus_io_size - 1; + b_res->flags |= IORESOURCE_IO | IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[1].end -= pci_cardbus_io_size; - add_to_list(realloc_head, bridge, b_res+1, pci_cardbus_io_size, - pci_cardbus_io_size); + b_res->end -= pci_cardbus_io_size; + add_to_list(realloc_head, bridge, b_res, pci_cardbus_io_size, + pci_cardbus_io_size); } handle_b_res_2: @@ -1153,21 +1159,22 @@ handle_b_res_2: pci_read_config_word(bridge, PCI_CB_BRIDGE_CONTROL, &ctrl); } - if (b_res[2].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_MEM_0_WINDOW]; + if (b_res->parent) goto handle_b_res_3; /* * If we have prefetchable memory support, allocate two regions. * Otherwise, allocate one region of twice the size. */ if (ctrl & PCI_CB_BRIDGE_CTL_PREFETCH_MEM0) { - b_res[2].start = pci_cardbus_mem_size; - b_res[2].end = b_res[2].start + pci_cardbus_mem_size - 1; - b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH | - IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_mem_size; + b_res->end = b_res->start + pci_cardbus_mem_size - 1; + b_res->flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH | + IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[2].end -= pci_cardbus_mem_size; - add_to_list(realloc_head, bridge, b_res+2, - pci_cardbus_mem_size, pci_cardbus_mem_size); + b_res->end -= pci_cardbus_mem_size; + add_to_list(realloc_head, bridge, b_res, + pci_cardbus_mem_size, pci_cardbus_mem_size); } /* Reduce that to half */ @@ -1175,15 +1182,16 @@ handle_b_res_2: } handle_b_res_3: - if (b_res[3].parent) + b_res = &bridge->resource[PCI_CB_BRIDGE_MEM_1_WINDOW]; + if (b_res->parent) goto handle_done; - b_res[3].start = pci_cardbus_mem_size; - b_res[3].end = b_res[3].start + b_res_3_size - 1; - b_res[3].flags |= IORESOURCE_MEM | IORESOURCE_STARTALIGN; + b_res->start = pci_cardbus_mem_size; + b_res->end = b_res->start + b_res_3_size - 1; + b_res->flags |= IORESOURCE_MEM | IORESOURCE_STARTALIGN; if (realloc_head) { - b_res[3].end -= b_res_3_size; - add_to_list(realloc_head, bridge, b_res+3, b_res_3_size, - pci_cardbus_mem_size); + b_res->end -= b_res_3_size; + add_to_list(realloc_head, bridge, b_res, b_res_3_size, + pci_cardbus_mem_size); } handle_done: @@ -1227,7 +1235,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) break; hdr_type = -1; /* Intentionally invalid - not a PCI device. */ } else { - pref = &bus->self->resource[PCI_BRIDGE_RESOURCES + 2]; + pref = &bus->self->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; hdr_type = bus->self->hdr_type; } @@ -1885,9 +1893,9 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, struct pci_dev *dev, *bridge = bus->self; resource_size_t io_per_hp, mmio_per_hp, mmio_pref_per_hp, align; - io_res = &bridge->resource[PCI_BRIDGE_RESOURCES + 0]; - mmio_res = &bridge->resource[PCI_BRIDGE_RESOURCES + 1]; - mmio_pref_res = &bridge->resource[PCI_BRIDGE_RESOURCES + 2]; + io_res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; + mmio_res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; + mmio_pref_res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; /* * The alignment of this bridge is yet to be considered, hence it must @@ -1960,21 +1968,21 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, * Reduce the available resource space by what the * bridge and devices below it occupy. */ - res = &dev->resource[PCI_BRIDGE_RESOURCES + 0]; + res = &dev->resource[PCI_BRIDGE_IO_WINDOW]; align = pci_resource_alignment(dev, res); align = align ? ALIGN(io.start, align) - io.start : 0; used_size = align + resource_size(res); if (!res->parent) io.start = min(io.start + used_size, io.end + 1); - res = &dev->resource[PCI_BRIDGE_RESOURCES + 1]; + res = &dev->resource[PCI_BRIDGE_MEM_WINDOW]; align = pci_resource_alignment(dev, res); align = align ? ALIGN(mmio.start, align) - mmio.start : 0; used_size = align + resource_size(res); if (!res->parent) mmio.start = min(mmio.start + used_size, mmio.end + 1); - res = &dev->resource[PCI_BRIDGE_RESOURCES + 2]; + res = &dev->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; align = pci_resource_alignment(dev, res); align = align ? ALIGN(mmio_pref.start, align) - mmio_pref.start : 0; @@ -2027,9 +2035,9 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, return; /* Take the initial extra resources from the hotplug port */ - available_io = bridge->resource[PCI_BRIDGE_RESOURCES + 0]; - available_mmio = bridge->resource[PCI_BRIDGE_RESOURCES + 1]; - available_mmio_pref = bridge->resource[PCI_BRIDGE_RESOURCES + 2]; + available_io = bridge->resource[PCI_BRIDGE_IO_WINDOW]; + available_mmio = bridge->resource[PCI_BRIDGE_MEM_WINDOW]; + available_mmio_pref = bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; pci_bus_distribute_available_resources(bridge->subordinate, add_list, available_io, diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index bf6529b0b5b0..5fe58dac0d1d 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -694,7 +694,7 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type struct pci_bus_region region; unsigned mask; - res = dev->resource + PCI_BRIDGE_RESOURCES + nr; + res = &dev->resource[nr]; /* Already allocated? */ if (res->parent) return 0; @@ -711,7 +711,7 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type region.end = config_readl(socket, addr_end) | ~mask; if (region.start && region.end > region.start && !override_bios) { pcibios_bus_to_resource(dev->bus, res, ®ion); - if (pci_claim_resource(dev, PCI_BRIDGE_RESOURCES + nr) == 0) + if (pci_claim_resource(dev, nr) == 0) return 0; dev_info(&dev->dev, "Preassigned resource %d busy or not available, reconfiguring...\n", @@ -751,13 +751,17 @@ static int yenta_allocate_res(struct yenta_socket *socket, int nr, unsigned type static void yenta_allocate_resources(struct yenta_socket *socket) { int program = 0; - program += yenta_allocate_res(socket, 0, IORESOURCE_IO, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_IO_0_WINDOW, + IORESOURCE_IO, PCI_CB_IO_BASE_0, PCI_CB_IO_LIMIT_0); - program += yenta_allocate_res(socket, 1, IORESOURCE_IO, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_IO_1_WINDOW, + IORESOURCE_IO, PCI_CB_IO_BASE_1, PCI_CB_IO_LIMIT_1); - program += yenta_allocate_res(socket, 2, IORESOURCE_MEM|IORESOURCE_PREFETCH, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_MEM_0_WINDOW, + IORESOURCE_MEM | IORESOURCE_PREFETCH, PCI_CB_MEMORY_BASE_0, PCI_CB_MEMORY_LIMIT_0); - program += yenta_allocate_res(socket, 3, IORESOURCE_MEM, + program += yenta_allocate_res(socket, PCI_CB_BRIDGE_MEM_1_WINDOW, + IORESOURCE_MEM, PCI_CB_MEMORY_BASE_1, PCI_CB_MEMORY_LIMIT_1); if (program) pci_setup_cardbus(socket->dev->subordinate); diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..cdfb07bfdf7d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -100,9 +100,21 @@ enum { PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1, #endif - /* Resources assigned to buses behind the bridge */ +/* PCI-to-PCI (P2P) bridge windows */ +#define PCI_BRIDGE_IO_WINDOW (PCI_BRIDGE_RESOURCES + 0) +#define PCI_BRIDGE_MEM_WINDOW (PCI_BRIDGE_RESOURCES + 1) +#define PCI_BRIDGE_PREF_MEM_WINDOW (PCI_BRIDGE_RESOURCES + 2) + +/* CardBus bridge windows */ +#define PCI_CB_BRIDGE_IO_0_WINDOW (PCI_BRIDGE_RESOURCES + 0) +#define PCI_CB_BRIDGE_IO_1_WINDOW (PCI_BRIDGE_RESOURCES + 1) +#define PCI_CB_BRIDGE_MEM_0_WINDOW (PCI_BRIDGE_RESOURCES + 2) +#define PCI_CB_BRIDGE_MEM_1_WINDOW (PCI_BRIDGE_RESOURCES + 3) + +/* Total number of bridge resources for P2P and CardBus */ #define PCI_BRIDGE_RESOURCE_NUM 4 + /* Resources assigned to buses behind the bridge */ PCI_BRIDGE_RESOURCES, PCI_BRIDGE_RESOURCE_END = PCI_BRIDGE_RESOURCES + PCI_BRIDGE_RESOURCE_NUM - 1, -- cgit v1.2.3 From 7bfb399eca460500f048098bf427c45b40e17cae Mon Sep 17 00:00:00 2001 From: Yuval Basson Date: Tue, 19 May 2020 23:51:26 +0300 Subject: qed: Add XRC to RoCE Add support for XRC-SRQ's and XRC-QP's for upper layer driver. We maintain separate bitmaps for resource management for srq and xrc-srq, However, the range in FW is one, The xrc-srq's are first and then the srq's follow. Therefore we maintain a srq-id offset. v2: perform cleanups if XRC bitmpas allocation fail. Signed-off-by: Michal Kalderon Signed-off-by: Yuval Bason Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 145 +++++++++++++++++++++++++---- drivers/net/ethernet/qlogic/qed/qed_rdma.h | 19 ++++ drivers/net/ethernet/qlogic/qed/qed_roce.c | 29 ++++++ include/linux/qed/qed_rdma_if.h | 19 ++++ 4 files changed, 194 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 415f3f3d2152..50985871cd3d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -212,13 +212,22 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn) goto free_rdma_port; } + /* Allocate bit map for XRC Domains */ + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->xrcd_map, + QED_RDMA_MAX_XRCDS, "XRCD"); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate xrcd_map,rc = %d\n", rc); + goto free_pd_map; + } + /* Allocate DPI bitmap */ rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->dpi_map, p_hwfn->dpi_count, "DPI"); if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to allocate DPI bitmap, rc = %d\n", rc); - goto free_pd_map; + goto free_xrcd_map; } /* Allocate bitmap for cq's. The maximum number of CQs is bound to @@ -271,6 +280,19 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn) goto free_cid_map; } + /* The first SRQ follows the last XRC SRQ. This means that the + * SRQ IDs start from an offset equals to max_xrc_srqs. + */ + p_rdma_info->srq_id_offset = p_hwfn->p_cxt_mngr->xrc_srq_count; + rc = qed_rdma_bmap_alloc(p_hwfn, + &p_rdma_info->xrc_srq_map, + p_hwfn->p_cxt_mngr->xrc_srq_count, "XRC SRQ"); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Failed to allocate xrc srq bitmap, rc = %d\n", rc); + goto free_real_cid_map; + } + /* Allocate bitmap for srqs */ p_rdma_info->num_srqs = p_hwfn->p_cxt_mngr->srq_count; rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->srq_map, @@ -278,7 +300,7 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn) if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to allocate srq bitmap, rc = %d\n", rc); - goto free_real_cid_map; + goto free_xrc_srq_map; } if (QED_IS_IWARP_PERSONALITY(p_hwfn)) @@ -292,6 +314,8 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn) free_srq_map: kfree(p_rdma_info->srq_map.bitmap); +free_xrc_srq_map: + kfree(p_rdma_info->xrc_srq_map.bitmap); free_real_cid_map: kfree(p_rdma_info->real_cid_map.bitmap); free_cid_map: @@ -304,6 +328,8 @@ free_cq_map: kfree(p_rdma_info->cq_map.bitmap); free_dpi_map: kfree(p_rdma_info->dpi_map.bitmap); +free_xrcd_map: + kfree(p_rdma_info->xrcd_map.bitmap); free_pd_map: kfree(p_rdma_info->pd_map.bitmap); free_rdma_port: @@ -377,6 +403,7 @@ static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tid_map, 1); qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->srq_map, 1); qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->real_cid_map, 1); + qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->xrc_srq_map, 1); kfree(p_rdma_info->port); kfree(p_rdma_info->dev); @@ -612,7 +639,10 @@ static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn, p_params_header->cnq_start_offset = (u8)RESC_START(p_hwfn, QED_RDMA_CNQ_RAM); p_params_header->num_cnqs = params->desired_cnq; - + p_params_header->first_reg_srq_id = + cpu_to_le16(p_hwfn->p_rdma_info->srq_id_offset); + p_params_header->reg_srq_base_addr = + cpu_to_le32(qed_cxt_get_ilt_page_size(p_hwfn, ILT_CLI_TSDM)); if (params->cq_mode == QED_RDMA_CQ_MODE_16_BITS) p_params_header->cq_ring_mode = 1; else @@ -983,6 +1013,41 @@ static void qed_rdma_free_pd(void *rdma_cxt, u16 pd) spin_unlock_bh(&p_hwfn->p_rdma_info->lock); } +static int qed_rdma_alloc_xrcd(void *rdma_cxt, u16 *xrcd_id) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + u32 returned_id; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Alloc XRCD\n"); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, + &p_hwfn->p_rdma_info->xrcd_map, + &returned_id); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + if (rc) { + DP_NOTICE(p_hwfn, "Failed in allocating xrcd id\n"); + return rc; + } + + *xrcd_id = (u16)returned_id; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Alloc XRCD - done, rc = %d\n", rc); + return rc; +} + +static void qed_rdma_free_xrcd(void *rdma_cxt, u16 xrcd_id) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "xrcd_id = %08x\n", xrcd_id); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->xrcd_map, xrcd_id); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); +} + static enum qed_rdma_toggle_bit qed_rdma_toggle_bit_create_resize_cq(struct qed_hwfn *p_hwfn, u16 icid) { @@ -1306,6 +1371,8 @@ qed_rdma_create_qp(void *rdma_cxt, qp->resp_offloaded = false; qp->e2e_flow_control_en = qp->use_srq ? false : true; qp->stats_queue = in_params->stats_queue; + qp->qp_type = in_params->qp_type; + qp->xrcd_id = in_params->xrcd_id; if (QED_IS_IWARP_PERSONALITY(p_hwfn)) { rc = qed_iwarp_create_qp(p_hwfn, qp, out_params); @@ -1418,6 +1485,18 @@ static int qed_rdma_modify_qp(void *rdma_cxt, qp->cur_state); } + switch (qp->qp_type) { + case QED_RDMA_QP_TYPE_XRC_INI: + qp->has_req = 1; + break; + case QED_RDMA_QP_TYPE_XRC_TGT: + qp->has_resp = 1; + break; + default: + qp->has_req = 1; + qp->has_resp = 1; + } + if (QED_IS_IWARP_PERSONALITY(p_hwfn)) { enum qed_iwarp_qp_state new_state = qed_roce2iwarp_state(qp->cur_state); @@ -1657,6 +1736,15 @@ static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev) return QED_AFFIN_HWFN(cdev); } +static struct qed_bmap *qed_rdma_get_srq_bmap(struct qed_hwfn *p_hwfn, + bool is_xrc) +{ + if (is_xrc) + return &p_hwfn->p_rdma_info->xrc_srq_map; + + return &p_hwfn->p_rdma_info->srq_map; +} + static int qed_rdma_modify_srq(void *rdma_cxt, struct qed_rdma_modify_srq_in_params *in_params) { @@ -1686,8 +1774,8 @@ static int qed_rdma_modify_srq(void *rdma_cxt, if (rc) return rc; - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "modified SRQ id = %x", - in_params->srq_id); + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "modified SRQ id = %x, is_xrc=%u\n", + in_params->srq_id, in_params->is_xrc); return rc; } @@ -1702,6 +1790,7 @@ qed_rdma_destroy_srq(void *rdma_cxt, struct qed_spq_entry *p_ent; struct qed_bmap *bmap; u16 opaque_fid; + u16 offset; int rc; opaque_fid = p_hwfn->hw_info.opaque_fid; @@ -1723,14 +1812,16 @@ qed_rdma_destroy_srq(void *rdma_cxt, if (rc) return rc; - bmap = &p_hwfn->p_rdma_info->srq_map; + bmap = qed_rdma_get_srq_bmap(p_hwfn, in_params->is_xrc); + offset = (in_params->is_xrc) ? 0 : p_hwfn->p_rdma_info->srq_id_offset; spin_lock_bh(&p_hwfn->p_rdma_info->lock); - qed_bmap_release_id(p_hwfn, bmap, in_params->srq_id); + qed_bmap_release_id(p_hwfn, bmap, in_params->srq_id - offset); spin_unlock_bh(&p_hwfn->p_rdma_info->lock); - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "SRQ destroyed Id = %x", - in_params->srq_id); + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "XRC/SRQ destroyed Id = %x, is_xrc=%u\n", + in_params->srq_id, in_params->is_xrc); return rc; } @@ -1748,24 +1839,26 @@ qed_rdma_create_srq(void *rdma_cxt, u16 opaque_fid, srq_id; struct qed_bmap *bmap; u32 returned_id; + u16 offset; int rc; - bmap = &p_hwfn->p_rdma_info->srq_map; + bmap = qed_rdma_get_srq_bmap(p_hwfn, in_params->is_xrc); spin_lock_bh(&p_hwfn->p_rdma_info->lock); rc = qed_rdma_bmap_alloc_id(p_hwfn, bmap, &returned_id); spin_unlock_bh(&p_hwfn->p_rdma_info->lock); if (rc) { - DP_NOTICE(p_hwfn, "failed to allocate srq id\n"); + DP_NOTICE(p_hwfn, + "failed to allocate xrc/srq id (is_xrc=%u)\n", + in_params->is_xrc); return rc; } - elem_type = QED_ELEM_SRQ; + elem_type = (in_params->is_xrc) ? (QED_ELEM_XRC_SRQ) : (QED_ELEM_SRQ); rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, elem_type, returned_id); if (rc) goto err; - /* returned id is no greater than u16 */ - srq_id = (u16)returned_id; + opaque_fid = p_hwfn->hw_info.opaque_fid; opaque_fid = p_hwfn->hw_info.opaque_fid; @@ -1782,20 +1875,34 @@ qed_rdma_create_srq(void *rdma_cxt, DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, in_params->pbl_base_addr); p_ramrod->pages_in_srq_pbl = cpu_to_le16(in_params->num_pages); p_ramrod->pd_id = cpu_to_le16(in_params->pd_id); - p_ramrod->srq_id.srq_idx = cpu_to_le16(srq_id); p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid); p_ramrod->page_size = cpu_to_le16(in_params->page_size); DMA_REGPAIR_LE(p_ramrod->producers_addr, in_params->prod_pair_addr); + offset = (in_params->is_xrc) ? 0 : p_hwfn->p_rdma_info->srq_id_offset; + srq_id = (u16)returned_id + offset; + p_ramrod->srq_id.srq_idx = cpu_to_le16(srq_id); + if (in_params->is_xrc) { + SET_FIELD(p_ramrod->flags, + RDMA_SRQ_CREATE_RAMROD_DATA_XRC_FLAG, 1); + SET_FIELD(p_ramrod->flags, + RDMA_SRQ_CREATE_RAMROD_DATA_RESERVED_KEY_EN, + in_params->reserved_key_en); + p_ramrod->xrc_srq_cq_cid = + cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | + in_params->cq_cid); + p_ramrod->xrc_domain = cpu_to_le16(in_params->xrcd_id); + } rc = qed_spq_post(p_hwfn, p_ent, NULL); if (rc) goto err; out_params->srq_id = srq_id; - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, - "SRQ created Id = %x\n", out_params->srq_id); - + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "XRC/SRQ created Id = %x (is_xrc=%u)\n", + out_params->srq_id, in_params->is_xrc); return rc; err: @@ -1961,6 +2068,8 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = { .rdma_cnq_prod_update = &qed_rdma_cnq_prod_update, .rdma_alloc_pd = &qed_rdma_alloc_pd, .rdma_dealloc_pd = &qed_rdma_free_pd, + .rdma_alloc_xrcd = &qed_rdma_alloc_xrcd, + .rdma_dealloc_xrcd = &qed_rdma_free_xrcd, .rdma_create_cq = &qed_rdma_create_cq, .rdma_destroy_cq = &qed_rdma_destroy_cq, .rdma_create_qp = &qed_rdma_create_qp, diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h index 3689fe3e5935..5a7ebc764bb6 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.h +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h @@ -63,6 +63,11 @@ #define QED_RDMA_MAX_CQE_32_BIT (0x7FFFFFFF - 1) #define QED_RDMA_MAX_CQE_16_BIT (0x7FFF - 1) +/* Up to 2^16 XRC Domains are supported, but the actual number of supported XRC + * SRQs is much smaller so there's no need to have that many domains. + */ +#define QED_RDMA_MAX_XRCDS (roundup_pow_of_two(RDMA_MAX_XRC_SRQS)) + enum qed_rdma_toggle_bit { QED_RDMA_TOGGLE_BIT_CLEAR = 0, QED_RDMA_TOGGLE_BIT_SET = 1 @@ -81,9 +86,11 @@ struct qed_rdma_info { struct qed_bmap cq_map; struct qed_bmap pd_map; + struct qed_bmap xrcd_map; struct qed_bmap tid_map; struct qed_bmap qp_map; struct qed_bmap srq_map; + struct qed_bmap xrc_srq_map; struct qed_bmap cid_map; struct qed_bmap tcp_cid_map; struct qed_bmap real_cid_map; @@ -111,6 +118,7 @@ struct qed_rdma_qp { u32 qpid; u16 icid; enum qed_roce_qp_state cur_state; + enum qed_rdma_qp_type qp_type; enum qed_iwarp_qp_state iwarp_state; bool use_srq; bool signal_all; @@ -153,18 +161,21 @@ struct qed_rdma_qp { dma_addr_t orq_phys_addr; u8 orq_num_pages; bool req_offloaded; + bool has_req; /* responder */ u8 max_rd_atomic_resp; u32 rq_psn; u16 rq_cq_id; u16 rq_num_pages; + u16 xrcd_id; dma_addr_t rq_pbl_ptr; void *irq; dma_addr_t irq_phys_addr; u8 irq_num_pages; bool resp_offloaded; u32 cq_prod; + bool has_resp; u8 remote_mac_addr[6]; u8 local_mac_addr[6]; @@ -174,6 +185,14 @@ struct qed_rdma_qp { struct qed_iwarp_ep *ep; }; +static inline bool qed_rdma_is_xrc_qp(struct qed_rdma_qp *qp) +{ + if (qp->qp_type == QED_RDMA_QP_TYPE_XRC_TGT || + qp->qp_type == QED_RDMA_QP_TYPE_XRC_INI) + return true; + + return false; +} #if IS_ENABLED(CONFIG_QED_RDMA) void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 475b89903f46..46a4d09eacef 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -254,6 +254,9 @@ static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn, int rc; u8 tc; + if (!qp->has_resp) + return 0; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); /* Allocate DMA-able memory for IRQ */ @@ -315,6 +318,10 @@ static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn, ROCE_CREATE_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER, qp->min_rnr_nak_timer); + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_RESP_RAMROD_DATA_XRC_FLAG, + qed_rdma_is_xrc_qp(qp)); + p_ramrod->max_ird = qp->max_rd_atomic_resp; p_ramrod->traffic_class = qp->traffic_class_tos; p_ramrod->hop_limit = qp->hop_limit_ttl; @@ -335,6 +342,7 @@ static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn, p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo); p_ramrod->cq_cid = cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | qp->rq_cq_id); + p_ramrod->xrc_domain = cpu_to_le16(qp->xrcd_id); tc = qed_roce_get_qp_tc(p_hwfn, qp); regular_latency_queue = qed_get_cm_pq_idx_ofld_mtc(p_hwfn, tc); @@ -395,6 +403,9 @@ static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn, int rc; u8 tc; + if (!qp->has_req) + return 0; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); /* Allocate DMA-able memory for ORQ */ @@ -444,6 +455,10 @@ static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn, ROCE_CREATE_QP_REQ_RAMROD_DATA_RNR_NAK_CNT, qp->rnr_retry_cnt); + SET_FIELD(p_ramrod->flags, + ROCE_CREATE_QP_REQ_RAMROD_DATA_XRC_FLAG, + qed_rdma_is_xrc_qp(qp)); + p_ramrod->max_ord = qp->max_rd_atomic_req; p_ramrod->traffic_class = qp->traffic_class_tos; p_ramrod->hop_limit = qp->hop_limit_ttl; @@ -517,6 +532,9 @@ static int qed_roce_sp_modify_responder(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent; int rc; + if (!qp->has_resp) + return 0; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); if (move_to_err && !qp->resp_offloaded) @@ -611,6 +629,9 @@ static int qed_roce_sp_modify_requester(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent; int rc; + if (!qp->has_req) + return 0; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); if (move_to_err && !(qp->req_offloaded)) @@ -705,6 +726,11 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn, dma_addr_t ramrod_res_phys; int rc; + if (!qp->has_resp) { + *cq_prod = 0; + return 0; + } + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); *cq_prod = qp->cq_prod; @@ -785,6 +811,9 @@ static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn, dma_addr_t ramrod_res_phys; int rc = -ENOMEM; + if (!qp->has_req) + return 0; + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid); if (!qp->req_offloaded) diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index 74efca15fde7..f93edd5750a5 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -53,6 +53,13 @@ enum qed_roce_qp_state { QED_ROCE_QP_STATE_SQE }; +enum qed_rdma_qp_type { + QED_RDMA_QP_TYPE_RC, + QED_RDMA_QP_TYPE_XRC_INI, + QED_RDMA_QP_TYPE_XRC_TGT, + QED_RDMA_QP_TYPE_INVAL = 0xffff, +}; + enum qed_rdma_tid_type { QED_RDMA_TID_REGISTERED_MR, QED_RDMA_TID_FMR, @@ -291,6 +298,12 @@ struct qed_rdma_create_srq_in_params { u16 num_pages; u16 pd_id; u16 page_size; + + /* XRC related only */ + bool reserved_key_en; + bool is_xrc; + u32 cq_cid; + u16 xrcd_id; }; struct qed_rdma_destroy_cq_in_params { @@ -319,7 +332,9 @@ struct qed_rdma_create_qp_in_params { u16 rq_num_pages; u64 rq_pbl_ptr; u16 srq_id; + u16 xrcd_id; u8 stats_queue; + enum qed_rdma_qp_type qp_type; }; struct qed_rdma_create_qp_out_params { @@ -429,11 +444,13 @@ struct qed_rdma_create_srq_out_params { struct qed_rdma_destroy_srq_in_params { u16 srq_id; + bool is_xrc; }; struct qed_rdma_modify_srq_in_params { u32 wqe_limit; u16 srq_id; + bool is_xrc; }; struct qed_rdma_stats_out_params { @@ -611,6 +628,8 @@ struct qed_rdma_ops { int (*rdma_set_rdma_int)(struct qed_dev *cdev, u16 cnt); int (*rdma_alloc_pd)(void *rdma_cxt, u16 *pd); void (*rdma_dealloc_pd)(void *rdma_cxt, u16 pd); + int (*rdma_alloc_xrcd)(void *rdma_cxt, u16 *xrcd); + void (*rdma_dealloc_xrcd)(void *rdma_cxt, u16 xrcd); int (*rdma_create_cq)(void *rdma_cxt, struct qed_rdma_create_cq_in_params *params, u16 *icid); -- cgit v1.2.3 From 8066021915924f58ed338bf38208215f5a7355f6 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 20 May 2020 08:29:14 +0200 Subject: ethtool: provide UAPI for PHY Signal Quality Index (SQI) Signal Quality Index is a mandatory value required by "OPEN Alliance SIG" for the 100Base-T1 PHYs [1]. This indicator can be used for cable integrity diagnostic and investigating other noise sources and implement by at least two vendors: NXP[2] and TI[3]. [1] http://www.opensig.org/download/document/218/Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf [2] https://www.nxp.com/docs/en/data-sheet/TJA1100.pdf [3] https://www.ti.com/product/DP83TC811R-Q1 Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 6 ++- include/linux/phy.h | 2 + include/uapi/linux/ethtool_netlink.h | 2 + net/ethtool/linkstate.c | 75 +++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index eed46b6aa07d..7e651ea33eab 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -454,10 +454,12 @@ Request contents: Kernel response contents: - ==================================== ====== ========================== + ==================================== ====== ============================ ``ETHTOOL_A_LINKSTATE_HEADER`` nested reply header ``ETHTOOL_A_LINKSTATE_LINK`` bool link state (up/down) - ==================================== ====== ========================== + ``ETHTOOL_A_LINKSTATE_SQI`` u32 Current Signal Quality Index + ``ETHTOOL_A_LINKSTATE_SQI_MAX`` u32 Max support SQI value + ==================================== ====== ============================ For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns carrier flag provided by ``netif_carrier_ok()`` but there are drivers which diff --git a/include/linux/phy.h b/include/linux/phy.h index 467aa8bf9f64..2bcdf19ed3b4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -723,6 +723,8 @@ struct phy_driver { struct ethtool_tunable *tuna, const void *data); int (*set_loopback)(struct phy_device *dev, bool enable); + int (*get_sqi)(struct phy_device *dev); + int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 2881af411f76..e6f109b76c9a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -232,6 +232,8 @@ enum { ETHTOOL_A_LINKSTATE_UNSPEC, ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ ETHTOOL_A_LINKSTATE_LINK, /* u8 */ + ETHTOOL_A_LINKSTATE_SQI, /* u32 */ + ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 2740cde0a182..7f47ba89054e 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -2,6 +2,7 @@ #include "netlink.h" #include "common.h" +#include struct linkstate_req_info { struct ethnl_req_info base; @@ -10,6 +11,8 @@ struct linkstate_req_info { struct linkstate_reply_data { struct ethnl_reply_data base; int link; + int sqi; + int sqi_max; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -20,8 +23,46 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, }; +static int linkstate_get_sqi(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + +static int linkstate_get_sqi_max(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi_max) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi_max(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,6 +75,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; data->link = __ethtool_get_link(dev); + + ret = linkstate_get_sqi(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi = ret; + + ret = linkstate_get_sqi_max(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi_max = ret; + ethnl_ops_complete(dev); return 0; @@ -42,8 +96,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, static int linkstate_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + int len; + + len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; + + if (data->sqi != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + if (data->sqi_max != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + return len; } static int linkstate_fill_reply(struct sk_buff *skb, @@ -56,6 +121,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; + if (data->sqi != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; + + if (data->sqi_max != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3 From 560e3a045961ed0c7184ef9f6a93b95bd38c1c48 Mon Sep 17 00:00:00 2001 From: Bhaumik Bhatt Date: Thu, 21 May 2020 22:32:40 +0530 Subject: bus: mhi: core: Handle firmware load using state worker Upon power up, driver queues firmware worker thread if the execution environment is PBL. Firmware worker is blocked with a timeout until state worker gets a chance to run and unblock firmware worker. An endpoint power up failure can be seen if state worker gets a chance to run after firmware worker has timed out. Remove this dependency and handle firmware load directly using state worker thread. Signed-off-by: Bhaumik Bhatt Reviewed-by: Jeffrey Hugo Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200521170249.21795-6-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/boot.c | 18 +++--------------- drivers/bus/mhi/core/init.c | 1 - drivers/bus/mhi/core/internal.h | 1 + drivers/bus/mhi/core/pm.c | 6 +----- include/linux/mhi.h | 2 -- 5 files changed, 5 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/mhi/core/boot.c b/drivers/bus/mhi/core/boot.c index ebad5eb48e5a..17c636b4bc6e 100644 --- a/drivers/bus/mhi/core/boot.c +++ b/drivers/bus/mhi/core/boot.c @@ -377,30 +377,18 @@ static void mhi_firmware_copy(struct mhi_controller *mhi_cntrl, } } -void mhi_fw_load_worker(struct work_struct *work) +void mhi_fw_load_handler(struct mhi_controller *mhi_cntrl) { - struct mhi_controller *mhi_cntrl; const struct firmware *firmware = NULL; struct image_info *image_info; - struct device *dev; + struct device *dev = &mhi_cntrl->mhi_dev->dev; const char *fw_name; void *buf; dma_addr_t dma_addr; size_t size; int ret; - mhi_cntrl = container_of(work, struct mhi_controller, fw_worker); - dev = &mhi_cntrl->mhi_dev->dev; - - dev_dbg(dev, "Waiting for device to enter PBL from: %s\n", - TO_MHI_EXEC_STR(mhi_cntrl->ee)); - - ret = wait_event_timeout(mhi_cntrl->state_event, - MHI_IN_PBL(mhi_cntrl->ee) || - MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state), - msecs_to_jiffies(mhi_cntrl->timeout_ms)); - - if (!ret || MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) { + if (MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) { dev_err(dev, "Device MHI is not in valid state\n"); return; } diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c index 1a93d24efffc..6882206ad80e 100644 --- a/drivers/bus/mhi/core/init.c +++ b/drivers/bus/mhi/core/init.c @@ -835,7 +835,6 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl, spin_lock_init(&mhi_cntrl->wlock); INIT_WORK(&mhi_cntrl->st_worker, mhi_pm_st_worker); INIT_WORK(&mhi_cntrl->syserr_worker, mhi_pm_sys_err_worker); - INIT_WORK(&mhi_cntrl->fw_worker, mhi_fw_load_worker); init_waitqueue_head(&mhi_cntrl->state_event); mhi_cmd = mhi_cntrl->mhi_cmd; diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h index 40c47f918ca3..0965ca3c9632 100644 --- a/drivers/bus/mhi/core/internal.h +++ b/drivers/bus/mhi/core/internal.h @@ -627,6 +627,7 @@ int mhi_init_irq_setup(struct mhi_controller *mhi_cntrl); void mhi_deinit_free_irq(struct mhi_controller *mhi_cntrl); void mhi_rddm_prepare(struct mhi_controller *mhi_cntrl, struct image_info *img_info); +void mhi_fw_load_handler(struct mhi_controller *mhi_cntrl); int mhi_prepare_channel(struct mhi_controller *mhi_cntrl, struct mhi_chan *mhi_chan); int mhi_init_chan_ctxt(struct mhi_controller *mhi_cntrl, diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c index e7c831867a23..3cc238ad693d 100644 --- a/drivers/bus/mhi/core/pm.c +++ b/drivers/bus/mhi/core/pm.c @@ -528,7 +528,6 @@ static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl, dev_dbg(dev, "Waiting for all pending threads to complete\n"); wake_up_all(&mhi_cntrl->state_event); flush_work(&mhi_cntrl->st_worker); - flush_work(&mhi_cntrl->fw_worker); dev_dbg(dev, "Reset all active channels and remove MHI devices\n"); device_for_each_child(mhi_cntrl->cntrl_dev, NULL, mhi_destroy_device); @@ -643,7 +642,7 @@ void mhi_pm_st_worker(struct work_struct *work) mhi_cntrl->ee = mhi_get_exec_env(mhi_cntrl); write_unlock_irq(&mhi_cntrl->pm_lock); if (MHI_IN_PBL(mhi_cntrl->ee)) - wake_up_all(&mhi_cntrl->state_event); + mhi_fw_load_handler(mhi_cntrl); break; case DEV_ST_TRANSITION_SBL: write_lock_irq(&mhi_cntrl->pm_lock); @@ -976,9 +975,6 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl) next_state = MHI_IN_PBL(current_ee) ? DEV_ST_TRANSITION_PBL : DEV_ST_TRANSITION_READY; - if (next_state == DEV_ST_TRANSITION_PBL) - schedule_work(&mhi_cntrl->fw_worker); - mhi_queue_state_transition(mhi_cntrl, next_state); mutex_unlock(&mhi_cntrl->pm_mutex); diff --git a/include/linux/mhi.h b/include/linux/mhi.h index b0739ad1bae4..8289202b8cbe 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -331,7 +331,6 @@ struct mhi_controller_config { * @wlock: Lock for protecting device wakeup * @mhi_link_info: Device bandwidth info * @st_worker: State transition worker - * @fw_worker: Firmware download worker * @syserr_worker: System error worker * @state_event: State change event * @status_cb: CB function to notify power states of the device (required) @@ -412,7 +411,6 @@ struct mhi_controller { spinlock_t wlock; struct mhi_link_info mhi_link_info; struct work_struct st_worker; - struct work_struct fw_worker; struct work_struct syserr_worker; wait_queue_head_t state_event; -- cgit v1.2.3 From bc7ccce5a5192cf277da0aef05e45cd92c81c79a Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Thu, 21 May 2020 22:32:44 +0530 Subject: bus: mhi: core: Remove the system error worker thread Remove the system error worker thread and instead have the execution environment worker handle that transition to serialize processing and avoid any possible race conditions during shutdown. Signed-off-by: Hemant Kumar Reviewed-by: Jeffrey Hugo Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200521170249.21795-10-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/init.c | 2 +- drivers/bus/mhi/core/internal.h | 3 ++- drivers/bus/mhi/core/main.c | 6 +++--- drivers/bus/mhi/core/pm.c | 32 ++++++++++++++------------------ include/linux/mhi.h | 2 -- 5 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c index 6882206ad80e..3a853c5d2103 100644 --- a/drivers/bus/mhi/core/init.c +++ b/drivers/bus/mhi/core/init.c @@ -34,6 +34,7 @@ const char * const dev_state_tran_str[DEV_ST_TRANSITION_MAX] = { [DEV_ST_TRANSITION_READY] = "READY", [DEV_ST_TRANSITION_SBL] = "SBL", [DEV_ST_TRANSITION_MISSION_MODE] = "MISSION_MODE", + [DEV_ST_TRANSITION_SYS_ERR] = "SYS_ERR", }; const char * const mhi_state_str[MHI_STATE_MAX] = { @@ -834,7 +835,6 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl, spin_lock_init(&mhi_cntrl->transition_lock); spin_lock_init(&mhi_cntrl->wlock); INIT_WORK(&mhi_cntrl->st_worker, mhi_pm_st_worker); - INIT_WORK(&mhi_cntrl->syserr_worker, mhi_pm_sys_err_worker); init_waitqueue_head(&mhi_cntrl->state_event); mhi_cmd = mhi_cntrl->mhi_cmd; diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h index 80b32c20149c..f01283b8a451 100644 --- a/drivers/bus/mhi/core/internal.h +++ b/drivers/bus/mhi/core/internal.h @@ -386,6 +386,7 @@ enum dev_st_transition { DEV_ST_TRANSITION_READY, DEV_ST_TRANSITION_SBL, DEV_ST_TRANSITION_MISSION_MODE, + DEV_ST_TRANSITION_SYS_ERR, DEV_ST_TRANSITION_MAX, }; @@ -587,7 +588,7 @@ enum mhi_ee_type mhi_get_exec_env(struct mhi_controller *mhi_cntrl); int mhi_queue_state_transition(struct mhi_controller *mhi_cntrl, enum dev_st_transition state); void mhi_pm_st_worker(struct work_struct *work); -void mhi_pm_sys_err_worker(struct work_struct *work); +void mhi_pm_sys_err_handler(struct mhi_controller *mhi_cntrl); void mhi_fw_load_worker(struct work_struct *work); int mhi_ready_state_transition(struct mhi_controller *mhi_cntrl); void mhi_ctrl_ev_task(unsigned long data); diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index 6a8066640e78..9ec9b3601592 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -406,7 +406,7 @@ irqreturn_t mhi_intvec_threaded_handler(int irq_number, void *dev) if (MHI_IN_PBL(ee)) mhi_cntrl->status_cb(mhi_cntrl, MHI_CB_FATAL_ERROR); else - schedule_work(&mhi_cntrl->syserr_worker); + mhi_pm_sys_err_handler(mhi_cntrl); } exit_intvec: @@ -734,7 +734,7 @@ int mhi_process_ctrl_ev_ring(struct mhi_controller *mhi_cntrl, MHI_PM_SYS_ERR_DETECT); write_unlock_irq(&mhi_cntrl->pm_lock); if (new_state == MHI_PM_SYS_ERR_DETECT) - schedule_work(&mhi_cntrl->syserr_worker); + mhi_pm_sys_err_handler(mhi_cntrl); break; } default: @@ -920,7 +920,7 @@ void mhi_ctrl_ev_task(unsigned long data) } write_unlock_irq(&mhi_cntrl->pm_lock); if (pm_state == MHI_PM_SYS_ERR_DETECT) - schedule_work(&mhi_cntrl->syserr_worker); + mhi_pm_sys_err_handler(mhi_cntrl); } } diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c index 3cc238ad693d..d9964d4c4eed 100644 --- a/drivers/bus/mhi/core/pm.c +++ b/drivers/bus/mhi/core/pm.c @@ -449,19 +449,8 @@ static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl, to_mhi_pm_state_str(transition_state)); /* We must notify MHI control driver so it can clean up first */ - if (transition_state == MHI_PM_SYS_ERR_PROCESS) { - /* - * If controller supports RDDM, we do not process - * SYS error state, instead we will jump directly - * to RDDM state - */ - if (mhi_cntrl->rddm_image) { - dev_dbg(dev, - "Controller supports RDDM, so skip SYS_ERR\n"); - return; - } + if (transition_state == MHI_PM_SYS_ERR_PROCESS) mhi_cntrl->status_cb(mhi_cntrl, MHI_CB_SYS_ERROR); - } mutex_lock(&mhi_cntrl->pm_mutex); write_lock_irq(&mhi_cntrl->pm_lock); @@ -527,7 +516,6 @@ static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl, mutex_unlock(&mhi_cntrl->pm_mutex); dev_dbg(dev, "Waiting for all pending threads to complete\n"); wake_up_all(&mhi_cntrl->state_event); - flush_work(&mhi_cntrl->st_worker); dev_dbg(dev, "Reset all active channels and remove MHI devices\n"); device_for_each_child(mhi_cntrl->cntrl_dev, NULL, mhi_destroy_device); @@ -607,13 +595,17 @@ int mhi_queue_state_transition(struct mhi_controller *mhi_cntrl, } /* SYS_ERR worker */ -void mhi_pm_sys_err_worker(struct work_struct *work) +void mhi_pm_sys_err_handler(struct mhi_controller *mhi_cntrl) { - struct mhi_controller *mhi_cntrl = container_of(work, - struct mhi_controller, - syserr_worker); + struct device *dev = &mhi_cntrl->mhi_dev->dev; + + /* skip if controller supports RDDM */ + if (mhi_cntrl->rddm_image) { + dev_dbg(dev, "Controller supports RDDM, skip SYS_ERROR\n"); + return; + } - mhi_pm_disable_transition(mhi_cntrl, MHI_PM_SYS_ERR_PROCESS); + mhi_queue_state_transition(mhi_cntrl, DEV_ST_TRANSITION_SYS_ERR); } /* Device State Transition worker */ @@ -661,6 +653,10 @@ void mhi_pm_st_worker(struct work_struct *work) case DEV_ST_TRANSITION_READY: mhi_ready_state_transition(mhi_cntrl); break; + case DEV_ST_TRANSITION_SYS_ERR: + mhi_pm_disable_transition + (mhi_cntrl, MHI_PM_SYS_ERR_PROCESS); + break; default: break; } diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 8289202b8cbe..c4a940d98912 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -331,7 +331,6 @@ struct mhi_controller_config { * @wlock: Lock for protecting device wakeup * @mhi_link_info: Device bandwidth info * @st_worker: State transition worker - * @syserr_worker: System error worker * @state_event: State change event * @status_cb: CB function to notify power states of the device (required) * @wake_get: CB function to assert device wake (optional) @@ -411,7 +410,6 @@ struct mhi_controller { spinlock_t wlock; struct mhi_link_info mhi_link_info; struct work_struct st_worker; - struct work_struct syserr_worker; wait_queue_head_t state_event; void (*status_cb)(struct mhi_controller *mhi_cntrl, -- cgit v1.2.3 From 8786eda9a97763c768e2e81cfd7b4f420281b6e1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 21 May 2020 13:05:40 -0500 Subject: misc: rtsx: Remove unused pcr_ops Remove the following unused function pointers from struct pcr_ops: int (*set_ltr_latency)(struct rtsx_pcr *pcr, u32 latency); int (*set_l1off_sub)(struct rtsx_pcr *pcr, u8 val); void (*full_on)(struct rtsx_pcr *pcr); void (*power_saving)(struct rtsx_pcr *pcr); Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20200521180545.1159896-2-helgaas@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rtsx_pcr.c | 15 +++------------ include/linux/rtsx_pci.h | 4 ---- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index 06038b325b02..7145de2504c2 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -85,10 +85,7 @@ static int rtsx_comm_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency) int rtsx_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency) { - if (pcr->ops->set_ltr_latency) - return pcr->ops->set_ltr_latency(pcr, latency); - else - return rtsx_comm_set_ltr_latency(pcr, latency); + return rtsx_comm_set_ltr_latency(pcr, latency); } static void rtsx_comm_set_aspm(struct rtsx_pcr *pcr, bool enable) @@ -151,10 +148,7 @@ static void rtsx_comm_pm_full_on(struct rtsx_pcr *pcr) static void rtsx_pm_full_on(struct rtsx_pcr *pcr) { - if (pcr->ops->full_on) - pcr->ops->full_on(pcr); - else - rtsx_comm_pm_full_on(pcr); + rtsx_comm_pm_full_on(pcr); } void rtsx_pci_start_run(struct rtsx_pcr *pcr) @@ -1108,10 +1102,7 @@ static void rtsx_comm_pm_power_saving(struct rtsx_pcr *pcr) static void rtsx_pm_power_saving(struct rtsx_pcr *pcr) { - if (pcr->ops->power_saving) - pcr->ops->power_saving(pcr); - else - rtsx_comm_pm_power_saving(pcr); + rtsx_comm_pm_power_saving(pcr); } static void rtsx_pci_idle_work(struct work_struct *work) diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index 65b8142a7fed..df86afb2b4fe 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1080,11 +1080,7 @@ struct pcr_ops { void (*stop_cmd)(struct rtsx_pcr *pcr); void (*set_aspm)(struct rtsx_pcr *pcr, bool enable); - int (*set_ltr_latency)(struct rtsx_pcr *pcr, u32 latency); - int (*set_l1off_sub)(struct rtsx_pcr *pcr, u8 val); void (*set_l1off_cfg_sub_d0)(struct rtsx_pcr *pcr, int active); - void (*full_on)(struct rtsx_pcr *pcr); - void (*power_saving)(struct rtsx_pcr *pcr); void (*enable_ocp)(struct rtsx_pcr *pcr); void (*disable_ocp)(struct rtsx_pcr *pcr); void (*init_ocp)(struct rtsx_pcr *pcr); -- cgit v1.2.3 From 51876e22bf7f8f5d2b9ca7d2b3dcbfaaac2991a9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 21 May 2020 13:05:41 -0500 Subject: misc: rtsx: Removed unused dev_aspm_mode The struct rtsx_cr_option.dev_aspm_mode member is never set to anything other than DEV_ASPM_DYNAMIC (0). Remove it and code that tests it. No functional change intended. Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20200521180545.1159896-3-helgaas@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rts5249.c | 19 ++++--------------- drivers/misc/cardreader/rts5260.c | 18 ++++-------------- drivers/misc/cardreader/rts5261.c | 36 ++++++------------------------------ drivers/misc/cardreader/rtsx_pcr.c | 19 ++++--------------- include/linux/rtsx_pci.h | 9 --------- 5 files changed, 18 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/cardreader/rts5249.c b/drivers/misc/cardreader/rts5249.c index 1a81cda948c1..5171fdd92f3c 100644 --- a/drivers/misc/cardreader/rts5249.c +++ b/drivers/misc/cardreader/rts5249.c @@ -349,25 +349,15 @@ static int rtsx_base_switch_output_voltage(struct rtsx_pcr *pcr, u8 voltage) static void rts5249_set_aspm(struct rtsx_pcr *pcr, bool enable) { - struct rtsx_cr_option *option = &pcr->option; u8 val = 0; if (pcr->aspm_enabled == enable) return; - if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) { - if (enable) - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, - pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); - } else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) { - u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0; - - if (!enable) - val = FORCE_ASPM_CTL0; - rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val); - } + if (enable) + val = pcr->aspm_en; + rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, + ASPM_MASK_NEG, val); pcr->aspm_enabled = enable; } @@ -471,7 +461,6 @@ void rts5249_init_params(struct rtsx_pcr *pcr) option->ltr_active_latency = LTR_ACTIVE_LATENCY_DEF; option->ltr_idle_latency = LTR_IDLE_LATENCY_DEF; option->ltr_l1off_latency = LTR_L1OFF_LATENCY_DEF; - option->dev_aspm_mode = DEV_ASPM_DYNAMIC; option->l1_snooze_delay = L1_SNOOZE_DELAY_DEF; option->ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5249_DEF; option->ltr_l1off_snooze_sspwrgate = diff --git a/drivers/misc/cardreader/rts5260.c b/drivers/misc/cardreader/rts5260.c index 711054ebad74..b4c1386382ea 100644 --- a/drivers/misc/cardreader/rts5260.c +++ b/drivers/misc/cardreader/rts5260.c @@ -572,24 +572,15 @@ static int rts5260_extra_init_hw(struct rtsx_pcr *pcr) static void rts5260_set_aspm(struct rtsx_pcr *pcr, bool enable) { - struct rtsx_cr_option *option = &pcr->option; u8 val = 0; if (pcr->aspm_enabled == enable) return; - if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) { - if (enable) - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); - } else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) { - u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0; - - if (!enable) - val = FORCE_ASPM_CTL0; - rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val); - } + if (enable) + val = pcr->aspm_en; + rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, + ASPM_MASK_NEG, val); pcr->aspm_enabled = enable; } @@ -683,7 +674,6 @@ void rts5260_init_params(struct rtsx_pcr *pcr) option->ltr_active_latency = LTR_ACTIVE_LATENCY_DEF; option->ltr_idle_latency = LTR_IDLE_LATENCY_DEF; option->ltr_l1off_latency = LTR_L1OFF_LATENCY_DEF; - option->dev_aspm_mode = DEV_ASPM_DYNAMIC; option->l1_snooze_delay = L1_SNOOZE_DELAY_DEF; option->ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5250_DEF; option->ltr_l1off_snooze_sspwrgate = diff --git a/drivers/misc/cardreader/rts5261.c b/drivers/misc/cardreader/rts5261.c index 4ed86122eaec..a41e805e4c94 100644 --- a/drivers/misc/cardreader/rts5261.c +++ b/drivers/misc/cardreader/rts5261.c @@ -518,51 +518,28 @@ static int rts5261_extra_init_hw(struct rtsx_pcr *pcr) static void rts5261_enable_aspm(struct rtsx_pcr *pcr, bool enable) { - struct rtsx_cr_option *option = &pcr->option; u8 val = 0; if (pcr->aspm_enabled == enable) return; - if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) { - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); - } else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) { - u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0; - - val = FORCE_ASPM_CTL0; - val |= (pcr->aspm_en & 0x02); - rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val); - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); - } + val = pcr->aspm_en; + rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, + ASPM_MASK_NEG, val); pcr->aspm_enabled = enable; } static void rts5261_disable_aspm(struct rtsx_pcr *pcr, bool enable) { - struct rtsx_cr_option *option = &pcr->option; u8 val = 0; if (pcr->aspm_enabled == enable) return; - if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) { - val = 0; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); - } else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) { - u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0; - - val = 0; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); - val = FORCE_ASPM_CTL0; - rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val); - } + val = 0; + rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, + ASPM_MASK_NEG, val); rtsx_pci_write_register(pcr, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0); udelay(10); pcr->aspm_enabled = enable; @@ -789,7 +766,6 @@ void rts5261_init_params(struct rtsx_pcr *pcr) option->l1_snooze_delay = L1_SNOOZE_DELAY_DEF; option->ltr_l1off_sspwrgate = 0x7F; option->ltr_l1off_snooze_sspwrgate = 0x78; - option->dev_aspm_mode = DEV_ASPM_DYNAMIC; option->ocp_en = 1; hw_param->interrupt_en |= SD_OC_INT_EN; diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index 7145de2504c2..ce3919e59719 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -90,24 +90,13 @@ int rtsx_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency) static void rtsx_comm_set_aspm(struct rtsx_pcr *pcr, bool enable) { - struct rtsx_cr_option *option = &pcr->option; - if (pcr->aspm_enabled == enable) return; - if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) { - if (enable) - rtsx_pci_enable_aspm(pcr); - else - rtsx_pci_disable_aspm(pcr); - } else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) { - u8 mask = FORCE_ASPM_VAL_MASK; - u8 val = 0; - - if (enable) - val = pcr->aspm_en; - rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val); - } + if (enable) + rtsx_pci_enable_aspm(pcr); + else + rtsx_pci_disable_aspm(pcr); pcr->aspm_enabled = enable; } diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index df86afb2b4fe..a75132a0cf8f 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1104,13 +1104,6 @@ enum PDEV_STAT {PDEV_STAT_IDLE, PDEV_STAT_RUN}; #define L1_SNOOZE_TEST_EN BIT(5) #define LTR_L1SS_PWR_GATE_CHECK_CARD_EN BIT(6) -enum dev_aspm_mode { - DEV_ASPM_DYNAMIC, - DEV_ASPM_BACKDOOR, - DEV_ASPM_STATIC, - DEV_ASPM_DISABLE, -}; - /* * struct rtsx_cr_option - card reader option * @dev_flags: device flags @@ -1121,7 +1114,6 @@ enum dev_aspm_mode { * @ltr_active_latency: ltr mode active latency * @ltr_idle_latency: ltr mode idle latency * @ltr_l1off_latency: ltr mode l1off latency - * @dev_aspm_mode: device aspm mode * @l1_snooze_delay: l1 snooze delay * @ltr_l1off_sspwrgate: ltr l1off sspwrgate * @ltr_l1off_snooze_sspwrgate: ltr l1off snooze sspwrgate @@ -1138,7 +1130,6 @@ struct rtsx_cr_option { u32 ltr_active_latency; u32 ltr_idle_latency; u32 ltr_l1off_latency; - enum dev_aspm_mode dev_aspm_mode; u32 l1_snooze_delay; u8 ltr_l1off_sspwrgate; u8 ltr_l1off_snooze_sspwrgate; -- cgit v1.2.3 From 3d1e7aa80d1c0e7ce8313f21c1e9c14a12d3ba48 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 21 May 2020 13:05:43 -0500 Subject: misc: rtsx: Use pcie_capability_clear_and_set_word() for PCI_EXP_LNKCTL Instead of using the driver-specific rtsx_pci_update_cfg_byte() to update the PCIe Link Control Register, use pcie_capability_clear_and_set_word() like the rest of the kernel does. This makes it easier to maintain ASPM across the PCI core and drivers. Remove the now-unused rtsx_pci_update_cfg_byte() and ASPM_MASK_NEG definitions. No functional change intended. Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20200521180545.1159896-5-helgaas@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rts5249.c | 9 +++------ drivers/misc/cardreader/rts5260.c | 9 +++------ drivers/misc/cardreader/rts5261.c | 14 ++++---------- drivers/misc/cardreader/rtsx_pcr.c | 8 ++++---- drivers/misc/cardreader/rtsx_pcr.h | 1 - include/linux/rtsx_pci.h | 12 ------------ 6 files changed, 14 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/cardreader/rts5249.c b/drivers/misc/cardreader/rts5249.c index 5171fdd92f3c..d122608e9f79 100644 --- a/drivers/misc/cardreader/rts5249.c +++ b/drivers/misc/cardreader/rts5249.c @@ -349,15 +349,12 @@ static int rtsx_base_switch_output_voltage(struct rtsx_pcr *pcr, u8 voltage) static void rts5249_set_aspm(struct rtsx_pcr *pcr, bool enable) { - u8 val = 0; - if (pcr->aspm_enabled == enable) return; - if (enable) - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); + pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + enable ? pcr->aspm_en : 0); pcr->aspm_enabled = enable; } diff --git a/drivers/misc/cardreader/rts5260.c b/drivers/misc/cardreader/rts5260.c index b4c1386382ea..75fc67d78d7f 100644 --- a/drivers/misc/cardreader/rts5260.c +++ b/drivers/misc/cardreader/rts5260.c @@ -572,15 +572,12 @@ static int rts5260_extra_init_hw(struct rtsx_pcr *pcr) static void rts5260_set_aspm(struct rtsx_pcr *pcr, bool enable) { - u8 val = 0; - if (pcr->aspm_enabled == enable) return; - if (enable) - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); + pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + enable ? pcr->aspm : 0); pcr->aspm_enabled = enable; } diff --git a/drivers/misc/cardreader/rts5261.c b/drivers/misc/cardreader/rts5261.c index a41e805e4c94..195822ec858e 100644 --- a/drivers/misc/cardreader/rts5261.c +++ b/drivers/misc/cardreader/rts5261.c @@ -518,28 +518,22 @@ static int rts5261_extra_init_hw(struct rtsx_pcr *pcr) static void rts5261_enable_aspm(struct rtsx_pcr *pcr, bool enable) { - u8 val = 0; - if (pcr->aspm_enabled == enable) return; - val = pcr->aspm_en; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); + pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, pcr->aspm_en); pcr->aspm_enabled = enable; } static void rts5261_disable_aspm(struct rtsx_pcr *pcr, bool enable) { - u8 val = 0; - if (pcr->aspm_enabled == enable) return; - val = 0; - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, val); + pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0); rtsx_pci_write_register(pcr, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0); udelay(10); pcr->aspm_enabled = enable; diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index afde5499bfb6..a8ce9c3744be 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -57,14 +57,14 @@ MODULE_DEVICE_TABLE(pci, rtsx_pci_ids); static inline void rtsx_pci_enable_aspm(struct rtsx_pcr *pcr) { - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, pcr->aspm_en); + pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, pcr->aspm_en); } static inline void rtsx_pci_disable_aspm(struct rtsx_pcr *pcr) { - rtsx_pci_update_cfg_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL, - ASPM_MASK_NEG, 0); + pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0); } static int rtsx_comm_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency) diff --git a/drivers/misc/cardreader/rtsx_pcr.h b/drivers/misc/cardreader/rtsx_pcr.h index ed391df52f4f..024cbd998b2a 100644 --- a/drivers/misc/cardreader/rtsx_pcr.h +++ b/drivers/misc/cardreader/rtsx_pcr.h @@ -29,7 +29,6 @@ #define LTR_L1OFF_SNOOZE_SSPWRGATE_5249_DEF 0xAC #define LTR_L1OFF_SNOOZE_SSPWRGATE_5250_DEF 0xF8 #define CMD_TIMEOUT_DEF 100 -#define ASPM_MASK_NEG 0xFC #define MASK_8_BIT_DEF 0xFF #define SSC_CLOCK_STABLE_WAIT 130 diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index a75132a0cf8f..e8780d4e4636 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1307,18 +1307,6 @@ static inline u8 *rtsx_pci_get_cmd_data(struct rtsx_pcr *pcr) return (u8 *)(pcr->host_cmds_ptr); } -static inline int rtsx_pci_update_cfg_byte(struct rtsx_pcr *pcr, int addr, - u8 mask, u8 append) -{ - int err; - u8 val; - - err = pci_read_config_byte(pcr->pci, addr, &val); - if (err < 0) - return err; - return pci_write_config_byte(pcr->pci, addr, (val & mask) | append); -} - static inline void rtsx_pci_write_be32(struct rtsx_pcr *pcr, u16 reg, u32 val) { rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, reg, 0xFF, val >> 24); -- cgit v1.2.3 From d45e3c1a5979efd40dbbac9a5c3586f4fa41f734 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 7 May 2020 13:33:16 +0100 Subject: PCI: endpoint: Add support to handle multiple base for mapping outbound memory R-Car PCIe controller has support to map multiple memory regions for mapping the outbound memory in local system also the controller limits single allocation for each region (that is, once a chunk is used from the region it cannot be used to allocate a new one). This features inspires to add support for handling multiple memory bases in endpoint framework. With this patch pci_epc_mem_init() initializes address space for endpoint controller which support single window and pci_epc_multi_mem_init() initializes multiple windows supported by endpoint controller. Link: https://lore.kernel.org/r/1588854799-13710-6-git-send-email-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Lad Prabhakar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pcie-designware-ep.c | 16 +- drivers/pci/endpoint/pci-epc-mem.c | 199 ++++++++++++++++-------- include/linux/pci-epc.h | 33 ++-- 3 files changed, 170 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 1cdcbd102ce8..a78902cbf2f0 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -412,11 +412,11 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, reg = ep->msi_cap + PCI_MSI_DATA_32; msg_data = dw_pcie_readw_dbi(pci, reg); } - aligned_offset = msg_addr_lower & (epc->mem->page_size - 1); + aligned_offset = msg_addr_lower & (epc->mem->window.page_size - 1); msg_addr = ((u64)msg_addr_upper) << 32 | (msg_addr_lower & ~aligned_offset); ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr, - epc->mem->page_size); + epc->mem->window.page_size); if (ret) return ret; @@ -459,9 +459,9 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, return -EPERM; } - aligned_offset = msg_addr & (epc->mem->page_size - 1); + aligned_offset = msg_addr & (epc->mem->window.page_size - 1); ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr, - epc->mem->page_size); + epc->mem->window.page_size); if (ret) return ret; @@ -477,7 +477,7 @@ void dw_pcie_ep_exit(struct dw_pcie_ep *ep) struct pci_epc *epc = ep->epc; pci_epc_mem_free_addr(epc, ep->msi_mem_phys, ep->msi_mem, - epc->mem->page_size); + epc->mem->window.page_size); pci_epc_mem_exit(epc); } @@ -610,15 +610,15 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) if (ret < 0) epc->max_functions = 1; - ret = __pci_epc_mem_init(epc, ep->phys_base, ep->addr_size, - ep->page_size); + ret = pci_epc_mem_init(epc, ep->phys_base, ep->addr_size, + ep->page_size); if (ret < 0) { dev_err(dev, "Failed to initialize address space\n"); return ret; } ep->msi_mem = pci_epc_mem_alloc_addr(epc, &ep->msi_mem_phys, - epc->mem->page_size); + epc->mem->window.page_size); if (!ep->msi_mem) { dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n"); return -ENOMEM; diff --git a/drivers/pci/endpoint/pci-epc-mem.c b/drivers/pci/endpoint/pci-epc-mem.c index cdd1d3821249..80c46f3a4590 100644 --- a/drivers/pci/endpoint/pci-epc-mem.c +++ b/drivers/pci/endpoint/pci-epc-mem.c @@ -23,7 +23,7 @@ static int pci_epc_mem_get_order(struct pci_epc_mem *mem, size_t size) { int order; - unsigned int page_shift = ilog2(mem->page_size); + unsigned int page_shift = ilog2(mem->window.page_size); size--; size >>= page_shift; @@ -36,67 +36,95 @@ static int pci_epc_mem_get_order(struct pci_epc_mem *mem, size_t size) } /** - * __pci_epc_mem_init() - initialize the pci_epc_mem structure + * pci_epc_multi_mem_init() - initialize the pci_epc_mem structure * @epc: the EPC device that invoked pci_epc_mem_init - * @phys_base: the physical address of the base - * @size: the size of the address space - * @page_size: size of each page + * @windows: pointer to windows supported by the device + * @num_windows: number of windows device supports * * Invoke to initialize the pci_epc_mem structure used by the * endpoint functions to allocate mapped PCI address. */ -int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_base, size_t size, - size_t page_size) +int pci_epc_multi_mem_init(struct pci_epc *epc, + struct pci_epc_mem_window *windows, + unsigned int num_windows) { - int ret; - struct pci_epc_mem *mem; - unsigned long *bitmap; + struct pci_epc_mem *mem = NULL; + unsigned long *bitmap = NULL; unsigned int page_shift; - int pages; + size_t page_size; int bitmap_size; + int pages; + int ret; + int i; - if (page_size < PAGE_SIZE) - page_size = PAGE_SIZE; + epc->num_windows = 0; - page_shift = ilog2(page_size); - pages = size >> page_shift; - bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + if (!windows || !num_windows) + return -EINVAL; - mem = kzalloc(sizeof(*mem), GFP_KERNEL); - if (!mem) { - ret = -ENOMEM; - goto err; - } + epc->windows = kcalloc(num_windows, sizeof(*epc->windows), GFP_KERNEL); + if (!epc->windows) + return -ENOMEM; - bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!bitmap) { - ret = -ENOMEM; - goto err_mem; - } + for (i = 0; i < num_windows; i++) { + page_size = windows[i].page_size; + if (page_size < PAGE_SIZE) + page_size = PAGE_SIZE; + page_shift = ilog2(page_size); + pages = windows[i].size >> page_shift; + bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - mem->bitmap = bitmap; - mem->phys_base = phys_base; - mem->page_size = page_size; - mem->pages = pages; - mem->size = size; - mutex_init(&mem->lock); + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + i--; + goto err_mem; + } - epc->mem = mem; + bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!bitmap) { + ret = -ENOMEM; + kfree(mem); + i--; + goto err_mem; + } + + mem->window.phys_base = windows[i].phys_base; + mem->window.size = windows[i].size; + mem->window.page_size = page_size; + mem->bitmap = bitmap; + mem->pages = pages; + mutex_init(&mem->lock); + epc->windows[i] = mem; + } + + epc->mem = epc->windows[0]; + epc->num_windows = num_windows; return 0; err_mem: - kfree(mem); + for (; i >= 0; i--) { + mem = epc->windows[i]; + kfree(mem->bitmap); + kfree(mem); + } + kfree(epc->windows); -err: -return ret; + return ret; } -EXPORT_SYMBOL_GPL(__pci_epc_mem_init); +EXPORT_SYMBOL_GPL(pci_epc_multi_mem_init); int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, size_t size, size_t page_size) { - return __pci_epc_mem_init(epc, base, size, page_size); + struct pci_epc_mem_window mem_window; + + mem_window.phys_base = base; + mem_window.size = size; + mem_window.page_size = page_size; + + return pci_epc_multi_mem_init(epc, &mem_window, 1); } EXPORT_SYMBOL_GPL(pci_epc_mem_init); @@ -109,11 +137,22 @@ EXPORT_SYMBOL_GPL(pci_epc_mem_init); */ void pci_epc_mem_exit(struct pci_epc *epc) { - struct pci_epc_mem *mem = epc->mem; + struct pci_epc_mem *mem; + int i; + if (!epc->num_windows) + return; + + for (i = 0; i < epc->num_windows; i++) { + mem = epc->windows[i]; + kfree(mem->bitmap); + kfree(mem); + } + kfree(epc->windows); + + epc->windows = NULL; epc->mem = NULL; - kfree(mem->bitmap); - kfree(mem); + epc->num_windows = 0; } EXPORT_SYMBOL_GPL(pci_epc_mem_exit); @@ -129,31 +168,60 @@ EXPORT_SYMBOL_GPL(pci_epc_mem_exit); void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size) { - int pageno; void __iomem *virt_addr = NULL; - struct pci_epc_mem *mem = epc->mem; - unsigned int page_shift = ilog2(mem->page_size); + struct pci_epc_mem *mem; + unsigned int page_shift; + size_t align_size; + int pageno; int order; + int i; - size = ALIGN(size, mem->page_size); - order = pci_epc_mem_get_order(mem, size); - - mutex_lock(&mem->lock); - pageno = bitmap_find_free_region(mem->bitmap, mem->pages, order); - if (pageno < 0) - goto ret; + for (i = 0; i < epc->num_windows; i++) { + mem = epc->windows[i]; + mutex_lock(&mem->lock); + align_size = ALIGN(size, mem->window.page_size); + order = pci_epc_mem_get_order(mem, align_size); - *phys_addr = mem->phys_base + ((phys_addr_t)pageno << page_shift); - virt_addr = ioremap(*phys_addr, size); - if (!virt_addr) - bitmap_release_region(mem->bitmap, pageno, order); + pageno = bitmap_find_free_region(mem->bitmap, mem->pages, + order); + if (pageno >= 0) { + page_shift = ilog2(mem->window.page_size); + *phys_addr = mem->window.phys_base + + ((phys_addr_t)pageno << page_shift); + virt_addr = ioremap(*phys_addr, align_size); + if (!virt_addr) { + bitmap_release_region(mem->bitmap, + pageno, order); + mutex_unlock(&mem->lock); + continue; + } + mutex_unlock(&mem->lock); + return virt_addr; + } + mutex_unlock(&mem->lock); + } -ret: - mutex_unlock(&mem->lock); return virt_addr; } EXPORT_SYMBOL_GPL(pci_epc_mem_alloc_addr); +static struct pci_epc_mem *pci_epc_get_matching_window(struct pci_epc *epc, + phys_addr_t phys_addr) +{ + struct pci_epc_mem *mem; + int i; + + for (i = 0; i < epc->num_windows; i++) { + mem = epc->windows[i]; + + if (phys_addr >= mem->window.phys_base && + phys_addr < (mem->window.phys_base + mem->window.size)) + return mem; + } + + return NULL; +} + /** * pci_epc_mem_free_addr() - free the allocated memory address * @epc: the EPC device on which memory was allocated @@ -166,14 +234,23 @@ EXPORT_SYMBOL_GPL(pci_epc_mem_alloc_addr); void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr, void __iomem *virt_addr, size_t size) { + struct pci_epc_mem *mem; + unsigned int page_shift; + size_t page_size; int pageno; - struct pci_epc_mem *mem = epc->mem; - unsigned int page_shift = ilog2(mem->page_size); int order; + mem = pci_epc_get_matching_window(epc, phys_addr); + if (!mem) { + pr_err("failed to get matching window\n"); + return; + } + + page_size = mem->window.page_size; + page_shift = ilog2(page_size); iounmap(virt_addr); - pageno = (phys_addr - mem->phys_base) >> page_shift; - size = ALIGN(size, mem->page_size); + pageno = (phys_addr - mem->window.phys_base) >> page_shift; + size = ALIGN(size, page_size); order = pci_epc_mem_get_order(mem, size); mutex_lock(&mem->lock); bitmap_release_region(mem->bitmap, pageno, order); diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5bc1de65849e..cc66bec8be90 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -65,20 +65,28 @@ struct pci_epc_ops { struct module *owner; }; +/** + * struct pci_epc_mem_window - address window of the endpoint controller + * @phys_base: physical base address of the PCI address window + * @size: the size of the PCI address window + * @page_size: size of each page + */ +struct pci_epc_mem_window { + phys_addr_t phys_base; + size_t size; + size_t page_size; +}; + /** * struct pci_epc_mem - address space of the endpoint controller - * @phys_base: physical base address of the PCI address space - * @size: the size of the PCI address space + * @window: address window of the endpoint controller * @bitmap: bitmap to manage the PCI address space * @pages: number of bits representing the address region - * @page_size: size of each page * @lock: mutex to protect bitmap */ struct pci_epc_mem { - phys_addr_t phys_base; - size_t size; + struct pci_epc_mem_window window; unsigned long *bitmap; - size_t page_size; int pages; /* mutex to protect against concurrent access for memory allocation*/ struct mutex lock; @@ -89,7 +97,11 @@ struct pci_epc_mem { * @dev: PCI EPC device * @pci_epf: list of endpoint functions present in this EPC device * @ops: function pointers for performing endpoint operations - * @mem: address space of the endpoint controller + * @windows: array of address space of the endpoint controller + * @mem: first window of the endpoint controller, which corresponds to + * default address space of the endpoint controller supporting + * single window. + * @num_windows: number of windows supported by device * @max_functions: max number of functions that can be configured in this EPC * @group: configfs group representing the PCI EPC device * @lock: mutex to protect pci_epc ops @@ -100,7 +112,9 @@ struct pci_epc { struct device dev; struct list_head pci_epf; const struct pci_epc_ops *ops; + struct pci_epc_mem **windows; struct pci_epc_mem *mem; + unsigned int num_windows; u8 max_functions; struct config_group *group; /* mutex to protect against concurrent access of EP controller */ @@ -194,8 +208,9 @@ void pci_epc_put(struct pci_epc *epc); int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t base, size_t size, size_t page_size); -int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size, - size_t page_size); +int pci_epc_multi_mem_init(struct pci_epc *epc, + struct pci_epc_mem_window *window, + unsigned int num_windows); void pci_epc_mem_exit(struct pci_epc *epc); void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size); -- cgit v1.2.3 From 4ffea5e083f8125fe273cf331ecb10d901eb64a2 Mon Sep 17 00:00:00 2001 From: Jonathan Bakker Date: Sat, 16 May 2020 12:47:59 -0700 Subject: regulator: max8998: Add charger regulator The max8998 has a current regulator for charging control. The charger driver in drivers/power/supply/max8998_charger.c has a comment in it stating that 'charger control is done by a current regulator "CHARGER"', but this regulator was never added until now. The current values have been extracted from a downstream driver for the SGH-T959V. Signed-off-by: Jonathan Bakker Link: https://lore.kernel.org/r/BN6PR04MB0660E1F4A3D5A348BE88311CA3BA0@BN6PR04MB0660.namprd04.prod.outlook.com Signed-off-by: Mark Brown --- drivers/regulator/max8998.c | 105 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/max8998.h | 1 + 2 files changed, 106 insertions(+) (limited to 'include/linux') diff --git a/drivers/regulator/max8998.c b/drivers/regulator/max8998.c index 60599c3bb845..668ced006417 100644 --- a/drivers/regulator/max8998.c +++ b/drivers/regulator/max8998.c @@ -33,6 +33,10 @@ struct max8998_data { unsigned int buck2_idx; }; +static const unsigned int charger_current_table[] = { + 90000, 380000, 475000, 550000, 570000, 600000, 700000, 800000, +}; + static int max8998_get_enable_register(struct regulator_dev *rdev, int *reg, int *shift) { @@ -63,6 +67,10 @@ static int max8998_get_enable_register(struct regulator_dev *rdev, *reg = MAX8998_REG_CHGR2; *shift = 7 - (ldo - MAX8998_ESAFEOUT1); break; + case MAX8998_CHARGER: + *reg = MAX8998_REG_CHGR2; + *shift = 0; + break; default: return -EINVAL; } @@ -88,6 +96,11 @@ static int max8998_ldo_is_enabled(struct regulator_dev *rdev) return val & (1 << shift); } +static int max8998_ldo_is_enabled_inverted(struct regulator_dev *rdev) +{ + return (!max8998_ldo_is_enabled(rdev)); +} + static int max8998_ldo_enable(struct regulator_dev *rdev) { struct max8998_data *max8998 = rdev_get_drvdata(rdev); @@ -358,6 +371,74 @@ static int max8998_set_voltage_buck_time_sel(struct regulator_dev *rdev, return 0; } +int max8998_set_current_limit(struct regulator_dev *rdev, + int min_uA, int max_uA) +{ + struct max8998_data *max8998 = rdev_get_drvdata(rdev); + struct i2c_client *i2c = max8998->iodev->i2c; + unsigned int n_currents = rdev->desc->n_current_limits; + int i, sel = -1; + + if (n_currents == 0) + return -EINVAL; + + if (rdev->desc->curr_table) { + const unsigned int *curr_table = rdev->desc->curr_table; + bool ascend = curr_table[n_currents - 1] > curr_table[0]; + + /* search for closest to maximum */ + if (ascend) { + for (i = n_currents - 1; i >= 0; i--) { + if (min_uA <= curr_table[i] && + curr_table[i] <= max_uA) { + sel = i; + break; + } + } + } else { + for (i = 0; i < n_currents; i++) { + if (min_uA <= curr_table[i] && + curr_table[i] <= max_uA) { + sel = i; + break; + } + } + } + } + + if (sel < 0) + return -EINVAL; + + sel <<= ffs(rdev->desc->csel_mask) - 1; + + return max8998_update_reg(i2c, rdev->desc->csel_reg, + sel, rdev->desc->csel_mask); +} + +int max8998_get_current_limit(struct regulator_dev *rdev) +{ + struct max8998_data *max8998 = rdev_get_drvdata(rdev); + struct i2c_client *i2c = max8998->iodev->i2c; + u8 val; + int ret; + + ret = max8998_read_reg(i2c, rdev->desc->csel_reg, &val); + if (ret != 0) + return ret; + + val &= rdev->desc->csel_mask; + val >>= ffs(rdev->desc->csel_mask) - 1; + + if (rdev->desc->curr_table) { + if (val >= rdev->desc->n_current_limits) + return -EINVAL; + + return rdev->desc->curr_table[val]; + } + + return -EINVAL; +} + static const struct regulator_ops max8998_ldo_ops = { .list_voltage = regulator_list_voltage_linear, .map_voltage = regulator_map_voltage_linear, @@ -379,6 +460,15 @@ static const struct regulator_ops max8998_buck_ops = { .set_voltage_time_sel = max8998_set_voltage_buck_time_sel, }; +static const struct regulator_ops max8998_charger_ops = { + .set_current_limit = max8998_set_current_limit, + .get_current_limit = max8998_get_current_limit, + .is_enabled = max8998_ldo_is_enabled_inverted, + /* Swapped as register is inverted */ + .enable = max8998_ldo_disable, + .disable = max8998_ldo_enable, +}; + static const struct regulator_ops max8998_others_ops = { .is_enabled = max8998_ldo_is_enabled, .enable = max8998_ldo_enable, @@ -397,6 +487,19 @@ static const struct regulator_ops max8998_others_ops = { .owner = THIS_MODULE, \ } +#define MAX8998_CURRENT_REG(_name, _ops, _table, _reg, _mask) \ + { \ + .name = #_name, \ + .id = MAX8998_##_name, \ + .ops = _ops, \ + .curr_table = _table, \ + .n_current_limits = ARRAY_SIZE(_table), \ + .csel_reg = _reg, \ + .csel_mask = _mask, \ + .type = REGULATOR_CURRENT, \ + .owner = THIS_MODULE, \ + } + #define MAX8998_OTHERS_REG(_name, _id) \ { \ .name = #_name, \ @@ -432,6 +535,8 @@ static const struct regulator_desc regulators[] = { MAX8998_OTHERS_REG(ENVICHG, MAX8998_ENVICHG), MAX8998_OTHERS_REG(ESAFEOUT1, MAX8998_ESAFEOUT1), MAX8998_OTHERS_REG(ESAFEOUT2, MAX8998_ESAFEOUT2), + MAX8998_CURRENT_REG(CHARGER, &max8998_charger_ops, + charger_current_table, MAX8998_REG_CHGR1, 0x7), }; static int max8998_pmic_dt_parse_dvs_gpio(struct max8998_dev *iodev, diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index 061af220dcd3..79c020bd0c70 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -39,6 +39,7 @@ enum { MAX8998_ENVICHG, MAX8998_ESAFEOUT1, MAX8998_ESAFEOUT2, + MAX8998_CHARGER, }; /** -- cgit v1.2.3 From 9398554fb3979852512ff4f1405e759889b45c16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 May 2020 14:36:00 +0200 Subject: block: remove the error_sector argument to blkdev_issue_flush The argument isn't used by any caller, and drivers don't fill out bi_sector for flush requests either. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++--------------- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-zoned-metadata.c | 6 +++--- drivers/md/raid5-ppl.c | 2 +- drivers/nvme/target/io-cmd-bdev.c | 2 +- fs/block_dev.c | 2 +- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/super.c | 2 +- fs/fat/file.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hfsplus/super.c | 2 +- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 4 ++-- fs/jbd2/recovery.c | 2 +- fs/libfs.c | 2 +- fs/nilfs2/the_nilfs.h | 2 +- fs/ocfs2/file.c | 2 +- fs/reiserfs/file.c | 2 +- fs/xfs/xfs_super.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/blkdev.h | 5 ++--- 22 files changed, 27 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/block/blk-flush.c b/block/blk-flush.c index b733f7ac75c7..d47a579964fb 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -432,15 +432,11 @@ void blk_insert_flush(struct request *rq) * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * @gfp_mask: memory allocation flags (for bio_alloc) - * @error_sector: error sector * * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. + * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { struct request_queue *q; struct bio *bio; @@ -458,15 +454,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be - * copied from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_iter.bi_sector; - bio_put(bio); return ret; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4094c47eca7f..84cb04904fab 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2657,7 +2657,7 @@ static void bitmap_flush_work(struct work_struct *work) dm_integrity_flush_buffers(ic); if (ic->meta_dev) - blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL); + blkdev_issue_flush(ic->dev->bdev, GFP_NOIO); limit = ic->provided_data_sectors; if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 369de15c4e80..bf2245370305 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -661,7 +661,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); return ret; } @@ -703,7 +703,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); return ret; } @@ -772,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); goto err; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d50238d0a85d..a750f4bbb5d9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, } /* flush the disk cache after recovery if necessary */ - ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); out: __free_page(page); return ret; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index ea0e596be15d..26f50c23b82e 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -226,7 +226,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { - if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL)) + if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index ebd1507789d2..d1e08bba925a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -672,7 +672,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e10206e7f4bb..35ff9a56db67 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -176,7 +176,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = ext4_fsync_journal(inode, datasync, &needs_barrier); if (needs_barrier) { - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..499f08d8522e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1440,7 +1440,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (ret < 0) goto err_out; if (barrier) - blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(sb->s_bdev, GFP_NOFS); skip_zeroout: ext4_lock_group(sb, group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..629a56b5c859 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5256,7 +5256,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/fat/file.c b/fs/fat/file.c index bdc4503c00a3..42134c58c87e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 94bd83b36644..e3da9e96b835 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 2b9e5743105e..129dca3f4b78 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -239,7 +239,7 @@ out: mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); return error; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 96bf33986d03..263f02ad8ebf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -414,7 +414,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e855d8260433..6d2da8ad0e6f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -775,7 +775,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); /* Done it all: now write the commit record asynchronously. */ if (jbd2_has_feature_async_commit(journal)) { @@ -882,7 +882,7 @@ start_journal_io: stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_dev, GFP_NOFS); } if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index a4967b27ffb6..2ed278f0dced 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -286,7 +286,7 @@ int jbd2_journal_recover(journal_t *journal) err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL); if (!err) err = err2; } diff --git a/fs/libfs.c b/fs/libfs.c index 3759fbacf522..4d08edf19c78 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1113,7 +1113,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = __generic_file_fsync(file, start, end, datasync); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } EXPORT_SYMBOL(generic_file_fsync); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 380a543c5b19..b55cdeb4d169 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) */ smp_wmb(); - err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL); if (err != -EIO) err = 0; return err; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6cd5e4924e4d..85979e2214b3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, needs_barrier = true; err = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!err) err = ret; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 84cf8bdbec9c..0b641ae694f1 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); if (barrier_done < 0) return barrier_done; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 424bb9a2d532..a123cd8267d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -305,7 +305,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); } STATIC void diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 0bf7009f50a2..25afcf55aa41 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -479,7 +479,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ret = file_write_and_wait_range(file, start, end); if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (ret) zonefs_io_error(inode, true); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2b33166b9daf..7d10f4e63232 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1233,7 +1233,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) extern void blk_io_schedule(void); -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); +int blkdev_issue_flush(struct block_device *, gfp_t); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -1872,8 +1872,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } -static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector) +static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { return 0; } -- cgit v1.2.3 From 7a4e63cb0905672fd52e7316a885f19d4aeed976 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 22 May 2020 16:56:58 +0200 Subject: Revert "i2c: core: support bus regulator controlling in adapter" This reverts commit 6fe12cdbcfe35ad4726a619a9546822d34fc934c. Testing in linux-next showed it needs some more time. Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 84 --------------------------------------------- include/linux/i2c.h | 2 -- 2 files changed, 86 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index ac7edcc1d5ee..9987e72752c4 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -313,14 +313,12 @@ static int i2c_smbus_host_notify_to_irq(const struct i2c_client *client) static int i2c_device_probe(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); - struct i2c_adapter *adap; struct i2c_driver *driver; int status; if (!client) return 0; - adap = client->adapter; driver = to_i2c_driver(dev->driver); client->irq = client->init_irq; @@ -386,12 +384,6 @@ static int i2c_device_probe(struct device *dev) dev_dbg(dev, "probe\n"); - status = regulator_enable(adap->bus_regulator); - if (status < 0) { - dev_err(&adap->dev, "Failed to enable power regulator\n"); - goto err_clear_wakeup_irq; - } - status = of_clk_set_defaults(dev->of_node, false); if (status < 0) goto err_clear_wakeup_irq; @@ -432,14 +424,12 @@ put_sync_adapter: static int i2c_device_remove(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); - struct i2c_adapter *adap; struct i2c_driver *driver; int status = 0; if (!client || !dev->driver) return 0; - adap = client->adapter; driver = to_i2c_driver(dev->driver); if (driver->remove) { dev_dbg(dev, "remove\n"); @@ -447,8 +437,6 @@ static int i2c_device_remove(struct device *dev) } dev_pm_domain_detach(&client->dev, true); - if (!pm_runtime_status_suspended(&client->dev)) - regulator_disable(adap->bus_regulator); dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); @@ -460,72 +448,6 @@ static int i2c_device_remove(struct device *dev) return status; } -#ifdef CONFIG_PM_SLEEP -static int i2c_resume_early(struct device *dev) -{ - struct i2c_client *client = i2c_verify_client(dev); - struct i2c_adapter *adap = client->adapter; - int err; - - if (!pm_runtime_status_suspended(&client->dev)) { - err = regulator_enable(adap->bus_regulator); - if (err) - return err; - } - - return pm_generic_resume_early(&client->dev); -} - -static int i2c_suspend_late(struct device *dev) -{ - struct i2c_client *client = i2c_verify_client(dev); - struct i2c_adapter *adap = client->adapter; - int err; - - err = pm_generic_suspend_late(&client->dev); - if (err) - return err; - - if (!pm_runtime_status_suspended(&client->dev)) - return regulator_disable(adap->bus_regulator); - - return 0; -} -#endif - -#ifdef CONFIG_PM -static int i2c_runtime_resume(struct device *dev) -{ - struct i2c_client *client = i2c_verify_client(dev); - struct i2c_adapter *adap = client->adapter; - int err; - - err = regulator_enable(adap->bus_regulator); - if (err) - return err; - - return pm_generic_runtime_resume(&client->dev); -} - -static int i2c_runtime_suspend(struct device *dev) -{ - struct i2c_client *client = i2c_verify_client(dev); - struct i2c_adapter *adap = client->adapter; - int err; - - err = pm_generic_runtime_suspend(&client->dev); - if (err) - return err; - - return regulator_disable(adap->bus_regulator); -} -#endif - -static const struct dev_pm_ops i2c_device_pm = { - SET_LATE_SYSTEM_SLEEP_PM_OPS(i2c_suspend_late, i2c_resume_early) - SET_RUNTIME_PM_OPS(i2c_runtime_suspend, i2c_runtime_resume, NULL) -}; - static void i2c_device_shutdown(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); @@ -583,7 +505,6 @@ struct bus_type i2c_bus_type = { .probe = i2c_device_probe, .remove = i2c_device_remove, .shutdown = i2c_device_shutdown, - .pm = &i2c_device_pm, }; EXPORT_SYMBOL_GPL(i2c_bus_type); @@ -1422,11 +1343,6 @@ static int i2c_register_adapter(struct i2c_adapter *adap) if (res) goto out_reg; - adap->bus_regulator = devm_regulator_get(&adap->dev, "bus"); - if (IS_ERR(adap->bus_regulator)) { - res = PTR_ERR(adap->bus_regulator); - goto out_reg; - } dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name); pm_runtime_no_callbacks(&adap->dev); diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7433aba207bd..49d29054e657 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -15,7 +15,6 @@ #include /* for struct device */ #include /* for completion */ #include -#include #include #include /* for Host Notify IRQ */ #include /* for struct device_node */ @@ -716,7 +715,6 @@ struct i2c_adapter { const struct i2c_adapter_quirks *quirks; struct irq_domain *host_notify_domain; - struct regulator *bus_regulator; }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) -- cgit v1.2.3 From ab91c2a89f86be2898cee208d492816ec238b2cf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 8 May 2020 11:38:26 -0500 Subject: tpm: eventlog: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. Also, the following issue shows up due to the flexible-array member having incomplete type[4]: drivers/char/tpm/eventlog/tpm2.c: In function ‘tpm2_bios_measurements_start’: drivers/char/tpm/eventlog/tpm2.c:54:46: error: invalid application of ‘sizeof’ to incomplete type ‘u8[]’ {aka ‘unsigned char[]’} 54 | size = sizeof(struct tcg_pcr_event) - sizeof(event_header->event) | ^ drivers/char/tpm/eventlog/tpm2.c: In function ‘tpm2_bios_measurements_next’: drivers/char/tpm/eventlog/tpm2.c:102:10: error: invalid application of ‘sizeof’ to incomplete type ‘u8[]’ {aka ‘unsigned char[]’} 102 | sizeof(event_header->event) + event_header->event_size; | ^ drivers/char/tpm/eventlog/tpm2.c: In function ‘tpm2_binary_bios_measurements_show’: drivers/char/tpm/eventlog/tpm2.c:140:10: error: invalid application of ‘sizeof’ to incomplete type ‘u8[]’ {aka ‘unsigned char[]’} 140 | sizeof(event_header->event) + event_header->event_size; | ^ scripts/Makefile.build:266: recipe for target 'drivers/char/tpm/eventlog/tpm2.o' failed make[3]: *** [drivers/char/tpm/eventlog/tpm2.o] Error 1 As mentioned above: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] As in "sizeof(event_header->event) always evaluated to 0, so removing it has no effect". Lastly, make use of the struct_size() helper to deal with the flexible array member and its host structure. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") [4] https://github.com/KSPP/linux/issues/43 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/eventlog/tpm2.c | 12 +++++------- include/linux/tpm_eventlog.h | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/tpm/eventlog/tpm2.c b/drivers/char/tpm/eventlog/tpm2.c index e741b1157525..37a05800980c 100644 --- a/drivers/char/tpm/eventlog/tpm2.c +++ b/drivers/char/tpm/eventlog/tpm2.c @@ -51,8 +51,7 @@ static void *tpm2_bios_measurements_start(struct seq_file *m, loff_t *pos) int i; event_header = addr; - size = sizeof(struct tcg_pcr_event) - sizeof(event_header->event) - + event_header->event_size; + size = struct_size(event_header, event, event_header->event_size); if (*pos == 0) { if (addr + size < limit) { @@ -98,8 +97,8 @@ static void *tpm2_bios_measurements_next(struct seq_file *m, void *v, event_header = log->bios_event_log; if (v == SEQ_START_TOKEN) { - event_size = sizeof(struct tcg_pcr_event) - - sizeof(event_header->event) + event_header->event_size; + event_size = struct_size(event_header, event, + event_header->event_size); marker = event_header; } else { event = v; @@ -136,9 +135,8 @@ static int tpm2_binary_bios_measurements_show(struct seq_file *m, void *v) size_t size; if (v == SEQ_START_TOKEN) { - size = sizeof(struct tcg_pcr_event) - - sizeof(event_header->event) + event_header->event_size; - + size = struct_size(event_header, event, + event_header->event_size); temp_ptr = event_header; if (size > 0) diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index c253461b1c4e..4f8c90c93c29 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -97,7 +97,7 @@ struct tcg_pcr_event { u32 event_type; u8 digest[20]; u32 event_size; - u8 event[0]; + u8 event[]; } __packed; struct tcg_event_field { -- cgit v1.2.3 From a4e91825d7e1252f7cba005f1451e5464b23c15d Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 10 May 2020 20:48:40 +0000 Subject: x86/amd_nb: Add AMD family 17h model 60h PCI IDs Add PCI IDs for AMD Renoir (4000-series Ryzen CPUs). This is necessary to enable support for temperature sensors via the k10temp module. Signed-off-by: Alexander Monakov Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Acked-by: Guenter Roeck Link: https://lkml.kernel.org/r/20200510204842.2603-2-amonakov@ispras.ru --- arch/x86/kernel/amd_nb.c | 5 +++++ include/linux/pci_ids.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b6b3297851f3..18f6b7c4bd79 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -18,9 +18,11 @@ #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 @@ -33,6 +35,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, {} }; @@ -50,6 +53,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, @@ -65,6 +69,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 1dfc4e1dcb94..3155f5ada02e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -550,6 +550,7 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 -- cgit v1.2.3 From 8eb613c0b8f19627ba1846dcf78bb2c85edbe8dd Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 3 May 2020 01:00:02 -0400 Subject: ima: verify mprotect change is consistent with mmap policy Files can be mmap'ed read/write and later changed to execute to circumvent IMA's mmap appraise policy rules. Due to locking issues (mmap semaphore would be taken prior to i_mutex), files can not be measured or appraised at this point. Eliminate this integrity gap, by denying the mprotect PROT_EXECUTE change, if an mmap appraise policy rule exists. On mprotect change success, return 0. On failure, return -EACESS. Reviewed-by: Lakshmi Ramasubramanian Signed-off-by: Mimi Zohar --- include/linux/ima.h | 7 ++++++ security/integrity/ima/ima_main.c | 51 +++++++++++++++++++++++++++++++++++++++ security/security.c | 7 +++++- 3 files changed, 64 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index aefe758f4466..9164e1534ec9 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -18,6 +18,7 @@ extern int ima_file_check(struct file *file, int mask); extern void ima_post_create_tmpfile(struct inode *inode); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); +extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot); extern int ima_load_data(enum kernel_load_data_id id); extern int ima_read_file(struct file *file, enum kernel_read_file_id id); extern int ima_post_read_file(struct file *file, void *buf, loff_t size, @@ -70,6 +71,12 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot) return 0; } +static inline int ima_file_mprotect(struct vm_area_struct *vma, + unsigned long prot) +{ + return 0; +} + static inline int ima_load_data(enum kernel_load_data_id id) { return 0; diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f96f151294e6..800fb3bba418 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -393,6 +393,57 @@ int ima_file_mmap(struct file *file, unsigned long prot) return 0; } +/** + * ima_file_mprotect - based on policy, limit mprotect change + * @prot: contains the protection that will be applied by the kernel. + * + * Files can be mmap'ed read/write and later changed to execute to circumvent + * IMA's mmap appraisal policy rules. Due to locking issues (mmap semaphore + * would be taken before i_mutex), files can not be measured or appraised at + * this point. Eliminate this integrity gap by denying the mprotect + * PROT_EXECUTE change, if an mmap appraise policy rule exists. + * + * On mprotect change success, return 0. On failure, return -EACESS. + */ +int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot) +{ + struct ima_template_desc *template; + struct file *file = vma->vm_file; + char filename[NAME_MAX]; + char *pathbuf = NULL; + const char *pathname = NULL; + struct inode *inode; + int result = 0; + int action; + u32 secid; + int pcr; + + /* Is mprotect making an mmap'ed file executable? */ + if (!vma->vm_file || !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC)) + return 0; + + security_task_getsecid(current, &secid); + inode = file_inode(vma->vm_file); + action = ima_get_action(inode, current_cred(), secid, MAY_EXEC, + MMAP_CHECK, &pcr, &template, 0); + + /* Is the mmap'ed file in policy? */ + if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK))) + return 0; + + if (action & IMA_APPRAISE_SUBMASK) + result = -EPERM; + + file = vma->vm_file; + pathname = ima_d_path(&file->f_path, &pathbuf, filename); + integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, pathname, + "collect_data", "failed-mprotect", result, 0); + if (pathbuf) + __putname(pathbuf); + + return result; +} + /** * ima_bprm_check - based on policy, collect/store measurement. * @bprm: contains the linux_binprm structure diff --git a/security/security.c b/security/security.c index 7fed24b9d57e..dd0917c5bfe9 100644 --- a/security/security.c +++ b/security/security.c @@ -1512,7 +1512,12 @@ int security_mmap_addr(unsigned long addr) int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { - return call_int_hook(file_mprotect, 0, vma, reqprot, prot); + int ret; + + ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot); + if (ret) + return ret; + return ima_file_mprotect(vma, prot); } int security_file_lock(struct file *file, unsigned int cmd) -- cgit v1.2.3 From d3e81989c0f028aa80cb97fcba83df40585b640d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:01 -0500 Subject: treewide: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Li Yang --- include/linux/fsl/bestcomm/bestcomm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/bestcomm/bestcomm.h b/include/linux/fsl/bestcomm/bestcomm.h index a0e2e6b19b57..154e541ce57e 100644 --- a/include/linux/fsl/bestcomm/bestcomm.h +++ b/include/linux/fsl/bestcomm/bestcomm.h @@ -27,7 +27,7 @@ */ struct bcom_bd { u32 status; - u32 data[0]; /* variable payload size */ + u32 data[]; /* variable payload size */ }; /* ======================================================================== */ -- cgit v1.2.3 From 65ece6de0114fc84fbc0487bf68cae91d535dd78 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 16 Apr 2020 11:50:49 +0200 Subject: virtchnl: Add missing explicit padding to structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On e.g. m68k, the alignment of 32-bit values is only 2 bytes, leading to the following: ./include/linux/avf/virtchnl.h:147:36: warning: division by zero [-Wdiv-by-zero] { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:577:1: note: in expansion of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); ^~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:577:32: error: enumerator value for ‘virtchnl_static_assert_virtchnl_filter’ is not an integer constant VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); ^~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:147:53: note: in definition of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:147:36: warning: division by zero [-Wdiv-by-zero] { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:619:1: note: in expansion of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_pf_event); ^~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:619:31: error: enumerator value for ‘virtchnl_static_assert_virtchnl_pf_event’ is not an integer constant VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_pf_event); ^~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:147:53: note: in definition of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:147:36: warning: division by zero [-Wdiv-by-zero] { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:640:1: note: in expansion of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_iwarp_qv_info); ^~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:640:31: error: enumerator value for ‘virtchnl_static_assert_virtchnl_iwarp_qv_info’ is not an integer constant VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_iwarp_qv_info); ^~~~~~~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:147:53: note: in definition of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:147:36: warning: division by zero [-Wdiv-by-zero] { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ ./include/linux/avf/virtchnl.h:647:1: note: in expansion of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_iwarp_qvlist_info); ^~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:647:31: error: enumerator value for ‘virtchnl_static_assert_virtchnl_iwarp_qvlist_info’ is not an integer constant VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_iwarp_qvlist_info); ^~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/avf/virtchnl.h:147:53: note: in definition of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’ { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } ^ Fix this by adding explicit padding to structures with holes. Reported-by: Signed-off-by: Geert Uytterhoeven Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index ca956b672ac0..40bad71865ea 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -476,6 +476,7 @@ struct virtchnl_rss_key { u16 vsi_id; u16 key_len; u8 key[1]; /* RSS hash key, packed bytes */ + u8 pad[1]; }; VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key); @@ -484,6 +485,7 @@ struct virtchnl_rss_lut { u16 vsi_id; u16 lut_entries; u8 lut[1]; /* RSS lookup table */ + u8 pad[1]; }; VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut); @@ -572,6 +574,7 @@ struct virtchnl_filter { enum virtchnl_action action; u32 action_meta; u8 field_flags; + u8 pad[3]; }; VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); @@ -610,6 +613,7 @@ struct virtchnl_pf_event { /* link_speed provided in Mbps */ u32 link_speed; u8 link_status; + u8 pad[3]; } link_event_adv; } event_data; @@ -635,6 +639,7 @@ struct virtchnl_iwarp_qv_info { u16 ceq_idx; u16 aeq_idx; u8 itr_idx; + u8 pad[3]; }; VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_iwarp_qv_info); -- cgit v1.2.3 From afaa33da08abd10be8978781d7c99a9e67d2bbff Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Fri, 22 May 2020 19:19:01 +0200 Subject: Drivers: hv: vmbus: Resolve more races involving init_vp_index() init_vp_index() uses the (per-node) hv_numa_map[] masks to record the CPUs allocated for channel interrupts at a given time, and distribute the performance-critical channels across the available CPUs: in part., the mask of "candidate" target CPUs in a given NUMA node, for a newly offered channel, is determined by XOR-ing the node's CPU mask and the node's hv_numa_map. This operation/mechanism assumes that no offline CPUs is set in the hv_numa_map mask, an assumption that does not hold since such mask is currently not updated when a channel is removed or assigned to a different CPU. To address the issues described above, this adds hooks in the channel removal path (hv_process_channel_removal()) and in target_cpu_store() in order to clear, resp. to update, the hv_numa_map[] masks as needed. This also adds a (missed) update of the masks in init_vp_index() (cf., e.g., the memory-allocation failure path in this function). Like in the case of init_vp_index(), such hooks require to determine if the given channel is performance critical. init_vp_index() does this by parsing the channel's offer, it can not rely on the device data structure (device_obj) to retrieve such information because the device data structure has not been allocated/linked with the channel by the time that init_vp_index() executes. A similar situation may hold in hv_is_alloced_cpu() (defined below); the adopted approach is to "cache" the device type of the channel, as computed by parsing the channel's offer, in the channel structure itself. Fixes: 7527810573436f ("Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type") Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200522171901.204127-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 22 ++++++++++++++++------ drivers/hv/hyperv_vmbus.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/hv/vmbus_drv.c | 19 +++++++++++++------ include/linux/hyperv.h | 7 +++++++ 4 files changed, 84 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 89eaacf069a8..417a95e5094d 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -24,9 +24,9 @@ #include "hyperv_vmbus.h" -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type); +static void init_vp_index(struct vmbus_channel *channel); -static const struct vmbus_device vmbus_devs[] = { +const struct vmbus_device vmbus_devs[] = { /* IDE */ { .dev_type = HV_IDE, HV_IDE_GUID, @@ -431,6 +431,13 @@ void hv_process_channel_removal(struct vmbus_channel *channel) spin_unlock_irqrestore(&primary_channel->lock, flags); } + /* + * If this is a "perf" channel, updates the hv_numa_map[] masks so that + * init_vp_index() can (re-)use the CPU. + */ + if (hv_is_perf_channel(channel)) + hv_clear_alloced_cpu(channel->target_cpu); + /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and * the relid is invalidated; after hibernation, when the user-space app @@ -497,7 +504,7 @@ static void vmbus_add_channel_work(struct work_struct *work) if (!newchannel->device_obj) goto err_deq_chan; - newchannel->device_obj->device_id = hv_get_dev_type(newchannel); + newchannel->device_obj->device_id = newchannel->device_id; /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() @@ -580,7 +587,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) */ mutex_lock(&vmbus_connection.channel_mutex); - init_vp_index(newchannel, hv_get_dev_type(newchannel)); + init_vp_index(newchannel); /* Remember the channels that should be cleaned up upon suspend. */ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) @@ -676,9 +683,9 @@ static int next_numa_node_id; * evenly among all the available NUMA nodes. Once the node is assigned, * we will assign the CPU based on a simple round robin scheme. */ -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) +static void init_vp_index(struct vmbus_channel *channel) { - bool perf_chn = vmbus_devs[dev_type].perf_device; + bool perf_chn = hv_is_perf_channel(channel); cpumask_var_t available_mask; struct cpumask *alloced_mask; u32 target_cpu; @@ -699,6 +706,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) channel->target_cpu = VMBUS_CONNECT_CPU; channel->target_vp = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); + if (perf_chn) + hv_set_alloced_cpu(VMBUS_CONNECT_CPU); return; } @@ -862,6 +871,7 @@ static void vmbus_setup_channel_state(struct vmbus_channel *channel, sizeof(struct vmbus_channel_offer_channel)); channel->monitor_grp = (u8)offer->monitorid / 32; channel->monitor_bit = (u8)offer->monitorid % 32; + channel->device_id = hv_get_dev_type(channel); } /* diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 5e5cebe5d048..40e2b9f91163 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -395,6 +395,54 @@ enum delay { MESSAGE_DELAY = 1, }; +extern const struct vmbus_device vmbus_devs[]; + +static inline bool hv_is_perf_channel(struct vmbus_channel *channel) +{ + return vmbus_devs[channel->device_id].perf_device; +} + +static inline bool hv_is_alloced_cpu(unsigned int cpu) +{ + struct vmbus_channel *channel, *sc; + + lockdep_assert_held(&vmbus_connection.channel_mutex); + /* + * List additions/deletions as well as updates of the target CPUs are + * protected by channel_mutex. + */ + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!hv_is_perf_channel(channel)) + continue; + if (channel->target_cpu == cpu) + return true; + list_for_each_entry(sc, &channel->sc_list, sc_list) { + if (sc->target_cpu == cpu) + return true; + } + } + return false; +} + +static inline void hv_set_alloced_cpu(unsigned int cpu) +{ + cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_clear_alloced_cpu(unsigned int cpu) +{ + if (hv_is_alloced_cpu(cpu)) + return; + cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_update_alloced_cpus(unsigned int old_cpu, + unsigned int new_cpu) +{ + hv_set_alloced_cpu(new_cpu); + hv_clear_alloced_cpu(old_cpu); +} + #ifdef CONFIG_HYPERV_TESTING int hv_debug_add_dev_dir(struct hv_device *dev); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index c2a4a7c0b99a..47747755d2e1 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1687,8 +1687,8 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) static ssize_t target_cpu_store(struct vmbus_channel *channel, const char *buf, size_t count) { + u32 target_cpu, origin_cpu; ssize_t ret = count; - u32 target_cpu; if (vmbus_proto_version < VERSION_WIN10_V4_1) return -EIO; @@ -1741,7 +1741,8 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, goto cpu_store_unlock; } - if (channel->target_cpu == target_cpu) + origin_cpu = channel->target_cpu; + if (target_cpu == origin_cpu) goto cpu_store_unlock; if (vmbus_send_modifychannel(channel->offermsg.child_relid, @@ -1763,14 +1764,20 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * in on a CPU that is different from the channel target_cpu value. */ - if (channel->change_target_cpu_callback) - (*channel->change_target_cpu_callback)(channel, - channel->target_cpu, target_cpu); - channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); channel->numa_node = cpu_to_node(target_cpu); + /* See init_vp_index(). */ + if (hv_is_perf_channel(channel)) + hv_update_alloced_cpus(origin_cpu, target_cpu); + + /* Currently set only for storvsc channels. */ + if (channel->change_target_cpu_callback) { + (*channel->change_target_cpu_callback)(channel, + origin_cpu, target_cpu); + } + cpu_store_unlock: mutex_unlock(&vmbus_connection.channel_mutex); cpus_read_unlock(); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index d783847d8cb4..40df3103e890 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -901,6 +901,13 @@ struct vmbus_channel { bool probe_done; + /* + * Cache the device ID here for easy access; this is useful, in + * particular, in situations where the channel's device_obj has + * not been allocated/initialized yet. + */ + u16 device_id; + /* * We must offload the handling of the primary/sub channels * from the single-threaded vmbus_connection.work_queue to -- cgit v1.2.3 From cd16627fc0468564fdd60f20ad52420b87195127 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sat, 23 May 2020 15:27:10 +0200 Subject: net: devres: provide devm_register_netdev() Provide devm_register_netdev() - a device resource managed variant of register_netdev(). This new helper will only work for net_device structs that are also already managed by devres. Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- Documentation/driver-api/driver-model/devres.rst | 1 + include/linux/netdevice.h | 2 + net/devres.c | 55 ++++++++++++++++++++++++ 3 files changed, 58 insertions(+) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 50df28d20fa7..fc242ed4bde5 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -375,6 +375,7 @@ MUX NET devm_alloc_etherdev() devm_alloc_etherdev_mqs() + devm_register_netdev() PER-CPU MEM devm_alloc_percpu() diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a18f8fdf4260..1a96e9c4ec36 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4280,6 +4280,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); +int devm_register_netdev(struct device *dev, struct net_device *ndev); + /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); diff --git a/net/devres.c b/net/devres.c index b97b0c5a8216..57a6a88d11f6 100644 --- a/net/devres.c +++ b/net/devres.c @@ -38,3 +38,58 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, return dr->ndev; } EXPORT_SYMBOL(devm_alloc_etherdev_mqs); + +static void devm_netdev_release(struct device *dev, void *this) +{ + struct net_device_devres *res = this; + + unregister_netdev(res->ndev); +} + +static int netdev_devres_match(struct device *dev, void *this, void *match_data) +{ + struct net_device_devres *res = this; + struct net_device *ndev = match_data; + + return ndev == res->ndev; +} + +/** + * devm_register_netdev - resource managed variant of register_netdev() + * @dev: managing device for this netdev - usually the parent device + * @ndev: device to register + * + * This is a devres variant of register_netdev() for which the unregister + * function will be call automatically when the managing device is + * detached. Note: the net_device used must also be resource managed by + * the same struct device. + */ +int devm_register_netdev(struct device *dev, struct net_device *ndev) +{ + struct net_device_devres *dr; + int ret; + + /* struct net_device must itself be managed. For now a managed netdev + * can only be allocated by devm_alloc_etherdev_mqs() so the check is + * straightforward. + */ + if (WARN_ON(!devres_find(dev, devm_free_netdev, + netdev_devres_match, ndev))) + return -EINVAL; + + dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + ret = register_netdev(ndev); + if (ret) { + devres_free(dr); + return ret; + } + + dr->ndev = ndev; + devres_add(ndev->dev.parent, dr); + + return 0; +} +EXPORT_SYMBOL(devm_register_netdev); -- cgit v1.2.3 From 1f1ec622623fe8e49a1036d4127b895a0f0e2c43 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 5 May 2020 12:13:35 +0200 Subject: mtd: rawnand: Propage CS selection to sub operations Some controller using the instruction parse infrastructure might need to know which CS a specific sub-operation is targeting. Let's propagate this information. Signed-off-by: Boris Brezillon Reviewed-by: Miquel Raynal Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200505101353.1776394-2-boris.brezillon@collabora.com --- drivers/mtd/nand/raw/nand_base.c | 3 ++- include/linux/mtd/rawnand.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 2d2a216af120..c4d6d5482b31 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -2174,7 +2174,7 @@ static void nand_op_parser_trace(const struct nand_op_parser_ctx *ctx) char *prefix = " "; unsigned int i; - pr_debug("executing subop:\n"); + pr_debug("executing subop (CS%d):\n", ctx->subop.cs); for (i = 0; i < ctx->ninstrs; i++) { instr = &ctx->instrs[i]; @@ -2238,6 +2238,7 @@ int nand_op_parser_exec_op(struct nand_chip *chip, const struct nand_operation *op, bool check_only) { struct nand_op_parser_ctx ctx = { + .subop.cs = op->cs, .subop.instrs = op->instrs, .instrs = op->instrs, .ninstrs = op->ninstrs, diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 0f45b6984ad1..a3bac8824937 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -709,6 +709,7 @@ struct nand_op_instr { /** * struct nand_subop - a sub operation + * @cs: the CS line to select for this NAND sub-operation * @instrs: array of instructions * @ninstrs: length of the @instrs array * @first_instr_start_off: offset to start from for the first instruction @@ -724,6 +725,7 @@ struct nand_op_instr { * controller driver. */ struct nand_subop { + unsigned int cs; const struct nand_op_instr *instrs; unsigned int ninstrs; unsigned int first_instr_start_off; -- cgit v1.2.3 From c8ae3f744ddca0da164bcacee42d1d4b6fe7027d Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 19 May 2020 09:45:42 +0200 Subject: lib/bch: Rework a little bit the exported function names There are four exported functions, all suffixed by _bch, which is clearly not the norm. Let's rename them by prefixing them with bch_ instead. This is a mechanical change: init_bch -> bch_init free_bch -> bch_free encode_bch -> bch_encode decode_bch -> bch_decode Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200519074549.23673-2-miquel.raynal@bootlin.com --- drivers/mtd/devices/docg3.c | 10 +++---- drivers/mtd/nand/raw/nand_bch.c | 10 +++---- include/linux/bch.h | 8 +++--- lib/bch.c | 64 ++++++++++++++++++++--------------------- 4 files changed, 46 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/docg3.c b/drivers/mtd/devices/docg3.c index eb0f4600efd1..799df8d03357 100644 --- a/drivers/mtd/devices/docg3.c +++ b/drivers/mtd/devices/docg3.c @@ -647,7 +647,7 @@ static int doc_ecc_bch_fix_data(struct docg3 *docg3, void *buf, u8 *hwecc) for (i = 0; i < DOC_ECC_BCH_SIZE; i++) ecc[i] = bitrev8(hwecc[i]); - numerrs = decode_bch(docg3->cascade->bch, NULL, + numerrs = bch_decode(docg3->cascade->bch, NULL, DOC_ECC_BCH_COVERED_BYTES, NULL, ecc, NULL, errorpos); BUG_ON(numerrs == -EINVAL); @@ -1984,8 +1984,8 @@ static int __init docg3_probe(struct platform_device *pdev) return ret; cascade->base = base; mutex_init(&cascade->lock); - cascade->bch = init_bch(DOC_ECC_BCH_M, DOC_ECC_BCH_T, - DOC_ECC_BCH_PRIMPOLY); + cascade->bch = bch_init(DOC_ECC_BCH_M, DOC_ECC_BCH_T, + DOC_ECC_BCH_PRIMPOLY); if (!cascade->bch) return ret; @@ -2021,7 +2021,7 @@ notfound: ret = -ENODEV; dev_info(dev, "No supported DiskOnChip found\n"); err_probe: - free_bch(cascade->bch); + bch_free(cascade->bch); for (floor = 0; floor < DOC_MAX_NBFLOORS; floor++) if (cascade->floors[floor]) doc_release_device(cascade->floors[floor]); @@ -2045,7 +2045,7 @@ static int docg3_release(struct platform_device *pdev) if (cascade->floors[floor]) doc_release_device(cascade->floors[floor]); - free_bch(docg3->cascade->bch); + bch_free(docg3->cascade->bch); return 0; } diff --git a/drivers/mtd/nand/raw/nand_bch.c b/drivers/mtd/nand/raw/nand_bch.c index 17527310c3a1..d95fcc7358e9 100644 --- a/drivers/mtd/nand/raw/nand_bch.c +++ b/drivers/mtd/nand/raw/nand_bch.c @@ -41,7 +41,7 @@ int nand_bch_calculate_ecc(struct nand_chip *chip, const unsigned char *buf, unsigned int i; memset(code, 0, chip->ecc.bytes); - encode_bch(nbc->bch, buf, chip->ecc.size, code); + bch_encode(nbc->bch, buf, chip->ecc.size, code); /* apply mask so that an erased page is a valid codeword */ for (i = 0; i < chip->ecc.bytes; i++) @@ -67,7 +67,7 @@ int nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf, unsigned int *errloc = nbc->errloc; int i, count; - count = decode_bch(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc, + count = bch_decode(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc, NULL, errloc); if (count > 0) { for (i = 0; i < count; i++) { @@ -130,7 +130,7 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd) if (!nbc) goto fail; - nbc->bch = init_bch(m, t, 0); + nbc->bch = bch_init(m, t, 0); if (!nbc->bch) goto fail; @@ -182,7 +182,7 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd) goto fail; memset(erased_page, 0xff, eccsize); - encode_bch(nbc->bch, erased_page, eccsize, nbc->eccmask); + bch_encode(nbc->bch, erased_page, eccsize, nbc->eccmask); kfree(erased_page); for (i = 0; i < eccbytes; i++) @@ -205,7 +205,7 @@ EXPORT_SYMBOL(nand_bch_init); void nand_bch_free(struct nand_bch_control *nbc) { if (nbc) { - free_bch(nbc->bch); + bch_free(nbc->bch); kfree(nbc->errloc); kfree(nbc->eccmask); kfree(nbc); diff --git a/include/linux/bch.h b/include/linux/bch.h index aa765af85c38..9c35e7cd5890 100644 --- a/include/linux/bch.h +++ b/include/linux/bch.h @@ -53,14 +53,14 @@ struct bch_control { struct gf_poly *poly_2t[4]; }; -struct bch_control *init_bch(int m, int t, unsigned int prim_poly); +struct bch_control *bch_init(int m, int t, unsigned int prim_poly); -void free_bch(struct bch_control *bch); +void bch_free(struct bch_control *bch); -void encode_bch(struct bch_control *bch, const uint8_t *data, +void bch_encode(struct bch_control *bch, const uint8_t *data, unsigned int len, uint8_t *ecc); -int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len, +int bch_decode(struct bch_control *bch, const uint8_t *data, unsigned int len, const uint8_t *recv_ecc, const uint8_t *calc_ecc, const unsigned int *syn, unsigned int *errloc); diff --git a/lib/bch.c b/lib/bch.c index 052d3fb753a0..1091841ac716 100644 --- a/lib/bch.c +++ b/lib/bch.c @@ -23,15 +23,15 @@ * This library provides runtime configurable encoding/decoding of binary * Bose-Chaudhuri-Hocquenghem (BCH) codes. * - * Call init_bch to get a pointer to a newly allocated bch_control structure for + * Call bch_init to get a pointer to a newly allocated bch_control structure for * the given m (Galois field order), t (error correction capability) and * (optional) primitive polynomial parameters. * - * Call encode_bch to compute and store ecc parity bytes to a given buffer. - * Call decode_bch to detect and locate errors in received data. + * Call bch_encode to compute and store ecc parity bytes to a given buffer. + * Call bch_decode to detect and locate errors in received data. * * On systems supporting hw BCH features, intermediate results may be provided - * to decode_bch in order to skip certain steps. See decode_bch() documentation + * to bch_decode in order to skip certain steps. See bch_decode() documentation * for details. * * Option CONFIG_BCH_CONST_PARAMS can be used to force fixed values of @@ -115,9 +115,9 @@ struct gf_poly_deg1 { }; /* - * same as encode_bch(), but process input data one byte at a time + * same as bch_encode(), but process input data one byte at a time */ -static void encode_bch_unaligned(struct bch_control *bch, +static void bch_encode_unaligned(struct bch_control *bch, const unsigned char *data, unsigned int len, uint32_t *ecc) { @@ -174,7 +174,7 @@ static void store_ecc8(struct bch_control *bch, uint8_t *dst, } /** - * encode_bch - calculate BCH ecc parity of data + * bch_encode - calculate BCH ecc parity of data * @bch: BCH control structure * @data: data to encode * @len: data length in bytes @@ -187,7 +187,7 @@ static void store_ecc8(struct bch_control *bch, uint8_t *dst, * The exact number of computed ecc parity bits is given by member @ecc_bits of * @bch; it may be less than m*t for large values of t. */ -void encode_bch(struct bch_control *bch, const uint8_t *data, +void bch_encode(struct bch_control *bch, const uint8_t *data, unsigned int len, uint8_t *ecc) { const unsigned int l = BCH_ECC_WORDS(bch)-1; @@ -215,7 +215,7 @@ void encode_bch(struct bch_control *bch, const uint8_t *data, m = ((unsigned long)data) & 3; if (m) { mlen = (len < (4-m)) ? len : 4-m; - encode_bch_unaligned(bch, data, mlen, bch->ecc_buf); + bch_encode_unaligned(bch, data, mlen, bch->ecc_buf); data += mlen; len -= mlen; } @@ -255,13 +255,13 @@ void encode_bch(struct bch_control *bch, const uint8_t *data, /* process last unaligned bytes */ if (len) - encode_bch_unaligned(bch, data, len, bch->ecc_buf); + bch_encode_unaligned(bch, data, len, bch->ecc_buf); /* store ecc parity bytes into original parity buffer */ if (ecc) store_ecc8(bch, ecc, bch->ecc_buf); } -EXPORT_SYMBOL_GPL(encode_bch); +EXPORT_SYMBOL_GPL(bch_encode); static inline int modulo(struct bch_control *bch, unsigned int v) { @@ -952,7 +952,7 @@ static int chien_search(struct bch_control *bch, unsigned int len, #endif /* USE_CHIEN_SEARCH */ /** - * decode_bch - decode received codeword and find bit error locations + * bch_decode - decode received codeword and find bit error locations * @bch: BCH control structure * @data: received data, ignored if @calc_ecc is provided * @len: data length in bytes, must always be provided @@ -966,22 +966,22 @@ static int chien_search(struct bch_control *bch, unsigned int len, * invalid parameters were provided * * Depending on the available hw BCH support and the need to compute @calc_ecc - * separately (using encode_bch()), this function should be called with one of + * separately (using bch_encode()), this function should be called with one of * the following parameter configurations - * * by providing @data and @recv_ecc only: - * decode_bch(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc) + * bch_decode(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc) * * by providing @recv_ecc and @calc_ecc: - * decode_bch(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc) + * bch_decode(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc) * * by providing ecc = recv_ecc XOR calc_ecc: - * decode_bch(@bch, NULL, @len, NULL, ecc, NULL, @errloc) + * bch_decode(@bch, NULL, @len, NULL, ecc, NULL, @errloc) * * by providing syndrome results @syn: - * decode_bch(@bch, NULL, @len, NULL, NULL, @syn, @errloc) + * bch_decode(@bch, NULL, @len, NULL, NULL, @syn, @errloc) * - * Once decode_bch() has successfully returned with a positive value, error + * Once bch_decode() has successfully returned with a positive value, error * locations returned in array @errloc should be interpreted as follows - * * if (errloc[n] >= 8*len), then n-th error is located in ecc (no need for @@ -993,7 +993,7 @@ static int chien_search(struct bch_control *bch, unsigned int len, * Note that this function does not perform any data correction by itself, it * merely indicates error locations. */ -int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len, +int bch_decode(struct bch_control *bch, const uint8_t *data, unsigned int len, const uint8_t *recv_ecc, const uint8_t *calc_ecc, const unsigned int *syn, unsigned int *errloc) { @@ -1012,7 +1012,7 @@ int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len, /* compute received data ecc into an internal buffer */ if (!data || !recv_ecc) return -EINVAL; - encode_bch(bch, data, len, NULL); + bch_encode(bch, data, len, NULL); } else { /* load provided calculated ecc */ load_ecc8(bch, bch->ecc_buf, calc_ecc); @@ -1053,7 +1053,7 @@ int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len, } return (err >= 0) ? err : -EBADMSG; } -EXPORT_SYMBOL_GPL(decode_bch); +EXPORT_SYMBOL_GPL(bch_decode); /* * generate Galois field lookup tables @@ -1236,7 +1236,7 @@ finish: } /** - * init_bch - initialize a BCH encoder/decoder + * bch_init - initialize a BCH encoder/decoder * @m: Galois field order, should be in the range 5-15 * @t: maximum error correction capability, in bits * @prim_poly: user-provided primitive polynomial (or 0 to use default) @@ -1246,17 +1246,17 @@ finish: * * This initialization can take some time, as lookup tables are built for fast * encoding/decoding; make sure not to call this function from a time critical - * path. Usually, init_bch() should be called on module/driver init and - * free_bch() should be called to release memory on exit. + * path. Usually, bch_init() should be called on module/driver init and + * bch_free() should be called to release memory on exit. * * You may provide your own primitive polynomial of degree @m in argument - * @prim_poly, or let init_bch() use its default polynomial. + * @prim_poly, or let bch_init() use its default polynomial. * - * Once init_bch() has successfully returned a pointer to a newly allocated + * Once bch_init() has successfully returned a pointer to a newly allocated * BCH control structure, ecc length in bytes is given by member @ecc_bytes of * the structure. */ -struct bch_control *init_bch(int m, int t, unsigned int prim_poly) +struct bch_control *bch_init(int m, int t, unsigned int prim_poly) { int err = 0; unsigned int i, words; @@ -1347,16 +1347,16 @@ struct bch_control *init_bch(int m, int t, unsigned int prim_poly) return bch; fail: - free_bch(bch); + bch_free(bch); return NULL; } -EXPORT_SYMBOL_GPL(init_bch); +EXPORT_SYMBOL_GPL(bch_init); /** - * free_bch - free the BCH control structure + * bch_free - free the BCH control structure * @bch: BCH control structure to release */ -void free_bch(struct bch_control *bch) +void bch_free(struct bch_control *bch) { unsigned int i; @@ -1377,7 +1377,7 @@ void free_bch(struct bch_control *bch) kfree(bch); } } -EXPORT_SYMBOL_GPL(free_bch); +EXPORT_SYMBOL_GPL(bch_free); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ivan Djelic "); -- cgit v1.2.3 From 1759279ad138cb0a903224a89f4bf40f69c417e8 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 19 May 2020 09:45:43 +0200 Subject: lib/bch: Allow easy bit swapping It seems that several hardware ECC engine use a swapped representation of bytes compared to software. This might having to do with how the ECC engine is wired to the NAND controller or the order the bits are passed to the hardware BCH logic. This means that when the software BCH engine is working in conjunction with data generated with hardware, sometimes we might need to swap the bits inside bytes, eg: 0x0A = b0000_1010 -> b0101_0000 = 0x50 Make it possible by adding a boolean to the BCH initialization routine. Regarding the implementation itself, this is a rather simple approach that can probably be enhanced in the future by preparing the ->a_{mod,pow}_tab tables with the swapping in mind. Suggested-by: Boris Brezillon Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200519074549.23673-3-miquel.raynal@bootlin.com --- drivers/mtd/devices/docg3.c | 2 +- drivers/mtd/nand/raw/nand_bch.c | 2 +- include/linux/bch.h | 5 ++- lib/bch.c | 90 ++++++++++++++++++++++++++++++++++------- 4 files changed, 82 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/docg3.c b/drivers/mtd/devices/docg3.c index 799df8d03357..a030792115bc 100644 --- a/drivers/mtd/devices/docg3.c +++ b/drivers/mtd/devices/docg3.c @@ -1985,7 +1985,7 @@ static int __init docg3_probe(struct platform_device *pdev) cascade->base = base; mutex_init(&cascade->lock); cascade->bch = bch_init(DOC_ECC_BCH_M, DOC_ECC_BCH_T, - DOC_ECC_BCH_PRIMPOLY); + DOC_ECC_BCH_PRIMPOLY, false); if (!cascade->bch) return ret; diff --git a/drivers/mtd/nand/raw/nand_bch.c b/drivers/mtd/nand/raw/nand_bch.c index d95fcc7358e9..d5af8c5fd02f 100644 --- a/drivers/mtd/nand/raw/nand_bch.c +++ b/drivers/mtd/nand/raw/nand_bch.c @@ -130,7 +130,7 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd) if (!nbc) goto fail; - nbc->bch = bch_init(m, t, 0); + nbc->bch = bch_init(m, t, 0, false); if (!nbc->bch) goto fail; diff --git a/include/linux/bch.h b/include/linux/bch.h index 9c35e7cd5890..85fdce83d4e2 100644 --- a/include/linux/bch.h +++ b/include/linux/bch.h @@ -33,6 +33,7 @@ * @cache: log-based polynomial representation buffer * @elp: error locator polynomial * @poly_2t: temporary polynomials of degree 2t + * @swap_bits: swap bits within data and syndrome bytes */ struct bch_control { unsigned int m; @@ -51,9 +52,11 @@ struct bch_control { int *cache; struct gf_poly *elp; struct gf_poly *poly_2t[4]; + bool swap_bits; }; -struct bch_control *bch_init(int m, int t, unsigned int prim_poly); +struct bch_control *bch_init(int m, int t, unsigned int prim_poly, + bool swap_bits); void bch_free(struct bch_control *bch); diff --git a/lib/bch.c b/lib/bch.c index 1091841ac716..7c031ee8b93b 100644 --- a/lib/bch.c +++ b/lib/bch.c @@ -114,6 +114,49 @@ struct gf_poly_deg1 { unsigned int c[2]; }; +static u8 swap_bits_table[] = { + 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, + 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, + 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, + 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, + 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, + 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, + 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, + 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, + 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, + 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, + 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, + 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, + 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, + 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, + 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, + 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, + 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, + 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, + 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, + 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, + 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, + 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, + 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, + 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, + 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, + 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, + 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, + 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, + 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, + 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, + 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, + 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, +}; + +static u8 swap_bits(struct bch_control *bch, u8 in) +{ + if (!bch->swap_bits) + return in; + + return swap_bits_table[in]; +} + /* * same as bch_encode(), but process input data one byte at a time */ @@ -126,7 +169,9 @@ static void bch_encode_unaligned(struct bch_control *bch, const int l = BCH_ECC_WORDS(bch)-1; while (len--) { - p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(*data++)) & 0xff); + u8 tmp = swap_bits(bch, *data++); + + p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(tmp)) & 0xff); for (i = 0; i < l; i++) ecc[i] = ((ecc[i] << 8)|(ecc[i+1] >> 24))^(*p++); @@ -145,10 +190,16 @@ static void load_ecc8(struct bch_control *bch, uint32_t *dst, unsigned int i, nwords = BCH_ECC_WORDS(bch)-1; for (i = 0; i < nwords; i++, src += 4) - dst[i] = (src[0] << 24)|(src[1] << 16)|(src[2] << 8)|src[3]; + dst[i] = ((u32)swap_bits(bch, src[0]) << 24) | + ((u32)swap_bits(bch, src[1]) << 16) | + ((u32)swap_bits(bch, src[2]) << 8) | + swap_bits(bch, src[3]); memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords); - dst[nwords] = (pad[0] << 24)|(pad[1] << 16)|(pad[2] << 8)|pad[3]; + dst[nwords] = ((u32)swap_bits(bch, pad[0]) << 24) | + ((u32)swap_bits(bch, pad[1]) << 16) | + ((u32)swap_bits(bch, pad[2]) << 8) | + swap_bits(bch, pad[3]); } /* @@ -161,15 +212,15 @@ static void store_ecc8(struct bch_control *bch, uint8_t *dst, unsigned int i, nwords = BCH_ECC_WORDS(bch)-1; for (i = 0; i < nwords; i++) { - *dst++ = (src[i] >> 24); - *dst++ = (src[i] >> 16) & 0xff; - *dst++ = (src[i] >> 8) & 0xff; - *dst++ = (src[i] >> 0) & 0xff; + *dst++ = swap_bits(bch, src[i] >> 24); + *dst++ = swap_bits(bch, src[i] >> 16); + *dst++ = swap_bits(bch, src[i] >> 8); + *dst++ = swap_bits(bch, src[i]); } - pad[0] = (src[nwords] >> 24); - pad[1] = (src[nwords] >> 16) & 0xff; - pad[2] = (src[nwords] >> 8) & 0xff; - pad[3] = (src[nwords] >> 0) & 0xff; + pad[0] = swap_bits(bch, src[nwords] >> 24); + pad[1] = swap_bits(bch, src[nwords] >> 16); + pad[2] = swap_bits(bch, src[nwords] >> 8); + pad[3] = swap_bits(bch, src[nwords]); memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords); } @@ -240,7 +291,13 @@ void bch_encode(struct bch_control *bch, const uint8_t *data, */ while (mlen--) { /* input data is read in big-endian format */ - w = r[0]^cpu_to_be32(*pdata++); + w = cpu_to_be32(*pdata++); + if (bch->swap_bits) + w = (u32)swap_bits(bch, w) | + ((u32)swap_bits(bch, w >> 8) << 8) | + ((u32)swap_bits(bch, w >> 16) << 16) | + ((u32)swap_bits(bch, w >> 24) << 24); + w ^= r[0]; p0 = tab0 + (l+1)*((w >> 0) & 0xff); p1 = tab1 + (l+1)*((w >> 8) & 0xff); p2 = tab2 + (l+1)*((w >> 16) & 0xff); @@ -1048,7 +1105,9 @@ int bch_decode(struct bch_control *bch, const uint8_t *data, unsigned int len, break; } errloc[i] = nbits-1-errloc[i]; - errloc[i] = (errloc[i] & ~7)|(7-(errloc[i] & 7)); + if (!bch->swap_bits) + errloc[i] = (errloc[i] & ~7) | + (7-(errloc[i] & 7)); } } return (err >= 0) ? err : -EBADMSG; @@ -1240,6 +1299,7 @@ finish: * @m: Galois field order, should be in the range 5-15 * @t: maximum error correction capability, in bits * @prim_poly: user-provided primitive polynomial (or 0 to use default) + * @swap_bits: swap bits within data and syndrome bytes * * Returns: * a newly allocated BCH control structure if successful, NULL otherwise @@ -1256,7 +1316,8 @@ finish: * BCH control structure, ecc length in bytes is given by member @ecc_bytes of * the structure. */ -struct bch_control *bch_init(int m, int t, unsigned int prim_poly) +struct bch_control *bch_init(int m, int t, unsigned int prim_poly, + bool swap_bits) { int err = 0; unsigned int i, words; @@ -1321,6 +1382,7 @@ struct bch_control *bch_init(int m, int t, unsigned int prim_poly) bch->syn = bch_alloc(2*t*sizeof(*bch->syn), &err); bch->cache = bch_alloc(2*t*sizeof(*bch->cache), &err); bch->elp = bch_alloc((t+1)*sizeof(struct gf_poly_deg1), &err); + bch->swap_bits = swap_bits; for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++) bch->poly_2t[i] = bch_alloc(GF_POLY_SZ(2*t), &err); -- cgit v1.2.3 From d7904619ea0636411f40fb1f34057288c0783ecf Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 19 May 2020 09:45:45 +0200 Subject: mtd: rawnand: Add nand_extract_bits() There are cases where ECC bytes are not byte-aligned. Indeed, BCH implies using a number of ECC bits, which are not always a multiple of 8. We then need a helper like nand_extract_bits() to extract these syndromes from a buffer. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200519074549.23673-5-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/nand_base.c | 44 ++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/rawnand.h | 4 ++++ 2 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 475647ea6948..6a6a0a36b3fd 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -274,6 +274,50 @@ static int check_offs_len(struct nand_chip *chip, loff_t ofs, uint64_t len) return ret; } +/** + * nand_extract_bits - Copy unaligned bits from one buffer to another one + * @dst: destination buffer + * @dst_off: bit offset at which the writing starts + * @src: source buffer + * @src_off: bit offset at which the reading starts + * @nbits: number of bits to copy from @src to @dst + * + * Copy bits from one memory region to another (overlap authorized). + */ +void nand_extract_bits(u8 *dst, unsigned int dst_off, const u8 *src, + unsigned int src_off, unsigned int nbits) +{ + unsigned int tmp, n; + + dst += dst_off / 8; + dst_off %= 8; + src += src_off / 8; + src_off %= 8; + + while (nbits) { + n = min3(8 - dst_off, 8 - src_off, nbits); + + tmp = (*src >> src_off) & GENMASK(n - 1, 0); + *dst &= ~GENMASK(n - 1 + dst_off, dst_off); + *dst |= tmp << dst_off; + + dst_off += n; + if (dst_off >= 8) { + dst++; + dst_off -= 8; + } + + src_off += n; + if (src_off >= 8) { + src++; + src_off -= 8; + } + + nbits -= n; + } +} +EXPORT_SYMBOL_GPL(nand_extract_bits); + /** * nand_select_target() - Select a NAND target (A.K.A. die) * @chip: NAND chip object diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index a3bac8824937..2804c13e5662 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1414,6 +1414,10 @@ int nand_gpio_waitrdy(struct nand_chip *chip, struct gpio_desc *gpiod, void nand_select_target(struct nand_chip *chip, unsigned int cs); void nand_deselect_target(struct nand_chip *chip); +/* Bitops */ +void nand_extract_bits(u8 *dst, unsigned int dst_off, const u8 *src, + unsigned int src_off, unsigned int nbits); + /** * nand_get_data_buf() - Get the internal page buffer * @chip: NAND chip object -- cgit v1.2.3 From 5469fd64efcfbc2b9b2edb92c4f906912736d5b0 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Tue, 5 May 2020 19:46:13 -0700 Subject: usb: gadget: Introduce usb_request->is_last To take advantage of DWC3 internal TRB prefetch and cache for performance, inform the controller the last request with stream_id before switching to a different stream transfer. This allows the controller to maintain its transfer burst within the stream ID. Introduce the usb-request is_last field to help inform the DWC3 controller of this. Signed-off-by: Thinh Nguyen Signed-off-by: Felipe Balbi --- include/linux/usb/gadget.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index e959c09a97c9..281eabae7f33 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -42,6 +42,8 @@ struct usb_ep; * @num_mapped_sgs: number of SG entries mapped to DMA (internal) * @length: Length of that data * @stream_id: The stream id, when USB3.0 bulk streams are being used + * @is_last: Indicates if this is the last request of a stream_id before + * switching to a different stream (required for DWC3 controllers). * @no_interrupt: If true, hints that no completion irq is needed. * Helpful sometimes with deep request queues that are handled * directly by DMA controllers. @@ -104,6 +106,7 @@ struct usb_request { unsigned num_mapped_sgs; unsigned stream_id:16; + unsigned is_last:1; unsigned no_interrupt:1; unsigned zero:1; unsigned short_not_ok:1; -- cgit v1.2.3 From 3c73bc52195def14165c3a7d91bdbb33b51725f5 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Sun, 10 May 2020 13:30:41 +0800 Subject: usb: gadget: core: sync interrupt before unbind the udc The threaded interrupt handler may still be called after the usb_gadget_disconnect is called, it causes the structures used at interrupt handler was freed before it uses, eg the usb_request. This issue usually occurs we remove the udc function during the transfer. Below is the example when doing stress test for android switch function, the EP0's request is freed by .unbind (configfs_composite_unbind -> composite_dev_cleanup), but the threaded handler accesses this request during handling setup packet request. In fact, there is no protection between unbind the udc and udc interrupt handling, so we have to avoid the interrupt handler is occurred or scheduled during the .unbind flow. init: Sending signal 9 to service 'adbd' (pid 18077) process group... android_work: did not send uevent (0 0 000000007bec2039) libprocessgroup: Successfully killed process cgroup uid 0 pid 18077 in 6ms init: Service 'adbd' (pid 18077) received signal 9 init: Sending signal 9 to service 'adbd' (pid 18077) process group... libprocessgroup: Successfully killed process cgroup uid 0 pid 18077 in 0ms init: processing action (init.svc.adbd=stopped) from (/init.usb.configfs.rc:14) init: Received control message 'start' for 'adbd' from pid: 399 (/vendor/bin/hw/android.hardware.usb@1. init: starting service 'adbd'... read descriptors read strings Unable to handle kernel read from unreadable memory at virtual address 000000000000002a android_work: sent uevent USB_STATE=CONNECTED Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00000000e97f1000 using random self ethernet address [000000000000002a] pgd=0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 232 Comm: irq/68-5b110000 Not tainted 5.4.24-06075-g94a6b52b5815 #92 Hardware name: Freescale i.MX8QXP MEK (DT) pstate: 00400085 (nzcv daIf +PAN -UAO) using random host ethernet address pc : composite_setup+0x5c/0x1730 lr : android_setup+0xc0/0x148 sp : ffff80001349bba0 x29: ffff80001349bba0 x28: ffff00083a50da00 x27: ffff8000124e6000 x26: ffff800010177950 x25: 0000000000000040 x24: ffff000834e18010 x23: 0000000000000000 x22: 0000000000000000 x21: ffff00083a50da00 x20: ffff00082e75ec40 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000001 x11: ffff80001180fb58 x10: 0000000000000040 x9 : ffff8000120fc980 x8 : 0000000000000000 x7 : ffff00083f98df50 x6 : 0000000000000100 x5 : 00000307e8978431 x4 : ffff800011386788 x3 : 0000000000000000 x2 : ffff800012342000 x1 : 0000000000000000 x0 : ffff800010c6d3a0 Call trace: composite_setup+0x5c/0x1730 android_setup+0xc0/0x148 cdns3_ep0_delegate_req+0x64/0x90 cdns3_check_ep0_interrupt_proceed+0x384/0x738 cdns3_device_thread_irq_handler+0x124/0x6e0 cdns3_thread_irq+0x94/0xa0 irq_thread_fn+0x30/0xa0 irq_thread+0x150/0x248 kthread+0xfc/0x128 ret_from_fork+0x10/0x18 Code: 910e8000 f9400693 12001ed7 79400f79 (3940aa61) ---[ end trace c685db37f8773fba ]--- Kernel panic - not syncing: Fatal exception SMP: stopping secondary CPUs Kernel Offset: disabled CPU features: 0x0002,20002008 Memory Limit: none Rebooting in 5 seconds.. Reviewed-by: Jun Li Signed-off-by: Peter Chen Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/core.c | 2 ++ include/linux/usb/gadget.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 9b11046480fe..2e28dde8376f 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1297,6 +1297,8 @@ static void usb_gadget_remove_driver(struct usb_udc *udc) kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); usb_gadget_disconnect(udc->gadget); + if (udc->gadget->irq) + synchronize_irq(udc->gadget->irq); udc->driver->unbind(udc->gadget); usb_gadget_udc_stop(udc); diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 281eabae7f33..6a178177e4c9 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -376,6 +376,7 @@ struct usb_gadget_ops { * @connected: True if gadget is connected. * @lpm_capable: If the gadget max_speed is FULL or HIGH, this flag * indicates that it supports LPM as per the LPM ECN & errata. + * @irq: the interrupt number for device controller. * * Gadgets have a mostly-portable "gadget driver" implementing device * functions, handling all usb configurations and interfaces. Gadget @@ -430,6 +431,7 @@ struct usb_gadget { unsigned deactivated:1; unsigned connected:1; unsigned lpm_capable:1; + int irq; }; #define work_to_gadget(w) (container_of((w), struct usb_gadget, work)) -- cgit v1.2.3 From 5d363120aa548ba52d58907a295eee25f8207ed2 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Mon, 18 May 2020 12:08:45 +0200 Subject: usb: gadget: Fix issue with config_ep_by_speed function This patch adds new config_ep_by_speed_and_alt function which extends the config_ep_by_speed about alt parameter. This additional parameter allows to find proper usb_ss_ep_comp_descriptor. Problem has appeared during testing f_tcm (BOT/UAS) driver function. f_tcm function for SS use array of headers for both BOT/UAS alternate setting: static struct usb_descriptor_header *uasp_ss_function_desc[] = { (struct usb_descriptor_header *) &bot_intf_desc, (struct usb_descriptor_header *) &uasp_ss_bi_desc, (struct usb_descriptor_header *) &bot_bi_ep_comp_desc, (struct usb_descriptor_header *) &uasp_ss_bo_desc, (struct usb_descriptor_header *) &bot_bo_ep_comp_desc, (struct usb_descriptor_header *) &uasp_intf_desc, (struct usb_descriptor_header *) &uasp_ss_bi_desc, (struct usb_descriptor_header *) &uasp_bi_ep_comp_desc, (struct usb_descriptor_header *) &uasp_bi_pipe_desc, (struct usb_descriptor_header *) &uasp_ss_bo_desc, (struct usb_descriptor_header *) &uasp_bo_ep_comp_desc, (struct usb_descriptor_header *) &uasp_bo_pipe_desc, (struct usb_descriptor_header *) &uasp_ss_status_desc, (struct usb_descriptor_header *) &uasp_status_in_ep_comp_desc, (struct usb_descriptor_header *) &uasp_status_pipe_desc, (struct usb_descriptor_header *) &uasp_ss_cmd_desc, (struct usb_descriptor_header *) &uasp_cmd_comp_desc, (struct usb_descriptor_header *) &uasp_cmd_pipe_desc, NULL, }; The first 5 descriptors are associated with BOT alternate setting, and others are associated with UAS. During handling UAS alternate setting f_tcm driver invokes config_ep_by_speed and this function sets incorrect companion endpoint descriptor in usb_ep object. Instead setting ep->comp_desc to uasp_bi_ep_comp_desc function in this case set ep->comp_desc to uasp_ss_bi_desc. This is due to the fact that it searches endpoint based on endpoint address: for_each_ep_desc(speed_desc, d_spd) { chosen_desc = (struct usb_endpoint_descriptor *)*d_spd; if (chosen_desc->bEndpoitAddress == _ep->address) goto ep_found; } And in result it uses the descriptor from BOT alternate setting instead UAS. Finally, it causes that controller driver during enabling endpoints detect that just enabled endpoint for bot. Signed-off-by: Jayshri Pawar Signed-off-by: Pawel Laszczak Signed-off-by: Felipe Balbi --- drivers/usb/gadget/composite.c | 78 +++++++++++++++++++++++++++++++++--------- include/linux/usb/composite.h | 3 ++ 2 files changed, 64 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index cb4950cf1cdc..5c1eb96a5c57 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -96,40 +96,43 @@ function_descriptors(struct usb_function *f, } /** - * next_ep_desc() - advance to the next EP descriptor + * next_desc() - advance to the next desc_type descriptor * @t: currect pointer within descriptor array + * @desc_type: descriptor type * - * Return: next EP descriptor or NULL + * Return: next desc_type descriptor or NULL * - * Iterate over @t until either EP descriptor found or + * Iterate over @t until either desc_type descriptor found or * NULL (that indicates end of list) encountered */ static struct usb_descriptor_header** -next_ep_desc(struct usb_descriptor_header **t) +next_desc(struct usb_descriptor_header **t, u8 desc_type) { for (; *t; t++) { - if ((*t)->bDescriptorType == USB_DT_ENDPOINT) + if ((*t)->bDescriptorType == desc_type) return t; } return NULL; } /* - * for_each_ep_desc()- iterate over endpoint descriptors in the - * descriptors list - * @start: pointer within descriptor array. - * @ep_desc: endpoint descriptor to use as the loop cursor + * for_each_desc() - iterate over desc_type descriptors in the + * descriptors list + * @start: pointer within descriptor array. + * @iter_desc: desc_type descriptor to use as the loop cursor + * @desc_type: wanted descriptr type */ -#define for_each_ep_desc(start, ep_desc) \ - for (ep_desc = next_ep_desc(start); \ - ep_desc; ep_desc = next_ep_desc(ep_desc+1)) +#define for_each_desc(start, iter_desc, desc_type) \ + for (iter_desc = next_desc(start, desc_type); \ + iter_desc; iter_desc = next_desc(iter_desc + 1, desc_type)) /** - * config_ep_by_speed() - configures the given endpoint + * config_ep_by_speed_and_alt() - configures the given endpoint * according to gadget speed. * @g: pointer to the gadget * @f: usb function * @_ep: the endpoint to configure + * @alt: alternate setting number * * Return: error code, 0 on success * @@ -142,11 +145,13 @@ next_ep_desc(struct usb_descriptor_header **t) * Note: the supplied function should hold all the descriptors * for supported speeds */ -int config_ep_by_speed(struct usb_gadget *g, - struct usb_function *f, - struct usb_ep *_ep) +int config_ep_by_speed_and_alt(struct usb_gadget *g, + struct usb_function *f, + struct usb_ep *_ep, + u8 alt) { struct usb_endpoint_descriptor *chosen_desc = NULL; + struct usb_interface_descriptor *int_desc = NULL; struct usb_descriptor_header **speed_desc = NULL; struct usb_ss_ep_comp_descriptor *comp_desc = NULL; @@ -182,8 +187,21 @@ int config_ep_by_speed(struct usb_gadget *g, default: speed_desc = f->fs_descriptors; } + + /* find correct alternate setting descriptor */ + for_each_desc(speed_desc, d_spd, USB_DT_INTERFACE) { + int_desc = (struct usb_interface_descriptor *)*d_spd; + + if (int_desc->bAlternateSetting == alt) { + speed_desc = d_spd; + goto intf_found; + } + } + return -EIO; + +intf_found: /* find descriptors */ - for_each_ep_desc(speed_desc, d_spd) { + for_each_desc(speed_desc, d_spd, USB_DT_ENDPOINT) { chosen_desc = (struct usb_endpoint_descriptor *)*d_spd; if (chosen_desc->bEndpointAddress == _ep->address) goto ep_found; @@ -237,6 +255,32 @@ ep_found: } return 0; } +EXPORT_SYMBOL_GPL(config_ep_by_speed_and_alt); + +/** + * config_ep_by_speed() - configures the given endpoint + * according to gadget speed. + * @g: pointer to the gadget + * @f: usb function + * @_ep: the endpoint to configure + * + * Return: error code, 0 on success + * + * This function chooses the right descriptors for a given + * endpoint according to gadget speed and saves it in the + * endpoint desc field. If the endpoint already has a descriptor + * assigned to it - overwrites it with currently corresponding + * descriptor. The endpoint maxpacket field is updated according + * to the chosen descriptor. + * Note: the supplied function should hold all the descriptors + * for supported speeds + */ +int config_ep_by_speed(struct usb_gadget *g, + struct usb_function *f, + struct usb_ep *_ep) +{ + return config_ep_by_speed_and_alt(g, f, _ep, 0); +} EXPORT_SYMBOL_GPL(config_ep_by_speed); /** diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 8675e145ea8b..2040696d75b6 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -249,6 +249,9 @@ int usb_function_activate(struct usb_function *); int usb_interface_id(struct usb_configuration *, struct usb_function *); +int config_ep_by_speed_and_alt(struct usb_gadget *g, struct usb_function *f, + struct usb_ep *_ep, u8 alt); + int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f, struct usb_ep *_ep); -- cgit v1.2.3 From eb012d125a2419786f5bcaaf8a901babc7b6e3d7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 May 2020 00:32:43 +0900 Subject: printk: Remove pr_cont_once() pr_cont_once() does not make sense; at least emitting module name using pr_fmt() into middle of a line (after e.g. pr_info_once()) does not make sense. Let's remove unused pr_cont_once(). Link: https://lore.kernel.org/r/20200524153243.11690-1-penguin-kernel@I-love.SAKURA.ne.jp Cc: Joe Perches Signed-off-by: Tetsuo Handa Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- include/linux/printk.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..07f2d08f79ff 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -384,8 +384,7 @@ extern int kptr_restrict; printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_once(fmt, ...) \ printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -#define pr_cont_once(fmt, ...) \ - printk_once(KERN_CONT pr_fmt(fmt), ##__VA_ARGS__) +/* no pr_cont_once, don't do that... */ #if defined(DEBUG) #define pr_devel_once(fmt, ...) \ -- cgit v1.2.3 From 551cb86cbb7d85a3d16d10f6fb4fb954f13a7556 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 May 2020 16:12:33 +0300 Subject: gpio: dwapb: Remove unneeded has_irq member in struct dwapb_port_property has_irq member of struct dwapb_port_property is used only in one place, so, make it local test instead and remove from the structure. This local test is using memchr_inv() which is quite efficient in comparison to the original loop and possible little overhead can be neglected. Signed-off-by: Andy Shevchenko Tested-by: Serge Semin Acked-by: Lee Jones Acked-by: Serge Semin Link: https://lore.kernel.org/r/20200519131233.59032-4-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/gpio/gpio-dwapb.c | 14 +++++++------- drivers/mfd/intel_quark_i2c_gpio.c | 1 - include/linux/platform_data/gpio-dwapb.h | 1 - 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c index d3765672c42b..1d8d55bd63aa 100644 --- a/drivers/gpio/gpio-dwapb.c +++ b/drivers/gpio/gpio-dwapb.c @@ -366,6 +366,11 @@ static void dwapb_configure_irqs(struct dwapb_gpio *gpio, irq_hw_number_t hwirq; int err, i; + if (memchr_inv(pp->irq, 0, sizeof(pp->irq)) == NULL) { + dev_warn(gpio->dev, "no IRQ for port%d\n", pp->idx); + return; + } + gpio->domain = irq_domain_create_linear(fwnode, ngpio, &irq_generic_chip_ops, gpio); if (!gpio->domain) @@ -501,7 +506,8 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio, if (pp->idx == 0) port->gc.set_config = dwapb_gpio_set_config; - if (pp->has_irq) + /* Only port A can provide interrupts in all configurations of the IP */ + if (pp->idx == 0) dwapb_configure_irqs(gpio, port, pp); err = gpiochip_add_data(&port->gc, port); @@ -550,13 +556,7 @@ static void dwapb_get_irq(struct device *dev, struct fwnode_handle *fwnode, irq = platform_get_irq_optional(to_platform_device(dev), j); if (irq > 0) pp->irq[j] = irq; - - if (pp->irq[j]) - pp->has_irq = true; } - - if (!pp->has_irq) - dev_warn(dev, "no irq for port%d\n", pp->idx); } static struct dwapb_platform_data *dwapb_gpio_get_pdata(struct device *dev) diff --git a/drivers/mfd/intel_quark_i2c_gpio.c b/drivers/mfd/intel_quark_i2c_gpio.c index 41326b48da55..84ca7902e1df 100644 --- a/drivers/mfd/intel_quark_i2c_gpio.c +++ b/drivers/mfd/intel_quark_i2c_gpio.c @@ -216,7 +216,6 @@ static int intel_quark_gpio_setup(struct pci_dev *pdev, struct mfd_cell *cell) pdata->properties->ngpio = INTEL_QUARK_MFD_NGPIO; pdata->properties->gpio_base = INTEL_QUARK_MFD_GPIO_BASE; pdata->properties->irq[0] = pdev->irq; - pdata->properties->has_irq = true; pdata->properties->irq_shared = true; cell->platform_data = pdata; diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h index 3c606c450d05..ff1be737bad6 100644 --- a/include/linux/platform_data/gpio-dwapb.h +++ b/include/linux/platform_data/gpio-dwapb.h @@ -12,7 +12,6 @@ struct dwapb_port_property { unsigned int ngpio; unsigned int gpio_base; int irq[32]; - bool has_irq; bool irq_shared; }; -- cgit v1.2.3 From 1072c12d7d58b5512b6c05c2268f57d32f1ab76c Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 16 Apr 2020 14:46:11 -0700 Subject: block: add bio_for_each_bvec_all() An upcoming Btrfs fix needs to know the original size of a non-cloned bios. Rather than accessing the bvec table directly, let's add a bio_for_each_bvec_all() accessor. Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- .clang-format | 1 + Documentation/block/biovecs.rst | 2 ++ include/linux/bio.h | 8 ++++++++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/.clang-format b/.clang-format index e92e6dd1780d..a0a96088c74f 100644 --- a/.clang-format +++ b/.clang-format @@ -80,6 +80,7 @@ ForEachMacros: - 'ax25_uid_for_each' - '__bio_for_each_bvec' - 'bio_for_each_bvec' + - 'bio_for_each_bvec_all' - 'bio_for_each_integrity_vec' - '__bio_for_each_segment' - 'bio_for_each_segment' diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index ad303a2569d3..36771a131b56 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -129,6 +129,7 @@ Usage of helpers: :: bio_for_each_segment_all() + bio_for_each_bvec_all() bio_first_bvec_all() bio_first_page_all() bio_last_bvec_all() @@ -143,4 +144,5 @@ Usage of helpers: bio_vec' will contain a multi-page IO vector during the iteration:: bio_for_each_bvec() + bio_for_each_bvec_all() rq_for_each_bvec() diff --git a/include/linux/bio.h b/include/linux/bio.h index a0ee494a6329..8e23f51ccfa4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -169,6 +169,14 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, #define bio_for_each_bvec(bvl, bio, iter) \ __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter) +/* + * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the + * same reasons as bio_for_each_segment_all(). + */ +#define bio_for_each_bvec_all(bvl, bio, i) \ + for (i = 0, bvl = bio_first_bvec_all(bio); \ + i < (bio)->bi_vcnt; i++, bvl++) \ + #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) static inline unsigned bio_segments(struct bio *bio) -- cgit v1.2.3 From d85dc2e116fdce776280224ed2bee4c78e5e5af2 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 30 Aug 2019 12:09:24 -0500 Subject: fs: export generic_file_buffered_read() Export generic_file_buffered_read() to be used to supplement incomplete direct reads. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/fs.h | 2 ++ mm/filemap.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 45cc10cdf6dd..366c533d30cd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3124,6 +3124,8 @@ extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *count, unsigned int flags); +extern ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 23a051a7ef0f..ad82672a9941 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1991,7 +1991,7 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) * * total number of bytes copied, including those the were already @written * * negative error code if nothing was copied */ -static ssize_t generic_file_buffered_read(struct kiocb *iocb, +ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; @@ -2243,6 +2243,7 @@ out: file_accessed(filp); return written ? written : error; } +EXPORT_SYMBOL_GPL(generic_file_buffered_read); /** * generic_file_read_iter - generic filesystem read routine -- cgit v1.2.3 From 8cecd0ba854799cda72d03a470e7de9eed3ed6c4 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 14 May 2019 18:54:27 -0500 Subject: iomap: add a filesystem hook for direct I/O bio submission This helps filesystems to perform tasks on the bio while submitting for I/O. This could be post-write operations such as data CRC or data replication for fs-handled RAID. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/iomap/direct-io.c | 15 ++++++++++----- include/linux/iomap.h | 2 ++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 20dde5aadcdd..f88ba6e7f6af 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -59,7 +59,7 @@ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) EXPORT_SYMBOL_GPL(iomap_dio_iopoll); static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio) + struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); @@ -67,7 +67,12 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, bio_set_polled(bio, dio->iocb); dio->submit.last_queue = bdev_get_queue(iomap->bdev); - dio->submit.cookie = submit_bio(bio); + if (dio->dops && dio->dops->submit_io) + dio->submit.cookie = dio->dops->submit_io( + file_inode(dio->iocb->ki_filp), + iomap, bio, pos); + else + dio->submit.cookie = submit_bio(bio); } static ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -191,7 +196,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio); + iomap_dio_submit_bio(dio, iomap, bio, pos); } static loff_t @@ -299,11 +304,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, } dio->size += n; - pos += n; copied += n; nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); - iomap_dio_submit_bio(dio, iomap, bio); + iomap_dio_submit_bio(dio, iomap, bio, pos); + pos += n; } while (nr_pages); /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..5b4875344874 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -252,6 +252,8 @@ int iomap_writepages(struct address_space *mapping, struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); + blk_qc_t (*submit_io)(struct inode *inode, struct iomap *iomap, + struct bio *bio, loff_t file_offset); }; ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From cea0fad0f8b41481c0052a1f30d296d6d6dd0b2e Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Thu, 30 Apr 2020 15:52:24 +0200 Subject: mailmap: change email for Ricardo Ribalda Modify emails to ribalda@kernel.org and unify my surname in all the files. Signed-off-by: Ricardo Ribalda Link: https://lore.kernel.org/r/20200430135224.362700-1-ricardo@ribalda.com Signed-off-by: Jonathan Corbet --- .mailmap | 4 +++- CREDITS | 6 ++++-- MAINTAINERS | 4 ++-- drivers/iio/dac/ad5761.c | 4 ++-- drivers/iio/dac/ti-dac7612.c | 4 ++-- drivers/leds/leds-pca963x.c | 2 +- drivers/media/i2c/imx214.c | 4 ++-- include/linux/platform_data/ad5761.h | 2 +- 8 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/.mailmap b/.mailmap index 4600dcfed0d3..09ae92771030 100644 --- a/.mailmap +++ b/.mailmap @@ -235,7 +235,9 @@ Ralf Baechle Ralf Wildenhues Randy Dunlap Rémi Denis-Courmont -Ricardo Ribalda Delgado +Ricardo Ribalda +Ricardo Ribalda +Ricardo Ribalda Ricardo Ribalda Delgado Ross Zwisler Rudolf Marek Rui Saraiva diff --git a/CREDITS b/CREDITS index 032b5994f476..0787b5872906 100644 --- a/CREDITS +++ b/CREDITS @@ -3104,14 +3104,16 @@ W: http://www.qsl.net/dl1bke/ D: Generic Z8530 driver, AX.25 DAMA slave implementation D: Several AX.25 hacks -N: Ricardo Ribalda Delgado -E: ricardo.ribalda@gmail.com +N: Ricardo Ribalda +E: ribalda@kernel.org W: http://ribalda.com D: PLX USB338x driver D: PCA9634 driver D: Option GTM671WFS D: Fintek F81216A D: AD5761 iio driver +D: TI DAC7612 driver +D: Sony IMX214 driver D: Various kernel hacks S: Qtechnology A/S S: Valby Langgade 142 diff --git a/MAINTAINERS b/MAINTAINERS index 70076af67296..f967ba816191 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15642,7 +15642,7 @@ F: drivers/ssb/ F: include/linux/ssb/ SONY IMX214 SENSOR DRIVER -M: Ricardo Ribalda +M: Ricardo Ribalda L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -16629,7 +16629,7 @@ S: Maintained F: sound/soc/ti/ TEXAS INSTRUMENTS' DAC7612 DAC DRIVER -M: Ricardo Ribalda +M: Ricardo Ribalda L: linux-iio@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/iio/dac/ti,dac7612.txt diff --git a/drivers/iio/dac/ad5761.c b/drivers/iio/dac/ad5761.c index 4fb42b743f0f..7468fbd11684 100644 --- a/drivers/iio/dac/ad5761.c +++ b/drivers/iio/dac/ad5761.c @@ -3,7 +3,7 @@ * AD5721, AD5721R, AD5761, AD5761R, Voltage Output Digital to Analog Converter * * Copyright 2016 Qtechnology A/S - * 2016 Ricardo Ribalda + * 2016 Ricardo Ribalda */ #include #include @@ -423,6 +423,6 @@ static struct spi_driver ad5761_driver = { }; module_spi_driver(ad5761_driver); -MODULE_AUTHOR("Ricardo Ribalda "); +MODULE_AUTHOR("Ricardo Ribalda "); MODULE_DESCRIPTION("Analog Devices AD5721, AD5721R, AD5761, AD5761R driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/iio/dac/ti-dac7612.c b/drivers/iio/dac/ti-dac7612.c index c46805144dd4..de0c6573cd97 100644 --- a/drivers/iio/dac/ti-dac7612.c +++ b/drivers/iio/dac/ti-dac7612.c @@ -3,7 +3,7 @@ * DAC7612 Dual, 12-Bit Serial input Digital-to-Analog Converter * * Copyright 2019 Qtechnology A/S - * 2019 Ricardo Ribalda + * 2019 Ricardo Ribalda * * Licensed under the GPL-2. */ @@ -179,6 +179,6 @@ static struct spi_driver dac7612_driver = { }; module_spi_driver(dac7612_driver); -MODULE_AUTHOR("Ricardo Ribalda "); +MODULE_AUTHOR("Ricardo Ribalda "); MODULE_DESCRIPTION("Texas Instruments DAC7612 DAC driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/leds/leds-pca963x.c b/drivers/leds/leds-pca963x.c index 66cdc003b8f4..d288acbc99c7 100644 --- a/drivers/leds/leds-pca963x.c +++ b/drivers/leds/leds-pca963x.c @@ -4,7 +4,7 @@ * Copyright 2013 Qtechnology/AS * * Author: Peter Meerwald - * Author: Ricardo Ribalda + * Author: Ricardo Ribalda * * Based on leds-pca955x.c * diff --git a/drivers/media/i2c/imx214.c b/drivers/media/i2c/imx214.c index 4175d06ffd47..1ef5af9a8c8b 100644 --- a/drivers/media/i2c/imx214.c +++ b/drivers/media/i2c/imx214.c @@ -4,7 +4,7 @@ * * Copyright 2018 Qtechnology A/S * - * Ricardo Ribalda + * Ricardo Ribalda */ #include #include @@ -1120,5 +1120,5 @@ static struct i2c_driver imx214_i2c_driver = { module_i2c_driver(imx214_i2c_driver); MODULE_DESCRIPTION("Sony IMX214 Camera driver"); -MODULE_AUTHOR("Ricardo Ribalda "); +MODULE_AUTHOR("Ricardo Ribalda "); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/platform_data/ad5761.h b/include/linux/platform_data/ad5761.h index 02bef5177ff5..69e261e2ca14 100644 --- a/include/linux/platform_data/ad5761.h +++ b/include/linux/platform_data/ad5761.h @@ -3,7 +3,7 @@ * AD5721, AD5721R, AD5761, AD5761R, Voltage Output Digital to Analog Converter * * Copyright 2016 Qtechnology A/S - * 2016 Ricardo Ribalda + * 2016 Ricardo Ribalda */ #ifndef __LINUX_PLATFORM_DATA_AD5761_H__ #define __LINUX_PLATFORM_DATA_AD5761_H__ -- cgit v1.2.3 From 11399346ac39a26ade2a90303d38ad318163c665 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:00:33 -0500 Subject: mtd: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Acked-by: Miquel Raynal Signed-off-by: Vignesh Raghavendra Link: https://lore.kernel.org/r/20200507190033.GA15215@embeddedor --- include/linux/mtd/cfi.h | 6 +++--- include/linux/mtd/qinfo.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index c98a21108688..fd1ecb821106 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -138,7 +138,7 @@ struct cfi_ident { uint16_t InterfaceDesc; uint16_t MaxBufWriteSize; uint8_t NumEraseRegions; - uint32_t EraseRegionInfo[0]; /* Not host ordered */ + uint32_t EraseRegionInfo[]; /* Not host ordered */ } __packed; /* Extended Query Structure for both PRI and ALT */ @@ -165,7 +165,7 @@ struct cfi_pri_intelext { uint16_t ProtRegAddr; uint8_t FactProtRegSize; uint8_t UserProtRegSize; - uint8_t extra[0]; + uint8_t extra[]; } __packed; struct cfi_intelext_otpinfo { @@ -286,7 +286,7 @@ struct cfi_private { map_word sector_erase_cmd; unsigned long chipshift; /* Because they're of the same type */ const char *im_name; /* inter_module name for cmdset_setup */ - struct flchip chips[0]; /* per-chip data structure for each chip */ + struct flchip chips[]; /* per-chip data structure for each chip */ }; uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs, diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h index df5b9fddea16..2e3f43788d48 100644 --- a/include/linux/mtd/qinfo.h +++ b/include/linux/mtd/qinfo.h @@ -24,7 +24,7 @@ struct lpddr_private { struct qinfo_chip *qinfo; int numchips; unsigned long chipshift; - struct flchip chips[0]; + struct flchip chips[]; }; /* qinfo_query_info structure contains request information for -- cgit v1.2.3 From 06081646450e46efcc1ca58c3b227bd60083cd3e Mon Sep 17 00:00:00 2001 From: Saravanan Sekar Date: Tue, 26 May 2020 11:06:42 +0200 Subject: mfd: mp2629: Add support for mps battery charger mp2629 is a highly-integrated switching-mode battery charge management device for single-cell Li-ion or Li-polymer battery. Add MFD core enables chip access for ADC driver for battery readings, and a power supply battery-charger driver Signed-off-by: Saravanan Sekar Reviewed-by: Andy Shevchenko Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 9 ++++++ drivers/mfd/Makefile | 2 ++ drivers/mfd/mp2629.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/mp2629.h | 17 ++++++++++ 4 files changed, 107 insertions(+) create mode 100644 drivers/mfd/mp2629.c create mode 100644 include/linux/mfd/mp2629.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 0a59249198d3..3526a03855be 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -434,6 +434,15 @@ config MFD_MC13XXX_I2C help Select this if your MC13xxx is connected via an I2C bus. +config MFD_MP2629 + tristate "Monolithic Power Systems MP2629 ADC and Battery charger" + depends on I2C + select REGMAP_I2C + help + Select this option to enable support for Monolithic Power Systems + battery charger. This provides ADC, thermal and battery charger power + management functions. + config MFD_MXS_LRADC tristate "Freescale i.MX23/i.MX28 LRADC" depends on ARCH_MXS || COMPILE_TEST diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f935d10cbf0f..d6c210f96d02 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -170,6 +170,8 @@ obj-$(CONFIG_MFD_MAX8925) += max8925.o obj-$(CONFIG_MFD_MAX8997) += max8997.o max8997-irq.o obj-$(CONFIG_MFD_MAX8998) += max8998.o max8998-irq.o +obj-$(CONFIG_MFD_MP2629) += mp2629.o + pcf50633-objs := pcf50633-core.o pcf50633-irq.o obj-$(CONFIG_MFD_PCF50633) += pcf50633.o obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o diff --git a/drivers/mfd/mp2629.c b/drivers/mfd/mp2629.c new file mode 100644 index 000000000000..16840ec5fd1c --- /dev/null +++ b/drivers/mfd/mp2629.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * MP2629 parent driver for ADC and battery charger + * + * Copyright 2020 Monolithic Power Systems, Inc + * + * Author: Saravanan Sekar + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell mp2629_cell[] = { + { + .name = "mp2629_adc", + .of_compatible = "mps,mp2629_adc", + }, + { + .name = "mp2629_charger", + .of_compatible = "mps,mp2629_charger", + } +}; + +static const struct regmap_config mp2629_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x17, +}; + +static int mp2629_probe(struct i2c_client *client) +{ + struct mp2629_data *ddata; + int ret; + + ddata = devm_kzalloc(&client->dev, sizeof(*ddata), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + ddata->dev = &client->dev; + i2c_set_clientdata(client, ddata); + + ddata->regmap = devm_regmap_init_i2c(client, &mp2629_regmap_config); + if (IS_ERR(ddata->regmap)) { + dev_err(ddata->dev, "Failed to allocate regmap\n"); + return PTR_ERR(ddata->regmap); + } + + ret = devm_mfd_add_devices(ddata->dev, PLATFORM_DEVID_AUTO, mp2629_cell, + ARRAY_SIZE(mp2629_cell), NULL, 0, NULL); + if (ret) + dev_err(ddata->dev, "Failed to register sub-devices %d\n", ret); + + return ret; +} + +static const struct of_device_id mp2629_of_match[] = { + { .compatible = "mps,mp2629"}, + { } +}; +MODULE_DEVICE_TABLE(of, mp2629_of_match); + +static struct i2c_driver mp2629_driver = { + .driver = { + .name = "mp2629", + .of_match_table = mp2629_of_match, + }, + .probe_new = mp2629_probe, +}; +module_i2c_driver(mp2629_driver); + +MODULE_AUTHOR("Saravanan Sekar "); +MODULE_DESCRIPTION("MP2629 Battery charger parent driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/mp2629.h b/include/linux/mfd/mp2629.h new file mode 100644 index 000000000000..baaeeaf82949 --- /dev/null +++ b/include/linux/mfd/mp2629.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright 2020 Monolithic Power Systems, Inc + */ + +#ifndef __MP2629_H__ +#define __MP2629_H__ + +#include +#include + +struct mp2629_data { + struct device *dev; + struct regmap *regmap; +}; + +#endif -- cgit v1.2.3 From 7abd9fb6468225f5c7f83149ce279cc1a912a68a Mon Sep 17 00:00:00 2001 From: Saravanan Sekar Date: Tue, 26 May 2020 11:06:43 +0200 Subject: iio: adc: mp2629: Add support for mp2629 ADC driver Add support for 8-bit resolution ADC readings for input power supply and battery charging measurement. Provides voltage, current readings to mp2629 power supply driver. Signed-off-by: Saravanan Sekar Reviewed-by: Andy Shevchenko Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones --- drivers/iio/adc/Kconfig | 10 +++ drivers/iio/adc/Makefile | 1 + drivers/iio/adc/mp2629_adc.c | 208 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/mp2629.h | 9 ++ 4 files changed, 228 insertions(+) create mode 100644 drivers/iio/adc/mp2629_adc.c (limited to 'include/linux') diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 12bb8b7ca1ff..a864ede98114 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -692,6 +692,16 @@ config MESON_SARADC To compile this driver as a module, choose M here: the module will be called meson_saradc. +config MP2629_ADC + tristate "Monolithic MP2629 ADC driver" + depends on MFD_MP2629 + help + Say yes to have support for battery charger IC MP2629 ADC device + accessed over I2C. + + This driver provides ADC conversion of system, input power supply + and battery voltage & current information. + config NAU7802 tristate "Nuvoton NAU7802 ADC driver" depends on I2C diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index 637807861112..78a1963a14f3 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_MCP3911) += mcp3911.o obj-$(CONFIG_MEDIATEK_MT6577_AUXADC) += mt6577_auxadc.o obj-$(CONFIG_MEN_Z188_ADC) += men_z188_adc.o obj-$(CONFIG_MESON_SARADC) += meson_saradc.o +obj-$(CONFIG_MP2629_ADC) += mp2629_adc.o obj-$(CONFIG_MXS_LRADC_ADC) += mxs-lradc-adc.o obj-$(CONFIG_NAU7802) += nau7802.o obj-$(CONFIG_NPCM_ADC) += npcm_adc.o diff --git a/drivers/iio/adc/mp2629_adc.c b/drivers/iio/adc/mp2629_adc.c new file mode 100644 index 000000000000..331a9a728217 --- /dev/null +++ b/drivers/iio/adc/mp2629_adc.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * MP2629 Driver for ADC + * + * Copyright 2020 Monolithic Power Systems, Inc + * + * Author: Saravanan Sekar + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MP2629_REG_ADC_CTRL 0x03 +#define MP2629_REG_BATT_VOLT 0x0e +#define MP2629_REG_SYSTEM_VOLT 0x0f +#define MP2629_REG_INPUT_VOLT 0x11 +#define MP2629_REG_BATT_CURRENT 0x12 +#define MP2629_REG_INPUT_CURRENT 0x13 + +#define MP2629_ADC_START BIT(7) +#define MP2629_ADC_CONTINUOUS BIT(6) + +#define MP2629_MAP(_mp, _mpc) IIO_MAP(#_mp, "mp2629_charger", "mp2629-"_mpc) + +#define MP2629_ADC_CHAN(_ch, _type) { \ + .type = _type, \ + .indexed = 1, \ + .datasheet_name = #_ch, \ + .channel = MP2629_ ## _ch, \ + .address = MP2629_REG_ ## _ch, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_RAW), \ + .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \ +} + +struct mp2629_adc { + struct regmap *regmap; + struct device *dev; +}; + +static struct iio_chan_spec mp2629_channels[] = { + MP2629_ADC_CHAN(BATT_VOLT, IIO_VOLTAGE), + MP2629_ADC_CHAN(SYSTEM_VOLT, IIO_VOLTAGE), + MP2629_ADC_CHAN(INPUT_VOLT, IIO_VOLTAGE), + MP2629_ADC_CHAN(BATT_CURRENT, IIO_CURRENT), + MP2629_ADC_CHAN(INPUT_CURRENT, IIO_CURRENT) +}; + +static struct iio_map mp2629_adc_maps[] = { + MP2629_MAP(BATT_VOLT, "batt-volt"), + MP2629_MAP(SYSTEM_VOLT, "system-volt"), + MP2629_MAP(INPUT_VOLT, "input-volt"), + MP2629_MAP(BATT_CURRENT, "batt-current"), + MP2629_MAP(INPUT_CURRENT, "input-current") +}; + +static int mp2629_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + struct mp2629_adc *info = iio_priv(indio_dev); + unsigned int rval; + int ret; + + switch (mask) { + case IIO_CHAN_INFO_RAW: + ret = regmap_read(info->regmap, chan->address, &rval); + if (ret) + return ret; + + if (chan->address == MP2629_INPUT_VOLT) + rval &= GENMASK(6, 0); + *val = rval; + return IIO_VAL_INT; + + case IIO_CHAN_INFO_SCALE: + switch (chan->channel) { + case MP2629_BATT_VOLT: + case MP2629_SYSTEM_VOLT: + *val = 20; + return IIO_VAL_INT; + + case MP2629_INPUT_VOLT: + *val = 60; + return IIO_VAL_INT; + + case MP2629_BATT_CURRENT: + *val = 175; + *val2 = 10; + return IIO_VAL_FRACTIONAL; + + case MP2629_INPUT_CURRENT: + *val = 133; + *val2 = 10; + return IIO_VAL_FRACTIONAL; + + default: + return -EINVAL; + } + + default: + return -EINVAL; + } +} + +static const struct iio_info mp2629_adc_info = { + .read_raw = &mp2629_read_raw, +}; + +static int mp2629_adc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct mp2629_data *ddata = dev_get_drvdata(dev->parent); + struct mp2629_adc *info; + struct iio_dev *indio_dev; + int ret; + + indio_dev = devm_iio_device_alloc(dev, sizeof(*info)); + if (!indio_dev) + return -ENOMEM; + + info = iio_priv(indio_dev); + info->regmap = ddata->regmap; + info->dev = dev; + platform_set_drvdata(pdev, indio_dev); + + ret = regmap_update_bits(info->regmap, MP2629_REG_ADC_CTRL, + MP2629_ADC_START | MP2629_ADC_CONTINUOUS, + MP2629_ADC_START | MP2629_ADC_CONTINUOUS); + if (ret) { + dev_err(dev, "adc enable fail: %d\n", ret); + return ret; + } + + ret = iio_map_array_register(indio_dev, mp2629_adc_maps); + if (ret) { + dev_err(dev, "IIO maps register fail: %d\n", ret); + goto fail_disable; + } + + indio_dev->name = "mp2629-adc"; + indio_dev->dev.parent = dev; + indio_dev->channels = mp2629_channels; + indio_dev->num_channels = ARRAY_SIZE(mp2629_channels); + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->info = &mp2629_adc_info; + + ret = iio_device_register(indio_dev); + if (ret) { + dev_err(dev, "IIO device register fail: %d\n", ret); + goto fail_map_unregister; + } + + return 0; + +fail_map_unregister: + iio_map_array_unregister(indio_dev); + +fail_disable: + regmap_update_bits(info->regmap, MP2629_REG_ADC_CTRL, + MP2629_ADC_CONTINUOUS, 0); + regmap_update_bits(info->regmap, MP2629_REG_ADC_CTRL, + MP2629_ADC_START, 0); + + return ret; +} + +static int mp2629_adc_remove(struct platform_device *pdev) +{ + struct iio_dev *indio_dev = platform_get_drvdata(pdev); + struct mp2629_adc *info = iio_priv(indio_dev); + + iio_device_unregister(indio_dev); + + iio_map_array_unregister(indio_dev); + + regmap_update_bits(info->regmap, MP2629_REG_ADC_CTRL, + MP2629_ADC_CONTINUOUS, 0); + regmap_update_bits(info->regmap, MP2629_REG_ADC_CTRL, + MP2629_ADC_START, 0); + + return 0; +} + +static const struct of_device_id mp2629_adc_of_match[] = { + { .compatible = "mps,mp2629_adc"}, + {} +}; +MODULE_DEVICE_TABLE(of, mp2629_adc_of_match); + +static struct platform_driver mp2629_adc_driver = { + .driver = { + .name = "mp2629_adc", + .of_match_table = mp2629_adc_of_match, + }, + .probe = mp2629_adc_probe, + .remove = mp2629_adc_remove, +}; +module_platform_driver(mp2629_adc_driver); + +MODULE_AUTHOR("Saravanan Sekar "); +MODULE_DESCRIPTION("MP2629 ADC driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/mp2629.h b/include/linux/mfd/mp2629.h index baaeeaf82949..89b706900b57 100644 --- a/include/linux/mfd/mp2629.h +++ b/include/linux/mfd/mp2629.h @@ -14,4 +14,13 @@ struct mp2629_data { struct regmap *regmap; }; +enum mp2629_adc_chan { + MP2629_BATT_VOLT, + MP2629_SYSTEM_VOLT, + MP2629_INPUT_VOLT, + MP2629_BATT_CURRENT, + MP2629_INPUT_CURRENT, + MP2629_ADC_CHAN_END +}; + #endif -- cgit v1.2.3 From 9aa8759960e36291e7663288d58b47ee4927b6f5 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Sat, 16 May 2020 14:06:08 +0300 Subject: mfd: Constify properties in mfd_cell Constify 'struct property_entry *properties' in mfd_cell. It is always passed around as a pointer const struct. Signed-off-by: Tomas Winkler Signed-off-by: Lee Jones --- include/linux/mfd/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index d01d1299e49d..7e5ac3c00891 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -70,7 +70,7 @@ struct mfd_cell { size_t pdata_size; /* device properties passed to the sub devices drivers */ - struct property_entry *properties; + const struct property_entry *properties; /* * Device Tree compatible string -- cgit v1.2.3 From 97eda5dcc2cde5dcc778bef7a9344db3b6bf8ef5 Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Wed, 22 Apr 2020 11:08:33 +0200 Subject: mfd: stmfx: Disable IRQ in suspend to avoid spurious interrupt When STMFX supply is stopped, spurious interrupt can occur. To avoid that, disable the interrupt in suspend before disabling the regulator and re-enable it at the end of resume. Fixes: 06252ade9156 ("mfd: Add ST Multi-Function eXpander (STMFX) core driver") Signed-off-by: Amelie Delaunay Signed-off-by: Lee Jones --- drivers/mfd/stmfx.c | 6 ++++++ include/linux/mfd/stmfx.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/mfd/stmfx.c b/drivers/mfd/stmfx.c index 1977fe95f876..711979afd90a 100644 --- a/drivers/mfd/stmfx.c +++ b/drivers/mfd/stmfx.c @@ -296,6 +296,8 @@ static int stmfx_irq_init(struct i2c_client *client) if (ret) goto irq_exit; + stmfx->irq = client->irq; + return 0; irq_exit: @@ -486,6 +488,8 @@ static int stmfx_suspend(struct device *dev) if (ret) return ret; + disable_irq(stmfx->irq); + if (stmfx->vdd) return regulator_disable(stmfx->vdd); @@ -529,6 +533,8 @@ static int stmfx_resume(struct device *dev) if (ret) return ret; + enable_irq(stmfx->irq); + return 0; } #endif diff --git a/include/linux/mfd/stmfx.h b/include/linux/mfd/stmfx.h index 3c67983678ec..744dce63946e 100644 --- a/include/linux/mfd/stmfx.h +++ b/include/linux/mfd/stmfx.h @@ -109,6 +109,7 @@ struct stmfx { struct device *dev; struct regmap *map; struct regulator *vdd; + int irq; struct irq_domain *irq_domain; struct mutex lock; /* IRQ bus lock */ u8 irq_src; -- cgit v1.2.3 From aaf2bc50df1f4bfc6857fc601fc7b21d5a18c6a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 May 2020 22:05:15 +0200 Subject: rcu: Abstract out rcu_irq_enter_check_tick() from rcu_nmi_enter() There will likely be exception handlers that can sleep, which rules out the usual approach of invoking rcu_nmi_enter() on entry and also rcu_nmi_exit() on all exit paths. However, the alternative approach of just not calling anything can prevent RCU from coaxing quiescent states from nohz_full CPUs that are looping in the kernel: RCU must instead IPI them explicitly. It would be better to enable the scheduler tick on such CPUs to interact with RCU in a lighter-weight manner, and this enabling is one of the things that rcu_nmi_enter() currently does. What is needed is something that helps RCU coax quiescent states while not preventing subsequent sleeps. This commit therefore splits out the nohz_full scheduler-tick enabling from the rest of the rcu_nmi_enter() logic into a new function named rcu_irq_enter_check_tick(). [ tglx: Renamed the function and made it a nop when context tracking is off ] [ mingo: Fixed a CONFIG_NO_HZ_FULL assumption, harmonized and fixed all the comment blocks and cleaned up rcu_nmi_enter()/exit() definitions. ] Suggested-by: Andy Lutomirski Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200521202116.996113173@linutronix.de --- include/linux/hardirq.h | 29 +++++++++-------- kernel/rcu/tree.c | 82 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 79 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 621556efe45f..e07cf853aa16 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -2,31 +2,28 @@ #ifndef LINUX_HARDIRQ_H #define LINUX_HARDIRQ_H +#include #include #include #include #include #include - extern void synchronize_irq(unsigned int irq); extern bool synchronize_hardirq(unsigned int irq); -#if defined(CONFIG_TINY_RCU) - -static inline void rcu_nmi_enter(void) -{ -} +#ifdef CONFIG_NO_HZ_FULL +void __rcu_irq_enter_check_tick(void); +#else +static inline void __rcu_irq_enter_check_tick(void) { } +#endif -static inline void rcu_nmi_exit(void) +static __always_inline void rcu_irq_enter_check_tick(void) { + if (context_tracking_enabled()) + __rcu_irq_enter_check_tick(); } -#else -extern void rcu_nmi_enter(void); -extern void rcu_nmi_exit(void); -#endif - /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -65,6 +62,14 @@ extern void irq_exit(void); #define arch_nmi_exit() do { } while (0) #endif +#ifdef CONFIG_TINY_RCU +static inline void rcu_nmi_enter(void) { } +static inline void rcu_nmi_exit(void) { } +#else +extern void rcu_nmi_enter(void); +extern void rcu_nmi_exit(void); +#endif + /* * NMI vs Tracing * -------------- diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90c8be22d57a..b7f8c494d1d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -848,6 +848,67 @@ void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** @@ -894,26 +955,7 @@ noinstr void rcu_nmi_enter(void) incby = 1; } else if (!in_nmi()) { instrumentation_begin(); - if (tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - // We get here only if we had already exited the - // extended quiescent state and this was an - // interrupt (not an NMI). Therefore, (1) RCU is - // already watching and (2) The fact that we are in - // an interrupt handler and that the rcu_node lock - // is an irq-disabled lock prevents self-deadlock. - // So we can safely recheck under the lock. - raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - // A nohz_full CPU is in the kernel and RCU - // needs a quiescent state. Turn on the tick! - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); - } + rcu_irq_enter_check_tick(); instrumentation_end(); } instrumentation_begin(); -- cgit v1.2.3 From 07325d4a90d2d84de45cc07b134fd0f023dbb971 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:16 +0200 Subject: rcu: Provide rcu_irq_exit_check_preempt() Provide a debug check which can be invoked from exception return to kernel mode before an attempt is made to schedule. Warn if RCU is not ready for this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20200521202117.089709607@linutronix.de --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 6 ++++++ kernel/rcu/tree.c | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index c869fb20cc51..8512caeb7682 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -72,6 +72,7 @@ static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } static inline void rcu_irq_exit_preempt(void) { } +static inline void rcu_irq_exit_check_preempt(void) { } static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9366fa4d0717..d5cc9d675987 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -51,6 +51,12 @@ void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); +#ifdef CONFIG_PROVE_RCU +void rcu_irq_exit_check_preempt(void); +#else +static inline void rcu_irq_exit_check_preempt(void) { } +#endif + void exit_rcu(void); void rcu_scheduler_starting(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7f8c494d1d1..d8e9dbbefcfa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -765,6 +765,24 @@ void rcu_irq_exit_preempt(void) "RCU in extended quiescent state!"); } +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) +{ + lockdep_assert_irqs_disabled(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} +#endif /* #ifdef CONFIG_PROVE_RCU */ + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3 From 46d26819a5056f4831649c5887ad5c71a16d86f7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 24 May 2020 17:30:40 +0200 Subject: software node: implement software_node_unregister() Sometimes it is better to unregister individual nodes instead of trying to do them all at once with software_node_unregister_nodes(), so create software_node_unregister() so that you can unregister them one at a time. This is especially important when creating nodes in a hierarchy, with parent -> children representations. Children always need to be removed before a parent is, as the swnode logic assumes this is going to be the case. Fix up the lib/test_printf.c fwnode_pointer() test which to use this new function as it had the problem of tearing things down in the backwards order. Fixes: f1ce39df508d ("lib/test_printf: Add tests for %pfw printk modifier") Cc: stable Cc: Andy Shevchenko Cc: Brendan Higgins Cc: Dmitry Torokhov Cc: Petr Mladek Cc: Rafael J. Wysocki Cc: Rasmus Villemoes Cc: Sakari Ailus Cc: Sergey Senozhatsky Cc: Steven Rostedt Reported-by: Naresh Kamboju Reported-by: kernel test robot Reported-by: Randy Dunlap Tested-by: Petr Mladek Tested-by: Randy Dunlap Tested-by: Guenter Roeck Reviewed-by: Heikki Krogerus Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20200524153041.2361-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 27 +++++++++++++++++++++------ include/linux/property.h | 1 + lib/test_printf.c | 4 +++- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index de8d3543e8fe..770b1f47a625 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -712,17 +712,18 @@ EXPORT_SYMBOL_GPL(software_node_register_nodes); * @nodes: Zero terminated array of software nodes to be unregistered * * Unregister multiple software nodes at once. + * + * NOTE: Be careful using this call if the nodes had parent pointers set up in + * them before registering. If so, it is wiser to remove the nodes + * individually, in the correct order (child before parent) instead of relying + * on the sequential order of the list of nodes in the array. */ void software_node_unregister_nodes(const struct software_node *nodes) { - struct swnode *swnode; int i; - for (i = 0; nodes[i].name; i++) { - swnode = software_node_to_swnode(&nodes[i]); - if (swnode) - fwnode_remove_software_node(&swnode->fwnode); - } + for (i = 0; nodes[i].name; i++) + software_node_unregister(&nodes[i]); } EXPORT_SYMBOL_GPL(software_node_unregister_nodes); @@ -741,6 +742,20 @@ int software_node_register(const struct software_node *node) } EXPORT_SYMBOL_GPL(software_node_register); +/** + * software_node_unregister - Unregister static software node + * @node: The software node to be unregistered + */ +void software_node_unregister(const struct software_node *node) +{ + struct swnode *swnode; + + swnode = software_node_to_swnode(node); + if (swnode) + fwnode_remove_software_node(&swnode->fwnode); +} +EXPORT_SYMBOL_GPL(software_node_unregister); + struct fwnode_handle * fwnode_create_software_node(const struct property_entry *properties, const struct fwnode_handle *parent) diff --git a/include/linux/property.h b/include/linux/property.h index d86de017c689..0d4099b4ce1f 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -441,6 +441,7 @@ int software_node_register_nodes(const struct software_node *nodes); void software_node_unregister_nodes(const struct software_node *nodes); int software_node_register(const struct software_node *node); +void software_node_unregister(const struct software_node *node); int software_node_notify(struct device *dev, unsigned long action); diff --git a/lib/test_printf.c b/lib/test_printf.c index 6b1622f4d7c2..fc63b8959d42 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -637,7 +637,9 @@ static void __init fwnode_pointer(void) test(second_name, "%pfwP", software_node_fwnode(&softnodes[1])); test(third_name, "%pfwP", software_node_fwnode(&softnodes[2])); - software_node_unregister_nodes(softnodes); + software_node_unregister(&softnodes[2]); + software_node_unregister(&softnodes[1]); + software_node_unregister(&softnodes[0]); } static void __init -- cgit v1.2.3 From ff937b916eb6316fe4644564a572ed3b5867bc1f Mon Sep 17 00:00:00 2001 From: Yuval Basson Date: Tue, 26 May 2020 09:41:20 +0300 Subject: qed: Add EDPM mode type for user-fw compatibility In older FW versions the completion flag was treated as the ack flag in edpm messages. Expose the FW option of setting which mode the QP is in by adding a flag to the qedr <-> qed API. Flag is added for backward compatibility with libqedr. This flag will be set by qedr after determining whether the libqedr is using the updated version. Fixes: f10939403352 ("qed: Add support for QP verbs") Signed-off-by: Yuval Basson Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 1 + drivers/net/ethernet/qlogic/qed/qed_rdma.h | 1 + drivers/net/ethernet/qlogic/qed/qed_roce.c | 3 +++ include/linux/qed/qed_rdma_if.h | 3 +++ 4 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 50985871cd3d..98455f698f53 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -1378,6 +1378,7 @@ qed_rdma_create_qp(void *rdma_cxt, rc = qed_iwarp_create_qp(p_hwfn, qp, out_params); qp->qpid = qp->icid; } else { + qp->edpm_mode = GET_FIELD(in_params->flags, QED_ROCE_EDPM_MODE); rc = qed_roce_alloc_cid(p_hwfn, &qp->icid); qp->qpid = ((0xFF << 16) | qp->icid); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h index 5a7ebc764bb6..3898cae61e7a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.h +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h @@ -183,6 +183,7 @@ struct qed_rdma_qp { void *shared_queue; dma_addr_t shared_queue_phys_addr; struct qed_iwarp_ep *ep; + u8 edpm_mode; }; static inline bool qed_rdma_is_xrc_qp(struct qed_rdma_qp *qp) diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 46a4d09eacef..4566815f7b87 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -459,6 +459,9 @@ static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn, ROCE_CREATE_QP_REQ_RAMROD_DATA_XRC_FLAG, qed_rdma_is_xrc_qp(qp)); + SET_FIELD(p_ramrod->flags2, + ROCE_CREATE_QP_REQ_RAMROD_DATA_EDPM_MODE, qp->edpm_mode); + p_ramrod->max_ord = qp->max_rd_atomic_req; p_ramrod->traffic_class = qp->traffic_class_tos; p_ramrod->hop_limit = qp->hop_limit_ttl; diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index f93edd5750a5..584077565f12 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -335,6 +335,9 @@ struct qed_rdma_create_qp_in_params { u16 xrcd_id; u8 stats_queue; enum qed_rdma_qp_type qp_type; + u8 flags; +#define QED_ROCE_EDPM_MODE_MASK 0x1 +#define QED_ROCE_EDPM_MODE_SHIFT 0 }; struct qed_rdma_create_qp_out_params { -- cgit v1.2.3 From 90ce665c6a40dc1be771bf5f86e624c0acf3a76f Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 26 May 2020 16:29:36 +0100 Subject: net: mdiobus: add clause 45 mdiobus accessors There is a recurring pattern throughout some of the PHY code converting a devad and regnum to our packed clause 45 representation. Rather than having this scattered around the code, let's put a common translation function in mdio.h, and provide some register accessors. Convert the phylib core, phylink, bcm87xx and cortina to use these. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm87xx.c | 2 +- drivers/net/phy/cortina.c | 3 +-- drivers/net/phy/phy-core.c | 11 ++++------- drivers/net/phy/phy.c | 4 ++-- drivers/net/phy/phy_device.c | 20 ++++++++------------ drivers/net/phy/phylink.c | 11 +++++------ include/linux/mdio.h | 31 +++++++++++++++++++++++++++++++ include/linux/phy.h | 6 ------ 8 files changed, 52 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm87xx.c b/drivers/net/phy/bcm87xx.c index f6dce6850850..df360e1c5069 100644 --- a/drivers/net/phy/bcm87xx.c +++ b/drivers/net/phy/bcm87xx.c @@ -55,7 +55,7 @@ static int bcm87xx_of_reg_init(struct phy_device *phydev) u16 mask = be32_to_cpup(paddr++); u16 val_bits = be32_to_cpup(paddr++); int val; - u32 regnum = MII_ADDR_C45 | (devid << 16) | reg; + u32 regnum = mdiobus_c45_addr(devid, reg); val = 0; if (mask) { val = phy_read(phydev, regnum); diff --git a/drivers/net/phy/cortina.c b/drivers/net/phy/cortina.c index aac51362c0fe..40514a94e6ff 100644 --- a/drivers/net/phy/cortina.c +++ b/drivers/net/phy/cortina.c @@ -17,8 +17,7 @@ static int cortina_read_reg(struct phy_device *phydev, u16 regnum) { - return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, - MII_ADDR_C45 | regnum); + return mdiobus_c45_read(phydev->mdio.bus, phydev->mdio.addr, 0, regnum); } static int cortina_read_status(struct phy_device *phydev) diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 66b8c61ca74c..46bd68e9ecfa 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -428,9 +428,8 @@ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) if (phydev->drv && phydev->drv->read_mmd) { val = phydev->drv->read_mmd(phydev, devad, regnum); } else if (phydev->is_c45) { - u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff); - - val = __mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, addr); + val = __mdiobus_c45_read(phydev->mdio.bus, phydev->mdio.addr, + devad, regnum); } else { struct mii_bus *bus = phydev->mdio.bus; int phy_addr = phydev->mdio.addr; @@ -485,10 +484,8 @@ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) if (phydev->drv && phydev->drv->write_mmd) { ret = phydev->drv->write_mmd(phydev, devad, regnum, val); } else if (phydev->is_c45) { - u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff); - - ret = __mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, - addr, val); + ret = __mdiobus_c45_write(phydev->mdio.bus, phydev->mdio.addr, + devad, regnum, val); } else { struct mii_bus *bus = phydev->mdio.bus; int phy_addr = phydev->mdio.addr; diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index d584701187db..27da0c94818f 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -361,7 +361,7 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd) if (mdio_phy_id_is_c45(mii_data->phy_id)) { prtad = mdio_phy_id_prtad(mii_data->phy_id); devad = mdio_phy_id_devad(mii_data->phy_id); - devad = MII_ADDR_C45 | devad << 16 | mii_data->reg_num; + devad = mdiobus_c45_addr(devad, mii_data->reg_num); } else { prtad = mii_data->phy_id; devad = mii_data->reg_num; @@ -374,7 +374,7 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd) if (mdio_phy_id_is_c45(mii_data->phy_id)) { prtad = mdio_phy_id_prtad(mii_data->phy_id); devad = mdio_phy_id_devad(mii_data->phy_id); - devad = MII_ADDR_C45 | devad << 16 | mii_data->reg_num; + devad = mdiobus_c45_addr(devad, mii_data->reg_num); } else { prtad = mii_data->phy_id; devad = mii_data->reg_num; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 6b30d205642f..04946de74fa0 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -675,16 +675,14 @@ EXPORT_SYMBOL(phy_device_create); static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr, u32 *devices_in_package) { - int phy_reg, reg_addr; + int phy_reg; - reg_addr = MII_ADDR_C45 | dev_addr << 16 | MDIO_DEVS2; - phy_reg = mdiobus_read(bus, addr, reg_addr); + phy_reg = mdiobus_c45_read(bus, addr, dev_addr, MDIO_DEVS2); if (phy_reg < 0) return -EIO; *devices_in_package = phy_reg << 16; - reg_addr = MII_ADDR_C45 | dev_addr << 16 | MDIO_DEVS1; - phy_reg = mdiobus_read(bus, addr, reg_addr); + phy_reg = mdiobus_c45_read(bus, addr, dev_addr, MDIO_DEVS1); if (phy_reg < 0) return -EIO; *devices_in_package |= phy_reg; @@ -709,11 +707,11 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr, * */ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id, - struct phy_c45_device_ids *c45_ids) { - int phy_reg; - int i, reg_addr; + struct phy_c45_device_ids *c45_ids) +{ const int num_ids = ARRAY_SIZE(c45_ids->device_ids); u32 *devs = &c45_ids->devices_in_package; + int i, phy_reg; /* Find first non-zero Devices In package. Device zero is reserved * for 802.3 c45 complied PHYs, so don't probe it at first. @@ -747,14 +745,12 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id, if (!(c45_ids->devices_in_package & (1 << i))) continue; - reg_addr = MII_ADDR_C45 | i << 16 | MII_PHYSID1; - phy_reg = mdiobus_read(bus, addr, reg_addr); + phy_reg = mdiobus_c45_read(bus, addr, i, MII_PHYSID1); if (phy_reg < 0) return -EIO; c45_ids->device_ids[i] = phy_reg << 16; - reg_addr = MII_ADDR_C45 | i << 16 | MII_PHYSID2; - phy_reg = mdiobus_read(bus, addr, reg_addr); + phy_reg = mdiobus_c45_read(bus, addr, i, MII_PHYSID2); if (phy_reg < 0) return -EIO; c45_ids->device_ids[i] |= phy_reg; diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index b6b1f77bba58..0ab65fb75258 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1631,7 +1631,7 @@ static int phylink_phy_read(struct phylink *pl, unsigned int phy_id, if (mdio_phy_id_is_c45(phy_id)) { prtad = mdio_phy_id_prtad(phy_id); devad = mdio_phy_id_devad(phy_id); - devad = MII_ADDR_C45 | devad << 16 | reg; + devad = mdiobus_c45_addr(devad, reg); } else if (phydev->is_c45) { switch (reg) { case MII_BMCR: @@ -1654,7 +1654,7 @@ static int phylink_phy_read(struct phylink *pl, unsigned int phy_id, return -EINVAL; } prtad = phy_id; - devad = MII_ADDR_C45 | devad << 16 | reg; + devad = mdiobus_c45_addr(devad, reg); } else { prtad = phy_id; devad = reg; @@ -1671,7 +1671,7 @@ static int phylink_phy_write(struct phylink *pl, unsigned int phy_id, if (mdio_phy_id_is_c45(phy_id)) { prtad = mdio_phy_id_prtad(phy_id); devad = mdio_phy_id_devad(phy_id); - devad = MII_ADDR_C45 | devad << 16 | reg; + devad = mdiobus_c45_addr(devad, reg); } else if (phydev->is_c45) { switch (reg) { case MII_BMCR: @@ -1694,7 +1694,7 @@ static int phylink_phy_write(struct phylink *pl, unsigned int phy_id, return -EINVAL; } prtad = phy_id; - devad = MII_ADDR_C45 | devad << 16 | reg; + devad = mdiobus_c45_addr(devad, reg); } else { prtad = phy_id; devad = reg; @@ -2292,7 +2292,6 @@ void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs) } EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_an_restart); -#define C45_ADDR(d,a) (MII_ADDR_C45 | (d) << 16 | (a)) void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, struct phylink_link_state *state) { @@ -2300,7 +2299,7 @@ void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, int addr = pcs->addr; int stat; - stat = mdiobus_read(bus, addr, C45_ADDR(MDIO_MMD_PCS, MDIO_STAT1)); + stat = mdiobus_c45_read(bus, addr, MDIO_MMD_PCS, MDIO_STAT1); if (stat < 0) { state->link = false; return; diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 917e4bb2ed71..36d2e0673d03 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -9,6 +9,13 @@ #include #include +/* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit + * IEEE 802.3ae clause 45 addressing mode used by 10GIGE phy chips. + */ +#define MII_ADDR_C45 (1<<30) +#define MII_DEVADDR_C45_SHIFT 16 +#define MII_REGADDR_C45_MASK GENMASK(15, 0) + struct gpio_desc; struct mii_bus; @@ -326,6 +333,30 @@ int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set); +static inline u32 mdiobus_c45_addr(int devad, u16 regnum) +{ + return MII_ADDR_C45 | devad << MII_DEVADDR_C45_SHIFT | regnum; +} + +static inline int __mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, + u16 regnum) +{ + return __mdiobus_read(bus, prtad, mdiobus_c45_addr(devad, regnum)); +} + +static inline int __mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, + u16 regnum, u16 val) +{ + return __mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), + val); +} + +static inline int mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, + u16 regnum) +{ + return mdiobus_read(bus, prtad, mdiobus_c45_addr(devad, regnum)); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); diff --git a/include/linux/phy.h b/include/linux/phy.h index 2bcdf19ed3b4..6d256e720a66 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -209,12 +209,6 @@ static inline const char *phy_modes(phy_interface_t interface) #define MII_BUS_ID_SIZE 61 -/* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit - IEEE 802.3ae clause 45 addressing mode used by 10GIGE phy chips. */ -#define MII_ADDR_C45 (1<<30) -#define MII_DEVADDR_C45_SHIFT 16 -#define MII_REGADDR_C45_MASK GENMASK(15, 0) - struct device; struct phylink; struct sfp_bus; -- cgit v1.2.3 From f1e71d75f04792721d411170168e68019ccb7de3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:04:52 -0500 Subject: nvme: replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/lightnvm.c | 2 +- include/linux/nvme.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 8aabc056c754..cb0007592c12 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -108,7 +108,7 @@ struct nvme_fc_fcp_op { struct nvme_fcp_op_w_sgl { struct nvme_fc_fcp_op op; struct scatterlist sgl[NVME_INLINE_SG_CNT]; - uint8_t priv[0]; + uint8_t priv[]; }; struct nvme_fc_lport { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ec46693f6b64..3002bf972c6b 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -171,7 +171,7 @@ struct nvme_nvm_bb_tbl { __le32 tdresv; __le32 thresv; __le32 rsvd2[8]; - __u8 blk[0]; + __u8 blk[]; }; struct nvme_nvm_id20_addrf { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b235a48eac8c..e2993e6a9d7c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1185,7 +1185,7 @@ struct nvmf_disc_rsp_page_hdr { __le64 numrec; __le16 recfmt; __u8 resv14[1006]; - struct nvmf_disc_rsp_page_entry entries[0]; + struct nvmf_disc_rsp_page_entry entries[]; }; enum { -- cgit v1.2.3 From c295ee4742fda49de598a304ef9a95cf8da6b1f5 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 19 May 2020 17:05:48 +0300 Subject: block: always define struct blk_integrity in genhd.h This will reduce the amount of ifdefs inside the source code for various drivers and also will reduce the amount of stub functions that were created for the !CONFIG_BLK_DEV_INTEGRITY case. Suggested-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig --- include/linux/genhd.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f9c226f9546a..2590bed6e6b3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -169,8 +169,6 @@ struct disk_part_tbl { struct disk_events; struct badblocks; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - struct blk_integrity { const struct blk_integrity_profile *profile; unsigned char flags; @@ -179,8 +177,6 @@ struct blk_integrity { unsigned char tag_size; }; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). -- cgit v1.2.3 From 39481fbd14ee272edd419d73a98bc637e2a3fd35 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 19 May 2020 17:06:00 +0300 Subject: nvme: add Metadata Capabilities enumerations The enumerations will be used to expose the namespace metadata format by the target. Suggested-by: Christoph Hellwig Signed-off-by: Israel Rukshin Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e2993e6a9d7c..5ce51ab4c50e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -420,6 +420,12 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +/* Identify Namespace Metadata Capabilities (MC): */ +enum { + NVME_MC_EXTENDED_LBA = (1 << 0), + NVME_MC_METADATA_PTR = (1 << 1), +}; + struct nvme_ns_id_desc { __u8 nidt; __u8 nidl; -- cgit v1.2.3 From 1a644de29f712771c2ec00e52caa391544eb6141 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:38 +0200 Subject: net: ethtool: Add generic parts of cable test TDR Add the generic parts of the code used to trigger a cable test and return raw TDR data. Any PHY driver which support this must implement the new driver op. Signed-off-by: Andrew Lunn v2 Update nxp-tja11xx for API change. Signed-off-by: David S. Miller --- drivers/net/phy/nxp-tja11xx.c | 2 +- drivers/net/phy/phy.c | 65 ++++++++++++++++++++++++++++++++++++++++- include/linux/ethtool_netlink.h | 4 +-- include/linux/phy.h | 13 +++++++++ net/ethtool/cabletest.c | 64 ++++++++++++++++++++++++++++++++++++---- net/ethtool/netlink.c | 5 ++++ net/ethtool/netlink.h | 1 + 7 files changed, 144 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 1e79c30ca81a..a72fa0d2e7c7 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -194,7 +194,7 @@ static int tja11xx_config_aneg_cable_test(struct phy_device *phydev) !phydev->drv->cable_test_get_status) return 0; - ret = ethnl_cable_test_alloc(phydev); + ret = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); if (ret) return ret; diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 27da0c94818f..495d9ba3d5bf 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -519,7 +519,7 @@ int phy_start_cable_test(struct phy_device *phydev, goto out; } - err = ethnl_cable_test_alloc(phydev); + err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); if (err) goto out; @@ -552,6 +552,69 @@ out: } EXPORT_SYMBOL(phy_start_cable_test); +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = phydev->attached_dev; + int err = -ENOMEM; + + if (!(phydev->drv && + phydev->drv->cable_test_tdr_start && + phydev->drv->cable_test_get_status)) { + NL_SET_ERR_MSG(extack, + "PHY driver does not support cable test TDR"); + return -EOPNOTSUPP; + } + + mutex_lock(&phydev->lock); + if (phydev->state == PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY already performing a test"); + err = -EBUSY; + goto out; + } + + if (phydev->state < PHY_UP || + phydev->state > PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY not configured. Try setting interface up"); + err = -EBUSY; + goto out; + } + + err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + if (err) + goto out; + + /* Mark the carrier down until the test is complete */ + phy_link_down(phydev); + + netif_testing_on(dev); + err = phydev->drv->cable_test_tdr_start(phydev); + if (err) { + netif_testing_off(dev); + phy_link_up(phydev); + goto out_free; + } + + phydev->state = PHY_CABLETEST; + + if (phy_polling_mode(phydev)) + phy_trigger_machine(phydev); + + mutex_unlock(&phydev->lock); + + return 0; + +out_free: + ethnl_cable_test_free(phydev); +out: + mutex_unlock(&phydev->lock); + + return err; +} +EXPORT_SYMBOL(phy_start_cable_test_tdr); + static int phy_config_aneg(struct phy_device *phydev) { if (phydev->drv->config_aneg) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index e317fc99565e..24817ba252a0 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -17,13 +17,13 @@ enum ethtool_multicast_groups { struct phy_device; #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) -int ethnl_cable_test_alloc(struct phy_device *phydev); +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd); void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); #else -static inline int ethnl_cable_test_alloc(struct phy_device *phydev) +static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { return -EOPNOTSUPP; } diff --git a/include/linux/phy.h b/include/linux/phy.h index 6d256e720a66..d3c384f353ca 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -699,6 +699,10 @@ struct phy_driver { /* Start a cable test */ int (*cable_test_start)(struct phy_device *dev); + + /* Start a raw TDR cable test */ + int (*cable_test_tdr_start)(struct phy_device *dev); + /* Once per second, or on interrupt, request the status of the * test. */ @@ -1251,6 +1255,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1259,6 +1265,13 @@ int phy_start_cable_test(struct phy_device *phydev, NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } +static inline +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); + return -EOPNOTSUPP; +} #endif int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result); diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 5ba06eabe8c2..94e9d5f04353 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -13,7 +13,7 @@ cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, }; -static int ethnl_cable_test_started(struct phy_device *phydev) +static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd) { struct sk_buff *skb; int err = -ENOMEM; @@ -23,7 +23,7 @@ static int ethnl_cable_test_started(struct phy_device *phydev) if (!skb) goto out; - ehdr = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_CABLE_TEST_NTF); + ehdr = ethnl_bcastmsg_put(skb, cmd); if (!ehdr) { err = -EMSGSIZE; goto out; @@ -86,7 +86,8 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) ethnl_ops_complete(dev); if (!ret) - ethnl_cable_test_started(dev->phydev); + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_NTF); out_rtnl: rtnl_unlock(); @@ -95,7 +96,7 @@ out_dev_put: return ret; } -int ethnl_cable_test_alloc(struct phy_device *phydev) +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; @@ -103,8 +104,7 @@ int ethnl_cable_test_alloc(struct phy_device *phydev) if (!phydev->skb) goto out; - phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, - ETHTOOL_MSG_CABLE_TEST_NTF); + phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd); if (!phydev->ehdr) { err = -EMSGSIZE; goto out; @@ -199,3 +199,55 @@ err: return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); + +static const struct nla_policy +cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, +}; + +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_TDR_MAX, + cable_test_tdr_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + + ethnl_ops_complete(dev); + + if (!ret) + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0f2f4754dcf9..88fd07f47040 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -844,6 +844,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test_tdr, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index b0eb5d920099..9a96b6e90dc2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -360,5 +360,6 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 6b4a0fc106521e480c00b55a7ef38c89f02dc4e8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:39 +0200 Subject: net: ethtool: Add helpers for cable test TDR data Add helpers for returning raw TDR helpers in netlink messages. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 21 +++++++++++ net/ethtool/cabletest.c | 80 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 24817ba252a0..8fbe4f97ffad 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -22,6 +22,10 @@ void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); +int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV); +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV); +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step); #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { @@ -46,5 +50,22 @@ static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, { return -EOPNOTSUPP; } + +static inline int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_step(struct phy_device *phydev, u32 first, + u32 last, u32 step) +{ + return -EOPNOTSUPP; +} #endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 94e9d5f04353..390d0673ff01 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -100,7 +100,10 @@ int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; - phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + /* One TDR sample occupies 20 bytes. For a 150 meter cable, + * with four pairs, around 12K is needed. + */ + phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL); if (!phydev->skb) goto out; @@ -251,3 +254,78 @@ out_dev_put: dev_put(dev); return ret; } + +int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair)) + goto err; + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude); + +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse); + +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, + first)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_step); -- cgit v1.2.3 From f2bc8ad31a7f814237bc6301d59296d76505a688 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:41 +0200 Subject: net: ethtool: Allow PHY cable test TDR data to configured Allow the user to configure where on the cable the TDR data should be retrieved, in terms of first and last sample, and the step between samples. Also add the ability to ask for TDR data for just one pair. If this configuration is not provided, it defaults to 1-150m at 1m intervals for all pairs. Signed-off-by: Andrew Lunn v3: Move the TDR configuration into a structure Add a range check on step Use NL_SET_ERR_MSG_ATTR() when appropriate Move TDR configuration into a nest Document attributes in the request Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 22 +++++- drivers/net/phy/marvell.c | 59 ++++++++++----- drivers/net/phy/phy.c | 5 +- include/linux/phy.h | 21 +++++- include/uapi/linux/ethtool_netlink.h | 13 ++++ net/ethtool/cabletest.c | 104 ++++++++++++++++++++++++++- 6 files changed, 197 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index dae36227d590..d42661b91128 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1023,9 +1023,25 @@ Start a cable test and report raw TDR data Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` nested request header - ==================================== ====== ========================== + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` | nested | reply header | + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_CFG`` | nested | test configuration | + +-+------------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE `` | u32 | first data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | last data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR`` | u8 | pair to test | + +-+-+----------------------------------------+--------+-----------------------+ + +The ETHTOOL_A_CABLE_TEST_TDR_CFG is optional, as well as all members +of the nest. All distances are expressed in centimeters. The PHY takes +the distances as a guide, and rounds to the nearest distance it +actually supports. If a pair is passed, only that one pair will be +tested. Otherwise all pairs are tested. Notification contents: diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e597bee2e966..335e51d6f138 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -198,6 +198,7 @@ #define MII_VCT5_CTRL_PEEK_HYST_DEFAULT 3 #define MII_VCT5_SAMPLE_POINT_DISTANCE 0x18 +#define MII_VCT5_SAMPLE_POINT_DISTANCE_MAX 511 #define MII_VCT5_TX_PULSE_CTRL 0x1c #define MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN BIT(12) #define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS (0x0 << 10) @@ -270,6 +271,10 @@ struct marvell_priv { char *hwmon_name; struct device *hwmon_dev; bool cable_test_tdr; + u32 first; + u32 last; + u32 step; + s8 pair; }; static int marvell_read_page(struct phy_device *phydev) @@ -1787,12 +1792,18 @@ static u32 marvell_vct5_distance2cm(int distance) return distance * 805 / 10; } +static u32 marvell_vct5_cm2distance(int cm) +{ + return cm * 10 / 805; +} + static int marvell_vct5_amplitude_distance(struct phy_device *phydev, - int distance) + int distance, int pair) { - int mV_pair0, mV_pair1, mV_pair2, mV_pair3; u16 reg; int err; + int mV; + int i; err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, MII_VCT5_SAMPLE_POINT_DISTANCE, @@ -1814,21 +1825,20 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev, if (err) return err; - mV_pair0 = marvell_vct5_amplitude(phydev, 0); - mV_pair1 = marvell_vct5_amplitude(phydev, 1); - mV_pair2 = marvell_vct5_amplitude(phydev, 2); - mV_pair3 = marvell_vct5_amplitude(phydev, 3); + for (i = 0; i < 4; i++) { + if (pair != PHY_PAIR_ALL && i != pair) + continue; - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_A, mV_pair0); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_B, mV_pair1); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_C, mV_pair2); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_D, mV_pair3); + mV = marvell_vct5_amplitude(phydev, i); + ethnl_cable_test_amplitude(phydev, i, mV); + } return 0; } static int marvell_vct5_amplitude_graph(struct phy_device *phydev) { + struct marvell_priv *priv = phydev->priv; int distance; int err; u16 reg; @@ -1843,8 +1853,11 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) if (err) return err; - for (distance = 0; distance <= 100; distance++) { - err = marvell_vct5_amplitude_distance(phydev, distance); + for (distance = priv->first; + distance <= priv->last; + distance += priv->step) { + err = marvell_vct5_amplitude_distance(phydev, distance, + priv->pair); if (err) return err; } @@ -1918,11 +1931,24 @@ static int marvell_vct7_cable_test_start(struct phy_device *phydev) MII_VCT7_CTRL_CENTIMETERS); } -static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) +static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev, + const struct phy_tdr_config *cfg) { struct marvell_priv *priv = phydev->priv; int ret; + priv->cable_test_tdr = true; + priv->first = marvell_vct5_cm2distance(cfg->first); + priv->last = marvell_vct5_cm2distance(cfg->last); + priv->step = marvell_vct5_cm2distance(cfg->step); + priv->pair = cfg->pair; + + if (priv->first > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + + if (priv->last > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + /* Disable VCT7 */ ret = phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE, MII_VCT7_CTRL, 0); @@ -1933,15 +1959,14 @@ static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) if (ret) return ret; - priv->cable_test_tdr = true; ret = ethnl_cable_test_pulse(phydev, 1000); if (ret) return ret; return ethnl_cable_test_step(phydev, - marvell_vct5_distance2cm(0), - marvell_vct5_distance2cm(100), - marvell_vct5_distance2cm(1)); + marvell_vct5_distance2cm(priv->first), + marvell_vct5_distance2cm(priv->last), + marvell_vct5_distance2cm(priv->step)); } static int marvell_vct7_distance_to_length(int distance, bool meter) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 495d9ba3d5bf..1de3938628f4 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -553,7 +553,8 @@ out: EXPORT_SYMBOL(phy_start_cable_test); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { struct net_device *dev = phydev->attached_dev; int err = -ENOMEM; @@ -590,7 +591,7 @@ int phy_start_cable_test_tdr(struct phy_device *phydev, phy_link_down(phydev); netif_testing_on(dev); - err = phydev->drv->cable_test_tdr_start(phydev); + err = phydev->drv->cable_test_tdr_start(phydev, config); if (err) { netif_testing_off(dev); phy_link_up(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index d3c384f353ca..8c05d0fb5c00 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -548,6 +548,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) +/* A structure containing possible configuration parameters + * for a TDR cable test. The driver does not need to implement + * all the parameters, but should report what is actually used. + */ +struct phy_tdr_config { + u32 first; + u32 last; + u32 step; + s8 pair; +}; +#define PHY_PAIR_ALL -1 + /* struct phy_driver: Driver structure for a particular PHY type * * driver_data: static driver data @@ -701,7 +713,8 @@ struct phy_driver { int (*cable_test_start)(struct phy_device *dev); /* Start a raw TDR cable test */ - int (*cable_test_tdr_start)(struct phy_device *dev); + int (*cable_test_tdr_start)(struct phy_device *dev, + const struct phy_tdr_config *config); /* Once per second, or on interrupt, request the status of the * test. @@ -1256,7 +1269,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1267,7 +1281,8 @@ int phy_start_cable_test(struct phy_device *phydev, } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 739faa7070c6..fc9051f2eeac 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -482,9 +482,22 @@ enum { /* CABLE TEST TDR */ +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 +}; + enum { ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CNT, diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 390d0673ff01..9991688d7d1d 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -5,7 +5,11 @@ #include "netlink.h" #include "common.h" -/* CABLE_TEST_ACT */ +/* 802.3 standard allows 100 meters for BaseT cables. However longer + * cables might work, depending on the quality of the cables and the + * PHY. So allow testing for up to 150 meters. + */ +#define MAX_CABLE_LENGTH_CM (150 * 100) static const struct nla_policy cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { @@ -203,16 +207,107 @@ err: } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); +struct cable_test_tdr_req_info { + struct ethnl_req_info base; +}; + +static const struct nla_policy +cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 }, +}; + static const struct nla_policy cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, }; +/* CABLE_TEST_TDR_ACT */ +int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest, + cable_test_tdr_act_cfg_policy, info->extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) + cfg->first = nla_get_u32( + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); + else + cfg->first = 100; + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) + cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); + else + cfg->last = MAX_CABLE_LENGTH_CM; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) + cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); + else + cfg->step = 100; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { + cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); + if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) { + NL_SET_ERR_MSG_ATTR( + info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR], + "invalid pair parameter"); + return -EINVAL; + } + } else { + cfg->pair = PHY_PAIR_ALL; + } + + if (cfg->first > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST], + "invalid first parameter"); + return -EINVAL; + } + + if (cfg->last > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST], + "invalid last parameter"); + return -EINVAL; + } + + if (cfg->first > cfg->last) { + NL_SET_ERR_MSG(info->extack, "invalid first/last parameter"); + return -EINVAL; + } + + if (!cfg->step) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "invalid step parameter"); + return -EINVAL; + } + + if (cfg->step > (cfg->last - cfg->first)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "step parameter too big"); + return -EINVAL; + } + + return 0; +} + int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; struct ethnl_req_info req_info = {}; + struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -235,12 +330,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; } + ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], + info, &cfg); + if (ret) + goto out_dev_put; + rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); ethnl_ops_complete(dev); -- cgit v1.2.3 From 7edd363421dab1d4806802ac65613d1c0ec85824 Mon Sep 17 00:00:00 2001 From: Gene Chen Date: Thu, 23 Apr 2020 19:24:52 +0800 Subject: mfd: Add support for PMIC MT6360 Add MFD driver for mt6360 pmic chip include Battery Charger/ USB_PD/Flash, LED/RGB and LED/LDO/Buck Signed-off-by: Gene Chen Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 12 ++ drivers/mfd/Makefile | 1 + drivers/mfd/mt6360-core.c | 425 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/mt6360.h | 240 +++++++++++++++++++++++++ 4 files changed, 678 insertions(+) create mode 100644 drivers/mfd/mt6360-core.c create mode 100644 include/linux/mfd/mt6360.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 37fa52b7b5b8..2df22de1e8d4 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -908,6 +908,18 @@ config MFD_MAX8998 additional drivers must be enabled in order to use the functionality of the device. +config MFD_MT6360 + tristate "Mediatek MT6360 SubPMIC" + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + depends on I2C + help + Say Y here to enable MT6360 PMU/PMIC/LDO functional support. + PMU part includes Charger, Flashlight, RGB LED + PMIC part includes 2-channel BUCKs and 2-channel LDOs + LDO part includes 4-channel LDOs + config MFD_MT6397 tristate "MediaTek MT6397 PMIC Support" select MFD_CORE diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index aebe43b76e58..9367a92f795a 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -243,6 +243,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC) += intel-soc-pmic.o obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o +obj-$(CONFIG_MFD_MT6360) += mt6360-core.o mt6397-objs := mt6397-core.o mt6397-irq.o mt6358-irq.o obj-$(CONFIG_MFD_MT6397) += mt6397.o obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c new file mode 100644 index 000000000000..9bb63e0b69e6 --- /dev/null +++ b/drivers/mfd/mt6360-core.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 MediaTek Inc. + * + * Author: Gene Chen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* reg 0 -> 0 ~ 7 */ +#define MT6360_CHG_TREG_EVT (4) +#define MT6360_CHG_AICR_EVT (5) +#define MT6360_CHG_MIVR_EVT (6) +#define MT6360_PWR_RDY_EVT (7) +/* REG 1 -> 8 ~ 15 */ +#define MT6360_CHG_BATSYSUV_EVT (9) +#define MT6360_FLED_CHG_VINOVP_EVT (11) +#define MT6360_CHG_VSYSUV_EVT (12) +#define MT6360_CHG_VSYSOV_EVT (13) +#define MT6360_CHG_VBATOV_EVT (14) +#define MT6360_CHG_VBUSOV_EVT (15) +/* REG 2 -> 16 ~ 23 */ +/* REG 3 -> 24 ~ 31 */ +#define MT6360_WD_PMU_DET (25) +#define MT6360_WD_PMU_DONE (26) +#define MT6360_CHG_TMRI (27) +#define MT6360_CHG_ADPBADI (29) +#define MT6360_CHG_RVPI (30) +#define MT6360_OTPI (31) +/* REG 4 -> 32 ~ 39 */ +#define MT6360_CHG_AICCMEASL (32) +#define MT6360_CHGDET_DONEI (34) +#define MT6360_WDTMRI (35) +#define MT6360_SSFINISHI (36) +#define MT6360_CHG_RECHGI (37) +#define MT6360_CHG_TERMI (38) +#define MT6360_CHG_IEOCI (39) +/* REG 5 -> 40 ~ 47 */ +#define MT6360_PUMPX_DONEI (40) +#define MT6360_BAT_OVP_ADC_EVT (41) +#define MT6360_TYPEC_OTP_EVT (42) +#define MT6360_ADC_WAKEUP_EVT (43) +#define MT6360_ADC_DONEI (44) +#define MT6360_BST_BATUVI (45) +#define MT6360_BST_VBUSOVI (46) +#define MT6360_BST_OLPI (47) +/* REG 6 -> 48 ~ 55 */ +#define MT6360_ATTACH_I (48) +#define MT6360_DETACH_I (49) +#define MT6360_QC30_STPDONE (51) +#define MT6360_QC_VBUSDET_DONE (52) +#define MT6360_HVDCP_DET (53) +#define MT6360_CHGDETI (54) +#define MT6360_DCDTI (55) +/* REG 7 -> 56 ~ 63 */ +#define MT6360_FOD_DONE_EVT (56) +#define MT6360_FOD_OV_EVT (57) +#define MT6360_CHRDET_UVP_EVT (58) +#define MT6360_CHRDET_OVP_EVT (59) +#define MT6360_CHRDET_EXT_EVT (60) +#define MT6360_FOD_LR_EVT (61) +#define MT6360_FOD_HR_EVT (62) +#define MT6360_FOD_DISCHG_FAIL_EVT (63) +/* REG 8 -> 64 ~ 71 */ +#define MT6360_USBID_EVT (64) +#define MT6360_APWDTRST_EVT (65) +#define MT6360_EN_EVT (66) +#define MT6360_QONB_RST_EVT (67) +#define MT6360_MRSTB_EVT (68) +#define MT6360_OTP_EVT (69) +#define MT6360_VDDAOV_EVT (70) +#define MT6360_SYSUV_EVT (71) +/* REG 9 -> 72 ~ 79 */ +#define MT6360_FLED_STRBPIN_EVT (72) +#define MT6360_FLED_TORPIN_EVT (73) +#define MT6360_FLED_TX_EVT (74) +#define MT6360_FLED_LVF_EVT (75) +#define MT6360_FLED2_SHORT_EVT (78) +#define MT6360_FLED1_SHORT_EVT (79) +/* REG 10 -> 80 ~ 87 */ +#define MT6360_FLED2_STRB_EVT (80) +#define MT6360_FLED1_STRB_EVT (81) +#define MT6360_FLED2_STRB_TO_EVT (82) +#define MT6360_FLED1_STRB_TO_EVT (83) +#define MT6360_FLED2_TOR_EVT (84) +#define MT6360_FLED1_TOR_EVT (85) +/* REG 11 -> 88 ~ 95 */ +/* REG 12 -> 96 ~ 103 */ +#define MT6360_BUCK1_PGB_EVT (96) +#define MT6360_BUCK1_OC_EVT (100) +#define MT6360_BUCK1_OV_EVT (101) +#define MT6360_BUCK1_UV_EVT (102) +/* REG 13 -> 104 ~ 111 */ +#define MT6360_BUCK2_PGB_EVT (104) +#define MT6360_BUCK2_OC_EVT (108) +#define MT6360_BUCK2_OV_EVT (109) +#define MT6360_BUCK2_UV_EVT (110) +/* REG 14 -> 112 ~ 119 */ +#define MT6360_LDO1_OC_EVT (113) +#define MT6360_LDO2_OC_EVT (114) +#define MT6360_LDO3_OC_EVT (115) +#define MT6360_LDO5_OC_EVT (117) +#define MT6360_LDO6_OC_EVT (118) +#define MT6360_LDO7_OC_EVT (119) +/* REG 15 -> 120 ~ 127 */ +#define MT6360_LDO1_PGB_EVT (121) +#define MT6360_LDO2_PGB_EVT (122) +#define MT6360_LDO3_PGB_EVT (123) +#define MT6360_LDO5_PGB_EVT (125) +#define MT6360_LDO6_PGB_EVT (126) +#define MT6360_LDO7_PGB_EVT (127) + +static const struct regmap_irq mt6360_pmu_irqs[] = { + REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_AICR_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_MIVR_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_PWR_RDY_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_BATSYSUV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED_CHG_VINOVP_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_VSYSUV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_VSYSOV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_VBATOV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_VBUSOV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_WD_PMU_DET, 8), + REGMAP_IRQ_REG_LINE(MT6360_WD_PMU_DONE, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_TMRI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_ADPBADI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_RVPI, 8), + REGMAP_IRQ_REG_LINE(MT6360_OTPI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_AICCMEASL, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHGDET_DONEI, 8), + REGMAP_IRQ_REG_LINE(MT6360_WDTMRI, 8), + REGMAP_IRQ_REG_LINE(MT6360_SSFINISHI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_RECHGI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_TERMI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_IEOCI, 8), + REGMAP_IRQ_REG_LINE(MT6360_PUMPX_DONEI, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BAT_OVP_ADC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_TYPEC_OTP_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_ADC_WAKEUP_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_ADC_DONEI, 8), + REGMAP_IRQ_REG_LINE(MT6360_BST_BATUVI, 8), + REGMAP_IRQ_REG_LINE(MT6360_BST_VBUSOVI, 8), + REGMAP_IRQ_REG_LINE(MT6360_BST_OLPI, 8), + REGMAP_IRQ_REG_LINE(MT6360_ATTACH_I, 8), + REGMAP_IRQ_REG_LINE(MT6360_DETACH_I, 8), + REGMAP_IRQ_REG_LINE(MT6360_QC30_STPDONE, 8), + REGMAP_IRQ_REG_LINE(MT6360_QC_VBUSDET_DONE, 8), + REGMAP_IRQ_REG_LINE(MT6360_HVDCP_DET, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHGDETI, 8), + REGMAP_IRQ_REG_LINE(MT6360_DCDTI, 8), + REGMAP_IRQ_REG_LINE(MT6360_FOD_DONE_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FOD_OV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHRDET_UVP_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHRDET_OVP_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_CHRDET_EXT_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FOD_LR_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FOD_HR_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FOD_DISCHG_FAIL_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_USBID_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_APWDTRST_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_EN_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_QONB_RST_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_MRSTB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_OTP_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_VDDAOV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_SYSUV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED_STRBPIN_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED_TORPIN_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED_TX_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED_LVF_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED2_SHORT_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED1_SHORT_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED2_STRB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED1_STRB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED2_STRB_TO_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED1_STRB_TO_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED2_TOR_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_FLED1_TOR_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK1_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK1_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK1_OV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK1_UV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK2_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK2_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK2_OV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_BUCK2_UV_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO1_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO2_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO3_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO5_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO6_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO7_OC_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO1_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO2_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO3_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO5_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO6_PGB_EVT, 8), + REGMAP_IRQ_REG_LINE(MT6360_LDO7_PGB_EVT, 8), +}; + +static int mt6360_pmu_handle_post_irq(void *irq_drv_data) +{ + struct mt6360_pmu_data *mpd = irq_drv_data; + + return regmap_update_bits(mpd->regmap, + MT6360_PMU_IRQ_SET, MT6360_IRQ_RETRIG, MT6360_IRQ_RETRIG); +} + +static struct regmap_irq_chip mt6360_pmu_irq_chip = { + .irqs = mt6360_pmu_irqs, + .num_irqs = ARRAY_SIZE(mt6360_pmu_irqs), + .num_regs = MT6360_PMU_IRQ_REGNUM, + .mask_base = MT6360_PMU_CHG_MASK1, + .status_base = MT6360_PMU_CHG_IRQ1, + .ack_base = MT6360_PMU_CHG_IRQ1, + .init_ack_masked = true, + .use_ack = true, + .handle_post_irq = mt6360_pmu_handle_post_irq, +}; + +static const struct regmap_config mt6360_pmu_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MT6360_PMU_MAXREG, +}; + +static const struct resource mt6360_adc_resources[] = { + DEFINE_RES_IRQ_NAMED(MT6360_ADC_DONEI, "adc_donei"), +}; + +static const struct resource mt6360_chg_resources[] = { + DEFINE_RES_IRQ_NAMED(MT6360_CHG_TREG_EVT, "chg_treg_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_PWR_RDY_EVT, "pwr_rdy_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_BATSYSUV_EVT, "chg_batsysuv_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_VSYSUV_EVT, "chg_vsysuv_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_VSYSOV_EVT, "chg_vsysov_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_VBATOV_EVT, "chg_vbatov_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_VBUSOV_EVT, "chg_vbusov_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_AICCMEASL, "chg_aiccmeasl"), + DEFINE_RES_IRQ_NAMED(MT6360_WDTMRI, "wdtmri"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_RECHGI, "chg_rechgi"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_TERMI, "chg_termi"), + DEFINE_RES_IRQ_NAMED(MT6360_CHG_IEOCI, "chg_ieoci"), + DEFINE_RES_IRQ_NAMED(MT6360_PUMPX_DONEI, "pumpx_donei"), + DEFINE_RES_IRQ_NAMED(MT6360_ATTACH_I, "attach_i"), + DEFINE_RES_IRQ_NAMED(MT6360_CHRDET_EXT_EVT, "chrdet_ext_evt"), +}; + +static const struct resource mt6360_led_resources[] = { + DEFINE_RES_IRQ_NAMED(MT6360_FLED_CHG_VINOVP_EVT, "fled_chg_vinovp_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_FLED_LVF_EVT, "fled_lvf_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_FLED2_SHORT_EVT, "fled2_short_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_FLED1_SHORT_EVT, "fled1_short_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_FLED2_STRB_TO_EVT, "fled2_strb_to_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_FLED1_STRB_TO_EVT, "fled1_strb_to_evt"), +}; + +static const struct resource mt6360_pmic_resources[] = { + DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_PGB_EVT, "buck1_pgb_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_OC_EVT, "buck1_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_OV_EVT, "buck1_ov_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_UV_EVT, "buck1_uv_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_PGB_EVT, "buck2_pgb_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_OC_EVT, "buck2_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_OV_EVT, "buck2_ov_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_UV_EVT, "buck2_uv_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO6_OC_EVT, "ldo6_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO7_OC_EVT, "ldo7_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO6_PGB_EVT, "ldo6_pgb_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO7_PGB_EVT, "ldo7_pgb_evt"), +}; + +static const struct resource mt6360_ldo_resources[] = { + DEFINE_RES_IRQ_NAMED(MT6360_LDO1_OC_EVT, "ldo1_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO2_OC_EVT, "ldo2_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO3_OC_EVT, "ldo3_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO5_OC_EVT, "ldo5_oc_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO1_PGB_EVT, "ldo1_pgb_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO2_PGB_EVT, "ldo2_pgb_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO3_PGB_EVT, "ldo3_pgb_evt"), + DEFINE_RES_IRQ_NAMED(MT6360_LDO5_PGB_EVT, "ldo5_pgb_evt"), +}; + +static const struct mfd_cell mt6360_devs[] = { + OF_MFD_CELL("mt6360_adc", mt6360_adc_resources, + NULL, 0, 0, "mediatek,mt6360_adc"), + OF_MFD_CELL("mt6360_chg", mt6360_chg_resources, + NULL, 0, 0, "mediatek,mt6360_chg"), + OF_MFD_CELL("mt6360_led", mt6360_led_resources, + NULL, 0, 0, "mediatek,mt6360_led"), + OF_MFD_CELL("mt6360_pmic", mt6360_pmic_resources, + NULL, 0, 0, "mediatek,mt6360_pmic"), + OF_MFD_CELL("mt6360_ldo", mt6360_ldo_resources, + NULL, 0, 0, "mediatek,mt6360_ldo"), + OF_MFD_CELL("mt6360_tcpc", NULL, + NULL, 0, 0, "mediatek,mt6360_tcpc"), +}; + +static const unsigned short mt6360_slave_addr[MT6360_SLAVE_MAX] = { + MT6360_PMU_SLAVEID, + MT6360_PMIC_SLAVEID, + MT6360_LDO_SLAVEID, + MT6360_TCPC_SLAVEID, +}; + +static int mt6360_pmu_probe(struct i2c_client *client) +{ + struct mt6360_pmu_data *mpd; + unsigned int reg_data; + int i, ret; + + mpd = devm_kzalloc(&client->dev, sizeof(*mpd), GFP_KERNEL); + if (!mpd) + return -ENOMEM; + + mpd->dev = &client->dev; + i2c_set_clientdata(client, mpd); + + mpd->regmap = devm_regmap_init_i2c(client, &mt6360_pmu_regmap_config); + if (IS_ERR(mpd->regmap)) { + dev_err(&client->dev, "Failed to register regmap\n"); + return PTR_ERR(mpd->regmap); + } + + ret = regmap_read(mpd->regmap, MT6360_PMU_DEV_INFO, ®_data); + if (ret) { + dev_err(&client->dev, "Device not found\n"); + return ret; + } + + mpd->chip_rev = reg_data & CHIP_REV_MASK; + if (mpd->chip_rev != CHIP_VEN_MT6360) { + dev_err(&client->dev, "Device not supported\n"); + return -ENODEV; + } + + mt6360_pmu_irq_chip.irq_drv_data = mpd; + ret = devm_regmap_add_irq_chip(&client->dev, mpd->regmap, client->irq, + IRQF_TRIGGER_FALLING, 0, + &mt6360_pmu_irq_chip, &mpd->irq_data); + if (ret) { + dev_err(&client->dev, "Failed to add Regmap IRQ Chip\n"); + return ret; + } + + mpd->i2c[0] = client; + for (i = 1; i < MT6360_SLAVE_MAX; i++) { + mpd->i2c[i] = devm_i2c_new_dummy_device(&client->dev, + client->adapter, + mt6360_slave_addr[i]); + if (IS_ERR(mpd->i2c[i])) { + dev_err(&client->dev, + "Failed to get new dummy I2C device for address 0x%x", + mt6360_slave_addr[i]); + return PTR_ERR(mpd->i2c[i]); + } + i2c_set_clientdata(mpd->i2c[i], mpd); + } + + ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO, + mt6360_devs, ARRAY_SIZE(mt6360_devs), NULL, + 0, regmap_irq_get_domain(mpd->irq_data)); + if (ret) { + dev_err(&client->dev, + "Failed to register subordinate devices\n"); + return ret; + } + + return 0; +} + +static int __maybe_unused mt6360_pmu_suspend(struct device *dev) +{ + struct i2c_client *i2c = to_i2c_client(dev); + + if (device_may_wakeup(dev)) + enable_irq_wake(i2c->irq); + + return 0; +} + +static int __maybe_unused mt6360_pmu_resume(struct device *dev) +{ + + struct i2c_client *i2c = to_i2c_client(dev); + + if (device_may_wakeup(dev)) + disable_irq_wake(i2c->irq); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(mt6360_pmu_pm_ops, + mt6360_pmu_suspend, mt6360_pmu_resume); + +static const struct of_device_id __maybe_unused mt6360_pmu_of_id[] = { + { .compatible = "mediatek,mt6360_pmu", }, + {}, +}; +MODULE_DEVICE_TABLE(of, mt6360_pmu_of_id); + +static struct i2c_driver mt6360_pmu_driver = { + .driver = { + .pm = &mt6360_pmu_pm_ops, + .of_match_table = of_match_ptr(mt6360_pmu_of_id), + }, + .probe_new = mt6360_pmu_probe, +}; +module_i2c_driver(mt6360_pmu_driver); + +MODULE_AUTHOR("Gene Chen "); +MODULE_DESCRIPTION("MT6360 PMU I2C Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h new file mode 100644 index 000000000000..ea1304035d4d --- /dev/null +++ b/include/linux/mfd/mt6360.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 MediaTek Inc. + */ + +#ifndef __MT6360_H__ +#define __MT6360_H__ + +#include + +enum { + MT6360_SLAVE_PMU = 0, + MT6360_SLAVE_PMIC, + MT6360_SLAVE_LDO, + MT6360_SLAVE_TCPC, + MT6360_SLAVE_MAX, +}; + +#define MT6360_PMU_SLAVEID (0x34) +#define MT6360_PMIC_SLAVEID (0x1A) +#define MT6360_LDO_SLAVEID (0x64) +#define MT6360_TCPC_SLAVEID (0x4E) + +struct mt6360_pmu_data { + struct i2c_client *i2c[MT6360_SLAVE_MAX]; + struct device *dev; + struct regmap *regmap; + struct regmap_irq_chip_data *irq_data; + unsigned int chip_rev; +}; + +/* PMU register defininition */ +#define MT6360_PMU_DEV_INFO (0x00) +#define MT6360_PMU_CORE_CTRL1 (0x01) +#define MT6360_PMU_RST1 (0x02) +#define MT6360_PMU_CRCEN (0x03) +#define MT6360_PMU_RST_PAS_CODE1 (0x04) +#define MT6360_PMU_RST_PAS_CODE2 (0x05) +#define MT6360_PMU_CORE_CTRL2 (0x06) +#define MT6360_PMU_TM_PAS_CODE1 (0x07) +#define MT6360_PMU_TM_PAS_CODE2 (0x08) +#define MT6360_PMU_TM_PAS_CODE3 (0x09) +#define MT6360_PMU_TM_PAS_CODE4 (0x0A) +#define MT6360_PMU_IRQ_IND (0x0B) +#define MT6360_PMU_IRQ_MASK (0x0C) +#define MT6360_PMU_IRQ_SET (0x0D) +#define MT6360_PMU_SHDN_CTRL (0x0E) +#define MT6360_PMU_TM_INF (0x0F) +#define MT6360_PMU_I2C_CTRL (0x10) +#define MT6360_PMU_CHG_CTRL1 (0x11) +#define MT6360_PMU_CHG_CTRL2 (0x12) +#define MT6360_PMU_CHG_CTRL3 (0x13) +#define MT6360_PMU_CHG_CTRL4 (0x14) +#define MT6360_PMU_CHG_CTRL5 (0x15) +#define MT6360_PMU_CHG_CTRL6 (0x16) +#define MT6360_PMU_CHG_CTRL7 (0x17) +#define MT6360_PMU_CHG_CTRL8 (0x18) +#define MT6360_PMU_CHG_CTRL9 (0x19) +#define MT6360_PMU_CHG_CTRL10 (0x1A) +#define MT6360_PMU_CHG_CTRL11 (0x1B) +#define MT6360_PMU_CHG_CTRL12 (0x1C) +#define MT6360_PMU_CHG_CTRL13 (0x1D) +#define MT6360_PMU_CHG_CTRL14 (0x1E) +#define MT6360_PMU_CHG_CTRL15 (0x1F) +#define MT6360_PMU_CHG_CTRL16 (0x20) +#define MT6360_PMU_CHG_AICC_RESULT (0x21) +#define MT6360_PMU_DEVICE_TYPE (0x22) +#define MT6360_PMU_QC_CONTROL1 (0x23) +#define MT6360_PMU_QC_CONTROL2 (0x24) +#define MT6360_PMU_QC30_CONTROL1 (0x25) +#define MT6360_PMU_QC30_CONTROL2 (0x26) +#define MT6360_PMU_USB_STATUS1 (0x27) +#define MT6360_PMU_QC_STATUS1 (0x28) +#define MT6360_PMU_QC_STATUS2 (0x29) +#define MT6360_PMU_CHG_PUMP (0x2A) +#define MT6360_PMU_CHG_CTRL17 (0x2B) +#define MT6360_PMU_CHG_CTRL18 (0x2C) +#define MT6360_PMU_CHRDET_CTRL1 (0x2D) +#define MT6360_PMU_CHRDET_CTRL2 (0x2E) +#define MT6360_PMU_DPDN_CTRL (0x2F) +#define MT6360_PMU_CHG_HIDDEN_CTRL1 (0x30) +#define MT6360_PMU_CHG_HIDDEN_CTRL2 (0x31) +#define MT6360_PMU_CHG_HIDDEN_CTRL3 (0x32) +#define MT6360_PMU_CHG_HIDDEN_CTRL4 (0x33) +#define MT6360_PMU_CHG_HIDDEN_CTRL5 (0x34) +#define MT6360_PMU_CHG_HIDDEN_CTRL6 (0x35) +#define MT6360_PMU_CHG_HIDDEN_CTRL7 (0x36) +#define MT6360_PMU_CHG_HIDDEN_CTRL8 (0x37) +#define MT6360_PMU_CHG_HIDDEN_CTRL9 (0x38) +#define MT6360_PMU_CHG_HIDDEN_CTRL10 (0x39) +#define MT6360_PMU_CHG_HIDDEN_CTRL11 (0x3A) +#define MT6360_PMU_CHG_HIDDEN_CTRL12 (0x3B) +#define MT6360_PMU_CHG_HIDDEN_CTRL13 (0x3C) +#define MT6360_PMU_CHG_HIDDEN_CTRL14 (0x3D) +#define MT6360_PMU_CHG_HIDDEN_CTRL15 (0x3E) +#define MT6360_PMU_CHG_HIDDEN_CTRL16 (0x3F) +#define MT6360_PMU_CHG_HIDDEN_CTRL17 (0x40) +#define MT6360_PMU_CHG_HIDDEN_CTRL18 (0x41) +#define MT6360_PMU_CHG_HIDDEN_CTRL19 (0x42) +#define MT6360_PMU_CHG_HIDDEN_CTRL20 (0x43) +#define MT6360_PMU_CHG_HIDDEN_CTRL21 (0x44) +#define MT6360_PMU_CHG_HIDDEN_CTRL22 (0x45) +#define MT6360_PMU_CHG_HIDDEN_CTRL23 (0x46) +#define MT6360_PMU_CHG_HIDDEN_CTRL24 (0x47) +#define MT6360_PMU_CHG_HIDDEN_CTRL25 (0x48) +#define MT6360_PMU_BC12_CTRL (0x49) +#define MT6360_PMU_CHG_STAT (0x4A) +#define MT6360_PMU_RESV1 (0x4B) +#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEH (0x4E) +#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEL (0x4F) +#define MT6360_PMU_TYPEC_OTP_HYST_TH (0x50) +#define MT6360_PMU_TYPEC_OTP_CTRL (0x51) +#define MT6360_PMU_ADC_BAT_DATA_H (0x52) +#define MT6360_PMU_ADC_BAT_DATA_L (0x53) +#define MT6360_PMU_IMID_BACKBST_ON (0x54) +#define MT6360_PMU_IMID_BACKBST_OFF (0x55) +#define MT6360_PMU_ADC_CONFIG (0x56) +#define MT6360_PMU_ADC_EN2 (0x57) +#define MT6360_PMU_ADC_IDLE_T (0x58) +#define MT6360_PMU_ADC_RPT_1 (0x5A) +#define MT6360_PMU_ADC_RPT_2 (0x5B) +#define MT6360_PMU_ADC_RPT_3 (0x5C) +#define MT6360_PMU_ADC_RPT_ORG1 (0x5D) +#define MT6360_PMU_ADC_RPT_ORG2 (0x5E) +#define MT6360_PMU_BAT_OVP_TH_SEL_CODEH (0x5F) +#define MT6360_PMU_BAT_OVP_TH_SEL_CODEL (0x60) +#define MT6360_PMU_CHG_CTRL19 (0x61) +#define MT6360_PMU_VDDASUPPLY (0x62) +#define MT6360_PMU_BC12_MANUAL (0x63) +#define MT6360_PMU_CHGDET_FUNC (0x64) +#define MT6360_PMU_FOD_CTRL (0x65) +#define MT6360_PMU_CHG_CTRL20 (0x66) +#define MT6360_PMU_CHG_HIDDEN_CTRL26 (0x67) +#define MT6360_PMU_CHG_HIDDEN_CTRL27 (0x68) +#define MT6360_PMU_RESV2 (0x69) +#define MT6360_PMU_USBID_CTRL1 (0x6D) +#define MT6360_PMU_USBID_CTRL2 (0x6E) +#define MT6360_PMU_USBID_CTRL3 (0x6F) +#define MT6360_PMU_FLED_CFG (0x70) +#define MT6360_PMU_RESV3 (0x71) +#define MT6360_PMU_FLED1_CTRL (0x72) +#define MT6360_PMU_FLED_STRB_CTRL (0x73) +#define MT6360_PMU_FLED1_STRB_CTRL2 (0x74) +#define MT6360_PMU_FLED1_TOR_CTRL (0x75) +#define MT6360_PMU_FLED2_CTRL (0x76) +#define MT6360_PMU_RESV4 (0x77) +#define MT6360_PMU_FLED2_STRB_CTRL2 (0x78) +#define MT6360_PMU_FLED2_TOR_CTRL (0x79) +#define MT6360_PMU_FLED_VMIDTRK_CTRL1 (0x7A) +#define MT6360_PMU_FLED_VMID_RTM (0x7B) +#define MT6360_PMU_FLED_VMIDTRK_CTRL2 (0x7C) +#define MT6360_PMU_FLED_PWSEL (0x7D) +#define MT6360_PMU_FLED_EN (0x7E) +#define MT6360_PMU_FLED_Hidden1 (0x7F) +#define MT6360_PMU_RGB_EN (0x80) +#define MT6360_PMU_RGB1_ISNK (0x81) +#define MT6360_PMU_RGB2_ISNK (0x82) +#define MT6360_PMU_RGB3_ISNK (0x83) +#define MT6360_PMU_RGB_ML_ISNK (0x84) +#define MT6360_PMU_RGB1_DIM (0x85) +#define MT6360_PMU_RGB2_DIM (0x86) +#define MT6360_PMU_RGB3_DIM (0x87) +#define MT6360_PMU_RESV5 (0x88) +#define MT6360_PMU_RGB12_Freq (0x89) +#define MT6360_PMU_RGB34_Freq (0x8A) +#define MT6360_PMU_RGB1_Tr (0x8B) +#define MT6360_PMU_RGB1_Tf (0x8C) +#define MT6360_PMU_RGB1_TON_TOFF (0x8D) +#define MT6360_PMU_RGB2_Tr (0x8E) +#define MT6360_PMU_RGB2_Tf (0x8F) +#define MT6360_PMU_RGB2_TON_TOFF (0x90) +#define MT6360_PMU_RGB3_Tr (0x91) +#define MT6360_PMU_RGB3_Tf (0x92) +#define MT6360_PMU_RGB3_TON_TOFF (0x93) +#define MT6360_PMU_RGB_Hidden_CTRL1 (0x94) +#define MT6360_PMU_RGB_Hidden_CTRL2 (0x95) +#define MT6360_PMU_RESV6 (0x97) +#define MT6360_PMU_SPARE1 (0x9A) +#define MT6360_PMU_SPARE2 (0xA0) +#define MT6360_PMU_SPARE3 (0xB0) +#define MT6360_PMU_SPARE4 (0xC0) +#define MT6360_PMU_CHG_IRQ1 (0xD0) +#define MT6360_PMU_CHG_IRQ2 (0xD1) +#define MT6360_PMU_CHG_IRQ3 (0xD2) +#define MT6360_PMU_CHG_IRQ4 (0xD3) +#define MT6360_PMU_CHG_IRQ5 (0xD4) +#define MT6360_PMU_CHG_IRQ6 (0xD5) +#define MT6360_PMU_QC_IRQ (0xD6) +#define MT6360_PMU_FOD_IRQ (0xD7) +#define MT6360_PMU_BASE_IRQ (0xD8) +#define MT6360_PMU_FLED_IRQ1 (0xD9) +#define MT6360_PMU_FLED_IRQ2 (0xDA) +#define MT6360_PMU_RGB_IRQ (0xDB) +#define MT6360_PMU_BUCK1_IRQ (0xDC) +#define MT6360_PMU_BUCK2_IRQ (0xDD) +#define MT6360_PMU_LDO_IRQ1 (0xDE) +#define MT6360_PMU_LDO_IRQ2 (0xDF) +#define MT6360_PMU_CHG_STAT1 (0xE0) +#define MT6360_PMU_CHG_STAT2 (0xE1) +#define MT6360_PMU_CHG_STAT3 (0xE2) +#define MT6360_PMU_CHG_STAT4 (0xE3) +#define MT6360_PMU_CHG_STAT5 (0xE4) +#define MT6360_PMU_CHG_STAT6 (0xE5) +#define MT6360_PMU_QC_STAT (0xE6) +#define MT6360_PMU_FOD_STAT (0xE7) +#define MT6360_PMU_BASE_STAT (0xE8) +#define MT6360_PMU_FLED_STAT1 (0xE9) +#define MT6360_PMU_FLED_STAT2 (0xEA) +#define MT6360_PMU_RGB_STAT (0xEB) +#define MT6360_PMU_BUCK1_STAT (0xEC) +#define MT6360_PMU_BUCK2_STAT (0xED) +#define MT6360_PMU_LDO_STAT1 (0xEE) +#define MT6360_PMU_LDO_STAT2 (0xEF) +#define MT6360_PMU_CHG_MASK1 (0xF0) +#define MT6360_PMU_CHG_MASK2 (0xF1) +#define MT6360_PMU_CHG_MASK3 (0xF2) +#define MT6360_PMU_CHG_MASK4 (0xF3) +#define MT6360_PMU_CHG_MASK5 (0xF4) +#define MT6360_PMU_CHG_MASK6 (0xF5) +#define MT6360_PMU_QC_MASK (0xF6) +#define MT6360_PMU_FOD_MASK (0xF7) +#define MT6360_PMU_BASE_MASK (0xF8) +#define MT6360_PMU_FLED_MASK1 (0xF9) +#define MT6360_PMU_FLED_MASK2 (0xFA) +#define MT6360_PMU_FAULTB_MASK (0xFB) +#define MT6360_PMU_BUCK1_MASK (0xFC) +#define MT6360_PMU_BUCK2_MASK (0xFD) +#define MT6360_PMU_LDO_MASK1 (0xFE) +#define MT6360_PMU_LDO_MASK2 (0xFF) +#define MT6360_PMU_MAXREG (MT6360_PMU_LDO_MASK2) + +/* MT6360_PMU_IRQ_SET */ +#define MT6360_PMU_IRQ_REGNUM (MT6360_PMU_LDO_IRQ2 - MT6360_PMU_CHG_IRQ1 + 1) +#define MT6360_IRQ_RETRIG BIT(2) + +#define CHIP_VEN_MASK (0xF0) +#define CHIP_VEN_MT6360 (0x50) +#define CHIP_REV_MASK (0x0F) + +#endif /* __MT6360_H__ */ -- cgit v1.2.3 From e76fede8bf7c90d92c799d9ceb092dec48346e2c Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:50 -0700 Subject: cfg80211: add KHz variants of frame RX API Drivers may wish to report the RX frequency in units of KHz. Provide cfg80211_rx_mgmt_khz() and wrap it with cfg80211_rx_mgmt() so exisiting drivers which can't report KHz anyway don't need to change. Add a similar wrapper for cfg80211_report_obss_beacon() so the frequency units stay somewhat consistent. This doesn't actually change the nl80211 API yet. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-2-thomas@adapt-ip.com [fix mac80211 calling the non-khz version of obss beacon report, drop trace point name changes] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ include/net/cfg80211.h | 54 ++++++++++++++++++++++++++++++++++++++++++----- net/mac80211/rx.c | 12 ++++++----- net/wireless/mlme.c | 6 +++--- net/wireless/nl80211.c | 12 +++++------ net/wireless/trace.h | 8 +++---- 6 files changed, 71 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a561db435a4b..41d5f000c0d9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,6 +3333,8 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size) /* convert frequencies */ #define MHZ_TO_KHZ(freq) ((freq) * 1000) #define KHZ_TO_MHZ(freq) ((freq) / 1000) +#define PR_KHZ(f) KHZ_TO_MHZ(f), f % 1000 +#define KHZ_F "%d.%03d" /* convert powers */ #define DBI_TO_MBI(gain) ((gain) * 100) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5cacf24cc9f0..7415f77d99ca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6988,6 +6988,26 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, enum nl80211_connect_failed_reason reason, gfp_t gfp); +/** + * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame + * @wdev: wireless device receiving the frame + * @freq: Frequency on which the frame was received in KHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * @buf: Management frame (header + body) + * @len: length of the frame data + * @flags: flags, as defined in enum nl80211_rxmgmt_flags + * + * This function is called whenever an Action frame is received for a station + * mode interface, but is not processed in kernel. + * + * Return: %true if a user space application has registered for this frame. + * For action frames, that makes it responsible for rejecting unrecognized + * action frames; %false otherwise, in which case for action frames the + * driver is responsible for rejecting the frame. + */ +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags); + /** * cfg80211_rx_mgmt - notification of received, unprocessed management frame * @wdev: wireless device receiving the frame @@ -7005,8 +7025,13 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * action frames; %false otherwise, in which case for action frames the * driver is responsible for rejecting the frame. */ -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags); +static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, + int sig_dbm, const u8 *buf, size_t len, + u32 flags) +{ + return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len, + flags); +} /** * cfg80211_mgmt_tx_status - notification of TX status for management frame @@ -7204,6 +7229,21 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, s32 ack_signal, bool is_valid_ack_signal, gfp_t gfp); +/** + * cfg80211_report_obss_beacon_khz - report beacon from other APs + * @wiphy: The wiphy that received the beacon + * @frame: the frame + * @len: length of the frame + * @freq: frequency the frame was received on in KHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * + * Use this function to report to userspace when a beacon was + * received. It is not useful to call this when there is no + * netdev that is in AP/GO mode. + */ +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm); + /** * cfg80211_report_obss_beacon - report beacon from other APs * @wiphy: The wiphy that received the beacon @@ -7216,9 +7256,13 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, * received. It is not useful to call this when there is no * netdev that is in AP/GO mode. */ -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm); +static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, + const u8 *frame, size_t len, + int freq, int sig_dbm) +{ + cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq), + sig_dbm); +} /** * cfg80211_reg_can_beacon - check if beaconing is allowed diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index eaf8931e4627..8e47b0d31051 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3095,9 +3095,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - cfg80211_report_obss_beacon(rx->local->hw.wiphy, - rx->skb->data, rx->skb->len, - status->freq, sig); + cfg80211_report_obss_beacon_khz(rx->local->hw.wiphy, + rx->skb->data, rx->skb->len, + ieee80211_rx_status_to_khz(status), + sig); rx->flags |= IEEE80211_RX_BEACON_REPORTED; } @@ -3443,8 +3444,9 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0)) { + if (cfg80211_rx_mgmt_khz(&rx->sdata->wdev, + ieee80211_rx_status_to_khz(status), sig, + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_stats.packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 409497a3527d..189334314cba 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -729,8 +729,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return rdev_mgmt_tx(rdev, wdev, params, cookie); } -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags) +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -785,7 +785,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, trace_cfg80211_return_bool(result); return result; } -EXPORT_SYMBOL(cfg80211_rx_mgmt); +EXPORT_SYMBOL(cfg80211_rx_mgmt_khz); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 258c621f651c..f6523f1485a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -16214,7 +16214,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, netdev->ifindex)) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16840,9 +16840,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, } EXPORT_SYMBOL(cfg80211_probe_status); -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm) +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; @@ -16865,7 +16864,8 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) @@ -16882,7 +16882,7 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, spin_unlock_bh(&rdev->beacon_registrations_lock); nlmsg_free(msg); } -EXPORT_SYMBOL(cfg80211_report_obss_beacon); +EXPORT_SYMBOL(cfg80211_report_obss_beacon_khz); #ifdef CONFIG_PM static int cfg80211_net_detect_results(struct sk_buff *msg, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 53c887ea67c7..f2ab44a2a3e4 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2840,8 +2840,8 @@ TRACE_EVENT(cfg80211_rx_mgmt, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d", - WDEV_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d", + WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_mgmt_tx_status, @@ -3121,8 +3121,8 @@ TRACE_EVENT(cfg80211_report_obss_beacon, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WIPHY_PR_FMT ", freq: %d, sig_dbm: %d", - WIPHY_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d", + WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_tdls_oper_request, -- cgit v1.2.3 From d6fb67ff86bb991d5ac18471e5f739bc32e5090e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:53 -0700 Subject: ieee80211: S1G defines These are found in IEEE-802.11ah-2016. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-5-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 221 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 41d5f000c0d9..f630b8978a43 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -105,6 +105,51 @@ /* extension, added by 802.11ad */ #define IEEE80211_STYPE_DMG_BEACON 0x0000 +#define IEEE80211_STYPE_S1G_BEACON 0x0010 + +/* bits unique to S1G beacon */ +#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 + +/* see 802.11ah-2016 9.9 NDP CMAC frames */ +#define IEEE80211_S1G_1MHZ_NDP_BITS 25 +#define IEEE80211_S1G_1MHZ_NDP_BYTES 4 +#define IEEE80211_S1G_2MHZ_NDP_BITS 37 +#define IEEE80211_S1G_2MHZ_NDP_BYTES 5 + +#define IEEE80211_NDP_FTYPE_CTS 0 +#define IEEE80211_NDP_FTYPE_CF_END 0 +#define IEEE80211_NDP_FTYPE_PS_POLL 1 +#define IEEE80211_NDP_FTYPE_ACK 2 +#define IEEE80211_NDP_FTYPE_PS_POLL_ACK 3 +#define IEEE80211_NDP_FTYPE_BA 4 +#define IEEE80211_NDP_FTYPE_BF_REPORT_POLL 5 +#define IEEE80211_NDP_FTYPE_PAGING 6 +#define IEEE80211_NDP_FTYPE_PREQ 7 + +#define SM64(f, v) ((((u64)v) << f##_S) & f) + +/* NDP CMAC frame fields */ +#define IEEE80211_NDP_FTYPE 0x0000000000000007 +#define IEEE80211_NDP_FTYPE_S 0x0000000000000000 + +/* 1M Probe Request 11ah 9.9.3.1.1 */ +#define IEEE80211_NDP_1M_PREQ_ANO 0x0000000000000008 +#define IEEE80211_NDP_1M_PREQ_ANO_S 3 +#define IEEE80211_NDP_1M_PREQ_CSSID 0x00000000000FFFF0 +#define IEEE80211_NDP_1M_PREQ_CSSID_S 4 +#define IEEE80211_NDP_1M_PREQ_RTYPE 0x0000000000100000 +#define IEEE80211_NDP_1M_PREQ_RTYPE_S 20 +#define IEEE80211_NDP_1M_PREQ_RSV 0x0000000001E00000 +#define IEEE80211_NDP_1M_PREQ_RSV 0x0000000001E00000 +/* 2M Probe Request 11ah 9.9.3.1.2 */ +#define IEEE80211_NDP_2M_PREQ_ANO 0x0000000000000008 +#define IEEE80211_NDP_2M_PREQ_ANO_S 3 +#define IEEE80211_NDP_2M_PREQ_CSSID 0x0000000FFFFFFFF0 +#define IEEE80211_NDP_2M_PREQ_CSSID_S 4 +#define IEEE80211_NDP_2M_PREQ_RTYPE 0x0000001000000000 +#define IEEE80211_NDP_2M_PREQ_RTYPE_S 36 + +#define IEEE80211_ANO_NETTYPE_WILD 15 /* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */ #define IEEE80211_CTL_EXT_POLL 0x2000 @@ -121,6 +166,21 @@ #define IEEE80211_MAX_SN IEEE80211_SN_MASK #define IEEE80211_SN_MODULO (IEEE80211_MAX_SN + 1) + +/* PV1 Layout 11ah 9.8.3.1 */ +#define IEEE80211_PV1_FCTL_VERS 0x0003 +#define IEEE80211_PV1_FCTL_FTYPE 0x001c +#define IEEE80211_PV1_FCTL_STYPE 0x00e0 +#define IEEE80211_PV1_FCTL_TODS 0x0100 +#define IEEE80211_PV1_FCTL_MOREFRAGS 0x0200 +#define IEEE80211_PV1_FCTL_PM 0x0400 +#define IEEE80211_PV1_FCTL_MOREDATA 0x0800 +#define IEEE80211_PV1_FCTL_PROTECTED 0x1000 +#define IEEE80211_PV1_FCTL_END_SP 0x2000 +#define IEEE80211_PV1_FCTL_RELAYED 0x4000 +#define IEEE80211_PV1_FCTL_ACK_POLICY 0x8000 +#define IEEE80211_PV1_FCTL_CTL_EXT 0x0f00 + static inline bool ieee80211_sn_less(u16 sn1, u16 sn2) { return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1); @@ -148,6 +208,7 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_FRAG_THRESHOLD 2352 #define IEEE80211_MAX_RTS_THRESHOLD 2353 #define IEEE80211_MAX_AID 2007 +#define IEEE80211_MAX_AID_S1G 8191 #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section @@ -371,6 +432,17 @@ static inline bool ieee80211_is_data(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_DATA); } +/** + * ieee80211_is_ext - check if type is IEEE80211_FTYPE_EXT + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_is_ext(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT); +} + + /** * ieee80211_is_data_qos - check if type is IEEE80211_FTYPE_DATA and IEEE80211_STYPE_QOS_DATA is set * @fc: frame control bytes in little-endian byteorder @@ -469,6 +541,18 @@ static inline bool ieee80211_is_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } +/** + * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && + * IEEE80211_STYPE_S1G_BEACON + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_is_s1g_beacon(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | + IEEE80211_FCTL_STYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); +} + /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -900,6 +984,59 @@ struct ieee80211_addba_ext_ie { u8 data; } __packed; +/** + * struct ieee80211_s1g_bcn_compat_ie + * + * S1G Beacon Compatibility element + */ +struct ieee80211_s1g_bcn_compat_ie { + __le16 compat_info; + __le16 beacon_int; + __le32 tsf_completion; +} __packed; + +/** + * struct ieee80211_s1g_oper_ie + * + * S1G Operation element + */ +struct ieee80211_s1g_oper_ie { + u8 ch_width; + u8 oper_class; + u8 primary_ch; + u8 oper_ch; + __le16 basic_mcs_nss; +} __packed; + +/** + * struct ieee80211_aid_response_ie + * + * AID Response element + */ +struct ieee80211_aid_response_ie { + __le16 aid; + u8 switch_count; + __le16 response_int; +} __packed; + +struct ieee80211_s1g_cap { + u8 capab_info[10]; + u8 supp_mcs_nss[5]; +} __packed; + +struct ieee80211_ext { + __le16 frame_control; + __le16 duration; + union { + struct { + u8 sa[ETH_ALEN]; + __le32 timestamp; + u8 change_seq; + u8 variable[0]; + } __packed s1g_beacon; + } u; +} __packed __aligned(2); + struct ieee80211_mgmt { __le16 frame_control; __le16 duration; @@ -2137,6 +2274,86 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) return spr_len; } +/* S1G Capabilities Information field */ +#define S1G_CAPAB_B0_S1G_LONG BIT(0) +#define S1G_CAPAB_B0_SGI_1MHZ BIT(1) +#define S1G_CAPAB_B0_SGI_2MHZ BIT(2) +#define S1G_CAPAB_B0_SGI_4MHZ BIT(3) +#define S1G_CAPAB_B0_SGI_8MHZ BIT(4) +#define S1G_CAPAB_B0_SGI_16MHZ BIT(5) +#define S1G_CAPAB_B0_SUPP_CH_WIDTH_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B0_SUPP_CH_WIDTH_SHIFT 6 + +#define S1G_CAPAB_B1_RX_LDPC BIT(0) +#define S1G_CAPAB_B1_TX_STBC BIT(1) +#define S1G_CAPAB_B1_RX_STBC BIT(2) +#define S1G_CAPAB_B1_SU_BFER BIT(3) +#define S1G_CAPAB_B1_SU_BFEE BIT(4) +#define S1G_CAPAB_B1_BFEE_STS_MASK (BIT(5) | BIT(6) | BIT(7)) +#define S1G_CAPAB_B1_BFEE_STS_SHIFT 5 + +#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_MASK (BIT(0) | BIT(1) | BIT(2)) +#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_SHIFT 0 +#define S1G_CAPAB_B2_MU_BFER BIT(3) +#define S1G_CAPAB_B2_MU_BFEE BIT(4) +#define S1G_CAPAB_B2_PLUS_HTC_VHT BIT(5) +#define S1G_CAPAB_B2_TRAVELING_PILOT_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B2_TRAVELING_PILOT_SHIFT 6 + +#define S1G_CAPAB_B3_RD_RESPONDER BIT(0) +#define S1G_CAPAB_B3_HT_DELAYED_BA BIT(1) +#define S1G_CAPAB_B3_MAX_MPDU_LEN BIT(2) +#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_MASK (BIT(3) | BIT(4)) +#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_SHIFT 3 +#define S1G_CAPAB_B3_MIN_MPDU_START_MASK (BIT(5) | BIT(6) | BIT(7)) +#define S1G_CAPAB_B3_MIN_MPDU_START_SHIFT 5 + +#define S1G_CAPAB_B4_UPLINK_SYNC BIT(0) +#define S1G_CAPAB_B4_DYNAMIC_AID BIT(1) +#define S1G_CAPAB_B4_BAT BIT(2) +#define S1G_CAPAB_B4_TIME_ADE BIT(3) +#define S1G_CAPAB_B4_NON_TIM BIT(4) +#define S1G_CAPAB_B4_GROUP_AID BIT(5) +#define S1G_CAPAB_B4_STA_TYPE_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B4_STA_TYPE_SHIFT 6 + +#define S1G_CAPAB_B5_CENT_AUTH_CONTROL BIT(0) +#define S1G_CAPAB_B5_DIST_AUTH_CONTROL BIT(1) +#define S1G_CAPAB_B5_AMSDU BIT(2) +#define S1G_CAPAB_B5_AMPDU BIT(3) +#define S1G_CAPAB_B5_ASYMMETRIC_BA BIT(4) +#define S1G_CAPAB_B5_FLOW_CONTROL BIT(5) +#define S1G_CAPAB_B5_SECTORIZED_BEAM_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B5_SECTORIZED_BEAM_SHIFT 6 + +#define S1G_CAPAB_B6_OBSS_MITIGATION BIT(0) +#define S1G_CAPAB_B6_FRAGMENT_BA BIT(1) +#define S1G_CAPAB_B6_NDP_PS_POLL BIT(2) +#define S1G_CAPAB_B6_RAW_OPERATION BIT(3) +#define S1G_CAPAB_B6_PAGE_SLICING BIT(4) +#define S1G_CAPAB_B6_TXOP_SHARING_IMP_ACK BIT(5) +#define S1G_CAPAB_B6_VHT_LINK_ADAPT_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B6_VHT_LINK_ADAPT_SHIFT 6 + +#define S1G_CAPAB_B7_TACK_AS_PS_POLL BIT(0) +#define S1G_CAPAB_B7_DUP_1MHZ BIT(1) +#define S1G_CAPAB_B7_MCS_NEGOTIATION BIT(2) +#define S1G_CAPAB_B7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) +#define S1G_CAPAB_B7_NDP_BFING_REPORT_POLL BIT(4) +#define S1G_CAPAB_B7_UNSOLICITED_DYN_AID BIT(5) +#define S1G_CAPAB_B7_SECTOR_TRAINING_OPERATION BIT(6) +#define S1G_CAPAB_B7_TEMP_PS_MODE_SWITCH BIT(7) + +#define S1G_CAPAB_B8_TWT_GROUPING BIT(0) +#define S1G_CAPAB_B8_BDT BIT(1) +#define S1G_CAPAB_B8_COLOR_MASK (BIT(2) | BIT(3) | BIT(4)) +#define S1G_CAPAB_B8_COLOR_SHIFT 2 +#define S1G_CAPAB_B8_TWT_REQUEST BIT(5) +#define S1G_CAPAB_B8_TWT_RESPOND BIT(6) +#define S1G_CAPAB_B8_PV1_FRAME BIT(7) + +#define S1G_CAPAB_B9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2532,8 +2749,12 @@ enum ieee80211_eid { WLAN_EID_QUIET_CHANNEL = 198, WLAN_EID_OPMODE_NOTIF = 199, + WLAN_EID_S1G_BCN_COMPAT = 213, + WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, + WLAN_EID_S1G_CAPABILITIES = 217, WLAN_EID_VENDOR_SPECIFIC = 221, WLAN_EID_QOS_PARAMETER = 222, + WLAN_EID_S1G_OPERATION = 232, WLAN_EID_CAG_NUMBER = 237, WLAN_EID_AP_CSN = 239, WLAN_EID_FILS_INDICATION = 240, -- cgit v1.2.3 From fedd0fe4e89b009f31eb53ec36dbdf1e457616c0 Mon Sep 17 00:00:00 2001 From: Tamizh Chelvam Date: Mon, 4 May 2020 22:34:59 +0530 Subject: mac80211: Add new AMPDU factor macro for HE peer caps Add IEEE80211_HE_VHT_MAX_AMPDU_FACTOR and IEEE80211_HE_HT_MAX_AMPDU_FACTOR as per spec to use for peer max ampdu factor. Signed-off-by: Tamizh Chelvam Link: https://lore.kernel.org/r/1588611900-21185-1-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f630b8978a43..2153d465d752 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1958,6 +1958,8 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 #define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT 3 + #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 #define IEEE80211_HE_MAC_CAP4_QTP 0x02 #define IEEE80211_HE_MAC_CAP4_BQR 0x04 @@ -1979,6 +1981,9 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING 0x40 #define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX 0x80 +#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 +#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 + /* 802.11ax HE PHY capabilities */ #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 -- cgit v1.2.3 From 396fba0a59f3c94d6fd6443fbeabd8bd9e3956eb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:39:09 -0500 Subject: cfg80211: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507183909.GA12993@embeddedor Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 6 +++--- include/net/cfg80211.h | 8 ++++---- net/wireless/core.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2153d465d752..0320ca4c7d28 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -800,7 +800,7 @@ struct ieee80211_msrment_ie { u8 token; u8 mode; u8 type; - u8 request[0]; + u8 request[]; } __packed; /** @@ -1781,7 +1781,7 @@ struct ieee80211_he_operation { __le32 he_oper_params; __le16 he_mcs_nss_set; /* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */ - u8 optional[0]; + u8 optional[]; } __packed; /** @@ -1793,7 +1793,7 @@ struct ieee80211_he_operation { struct ieee80211_he_spr { u8 he_sr_control; /* Optional 0 to 19 bytes: depends on @he_sr_control */ - u8 optional[0]; + u8 optional[]; } __packed; /** diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7415f77d99ca..021366cfb2b0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2037,7 +2037,7 @@ struct cfg80211_scan_request { bool no_cck; /* keep last */ - struct ieee80211_channel *channels[0]; + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2183,7 +2183,7 @@ struct cfg80211_sched_scan_request { struct list_head list; /* keep last */ - struct ieee80211_channel *channels[0]; + struct ieee80211_channel *channels[]; }; /** @@ -2305,7 +2305,7 @@ struct cfg80211_bss { u8 bssid_index; u8 max_bssid_indicator; - u8 priv[0] __aligned(sizeof(void *)); + u8 priv[] __aligned(sizeof(void *)); }; /** @@ -4852,7 +4852,7 @@ struct wiphy { u8 max_data_retry_count; - char priv[0] __aligned(NETDEV_ALIGN); + char priv[] __aligned(NETDEV_ALIGN); }; static inline struct net *wiphy_net(struct wiphy *wiphy) diff --git a/net/wireless/core.h b/net/wireless/core.h index 639d41896573..e0e5b3ee9699 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -286,7 +286,7 @@ struct cfg80211_cqm_config { u32 rssi_hyst; s32 last_rssi_event_value; int n_rssi_thresholds; - s32 rssi_thresholds[0]; + s32 rssi_thresholds[]; }; void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); -- cgit v1.2.3 From 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 May 2020 14:06:17 -0700 Subject: /dev/mem: Revoke mappings when a driver claims the region Close the hole of holding a mapping over kernel driver takeover event of a given address range. Commit 90a545e98126 ("restrict /dev/mem to idle io memory ranges") introduced CONFIG_IO_STRICT_DEVMEM with the goal of protecting the kernel against scenarios where a /dev/mem user tramples memory that a kernel driver owns. However, this protection only prevents *new* read(), write() and mmap() requests. Established mappings prior to the driver calling request_mem_region() are left alone. Especially with persistent memory, and the core kernel metadata that is stored there, there are plentiful scenarios for a /dev/mem user to violate the expectations of the driver and cause amplified damage. Teach request_mem_region() to find and shoot down active /dev/mem mappings that it believes it has successfully claimed for the exclusive use of the driver. Effectively a driver call to request_mem_region() becomes a hole-punch on the /dev/mem device. The typical usage of unmap_mapping_range() is part of truncate_pagecache() to punch a hole in a file, but in this case the implementation is only doing the "first half" of a hole punch. Namely it is just evacuating current established mappings of the "hole", and it relies on the fact that /dev/mem establishes mappings in terms of absolute physical address offsets. Once existing mmap users are invalidated they can attempt to re-establish the mapping, or attempt to continue issuing read(2) / write(2) to the invalidated extent, but they will then be subject to the CONFIG_IO_STRICT_DEVMEM checking that can block those subsequent accesses. Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Kees Cook Cc: Matthew Wilcox Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Fixes: 90a545e98126 ("restrict /dev/mem to idle io memory ranges") Signed-off-by: Dan Williams Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 101 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/ioport.h | 6 +++ include/uapi/linux/magic.h | 1 + kernel/resource.c | 5 +++ 4 files changed, 111 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 43dd0891ca1e..31cae88a730b 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -31,11 +31,15 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_IA64 # include #endif +#define DEVMEM_MINOR 1 #define DEVPORT_MINOR 4 static inline unsigned long size_inside_page(unsigned long start, @@ -805,12 +809,64 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) return ret; } +static struct inode *devmem_inode; + +#ifdef CONFIG_IO_STRICT_DEVMEM +void revoke_devmem(struct resource *res) +{ + struct inode *inode = READ_ONCE(devmem_inode); + + /* + * Check that the initialization has completed. Losing the race + * is ok because it means drivers are claiming resources before + * the fs_initcall level of init and prevent /dev/mem from + * establishing mappings. + */ + if (!inode) + return; + + /* + * The expectation is that the driver has successfully marked + * the resource busy by this point, so devmem_is_allowed() + * should start returning false, however for performance this + * does not iterate the entire resource range. + */ + if (devmem_is_allowed(PHYS_PFN(res->start)) && + devmem_is_allowed(PHYS_PFN(res->end))) { + /* + * *cringe* iomem=relaxed says "go ahead, what's the + * worst that can happen?" + */ + return; + } + + unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1); +} +#endif + static int open_port(struct inode *inode, struct file *filp) { + int rc; + if (!capable(CAP_SYS_RAWIO)) return -EPERM; - return security_locked_down(LOCKDOWN_DEV_MEM); + rc = security_locked_down(LOCKDOWN_DEV_MEM); + if (rc) + return rc; + + if (iminor(inode) != DEVMEM_MINOR) + return 0; + + /* + * Use a unified address space to have a single point to manage + * revocations when drivers want to take over a /dev/mem mapped + * range. + */ + inode->i_mapping = devmem_inode->i_mapping; + filp->f_mapping = inode->i_mapping; + + return 0; } #define zero_lseek null_lseek @@ -885,7 +941,7 @@ static const struct memdev { fmode_t fmode; } devlist[] = { #ifdef CONFIG_DEVMEM - [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, + [DEVMEM_MINOR] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, #endif #ifdef CONFIG_DEVKMEM [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET }, @@ -939,6 +995,45 @@ static char *mem_devnode(struct device *dev, umode_t *mode) static struct class *mem_class; +static int devmem_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type devmem_fs_type = { + .name = "devmem", + .owner = THIS_MODULE, + .init_fs_context = devmem_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int devmem_init_inode(void) +{ + static struct vfsmount *devmem_vfs_mount; + static int devmem_fs_cnt; + struct inode *inode; + int rc; + + rc = simple_pin_fs(&devmem_fs_type, &devmem_vfs_mount, &devmem_fs_cnt); + if (rc < 0) { + pr_err("Cannot mount /dev/mem pseudo filesystem: %d\n", rc); + return rc; + } + + inode = alloc_anon_inode(devmem_vfs_mount->mnt_sb); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + pr_err("Cannot allocate inode for /dev/mem: %d\n", rc); + simple_release_fs(&devmem_vfs_mount, &devmem_fs_cnt); + return rc; + } + + /* publish /dev/mem initialized */ + WRITE_ONCE(devmem_inode, inode); + + return 0; +} + static int __init chr_dev_init(void) { int minor; @@ -960,6 +1055,8 @@ static int __init chr_dev_init(void) */ if ((minor == DEVPORT_MINOR) && !arch_has_dev_port()) continue; + if ((minor == DEVMEM_MINOR) && devmem_init_inode() != 0) + continue; device_create(mem_class, NULL, MKDEV(MEM_MAJOR, minor), NULL, devlist[minor].name); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index a9b9170b5dd2..6c3eca90cbc4 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -301,5 +301,11 @@ struct resource *devm_request_free_mem_region(struct device *dev, struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); +#ifdef CONFIG_IO_STRICT_DEVMEM +void revoke_devmem(struct resource *res); +#else +static inline void revoke_devmem(struct resource *res) { }; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index d78064007b17..f3956fc11de6 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -94,6 +94,7 @@ #define BALLOON_KVM_MAGIC 0x13661366 #define ZSMALLOC_MAGIC 0x58295829 #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ +#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define Z3FOLD_MAGIC 0x33 #define PPC_CMM_MAGIC 0xc7571590 diff --git a/kernel/resource.c b/kernel/resource.c index 76036a41143b..841737bbda9e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent, { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); + struct resource *orig_parent = parent; if (!res) return NULL; @@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent, break; } write_unlock(&resource_lock); + + if (res && orig_parent == &iomem_resource) + revoke_devmem(res); + return res; } EXPORT_SYMBOL(__request_region); -- cgit v1.2.3 From 6b646a7e4af69814dd1a3340fca0f02d4977420d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 9 Mar 2020 16:44:25 +0200 Subject: net/mlx5: Add ability to read and write ECE options The end result of RDMA-CM ECE handshake is ECE options, which is needed to be used while configuring data QPs. Such options can come in any QP state, so add in/out fields to set and query ECE options. OUT fields: * create_qp() - default ECE options for that type of QP. * modify_qp() - enabled ECE options after QP state transition. IN fields: * create_qp() - create QP with this ECE option. * modify_qp() - requested options. For unconnected QPs, the FW will return an error if ECE is already configured with any options that not equal to previously set. Reviewed-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fd8da4875ea0..1a56dc079c32 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1208,7 +1208,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_99[0x2]; u8 log_max_qp[0x5]; - u8 reserved_at_a0[0xb]; + u8 reserved_at_a0[0x3]; + u8 ece_support[0x1]; + u8 reserved_at_a4[0x7]; u8 log_max_srq[0x5]; u8 reserved_at_b0[0x10]; @@ -4216,7 +4218,8 @@ struct mlx5_ifc_rts2rts_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_rts2rts_qp_in_bits { @@ -4233,7 +4236,7 @@ struct mlx5_ifc_rts2rts_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -4246,7 +4249,8 @@ struct mlx5_ifc_rtr2rts_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_rtr2rts_qp_in_bits { @@ -4263,7 +4267,7 @@ struct mlx5_ifc_rtr2rts_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -4815,7 +4819,8 @@ struct mlx5_ifc_query_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; u8 opt_param_mask[0x20]; @@ -6580,7 +6585,8 @@ struct mlx5_ifc_init2rtr_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_init2rtr_qp_in_bits { @@ -6597,7 +6603,7 @@ struct mlx5_ifc_init2rtr_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -7693,7 +7699,7 @@ struct mlx5_ifc_create_qp_out_bits { u8 reserved_at_40[0x8]; u8 qpn[0x18]; - u8 reserved_at_60[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_create_qp_in_bits { @@ -7707,7 +7713,7 @@ struct mlx5_ifc_create_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; -- cgit v1.2.3 From 956d510ee78caebc83c0eaeb892db5b239a36a06 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:04 +0200 Subject: block: add disk/bio-based accounting helpers Add two new helpers to simplify I/O accounting for bio based drivers. Currently these drivers use the generic_start_io_acct and generic_end_io_acct helpers which have very cumbersome calling conventions, don't actually return the time they started accounting, and try to deal with accounting for partitions, which can't happen for bio based drivers. The new helpers will be used to subsequently replace uses of the old helpers. The main API is the bio based wrappes in blkdev.h, but for zram which wants to account rw_page based I/O lower level routines are provided as well. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/blk-core.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 77e57c2e8d60..8973104f88d9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1432,6 +1432,40 @@ void blk_account_io_start(struct request *rq, bool new_io) part_stat_unlock(); } +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op) +{ + struct hd_struct *part = &disk->part0; + const int sgrp = op_stat_group(op); + unsigned long now = READ_ONCE(jiffies); + + part_stat_lock(); + update_io_ticks(part, now, false); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); + part_stat_local_inc(part, in_flight[op_is_write(op)]); + part_stat_unlock(); + + return now; +} +EXPORT_SYMBOL(disk_start_io_acct); + +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time) +{ + struct hd_struct *part = &disk->part0; + const int sgrp = op_stat_group(op); + unsigned long now = READ_ONCE(jiffies); + unsigned long duration = now - start_time; + + part_stat_lock(); + update_io_ticks(part, now, true); + part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(part, in_flight[op_is_write(op)]); + part_stat_unlock(); +} +EXPORT_SYMBOL(disk_end_io_acct); + /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7d10f4e63232..6f7ff0fa8fcf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1892,4 +1892,32 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op); +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time); + +#ifdef CONFIG_BLOCK +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +static inline unsigned long bio_start_io_acct(struct bio *bio) +{ + return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); +} + +/** + * bio_end_io_acct - end I/O accounting for bio based drivers + * @bio: bio to end account for + * @start: start time returned by bio_start_io_acct() + */ +static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) +{ + return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); +} +#endif /* CONFIG_BLOCK */ + #endif -- cgit v1.2.3 From e722fff238bbfe6308d7778a8c2163c181bf998a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:12 +0200 Subject: block: remove generic_{start,end}_io_acct Remove these now unused functions. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/bio.c | 39 --------------------------------------- include/linux/bio.h | 6 ------ 2 files changed, 45 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 9c101a0572ca..3e89c7b37855 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1392,45 +1392,6 @@ again: } } -void generic_start_io_acct(struct request_queue *q, int op, - unsigned long sectors, struct hd_struct *part) -{ - const int sgrp = op_stat_group(op); - int rw = op_is_write(op); - - part_stat_lock(); - - update_io_ticks(part, jiffies, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); - part_stat_local_inc(part, in_flight[rw]); - if (part->partno) - part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]); - - part_stat_unlock(); -} -EXPORT_SYMBOL(generic_start_io_acct); - -void generic_end_io_acct(struct request_queue *q, int req_op, - struct hd_struct *part, unsigned long start_time) -{ - unsigned long now = jiffies; - unsigned long duration = now - start_time; - const int sgrp = op_stat_group(req_op); - int rw = op_is_write(req_op); - - part_stat_lock(); - - update_io_ticks(part, now, true); - part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_local_dec(part, in_flight[rw]); - if (part->partno) - part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); - - part_stat_unlock(); -} -EXPORT_SYMBOL(generic_end_io_acct); - static inline bool bio_remaining_done(struct bio *bio) { /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 950c9dc44c4f..941378ec5b39 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -444,12 +444,6 @@ void bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int op, - unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int op, - struct hd_struct *part, - unsigned long start_time); - extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); -- cgit v1.2.3 From 58d4f14fc30ac26288cfed74d7e566921c22cf59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:14 +0200 Subject: block: always use a percpu variable for disk stats percpu variables have a perfectly fine working stub implementation for UP kernels, so use that. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 12 +++-------- block/partitions/core.c | 5 +++-- include/linux/genhd.h | 13 ----------- include/linux/part_stat.h | 55 ++++++++++------------------------------------- 5 files changed, 18 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/block/blk.h b/block/blk.h index bdf5e94467aa..0ecba2ab383d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -378,7 +378,7 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - free_part_stats(part); + free_percpu(part->dkstats); kfree(part->info); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index 094ed9096496..3e7df0a3e6bb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -92,7 +92,6 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); -#ifdef CONFIG_SMP static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) { int cpu; @@ -112,12 +111,6 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) stat->io_ticks += ptr->io_ticks; } } -#else /* CONFIG_SMP */ -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) -{ - memcpy(stat, &part->dkstats, sizeof(struct disk_stats)); -} -#endif /* CONFIG_SMP */ static unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) @@ -1688,14 +1681,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { - if (!init_part_stats(&disk->part0)) { + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) { kfree(disk); return NULL; } init_rwsem(&disk->lookup_sem); disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) { - free_part_stats(&disk->part0); + free_percpu(disk->part0.dkstats); kfree(disk); return NULL; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 297004fd2264..78951e33b2d7 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -387,7 +387,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); - if (!init_part_stats(p)) { + p->dkstats = alloc_percpu(struct disk_stats); + if (!p->dkstats) { err = -ENOMEM; goto out_free; } @@ -468,7 +469,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_free_info: kfree(p->info); out_free_stats: - free_part_stats(p); + free_percpu(p->dkstats); out_free: kfree(p); return ERR_PTR(err); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a9384449465a..f0d6d77309a5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -39,15 +39,6 @@ extern struct class block_class; #include #include -struct disk_stats { - u64 nsecs[NR_STAT_GROUPS]; - unsigned long sectors[NR_STAT_GROUPS]; - unsigned long ios[NR_STAT_GROUPS]; - unsigned long merges[NR_STAT_GROUPS]; - unsigned long io_ticks; - local_t in_flight[2]; -}; - #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. @@ -72,11 +63,7 @@ struct hd_struct { seqcount_t nr_sects_seq; #endif unsigned long stamp; -#ifdef CONFIG_SMP struct disk_stats __percpu *dkstats; -#else - struct disk_stats dkstats; -#endif struct percpu_ref ref; sector_t alignment_offset; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ece607607a86..6644197980b9 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -4,19 +4,23 @@ #include +struct disk_stats { + u64 nsecs[NR_STAT_GROUPS]; + unsigned long sectors[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long merges[NR_STAT_GROUPS]; + unsigned long io_ticks; + local_t in_flight[2]; +}; + /* * Macros to operate on percpu disk statistics: * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters - * and should be called between disk_stat_lock() and - * disk_stat_unlock(). + * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should + * be called between disk_stat_lock() and disk_stat_unlock(). * * part_stat_read() can be called at any time. - * - * part_stat_{add|set_all}() and {init|free}_part_stats are for - * internal use only. */ -#ifdef CONFIG_SMP #define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) #define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) @@ -44,43 +48,6 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) sizeof(struct disk_stats)); } -static inline int init_part_stats(struct hd_struct *part) -{ - part->dkstats = alloc_percpu(struct disk_stats); - if (!part->dkstats) - return 0; - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ - free_percpu(part->dkstats); -} - -#else /* !CONFIG_SMP */ -#define part_stat_lock() ({ rcu_read_lock(); 0; }) -#define part_stat_unlock() rcu_read_unlock() - -#define part_stat_get(part, field) ((part)->dkstats.field) -#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) -#define part_stat_read(part, field) part_stat_get(part, field) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - memset(&part->dkstats, value, sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ -} - -#endif /* CONFIG_SMP */ - #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ -- cgit v1.2.3 From 8ab1d40a646e753adb6814642432a093d93dbf47 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 May 2020 07:24:17 +0200 Subject: block: remove rcu_read_lock() from part_stat_lock() The RCU lock is required only in disk_map_sector_rcu() to lookup the partition. After that request holds reference to related hd_struct. Replace get_cpu() with preempt_disable() - returned cpu index is unused. [hch: rebased] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 11 ++++++++--- include/linux/part_stat.h | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 3e7df0a3e6bb..1a7659327664 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -321,11 +321,12 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) struct hd_struct *part; int i; + rcu_read_lock(); ptbl = rcu_dereference(disk->part_tbl); part = rcu_dereference(ptbl->last_lookup); if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) - return part; + goto out_unlock; for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); @@ -339,10 +340,14 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) if (!hd_struct_try_get(part)) break; rcu_assign_pointer(ptbl->last_lookup, part); - return part; + goto out_unlock; } } - return &disk->part0; + + part = &disk->part0; +out_unlock: + rcu_read_unlock(); + return part; } /** diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 6644197980b9..a6b0938ce82e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -21,8 +21,8 @@ struct disk_stats { * * part_stat_read() can be called at any time. */ -#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) -#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) +#define part_stat_lock() preempt_disable() +#define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ (per_cpu_ptr((part)->dkstats, (cpu))->field) -- cgit v1.2.3 From b2d76adbc0828e0f108567973bcc500ed1abc139 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 May 2020 07:24:18 +0200 Subject: block: use __this_cpu_add() instead of access by smp_processor_id() Most architectures have fast path to access percpu for current cpu. The required preempt_disable() is provided by part_stat_lock(). [hch: rebased] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/part_stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index a6b0938ce82e..24125778ef3e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -54,7 +54,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ - (part_stat_get(part, field) += (addnd)) + __this_cpu_add((part)->dkstats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ -- cgit v1.2.3 From 521376741b2c26fe53a1ec24d02da24d477eb739 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 20 May 2020 17:22:00 +0200 Subject: PCI/ATS: Only enable ATS for trusted devices Add pci_ats_supported(), which checks whether a device has an ATS capability, and whether it is trusted. A device is untrusted if it is plugged into an external-facing port such as Thunderbolt and could be spoofing an existing device to exploit weaknesses in the IOMMU configuration. PCIe ATS is one such weaknesses since it allows endpoints to cache IOMMU translations and emit transactions with 'Translated' Address Type (10b) that partially bypass the IOMMU translation. The SMMUv3 and VT-d IOMMU drivers already disallow ATS and transactions with 'Translated' Address Type for untrusted devices. Add the check to pci_enable_ats() to let other drivers (AMD IOMMU for now) benefit from it. By checking ats_cap, the pci_ats_supported() helper also returns whether ATS was globally disabled with pci=noats, and could later include more things, for example whether the whole PCIe hierarchy down to the endpoint supports ATS. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Joerg Roedel Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20200520152201.3309416-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 18 +++++++++++++++++- include/linux/pci-ats.h | 3 +++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 390e92f2d8d1..b761c1f72f67 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -30,6 +30,22 @@ void pci_ats_init(struct pci_dev *dev) dev->ats_cap = pos; } +/** + * pci_ats_supported - check if the device can use ATS + * @dev: the PCI device + * + * Returns true if the device supports ATS and is allowed to use it, false + * otherwise. + */ +bool pci_ats_supported(struct pci_dev *dev) +{ + if (!dev->ats_cap) + return false; + + return (dev->untrusted == 0); +} +EXPORT_SYMBOL_GPL(pci_ats_supported); + /** * pci_enable_ats - enable the ATS capability * @dev: the PCI device @@ -42,7 +58,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps) u16 ctrl; struct pci_dev *pdev; - if (!dev->ats_cap) + if (!pci_ats_supported(dev)) return -EINVAL; if (WARN_ON(dev->ats_enabled)) diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index d08f0869f121..f75c307f346d 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -6,11 +6,14 @@ #ifdef CONFIG_PCI_ATS /* Address Translation Service */ +bool pci_ats_supported(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else /* CONFIG_PCI_ATS */ +static inline bool pci_ats_supported(struct pci_dev *d) +{ return false; } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } -- cgit v1.2.3 From d04659db7b8089531ccbee364c52909fe4670408 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Fri, 24 Apr 2020 16:17:23 +0800 Subject: nfs4: Remove unneeded semicolon Fixes coccicheck warning: include/linux/nfs4.h:298:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 82d8fb422092..4a51db7363a5 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -295,7 +295,7 @@ static inline bool seqid_mutating_err(u32 err) case NFS4ERR_NOFILEHANDLE: case NFS4ERR_MOVED: return false; - }; + } return true; } -- cgit v1.2.3 From 1ad8dd939a9826ff6f8c69ac13e4e1dbba076703 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:02:23 -0500 Subject: NFS: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 2 +- include/linux/nfs_xdr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 4a51db7363a5..4dba3c948932 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -38,7 +38,7 @@ struct nfs4_ace { struct nfs4_acl { uint32_t naces; - struct nfs4_ace aces[0]; + struct nfs4_ace aces[]; }; #define NFS4_MAXLABELLEN 2048 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index e5f3e7d8d3d5..5fd0a9ef425f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1227,7 +1227,7 @@ struct nfs4_secinfo4 { struct nfs4_secinfo_flavors { unsigned int num_flavors; - struct nfs4_secinfo4 flavors[0]; + struct nfs4_secinfo4 flavors[]; }; struct nfs4_secinfo_arg { -- cgit v1.2.3 From ad1e4f74c072eaa2c6d77dd710db31aafecd614f Mon Sep 17 00:00:00 2001 From: Domenico Andreoli Date: Tue, 19 May 2020 20:14:10 +0200 Subject: PM: hibernate: Restrict writes to the resume device Hibernation via snapshot device requires write permission to the swap block device, the one that more often (but not necessarily) is used to store the hibernation image. With this patch, such permissions are granted iff: 1) snapshot device config option is enabled 2) swap partition is used as resume device In other circumstances the swap device is not writable from userspace. In order to achieve this, every write attempt to a swap device is checked against the device configured as part of the uswsusp API [0] using a pointer to the inode struct in memory. If the swap device being written was not configured for resuming, the write request is denied. NOTE: this implementation works only for swap block devices, where the inode configured by swapon (which sets S_SWAPFILE) is the same used by SNAPSHOT_SET_SWAP_AREA. In case of swap file, SNAPSHOT_SET_SWAP_AREA indeed receives the inode of the block device containing the filesystem where the swap file is located (+ offset in it) which is never passed to swapon and then has not set S_SWAPFILE. As result, the swap file itself (as a file) has never an option to be written from userspace. Instead it remains writable if accessed directly from the containing block device, which is always writeable from root. [0] Documentation/power/userland-swsusp.rst v2: - rename is_hibernate_snapshot_dev() to is_hibernate_resume_dev() - fix description so to correctly refer to the resume device Signed-off-by: Domenico Andreoli Acked-by: Darrick J. Wong Signed-off-by: Rafael J. Wysocki --- fs/block_dev.c | 3 +-- include/linux/suspend.h | 6 ++++++ kernel/power/user.c | 14 +++++++++++++- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index 93672c3f1c78..608a7b9173f4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -2023,8 +2023,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - /* uswsusp needs write permission to the swap */ - if (IS_SWAPFILE(bd_inode) && !hibernation_available()) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) return -ETXTBSY; if (!iov_iter_count(from)) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 4fcc6fd0cbd6..b960098acfb0 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -466,6 +466,12 @@ static inline bool system_entering_hibernation(void) { return false; } static inline bool hibernation_available(void) { return false; } #endif /* CONFIG_HIBERNATION */ +#ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV +int is_hibernate_resume_dev(const struct inode *); +#else +static inline int is_hibernate_resume_dev(const struct inode *i) { return 0; } +#endif + /* Hibernation and suspend events */ #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ #define PM_POST_HIBERNATION 0x0002 /* Hibernation finished */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 98548d1cf8a6..d5eedc2baa2a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,8 +35,14 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; + struct inode *bd_inode; } snapshot_state; +int is_hibernate_resume_dev(const struct inode *bd_inode) +{ + return hibernation_available() && snapshot_state.bd_inode == bd_inode; +} + static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; @@ -95,6 +101,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; + data->bd_inode = NULL; Unlock: unlock_system_sleep(); @@ -110,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; + data->bd_inode = NULL; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -202,6 +210,7 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { + struct block_device *bdev; sector_t offset; dev_t swdev; @@ -232,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data, data->swap = -1; return -EINVAL; } - data->swap = swap_type_of(swdev, offset, NULL); + data->swap = swap_type_of(swdev, offset, &bdev); if (data->swap < 0) return -ENODEV; + + data->bd_inode = bdev->bd_inode; + bdput(bdev); return 0; } -- cgit v1.2.3 From f18e26af6aba778b888044859d9c69bb9bbc7bc1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:38 +0300 Subject: RDMA/mlx5: Convert modify QP to use MLX5_SET macros Instead of hand crafted mlx5_qp_context and mlx5_qp_path use common MLX5_SET() macros. Link: https://lore.kernel.org/r/20200526115440.205922-7-leon@kernel.org Reviewed-by: Maor Gottlieb Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 203 +++++++++++++++++++--------------------- include/linux/mlx5/qp.h | 66 ------------- 2 files changed, 97 insertions(+), 172 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5099866533dd..a24176a8ec83 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3028,14 +3028,13 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) return 0; } -static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, - const struct ib_qp_attr *attr, - int attr_mask, __be32 *hw_access_flags_be) +static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, + const struct ib_qp_attr *attr, int attr_mask, + void *qpc) { - u8 dest_rd_atomic; - u32 access_flags, hw_access_flags = 0; - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + u8 dest_rd_atomic; + u32 access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; @@ -3050,8 +3049,8 @@ static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; - if (access_flags & IB_ACCESS_REMOTE_READ) - hw_access_flags |= MLX5_QP_BIT_RRE; + MLX5_SET(qpc, qpc, rre, !!(access_flags & IB_ACCESS_REMOTE_READ)); + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; @@ -3059,15 +3058,11 @@ static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, if (atomic_mode < 0) return -EOPNOTSUPP; - hw_access_flags |= MLX5_QP_BIT_RAE; - hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET; + MLX5_SET(qpc, qpc, rae, 1); + MLX5_SET(qpc, qpc, atomic_mode, atomic_mode); } - if (access_flags & IB_ACCESS_REMOTE_WRITE) - hw_access_flags |= MLX5_QP_BIT_RWE; - - *hw_access_flags_be = cpu_to_be32(hw_access_flags); - + MLX5_SET(qpc, qpc, rwe, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); return 0; } @@ -3147,26 +3142,22 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, return err; } -static void mlx5_set_path_udp_sport(struct mlx5_qp_path *path, - const struct rdma_ah_attr *ah, +static void mlx5_set_path_udp_sport(void *path, const struct rdma_ah_attr *ah, u32 lqpn, u32 rqpn) { u32 fl = ah->grh.flow_label; - u16 sport; if (!fl) fl = rdma_calc_flow_label(lqpn, rqpn); - sport = rdma_flow_label_to_udp_sport(fl); - path->udp_sport = cpu_to_be16(sport); + MLX5_SET(ads, path, udp_sport, rdma_flow_label_to_udp_sport(fl)); } static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - const struct rdma_ah_attr *ah, - struct mlx5_qp_path *path, u8 port, int attr_mask, - u32 path_flags, const struct ib_qp_attr *attr, - bool alt) + const struct rdma_ah_attr *ah, void *path, u8 port, + int attr_mask, u32 path_flags, + const struct ib_qp_attr *attr, bool alt) { const struct ib_global_route *grh = rdma_ah_read_grh(ah); int err; @@ -3175,8 +3166,8 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u8 sl = rdma_ah_get_sl(ah); if (attr_mask & IB_QP_PKEY_INDEX) - path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : - attr->pkey_index); + MLX5_SET(ads, path, pkey_index, + alt ? attr->alt_pkey_index : attr->pkey_index); if (ah_flags & IB_AH_GRH) { if (grh->sgid_index >= @@ -3192,7 +3183,8 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!(ah_flags & IB_AH_GRH)) return -EINVAL; - memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac)); + ether_addr_copy(MLX5_ADDR_OF(ads, path, rmac_47_32), + ah->roce.dmac); if ((qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || qp->ibqp.qp_type == IB_QPT_XRC_INI || @@ -3202,38 +3194,38 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, mlx5_set_path_udp_sport(path, ah, qp->ibqp.qp_num, attr->dest_qp_num); - path->dci_cfi_prio_sl = (sl & 0x7) << 4; + MLX5_SET(ads, path, eth_prio, sl & 0x7); gid_type = ah->grh.sgid_attr->gid_type; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - path->ecn_dscp = (grh->traffic_class >> 2) & 0x3f; + MLX5_SET(ads, path, dscp, grh->traffic_class >> 2); } else { - path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; - path->fl_free_ar |= - (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; - path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah)); - path->grh_mlid = rdma_ah_get_path_bits(ah) & 0x7f; - if (ah_flags & IB_AH_GRH) - path->grh_mlid |= 1 << 7; - path->dci_cfi_prio_sl = sl & 0xf; + MLX5_SET(ads, path, fl, !!(path_flags & MLX5_PATH_FLAG_FL)); + MLX5_SET(ads, path, free_ar, + !!(path_flags & MLX5_PATH_FLAG_FREE_AR)); + MLX5_SET(ads, path, rlid, rdma_ah_get_dlid(ah)); + MLX5_SET(ads, path, mlid, rdma_ah_get_path_bits(ah)); + MLX5_SET(ads, path, grh, !!(ah_flags & IB_AH_GRH)); + MLX5_SET(ads, path, sl, sl); } if (ah_flags & IB_AH_GRH) { - path->mgid_index = grh->sgid_index; - path->hop_limit = grh->hop_limit; - path->tclass_flowlabel = - cpu_to_be32((grh->traffic_class << 20) | - (grh->flow_label)); - memcpy(path->rgid, grh->dgid.raw, 16); + MLX5_SET(ads, path, src_addr_index, grh->sgid_index); + MLX5_SET(ads, path, hop_limit, grh->hop_limit); + MLX5_SET(ads, path, tclass, grh->traffic_class); + MLX5_SET(ads, path, flow_label, grh->flow_label); + memcpy(MLX5_ADDR_OF(ads, path, rgid_rip), grh->dgid.raw, + sizeof(grh->dgid.raw)); } err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); if (err < 0) return err; - path->static_rate = err; - path->port = port; + MLX5_SET(ads, path, stat_rate, err); + MLX5_SET(ads, path, vhca_port_num, port); if (attr_mask & IB_QP_TIMEOUT) - path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3; + MLX5_SET(ads, path, ack_timeout, + alt ? attr->alt_timeout : attr->timeout); if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) return modify_raw_packet_eth_prio(dev->mdev, @@ -3759,9 +3751,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; - struct mlx5_qp_context *context; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; + void *qpc, *pri_path, *alt_path; enum mlx5_qp_optpar optpar = 0; u32 set_id = 0; int mlx5_st; @@ -3773,25 +3765,25 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (mlx5_st < 0) return -EINVAL; - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) + qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); + if (!qpc) return -ENOMEM; pd = to_mpd(qp->ibqp.pd); - context->flags = cpu_to_be32(mlx5_st << 16); + MLX5_SET(qpc, qpc, st, mlx5_st); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { - context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); } else { switch (attr->path_mig_state) { case IB_MIG_MIGRATED: - context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); break; case IB_MIG_REARM: - context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_REARM); break; case IB_MIG_ARMED: - context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_ARMED); break; } } @@ -3799,19 +3791,20 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, tx_affinity = get_tx_affinity(ibqp, attr, attr_mask, cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT, udata); - if (tx_affinity) { - context->flags |= cpu_to_be32(tx_affinity << 24); - if (new_state == IB_QPS_RTR && - MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) - optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; - } + + MLX5_SET(qpc, qpc, lag_tx_port_affinity, tx_affinity); + if (tx_affinity && new_state == IB_QPS_RTR && + MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) + optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; if (is_sqp(ibqp->qp_type)) { - context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + MLX5_SET(qpc, qpc, mtu, IB_MTU_256); + MLX5_SET(qpc, qpc, log_msg_max, 8); } else if ((ibqp->qp_type == IB_QPT_UD && !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { - context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); + MLX5_SET(qpc, qpc, log_msg_max, 12); } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { @@ -3819,40 +3812,45 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = -EINVAL; goto out; } - context->mtu_msgmax = (attr->path_mtu << 5) | - (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); + MLX5_SET(qpc, qpc, mtu, attr->path_mtu); + MLX5_SET(qpc, qpc, log_msg_max, + MLX5_CAP_GEN(dev->mdev, log_max_msg)); } if (attr_mask & IB_QP_DEST_QPN) - context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + MLX5_SET(qpc, qpc, remote_qpn, attr->dest_qp_num); + + pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path); if (attr_mask & IB_QP_PKEY_INDEX) - context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); + MLX5_SET(ads, pri_path, pkey_index, attr->pkey_index); /* todo implement counter_index functionality */ if (is_sqp(ibqp->qp_type)) - context->pri_path.port = qp->port; + MLX5_SET(ads, pri_path, vhca_port_num, qp->port); if (attr_mask & IB_QP_PORT) - context->pri_path.port = attr->port_num; + MLX5_SET(ads, pri_path, vhca_port_num, attr->port_num); if (attr_mask & IB_QP_AV) { - err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, - attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + err = mlx5_set_path(dev, qp, &attr->ah_attr, pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : + qp->port, attr_mask, 0, attr, false); if (err) goto out; } if (attr_mask & IB_QP_TIMEOUT) - context->pri_path.ackto_lt |= attr->timeout << 3; + MLX5_SET(ads, pri_path, ack_timeout, attr->timeout); if (attr_mask & IB_QP_ALT_PATH) { - err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, - &context->alt_path, + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, alt_path, attr->alt_port_num, - attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + attr_mask | IB_QP_PKEY_INDEX | + IB_QP_TIMEOUT, 0, attr, true); if (err) goto out; @@ -3861,53 +3859,47 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); - context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); - context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; - context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; - context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + MLX5_SET(qpc, qpc, pd, pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + if (send_cq) + MLX5_SET(qpc, qpc, cqn_snd, send_cq->mcq.cqn); + if (recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, recv_cq->mcq.cqn); + + MLX5_SET(qpc, qpc, log_ack_req_freq, MLX5_IB_ACK_REQ_FREQ); if (attr_mask & IB_QP_RNR_RETRY) - context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); if (attr_mask & IB_QP_RETRY_CNT) - context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { - if (attr->max_rd_atomic) - context->params1 |= - cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); - } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic) + MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic)); if (attr_mask & IB_QP_SQ_PSN) - context->next_send_psn = cpu_to_be32(attr->sq_psn); + MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn); - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { - if (attr->max_dest_rd_atomic) - context->params2 |= - cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); - } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic) + MLX5_SET(qpc, qpc, log_rra_max, + ilog2(attr->max_dest_rd_atomic)); if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { - __be32 access_flags; - - err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags); + err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc); if (err) goto out; - - context->params2 |= access_flags; } if (attr_mask & IB_QP_MIN_RNR_TIMER) - context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + MLX5_SET(qpc, qpc, min_rnr_nak, attr->min_rnr_timer); if (attr_mask & IB_QP_RQ_PSN) - context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + MLX5_SET(qpc, qpc, next_rcv_psn, attr->rq_psn); if (attr_mask & IB_QP_QKEY) - context->qkey = cpu_to_be32(attr->qkey); + MLX5_SET(qpc, qpc, q_key, attr->qkey); if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - context->db_rec_addr = cpu_to_be64(qp->db.dma); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : @@ -3921,15 +3913,14 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, set_id = ibqp->counter->id; else set_id = mlx5_ib_get_counters_id(dev, port_num); - context->qp_counter_set_usr_page |= - cpu_to_be32(set_id << 24); + MLX5_SET(qpc, qpc, counter_set_id, set_id); } if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - context->sq_crq_size |= cpu_to_be16(1 << 4); + MLX5_SET(qpc, qpc, rlky, 1); if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) - context->deth_sqpn = cpu_to_be32(1); + MLX5_SET(qpc, qpc, deth_sqpn, 1); mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -3987,7 +3978,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - err = mlx5_core_qp_modify(dev, op, optpar, context, &base->mqp); + err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp); } if (err) @@ -4034,7 +4025,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } out: - kfree(context); + kfree(qpc); return err; } diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index def601199a1a..b8992b861ae6 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -495,72 +495,6 @@ struct mlx5_core_dct { struct completion drained; }; -struct mlx5_qp_path { - u8 fl_free_ar; - u8 rsvd3; - __be16 pkey_index; - u8 rsvd0; - u8 grh_mlid; - __be16 rlid; - u8 ackto_lt; - u8 mgid_index; - u8 static_rate; - u8 hop_limit; - __be32 tclass_flowlabel; - union { - u8 rgid[16]; - u8 rip[16]; - }; - u8 f_dscp_ecn_prio; - u8 ecn_dscp; - __be16 udp_sport; - u8 dci_cfi_prio_sl; - u8 port; - u8 rmac[6]; -}; - -/* FIXME: use mlx5_ifc.h qpc */ -struct mlx5_qp_context { - __be32 flags; - __be32 flags_pd; - u8 mtu_msgmax; - u8 rq_size_stride; - __be16 sq_crq_size; - __be32 qp_counter_set_usr_page; - __be32 wire_qpn; - __be32 log_pg_sz_remote_qpn; - struct mlx5_qp_path pri_path; - struct mlx5_qp_path alt_path; - __be32 params1; - u8 reserved2[4]; - __be32 next_send_psn; - __be32 cqn_send; - __be32 deth_sqpn; - u8 reserved3[4]; - __be32 last_acked_psn; - __be32 ssn; - __be32 params2; - __be32 rnr_nextrecvpsn; - __be32 xrcd; - __be32 cqn_recv; - __be64 db_rec_addr; - __be32 qkey; - __be32 rq_type_srqn; - __be32 rmsn; - __be16 hw_sq_wqe_counter; - __be16 sw_sq_wqe_counter; - __be16 hw_rcyclic_byte_counter; - __be16 hw_rq_counter; - __be16 sw_rcyclic_byte_counter; - __be16 sw_rq_counter; - u8 rsvd0[5]; - u8 cgs; - u8 cs_req; - u8 cs_res; - __be64 dc_access_key; - u8 rsvd1[24]; -}; - int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); -- cgit v1.2.3 From 3910ebaca8eae0cb9d41a20efe1bcb375ec64dfb Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczyński Date: Tue, 26 May 2020 21:39:05 +0000 Subject: PCI: Rename _DSM constants to align with spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename PCI-related _DSM constants to align them with the PCI Firmware Spec, r3.2, sec 4.6. No functional change intended. Link: https://lore.kernel.org/r/20200526213905.2479381-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/acpi/pci_root.c | 2 +- drivers/pci/pci-acpi.c | 4 ++-- drivers/pci/pci-label.c | 4 ++-- include/linux/pci-acpi.h | 10 ++++++---- 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index ac8ad6cb82aa..191204a4abe9 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -938,7 +938,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, * assignments made by firmware for this host bridge. */ obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1, - IGNORE_PCI_BOOT_CONFIG_DSM, NULL); + DSM_PCI_PRESERVE_BOOT_CONFIG, NULL); if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0) host_bridge->preserve_config = 1; ACPI_FREE(obj); diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d820a55ae71c..7224b1e5f2a8 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1128,7 +1128,7 @@ void acpi_pci_add_bus(struct pci_bus *bus) return; obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 3, - RESET_DELAY_DSM, NULL); + DSM_PCI_POWER_ON_RESET_DELAY, NULL); if (!obj) return; @@ -1193,7 +1193,7 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev, pdev->d3cold_delay = 0; obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 3, - FUNCTION_DELAY_DSM, NULL); + DSM_PCI_DEVICE_READINESS_DURATIONS, NULL); if (!obj) return; diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index a5910f942857..707dd9808676 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -178,7 +178,7 @@ static int dsm_get_label(struct device *dev, char *buf, return -1; obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 0x2, - DEVICE_LABEL_DSM, NULL); + DSM_PCI_DEVICE_NAME, NULL); if (!obj) return -1; @@ -218,7 +218,7 @@ static bool device_has_dsm(struct device *dev) return false; return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2, - 1 << DEVICE_LABEL_DSM); + 1 << DSM_PCI_DEVICE_NAME); } static umode_t acpi_index_string_exist(struct kobject *kobj, diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 2d155bfb8fbf..a3bb8e768778 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -107,10 +107,12 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { } #endif extern const guid_t pci_acpi_dsm_guid; -#define IGNORE_PCI_BOOT_CONFIG_DSM 0x05 -#define DEVICE_LABEL_DSM 0x07 -#define RESET_DELAY_DSM 0x08 -#define FUNCTION_DELAY_DSM 0x09 + +/* _DSM Definitions for PCI */ +#define DSM_PCI_PRESERVE_BOOT_CONFIG 0x05 +#define DSM_PCI_DEVICE_NAME 0x07 +#define DSM_PCI_POWER_ON_RESET_DELAY 0x08 +#define DSM_PCI_DEVICE_READINESS_DURATIONS 0x09 #ifdef CONFIG_PCIE_EDR void pci_acpi_add_edr_notifier(struct pci_dev *pdev); -- cgit v1.2.3 From 7a15b2e013f535a125ad7351ffc808c79bc6de35 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 20:22:29 +0200 Subject: net: remove kernel_getsockopt No users left. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/net.h | 2 -- net/socket.c | 34 ---------------------------------- 2 files changed, 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 6451425e828f..74ef5d7315f7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, - int *optlen); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, diff --git a/net/socket.c b/net/socket.c index 80422fc3c836..81a98b6cbd08 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3624,40 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_getsockopt - get a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Assigns the option length to @optlen. - * Returns 0 or an error. - */ - -int kernel_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int __user *uoptlen; - int err; - - uoptval = (char __user __force *) optval; - uoptlen = (int __user __force *) optlen; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); - else - err = sock->ops->getsockopt(sock, level, optname, uoptval, - uoptlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_getsockopt); - /** * kernel_setsockopt - set a socket option (kernel space) * @sock: socket -- cgit v1.2.3 From 1ac71ec0130cce5bed3ec11ffc88651097a24173 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 28 Apr 2020 08:47:43 +0000 Subject: mtd: spi-nor: Fix SPI NOR acronym The correct terminology is serial NOR flash or SPI NOR. s/SPI-NOR/SPI NOR and s/spi-nor/SPI NOR across the subsystem. Signed-off-by: Tudor Ambarus Reviewed-by: Sergei Shtylyov --- drivers/mtd/spi-nor/Kconfig | 4 ++-- drivers/mtd/spi-nor/controllers/Kconfig | 4 ++-- drivers/mtd/spi-nor/controllers/aspeed-smc.c | 2 +- drivers/mtd/spi-nor/controllers/hisi-sfc.c | 2 +- drivers/mtd/spi-nor/controllers/nxp-spifi.c | 2 +- drivers/mtd/spi-nor/core.c | 4 ++-- include/linux/mtd/spi-nor.h | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig index 6e816eafb312..ffc4b380f2b1 100644 --- a/drivers/mtd/spi-nor/Kconfig +++ b/drivers/mtd/spi-nor/Kconfig @@ -1,12 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig MTD_SPI_NOR - tristate "SPI-NOR device support" + tristate "SPI NOR device support" depends on MTD depends on MTD && SPI_MASTER select SPI_MEM help This is the framework for the SPI NOR which can be used by the SPI - device drivers and the SPI-NOR device driver. + device drivers and the SPI NOR device driver. if MTD_SPI_NOR diff --git a/drivers/mtd/spi-nor/controllers/Kconfig b/drivers/mtd/spi-nor/controllers/Kconfig index 10b86660b821..d89a5ea9446a 100644 --- a/drivers/mtd/spi-nor/controllers/Kconfig +++ b/drivers/mtd/spi-nor/controllers/Kconfig @@ -21,11 +21,11 @@ config SPI_CADENCE_QUADSPI Flash as an MTD device. config SPI_HISI_SFC - tristate "Hisilicon FMC SPI-NOR Flash Controller(SFC)" + tristate "Hisilicon FMC SPI NOR Flash Controller(SFC)" depends on ARCH_HISI || COMPILE_TEST depends on HAS_IOMEM help - This enables support for HiSilicon FMC SPI-NOR flash controller. + This enables support for HiSilicon FMC SPI NOR flash controller. config SPI_NXP_SPIFI tristate "NXP SPI Flash Interface (SPIFI)" diff --git a/drivers/mtd/spi-nor/controllers/aspeed-smc.c b/drivers/mtd/spi-nor/controllers/aspeed-smc.c index ae85e4c0e114..7225870e8b18 100644 --- a/drivers/mtd/spi-nor/controllers/aspeed-smc.c +++ b/drivers/mtd/spi-nor/controllers/aspeed-smc.c @@ -727,7 +727,7 @@ static int aspeed_smc_chip_setup_finish(struct aspeed_smc_chip *chip) /* * TODO: Adjust clocks if fast read is supported and interpret - * SPI-NOR flags to adjust controller settings. + * SPI NOR flags to adjust controller settings. */ if (chip->nor.read_proto == SNOR_PROTO_1_1_1) { if (chip->nor.read_dummy == 0) diff --git a/drivers/mtd/spi-nor/controllers/hisi-sfc.c b/drivers/mtd/spi-nor/controllers/hisi-sfc.c index 6c7a4118752e..95c502173cbd 100644 --- a/drivers/mtd/spi-nor/controllers/hisi-sfc.c +++ b/drivers/mtd/spi-nor/controllers/hisi-sfc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * HiSilicon FMC SPI-NOR flash controller driver + * HiSilicon FMC SPI NOR flash controller driver * * Copyright (c) 2015-2016 HiSilicon Technologies Co., Ltd. */ diff --git a/drivers/mtd/spi-nor/controllers/nxp-spifi.c b/drivers/mtd/spi-nor/controllers/nxp-spifi.c index 9a5b1a7c636a..5703e8313980 100644 --- a/drivers/mtd/spi-nor/controllers/nxp-spifi.c +++ b/drivers/mtd/spi-nor/controllers/nxp-spifi.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * SPI-NOR driver for NXP SPI Flash Interface (SPIFI) + * SPI NOR driver for NXP SPI Flash Interface (SPIFI) * * Copyright (C) 2015 Joachim Eastwood * diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c index 1ab4386a099a..0369d98b2d12 100644 --- a/drivers/mtd/spi-nor/core.c +++ b/drivers/mtd/spi-nor/core.c @@ -2469,7 +2469,7 @@ static int spi_nor_select_read(struct spi_nor *nor, nor->read_proto = read->proto; /* - * In the spi-nor framework, we don't need to make the difference + * In the SPI NOR framework, we don't need to make the difference * between mode clock cycles and wait state clock cycles. * Indeed, the value of the mode clock cycles is used by a QSPI * flash memory to know whether it should enter or leave its 0-4-4 @@ -3126,7 +3126,7 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, /* * Make sure the XSR_RDY flag is set before calling * spi_nor_wait_till_ready(). Xilinx S3AN share MFR - * with Atmel spi-nor + * with Atmel SPI NOR. */ if (info->flags & SPI_NOR_XSR_RDY) nor->flags |= SNOR_F_READY_XSR_RDY; diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index bebff2729c18..60bac2c0ec45 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -302,7 +302,7 @@ struct spi_nor; * @read: read data from the SPI NOR. * @write: write data to the SPI NOR. * @erase: erase a sector of the SPI NOR at the offset @offs; if - * not provided by the driver, spi-nor will send the erase + * not provided by the driver, SPI NOR will send the erase * opcode via write_reg(). */ struct spi_nor_controller_ops { @@ -336,7 +336,7 @@ struct spi_nor_flash_parameter; * layer is not DMA-able * @bouncebuf_size: size of the bounce buffer * @info: SPI NOR part JEDEC MFR ID and other info - * @manufacturer: spi-nor manufacturer + * @manufacturer: SPI NOR manufacturer * @page_size: the page size of the SPI NOR * @addr_width: number of address bytes * @erase_opcode: the opcode for erasing a sector @@ -344,12 +344,12 @@ struct spi_nor_flash_parameter; * @read_dummy: the dummy needed by the read operation * @program_opcode: the program opcode * @sst_write_second: used by the SST write operation - * @flags: flag options for the current SPI-NOR (SNOR_F_*) + * @flags: flag options for the current SPI NOR (SNOR_F_*) * @read_proto: the SPI protocol for read operations * @write_proto: the SPI protocol for write operations * @reg_proto: the SPI protocol for read_reg/write_reg/erase operations * @controller_ops: SPI NOR controller driver specific operations. - * @params: [FLASH-SPECIFIC] SPI-NOR flash parameters and settings. + * @params: [FLASH-SPECIFIC] SPI NOR flash parameters and settings. * The structure includes legacy flash parameters and * settings that can be overwritten by the spi_nor_fixups * hooks, or dynamically when parsing the SFDP tables. -- cgit v1.2.3 From 91710728d1725de51d06b40674abf6e860d592c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 May 2020 22:11:13 +0200 Subject: locking: Introduce local_lock() preempt_disable() and local_irq_disable/save() are in principle per CPU big kernel locks. This has several downsides: - The protection scope is unknown - Violation of protection rules is hard to detect by instrumentation - For PREEMPT_RT such sections, unless in low level critical code, can violate the preemptability constraints. To address this PREEMPT_RT introduced the concept of local_locks which are strictly per CPU. The lock operations map to preempt_disable(), local_irq_disable/save() and the enabling counterparts on non RT enabled kernels. If lockdep is enabled local locks gain a lock map which tracks the usage context. This will catch cases where an area is protected by preempt_disable() but the access also happens from interrupt context. local locks have identified quite a few such issues over the years, the most recent example is: b7d5dc21072cd ("random: add a spinlock_t to struct batched_entropy") Aside of the lockdep coverage this also improves code readability as it precisely annotates the protection scope. PREEMPT_RT substitutes these local locks with 'sleeping' spinlocks to protect such sections while maintaining preemtability and CPU locality. local locks can replace: - preempt_enable()/disable() pairs - local_irq_disable/enable() pairs - local_irq_save/restore() pairs They are also used to replace code which implicitly disables preemption like: - get_cpu()/put_cpu() - get_cpu_var()/put_cpu_var() with PREEMPT_RT friendly constructs. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200527201119.1692513-2-bigeasy@linutronix.de --- Documentation/locking/locktypes.rst | 215 ++++++++++++++++++++++++++++++++++-- include/linux/local_lock.h | 54 +++++++++ include/linux/local_lock_internal.h | 90 +++++++++++++++ 3 files changed, 348 insertions(+), 11 deletions(-) create mode 100644 include/linux/local_lock.h create mode 100644 include/linux/local_lock_internal.h (limited to 'include/linux') diff --git a/Documentation/locking/locktypes.rst b/Documentation/locking/locktypes.rst index 09f45ce38d26..1b577a8bf982 100644 --- a/Documentation/locking/locktypes.rst +++ b/Documentation/locking/locktypes.rst @@ -13,6 +13,7 @@ The kernel provides a variety of locking primitives which can be divided into two categories: - Sleeping locks + - CPU local locks - Spinning locks This document conceptually describes these lock types and provides rules @@ -44,9 +45,23 @@ Sleeping lock types: On PREEMPT_RT kernels, these lock types are converted to sleeping locks: + - local_lock - spinlock_t - rwlock_t + +CPU local locks +--------------- + + - local_lock + +On non-PREEMPT_RT kernels, local_lock functions are wrappers around +preemption and interrupt disabling primitives. Contrary to other locking +mechanisms, disabling preemption or interrupts are pure CPU local +concurrency control mechanisms and not suited for inter-CPU concurrency +control. + + Spinning locks -------------- @@ -67,6 +82,7 @@ can have suffixes which apply further protections: _irqsave/restore() Save and disable / restore interrupt disabled state =================== ==================================================== + Owner semantics =============== @@ -139,6 +155,56 @@ implementation, thus changing the fairness: writer from starving readers. +local_lock +========== + +local_lock provides a named scope to critical sections which are protected +by disabling preemption or interrupts. + +On non-PREEMPT_RT kernels local_lock operations map to the preemption and +interrupt disabling and enabling primitives: + + =========================== ====================== + local_lock(&llock) preempt_disable() + local_unlock(&llock) preempt_enable() + local_lock_irq(&llock) local_irq_disable() + local_unlock_irq(&llock) local_irq_enable() + local_lock_save(&llock) local_irq_save() + local_lock_restore(&llock) local_irq_save() + =========================== ====================== + +The named scope of local_lock has two advantages over the regular +primitives: + + - The lock name allows static analysis and is also a clear documentation + of the protection scope while the regular primitives are scopeless and + opaque. + + - If lockdep is enabled the local_lock gains a lockmap which allows to + validate the correctness of the protection. This can detect cases where + e.g. a function using preempt_disable() as protection mechanism is + invoked from interrupt or soft-interrupt context. Aside of that + lockdep_assert_held(&llock) works as with any other locking primitive. + +local_lock and PREEMPT_RT +------------------------- + +PREEMPT_RT kernels map local_lock to a per-CPU spinlock_t, thus changing +semantics: + + - All spinlock_t changes also apply to local_lock. + +local_lock usage +---------------- + +local_lock should be used in situations where disabling preemption or +interrupts is the appropriate form of concurrency control to protect +per-CPU data structures on a non PREEMPT_RT kernel. + +local_lock is not suitable to protect against preemption or interrupts on a +PREEMPT_RT kernel due to the PREEMPT_RT specific spinlock_t semantics. + + raw_spinlock_t and spinlock_t ============================= @@ -258,10 +324,82 @@ implementation, thus changing semantics: PREEMPT_RT caveats ================== +local_lock on RT +---------------- + +The mapping of local_lock to spinlock_t on PREEMPT_RT kernels has a few +implications. For example, on a non-PREEMPT_RT kernel the following code +sequence works as expected:: + + local_lock_irq(&local_lock); + raw_spin_lock(&lock); + +and is fully equivalent to:: + + raw_spin_lock_irq(&lock); + +On a PREEMPT_RT kernel this code sequence breaks because local_lock_irq() +is mapped to a per-CPU spinlock_t which neither disables interrupts nor +preemption. The following code sequence works perfectly correct on both +PREEMPT_RT and non-PREEMPT_RT kernels:: + + local_lock_irq(&local_lock); + spin_lock(&lock); + +Another caveat with local locks is that each local_lock has a specific +protection scope. So the following substitution is wrong:: + + func1() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock_1, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock_1, flags); + } + + func2() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock_2, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock_2, flags); + } + + func3() + { + lockdep_assert_irqs_disabled(); + access_protected_data(); + } + +On a non-PREEMPT_RT kernel this works correctly, but on a PREEMPT_RT kernel +local_lock_1 and local_lock_2 are distinct and cannot serialize the callers +of func3(). Also the lockdep assert will trigger on a PREEMPT_RT kernel +because local_lock_irqsave() does not disable interrupts due to the +PREEMPT_RT-specific semantics of spinlock_t. The correct substitution is:: + + func1() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock, flags); + } + + func2() + { + local_irq_save(flags); -> local_lock_irqsave(&local_lock, flags); + func3(); + local_irq_restore(flags); -> local_lock_irqrestore(&local_lock, flags); + } + + func3() + { + lockdep_assert_held(&local_lock); + access_protected_data(); + } + + spinlock_t and rwlock_t ----------------------- -These changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels +The changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels have a few implications. For example, on a non-PREEMPT_RT kernel the following code sequence works as expected:: @@ -282,9 +420,61 @@ local_lock mechanism. Acquiring the local_lock pins the task to a CPU, allowing things like per-CPU interrupt disabled locks to be acquired. However, this approach should be used only where absolutely necessary. +A typical scenario is protection of per-CPU variables in thread context:: -raw_spinlock_t --------------- + struct foo *p = get_cpu_ptr(&var1); + + spin_lock(&p->lock); + p->count += this_cpu_read(var2); + +This is correct code on a non-PREEMPT_RT kernel, but on a PREEMPT_RT kernel +this breaks. The PREEMPT_RT-specific change of spinlock_t semantics does +not allow to acquire p->lock because get_cpu_ptr() implicitly disables +preemption. The following substitution works on both kernels:: + + struct foo *p; + + migrate_disable(); + p = this_cpu_ptr(&var1); + spin_lock(&p->lock); + p->count += this_cpu_read(var2); + +On a non-PREEMPT_RT kernel migrate_disable() maps to preempt_disable() +which makes the above code fully equivalent. On a PREEMPT_RT kernel +migrate_disable() ensures that the task is pinned on the current CPU which +in turn guarantees that the per-CPU access to var1 and var2 are staying on +the same CPU. + +The migrate_disable() substitution is not valid for the following +scenario:: + + func() + { + struct foo *p; + + migrate_disable(); + p = this_cpu_ptr(&var1); + p->val = func2(); + +While correct on a non-PREEMPT_RT kernel, this breaks on PREEMPT_RT because +here migrate_disable() does not protect against reentrancy from a +preempting task. A correct substitution for this case is:: + + func() + { + struct foo *p; + + local_lock(&foo_lock); + p = this_cpu_ptr(&var1); + p->val = func2(); + +On a non-PREEMPT_RT kernel this protects against reentrancy by disabling +preemption. On a PREEMPT_RT kernel this is achieved by acquiring the +underlying per-CPU spinlock. + + +raw_spinlock_t on RT +-------------------- Acquiring a raw_spinlock_t disables preemption and possibly also interrupts, so the critical section must avoid acquiring a regular @@ -325,22 +515,25 @@ Lock type nesting rules The most basic rules are: - - Lock types of the same lock category (sleeping, spinning) can nest - arbitrarily as long as they respect the general lock ordering rules to - prevent deadlocks. + - Lock types of the same lock category (sleeping, CPU local, spinning) + can nest arbitrarily as long as they respect the general lock ordering + rules to prevent deadlocks. + + - Sleeping lock types cannot nest inside CPU local and spinning lock types. - - Sleeping lock types cannot nest inside spinning lock types. + - CPU local and spinning lock types can nest inside sleeping lock types. - - Spinning lock types can nest inside sleeping lock types. + - Spinning lock types can nest inside all lock types These constraints apply both in PREEMPT_RT and otherwise. The fact that PREEMPT_RT changes the lock category of spinlock_t and -rwlock_t from spinning to sleeping means that they cannot be acquired while -holding a raw spinlock. This results in the following nesting ordering: +rwlock_t from spinning to sleeping and substitutes local_lock with a +per-CPU spinlock_t means that they cannot be acquired while holding a raw +spinlock. This results in the following nesting ordering: 1) Sleeping locks - 2) spinlock_t and rwlock_t + 2) spinlock_t, rwlock_t, local_lock 3) raw_spinlock_t and bit spinlocks Lockdep will complain if these constraints are violated, both in diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h new file mode 100644 index 000000000000..e55010fa7329 --- /dev/null +++ b/include/linux/local_lock.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LOCAL_LOCK_H +#define _LINUX_LOCAL_LOCK_H + +#include + +/** + * local_lock_init - Runtime initialize a lock instance + */ +#define local_lock_init(lock) __local_lock_init(lock) + +/** + * local_lock - Acquire a per CPU local lock + * @lock: The lock variable + */ +#define local_lock(lock) __local_lock(lock) + +/** + * local_lock_irq - Acquire a per CPU local lock and disable interrupts + * @lock: The lock variable + */ +#define local_lock_irq(lock) __local_lock_irq(lock) + +/** + * local_lock_irqsave - Acquire a per CPU local lock, save and disable + * interrupts + * @lock: The lock variable + * @flags: Storage for interrupt flags + */ +#define local_lock_irqsave(lock, flags) \ + __local_lock_irqsave(lock, flags) + +/** + * local_unlock - Release a per CPU local lock + * @lock: The lock variable + */ +#define local_unlock(lock) __local_unlock(lock) + +/** + * local_unlock_irq - Release a per CPU local lock and enable interrupts + * @lock: The lock variable + */ +#define local_unlock_irq(lock) __local_unlock_irq(lock) + +/** + * local_unlock_irqrestore - Release a per CPU local lock and restore + * interrupt flags + * @lock: The lock variable + * @flags: Interrupt flags to restore + */ +#define local_unlock_irqrestore(lock, flags) \ + __local_unlock_irqrestore(lock, flags) + +#endif diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h new file mode 100644 index 000000000000..4a8795b21d77 --- /dev/null +++ b/include/linux/local_lock_internal.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LOCAL_LOCK_H +# error "Do not include directly, include linux/local_lock.h" +#endif + +#include +#include + +typedef struct { +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif +} local_lock_t; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LL_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_CONFIG, \ + } +#else +# define LL_DEP_MAP_INIT(lockname) +#endif + +#define INIT_LOCAL_LOCK(lockname) { LL_DEP_MAP_INIT(lockname) } + +#define __local_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ + lockdep_init_map_wait(&(lock)->dep_map, #lock, &__key, 0, LD_WAIT_CONFIG);\ +} while (0) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void local_lock_acquire(local_lock_t *l) +{ + lock_map_acquire(&l->dep_map); + DEBUG_LOCKS_WARN_ON(l->owner); + l->owner = current; +} + +static inline void local_lock_release(local_lock_t *l) +{ + DEBUG_LOCKS_WARN_ON(l->owner != current); + l->owner = NULL; + lock_map_release(&l->dep_map); +} + +#else /* CONFIG_DEBUG_LOCK_ALLOC */ +static inline void local_lock_acquire(local_lock_t *l) { } +static inline void local_lock_release(local_lock_t *l) { } +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + +#define __local_lock(lock) \ + do { \ + preempt_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_lock_irq(lock) \ + do { \ + local_irq_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_lock_irqsave(lock, flags) \ + do { \ + local_irq_save(flags); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + preempt_enable(); \ + } while (0) + +#define __local_unlock_irq(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + local_irq_enable(); \ + } while (0) + +#define __local_unlock_irqrestore(lock, flags) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + local_irq_restore(flags); \ + } while (0) -- cgit v1.2.3 From cfa6705d89b6562f79c40c249f8d94073c4276e4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 May 2020 22:11:14 +0200 Subject: radix-tree: Use local_lock for protection The radix-tree and idr preload mechanisms use preempt_disable() to protect the complete operation between xxx_preload() and xxx_preload_end(). As the code inside the preempt disabled section acquires regular spinlocks, which are converted to 'sleeping' spinlocks on a PREEMPT_RT kernel and eventually calls into a memory allocator, this conflicts with the RT semantics. Convert it to a local_lock which allows RT kernels to substitute them with a real per CPU lock. On non RT kernels this maps to preempt_disable() as before, but provides also lockdep coverage of the critical region. No functional change. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200527201119.1692513-3-bigeasy@linutronix.de --- include/linux/idr.h | 2 +- include/linux/radix-tree.h | 11 ++++++++++- lib/radix-tree.c | 20 +++++++++----------- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index ac6e946b6767..3ade03e5c7af 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -171,7 +171,7 @@ static inline bool idr_is_empty(const struct idr *idr) */ static inline void idr_preload_end(void) { - preempt_enable(); + local_unlock(&radix_tree_preloads.lock); } /** diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 63e62372443a..c2a9f7c90727 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -16,11 +16,20 @@ #include #include #include +#include /* Keep unconverted code working */ #define radix_tree_root xarray #define radix_tree_node xa_node +struct radix_tree_preload { + local_lock_t lock; + unsigned nr; + /* nodes->parent points to next preallocated node */ + struct radix_tree_node *nodes; +}; +DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); + /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: @@ -245,7 +254,7 @@ int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag); static inline void radix_tree_preload_end(void) { - preempt_enable(); + local_unlock(&radix_tree_preloads.lock); } void __rcu **idr_get_free(struct radix_tree_root *root, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 2ee6ae3b0ade..34e406fe561f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* in_interrupt() */ #include #include @@ -27,7 +28,6 @@ #include #include - /* * Radix tree node cache. */ @@ -58,12 +58,10 @@ struct kmem_cache *radix_tree_node_cachep; /* * Per-cpu pool of preloaded nodes */ -struct radix_tree_preload { - unsigned nr; - /* nodes->parent points to next preallocated node */ - struct radix_tree_node *nodes; +DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { + .lock = INIT_LOCAL_LOCK(lock), }; -static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { @@ -332,14 +330,14 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) */ gfp_mask &= ~__GFP_ACCOUNT; - preempt_disable(); + local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { - preempt_enable(); + local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; - preempt_disable(); + local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; @@ -381,7 +379,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ - preempt_disable(); + local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); @@ -1470,7 +1468,7 @@ EXPORT_SYMBOL(radix_tree_tagged); void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) - preempt_disable(); + local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); -- cgit v1.2.3 From b01b2141999936ac3e4746b7f76c0f204ae4b445 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 May 2020 22:11:15 +0200 Subject: mm/swap: Use local_lock for protection The various struct pagevec per CPU variables are protected by disabling either preemption or interrupts across the critical sections. Inside these sections spinlocks have to be acquired. These spinlocks are regular spinlock_t types which are converted to "sleeping" spinlocks on PREEMPT_RT enabled kernels. Obviously sleeping locks cannot be acquired in preemption or interrupt disabled sections. local locks provide a trivial way to substitute preempt and interrupt disable instances. On a non PREEMPT_RT enabled kernel local_lock() maps to preempt_disable() and local_lock_irq() to local_irq_disable(). Create lru_rotate_pvecs containing the pagevec and the locallock. Create lru_pvecs containing the remaining pagevecs and the locallock. Add lru_add_drain_cpu_zone() which is used from compact_zone() to avoid exporting the pvec structure. Change the relevant call sites to acquire these locks instead of using preempt_disable() / get_cpu() / get_cpu_var() and local_irq_disable() / local_irq_save(). There is neither a functional change nor a change in the generated binary code for non PREEMPT_RT enabled non-debug kernels. When lockdep is enabled local locks have lockdep maps embedded. These allow lockdep to validate the protections, i.e. inappropriate usage of a preemption only protected sections would result in a lockdep warning while the same problem would not be noticed with a plain preempt_disable() based protection. local locks also improve readability as they provide a named scope for the protections while preempt/interrupt disable are opaque scopeless. Finally local locks allow PREEMPT_RT to substitute them with real locking primitives to ensure the correctness of operation in a fully preemptible kernel. [ bigeasy: Adopted to use local_lock ] Signed-off-by: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200527201119.1692513-4-bigeasy@linutronix.de --- include/linux/swap.h | 1 + mm/compaction.c | 6 +-- mm/swap.c | 118 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 82 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e1bbf7a16b27..25181d2dd0b9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -337,6 +337,7 @@ extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); +extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..c9d659e6a02c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2243,15 +2243,11 @@ check_drain: * would succeed. */ if (cc->order > 0 && last_migrated_pfn) { - int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); if (last_migrated_pfn < current_block_start) { - cpu = get_cpu(); - lru_add_drain_cpu(cpu); - drain_local_pages(cc->zone); - put_cpu(); + lru_add_drain_cpu_zone(cc->zone); /* No more flushing until we migrate again */ last_migrated_pfn = 0; } diff --git a/mm/swap.c b/mm/swap.c index bf9a79fed62d..0ac463d44cff 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "internal.h" @@ -44,14 +45,32 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); +/* Protecting only lru_rotate.pvec which requires disabling interrupts */ +struct lru_rotate { + local_lock_t lock; + struct pagevec pvec; +}; +static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +/* + * The following struct pagevec are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ +struct lru_pvecs { + local_lock_t lock; + struct pagevec lru_add; + struct pagevec lru_deactivate_file; + struct pagevec lru_deactivate; + struct pagevec lru_lazyfree; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + struct pagevec activate_page; #endif +}; +static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* * This path almost never happens for VM activity - pages are normally @@ -254,11 +273,11 @@ void rotate_reclaimable_page(struct page *page) unsigned long flags; get_page(page); - local_irq_save(flags); - pvec = this_cpu_ptr(&lru_rotate_pvecs); + local_lock_irqsave(&lru_rotate.lock, flags); + pvec = this_cpu_ptr(&lru_rotate.pvec); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } } @@ -293,7 +312,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, #ifdef CONFIG_SMP static void activate_page_drain(int cpu) { - struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, __activate_page, NULL); @@ -301,19 +320,21 @@ static void activate_page_drain(int cpu) static bool need_activate_page_drain(int cpu) { - return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); - put_cpu_var(activate_page_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -335,9 +356,12 @@ void activate_page(struct page *page) static void __lru_cache_activate_page(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + struct pagevec *pvec; int i; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + /* * Search backwards on the optimistic assumption that the page being * activated has just been added to this pagevec. Note that only @@ -357,7 +381,7 @@ static void __lru_cache_activate_page(struct page *page) } } - put_cpu_var(lru_add_pvec); + local_unlock(&lru_pvecs.lock); } /* @@ -385,7 +409,7 @@ void mark_page_accessed(struct page *page) } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via - * activate_page_pvecs. Otherwise, assume the page is on a + * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -404,12 +428,14 @@ EXPORT_SYMBOL(mark_page_accessed); static void __lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvec); + local_unlock(&lru_pvecs.lock); } /** @@ -593,30 +619,30 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = &per_cpu(lru_rotate.pvec, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + local_lock_irqsave(&lru_rotate.lock, flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } - pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - pvec = &per_cpu(lru_lazyfree_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -641,11 +667,14 @@ void deactivate_file_page(struct page *page) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - put_cpu_var(lru_deactivate_file_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -660,12 +689,14 @@ void deactivate_file_page(struct page *page) void deactivate_page(struct page *page) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -680,19 +711,30 @@ void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); - put_cpu_var(lru_lazyfree_pvecs); + local_unlock(&lru_pvecs.lock); } } void lru_add_drain(void) { - lru_add_drain_cpu(get_cpu()); - put_cpu(); + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&lru_pvecs.lock); +} + +void lru_add_drain_cpu_zone(struct zone *zone) +{ + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + drain_local_pages(zone); + local_unlock(&lru_pvecs.lock); } #ifdef CONFIG_SMP @@ -743,11 +785,11 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || - pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || + if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); -- cgit v1.2.3 From 4b44a21dd640b692d4e9b12d3e37c24825f90baa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:02 +0200 Subject: irq_work, smp: Allow irq_work on call_single_queue Currently irq_work_queue_on() will issue an unconditional arch_send_call_function_single_ipi() and has the handler do irq_work_run(). This is unfortunate in that it makes the IPI handler look at a second cacheline and it misses the opportunity to avoid the IPI. Instead note that struct irq_work and struct __call_single_data are very similar in layout, so use a few bits in the flags word to encode a type and stick the irq_work on the call_single_queue list. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.011635912@infradead.org --- include/linux/irq_work.h | 7 ++- include/linux/smp.h | 23 ++++++++- kernel/irq_work.c | 53 +++++++++++---------- kernel/smp.c | 119 +++++++++++++++++++++++++++++------------------ 4 files changed, 130 insertions(+), 72 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 3b752e80c017..f23a359c8f46 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -13,6 +13,8 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ +/* flags share CSD_FLAG_ space */ + #define IRQ_WORK_PENDING BIT(0) #define IRQ_WORK_BUSY BIT(1) @@ -23,9 +25,12 @@ #define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) +/* + * structure shares layout with single_call_data_t. + */ struct irq_work { - atomic_t flags; struct llist_node llnode; + atomic_t flags; void (*func)(struct irq_work *); }; diff --git a/include/linux/smp.h b/include/linux/smp.h index cbc9162689d0..45ad6e30f398 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -16,17 +16,38 @@ typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); + +enum { + CSD_FLAG_LOCK = 0x01, + + /* IRQ_WORK_flags */ + + CSD_TYPE_ASYNC = 0x00, + CSD_TYPE_SYNC = 0x10, + CSD_TYPE_IRQ_WORK = 0x20, + CSD_FLAG_TYPE_MASK = 0xF0, +}; + +/* + * structure shares (partial) layout with struct irq_work + */ struct __call_single_data { struct llist_node llist; + unsigned int flags; smp_call_func_t func; void *info; - unsigned int flags; }; /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); +/* + * Enqueue a llist_node on the call_single_queue; be very careful, read + * flush_smp_call_function_queue() in detail. + */ +extern void __smp_call_single_queue(int cpu, struct llist_node *node); + /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 48b5d1b6af4d..eca83965b631 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); /* * If the work is already pending, no need to raise the IPI. * The pairing atomic_fetch_andnot() in irq_work_run() makes sure @@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &work->llnode); } else { __irq_work_queue_local(work); } @@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void) return true; } +void irq_work_single(void *arg) +{ + struct irq_work *work = arg; + int flags; + + /* + * Clear the PENDING bit, after this point the @work + * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. + */ + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + + lockdep_irq_work_enter(work); + work->func(work); + lockdep_irq_work_exit(work); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); +} + static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; @@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) { - int flags; - /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. - */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); - - lockdep_irq_work_enter(work); - work->func(work); - lockdep_irq_work_exit(work); - /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. - */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); - } + llist_for_each_entry_safe(work, tmp, llnode, llnode) + irq_work_single(work); } /* diff --git a/kernel/smp.c b/kernel/smp.c index 9f1181375141..856562b80794 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,10 +23,8 @@ #include "smpboot.h" -enum { - CSD_FLAG_LOCK = 0x01, - CSD_FLAG_SYNCHRONOUS = 0x02, -}; + +#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -137,15 +135,33 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); extern void send_call_function_single_ipi(int cpu); +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (llist_add(node, &per_cpu(call_single_queue, cpu))) + send_call_function_single_ipi(cpu); +} + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd, - smp_call_func_t func, void *info) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { + smp_call_func_t func = csd->func; + void *info = csd->info; unsigned long flags; /* @@ -159,28 +175,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, return 0; } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } - csd->func = func; - csd->info = info; - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &csd->llist); return 0; } @@ -194,16 +194,10 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, void generic_smp_call_function_single_interrupt(void) { flush_smp_call_function_queue(true); - - /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. - */ - irq_work_run(); } +extern void irq_work_single(void *); + /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * @@ -241,9 +235,21 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) - pr_warn("IPI callback %pS sent to offline CPU\n", - csd->func); + llist_for_each_entry(csd, entry, llist) { + switch (CSD_TYPE(csd)) { + case CSD_TYPE_ASYNC: + case CSD_TYPE_SYNC: + case CSD_TYPE_IRQ_WORK: + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + break; + + default: + pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", + CSD_TYPE(csd)); + break; + } + } } /* @@ -251,16 +257,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; - /* Do we wait until *after* callback? */ - if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + if (prev) { prev->next = &csd_next->llist; } else { entry = &csd_next->llist; } + func(info); csd_unlock(csd); } else { @@ -272,11 +279,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * Second; run all !SYNC callbacks. */ llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; + int type = CSD_TYPE(csd); - csd_unlock(csd); - func(info); + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } } } @@ -305,7 +318,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }; int this_cpu; int err; @@ -339,7 +352,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } - err = generic_exec_single(cpu, csd, func, info); + csd->func = func; + csd->info = info; + + err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); @@ -385,7 +401,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) csd->flags = CSD_FLAG_LOCK; smp_wmb(); - err = generic_exec_single(cpu, csd, csd->func, csd->info); + err = generic_exec_single(cpu, csd); out: preempt_enable(); @@ -500,7 +516,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_FLAG_SYNCHRONOUS; + csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) @@ -632,6 +648,17 @@ void __init smp_init(void) { int num_nodes, num_cpus; + /* + * Ensure struct irq_work layout matches so that + * flush_smp_call_function_queue() can do horrible things. + */ + BUILD_BUG_ON(offsetof(struct irq_work, llnode) != + offsetof(struct __call_single_data, llist)); + BUILD_BUG_ON(offsetof(struct irq_work, func) != + offsetof(struct __call_single_data, func)); + BUILD_BUG_ON(offsetof(struct irq_work, flags) != + offsetof(struct __call_single_data, flags)); + idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From a148866489fbe243c936fe43e4525d8dbfa0318f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2020 18:11:04 +0200 Subject: sched: Replace rq::wake_list The recent commit: 90b5363acd47 ("sched: Clean up scheduler_ipi()") got smp_call_function_single_async() subtly wrong. Even though it will return -EBUSY when trying to re-use a csd, that condition is not atomic and still requires external serialization. The change in ttwu_queue_remote() got this wrong. While on first reading ttwu_queue_remote() has an atomic test-and-set that appears to serialize the use, the matching 'release' is not in the right place to actually guarantee this serialization. The actual race is vs the sched_ttwu_pending() call in the idle loop; that can run the wakeup-list without consuming the CSD. Instead of trying to chain the lists, merge them. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200526161908.129371594@infradead.org --- include/linux/sched.h | 1 + include/linux/smp.h | 1 + kernel/sched/core.c | 25 +++++++------------------ kernel/sched/idle.c | 1 - kernel/sched/sched.h | 8 -------- kernel/smp.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 6 files changed, 49 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ebc68706152a..e0f5f41cf2ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,6 +654,7 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; + unsigned int wake_entry_type; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 45ad6e30f398..84f90e24ed6f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -25,6 +25,7 @@ enum { CSD_TYPE_ASYNC = 0x00, CSD_TYPE_SYNC = 0x10, CSD_TYPE_IRQ_WORK = 0x20, + CSD_TYPE_TTWU = 0x30, CSD_FLAG_TYPE_MASK = 0xF0, }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b71ed5ee97fd..b3c64c6b4d2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1538,7 +1538,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_from_idle(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -2272,14 +2272,13 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist; struct task_struct *p, *t; struct rq_flags rf; - llist = llist_del_all(&rq->wake_list); if (!llist) return; @@ -2299,11 +2298,6 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } -static void wake_csd_func(void *info) -{ - sched_ttwu_pending(); -} - void send_call_function_single_ipi(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -2327,12 +2321,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - if (llist_add(&p->wake_entry, &rq->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_call_function_single_async(cpu, &rq->wake_csd); - else - trace_sched_wake_idle_without_ipi(cpu); - } + __smp_call_single_queue(cpu, &p->wake_entry); } void wake_up_if_idle(int cpu) @@ -2772,6 +2761,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry_type = CSD_TYPE_TTWU; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -6564,7 +6556,6 @@ int sched_cpu_dying(unsigned int cpu) struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); @@ -6763,8 +6754,6 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; - rq_csd_init(rq, &rq->wake_csd, wake_csd_func); - INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 387fd75ee6f7..05deb81bb3e3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -294,7 +294,6 @@ static void do_idle(void) * critical section. */ flush_smp_call_function_from_idle(); - sched_ttwu_pending(); schedule_idle(); if (unlikely(klp_patch_pending(current))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c86fc94c54e5..1d4e94c1e5fe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1023,11 +1023,6 @@ struct rq { unsigned int ttwu_local; #endif -#ifdef CONFIG_SMP - call_single_data_t wake_csd; - struct llist_head wake_list; -#endif - #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -1371,8 +1366,6 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -extern void sched_ttwu_pending(void); - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -1512,7 +1505,6 @@ extern void flush_smp_call_function_from_idle(void); #else /* !CONFIG_SMP: */ static inline void flush_smp_call_function_from_idle(void) { } -static inline void sched_ttwu_pending(void) { } #endif #include "stats.h" diff --git a/kernel/smp.c b/kernel/smp.c index 856562b80794..0d61dc060b01 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -196,6 +196,7 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } +extern void sched_ttwu_pending(void *); extern void irq_work_single(void *); /** @@ -244,6 +245,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd->func); break; + case CSD_TYPE_TTWU: + pr_warn("IPI task-wakeup sent to offline CPU\n"); + break; + default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); @@ -275,22 +280,43 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } + if (!entry) + return; + /* * Second; run all !SYNC callbacks. */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { int type = CSD_TYPE(csd); - if (type == CSD_TYPE_ASYNC) { - smp_call_func_t func = csd->func; - void *info = csd->info; + if (type != CSD_TYPE_TTWU) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } - csd_unlock(csd); - func(info); - } else if (type == CSD_TYPE_IRQ_WORK) { - irq_work_single(csd); + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } + + } else { + prev = &csd->llist; } } + + /* + * Third; only CSD_TYPE_TTWU is left, issue those. + */ + if (entry) + sched_ttwu_pending(entry); } void flush_smp_call_function_from_idle(void) @@ -659,6 +685,13 @@ void __init smp_init(void) BUILD_BUG_ON(offsetof(struct irq_work, flags) != offsetof(struct __call_single_data, flags)); + /* + * Assert the CSD_TYPE_TTWU layout is similar enough + * for task_struct to be on the @call_single_queue. + */ + BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != + offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); + idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From 6db96e5810e0a6a345b7d78549de7676ae5b2662 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 13 Apr 2020 10:46:03 +0800 Subject: mmc: host: Introduce the request_atomic() for the host The SD host controller can process one request in the atomic context if the card is nonremovable, which means we can submit next request in the irq hard handler when using the MMC host software queue to reduce the latency. Thus this patch adds a new API request_atomic() for the host controller, as well as adding support for host software queue to submit a request by the new request_atomic() API. Moreover there is an unusual case that the card is busy when trying to send a command, and we can not polling the card status in interrupt context by using request_atomic() to dispatch requests. Thus we should queue a work to try again in the non-atomic context in case the host releases the busy signal later. Suggested-by: Adrian Hunter Acked-by: Adrian Hunter Signed-off-by: Baolin Wang Link: https://lore.kernel.org/r/a344e27e506cb2329073cbd5cf65e15cc3cbeba9.1586744073.git.baolin.wang7@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmc_hsq.c | 29 ++++++++++++++++++++++++++++- drivers/mmc/host/mmc_hsq.h | 1 + include/linux/mmc/host.h | 3 +++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/mmc_hsq.c b/drivers/mmc/host/mmc_hsq.c index b90b2c97b6cf..a5e05ed0fda3 100644 --- a/drivers/mmc/host/mmc_hsq.c +++ b/drivers/mmc/host/mmc_hsq.c @@ -16,11 +16,20 @@ #define HSQ_NUM_SLOTS 64 #define HSQ_INVALID_TAG HSQ_NUM_SLOTS +static void mmc_hsq_retry_handler(struct work_struct *work) +{ + struct mmc_hsq *hsq = container_of(work, struct mmc_hsq, retry_work); + struct mmc_host *mmc = hsq->mmc; + + mmc->ops->request(mmc, hsq->mrq); +} + static void mmc_hsq_pump_requests(struct mmc_hsq *hsq) { struct mmc_host *mmc = hsq->mmc; struct hsq_slot *slot; unsigned long flags; + int ret = 0; spin_lock_irqsave(&hsq->lock, flags); @@ -42,7 +51,24 @@ static void mmc_hsq_pump_requests(struct mmc_hsq *hsq) spin_unlock_irqrestore(&hsq->lock, flags); - mmc->ops->request(mmc, hsq->mrq); + if (mmc->ops->request_atomic) + ret = mmc->ops->request_atomic(mmc, hsq->mrq); + else + mmc->ops->request(mmc, hsq->mrq); + + /* + * If returning BUSY from request_atomic(), which means the card + * may be busy now, and we should change to non-atomic context to + * try again for this unusual case, to avoid time-consuming operations + * in the atomic context. + * + * Note: we just give a warning for other error cases, since the host + * driver will handle them. + */ + if (ret == -EBUSY) + schedule_work(&hsq->retry_work); + else + WARN_ON_ONCE(ret); } static void mmc_hsq_update_next_tag(struct mmc_hsq *hsq, int remains) @@ -325,6 +351,7 @@ int mmc_hsq_init(struct mmc_hsq *hsq, struct mmc_host *mmc) hsq->mmc->cqe_private = hsq; mmc->cqe_ops = &mmc_hsq_ops; + INIT_WORK(&hsq->retry_work, mmc_hsq_retry_handler); spin_lock_init(&hsq->lock); init_waitqueue_head(&hsq->wait_queue); diff --git a/drivers/mmc/host/mmc_hsq.h b/drivers/mmc/host/mmc_hsq.h index 18b9cf55925f..ffdd9cd172c3 100644 --- a/drivers/mmc/host/mmc_hsq.h +++ b/drivers/mmc/host/mmc_hsq.h @@ -12,6 +12,7 @@ struct mmc_hsq { wait_queue_head_t wait_queue; struct hsq_slot *slot; spinlock_t lock; + struct work_struct retry_work; int next_tag; int num_slots; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index c318fb5b6a94..d4a50e5dc111 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -92,6 +92,9 @@ struct mmc_host_ops { int err); void (*pre_req)(struct mmc_host *host, struct mmc_request *req); void (*request)(struct mmc_host *host, struct mmc_request *req); + /* Submit one request to host in atomic context. */ + int (*request_atomic)(struct mmc_host *host, + struct mmc_request *req); /* * Avoid calling the next three functions too often or in a "fast -- cgit v1.2.3 From 064f7e58ee42372be05ec87fdf57864c7475ba93 Mon Sep 17 00:00:00 2001 From: Krishna Konda Date: Fri, 1 May 2020 19:23:01 +0530 Subject: mmc: core: expose info about enhanced rpmb support Following eMMC JEDEC JESD84-B51 standard, an enhanced form of rpmb is supported. What this enhanced mode supports is in addition to be able to write one rpmb or two rpmb frames at a time, 32 frames can be written at a time. Expose this information present in ext csd field so that the user space application that wants to make use of this can do so. Signed-off-by: Krishna Konda Signed-off-by: Veerabhadrarao Badiganti Link: https://lore.kernel.org/r/1588341189-4371-1-git-send-email-vbadigan@codeaurora.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 6 ++++++ include/linux/mmc/card.h | 1 + include/linux/mmc/mmc.h | 1 + 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index de94fbe629bd..4203303f946a 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -647,6 +647,9 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) mmc_hostname(card->host), card->ext_csd.cmdq_depth); } + card->ext_csd.enhanced_rpmb_supported = + (card->ext_csd.rel_param & + EXT_CSD_WR_REL_PARAM_EN_RPMB_REL_WR); } out: return err; @@ -786,6 +789,8 @@ MMC_DEV_ATTR(enhanced_area_offset, "%llu\n", card->ext_csd.enhanced_area_offset); MMC_DEV_ATTR(enhanced_area_size, "%u\n", card->ext_csd.enhanced_area_size); MMC_DEV_ATTR(raw_rpmb_size_mult, "%#x\n", card->ext_csd.raw_rpmb_size_mult); +MMC_DEV_ATTR(enhanced_rpmb_supported, "%#x\n", + card->ext_csd.enhanced_rpmb_supported); MMC_DEV_ATTR(rel_sectors, "%#x\n", card->ext_csd.rel_sectors); MMC_DEV_ATTR(ocr, "0x%08x\n", card->ocr); MMC_DEV_ATTR(rca, "0x%04x\n", card->rca); @@ -843,6 +848,7 @@ static struct attribute *mmc_std_attrs[] = { &dev_attr_enhanced_area_offset.attr, &dev_attr_enhanced_area_size.attr, &dev_attr_raw_rpmb_size_mult.attr, + &dev_attr_enhanced_rpmb_supported.attr, &dev_attr_rel_sectors.attr, &dev_attr_ocr.attr, &dev_attr_rca.attr, diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index cf3780a6ccc4..7d46411ffaa2 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -48,6 +48,7 @@ struct mmc_ext_csd { u8 sec_feature_support; u8 rel_sectors; u8 rel_param; + bool enhanced_rpmb_supported; u8 part_config; u8 cache_ctrl; u8 rst_n_function; diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 4b85ef05a906..d9a65c6a8816 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -325,6 +325,7 @@ static inline bool mmc_ready_for_data(u32 status) */ #define EXT_CSD_WR_REL_PARAM_EN (1<<2) +#define EXT_CSD_WR_REL_PARAM_EN_RPMB_REL_WR (1<<4) #define EXT_CSD_BOOT_WP_B_PWR_WP_DIS (0x40) #define EXT_CSD_BOOT_WP_B_PERM_WP_DIS (0x10) -- cgit v1.2.3 From 30e1028dcef9ac981c20df83983e0e77a4df2dc4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 2 May 2020 16:28:25 +0200 Subject: mmc: sdhci-esdhc: update contact email The 'pengutronix' address is defunct for years. Use the proper contact address. Signed-off-by: Wolfram Sang Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20200502142840.19418-1-wsa@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc.h | 2 +- include/linux/platform_data/mmc-esdhc-imx.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sdhci-esdhc.h b/drivers/mmc/host/sdhci-esdhc.h index 947212f16bc6..a30796e79b1c 100644 --- a/drivers/mmc/host/sdhci-esdhc.h +++ b/drivers/mmc/host/sdhci-esdhc.h @@ -5,7 +5,7 @@ * Copyright (c) 2007 Freescale Semiconductor, Inc. * Copyright (c) 2009 MontaVista Software, Inc. * Copyright (c) 2010 Pengutronix e.K. - * Author: Wolfram Sang + * Author: Wolfram Sang */ #ifndef _DRIVERS_MMC_SDHCI_ESDHC_H diff --git a/include/linux/platform_data/mmc-esdhc-imx.h b/include/linux/platform_data/mmc-esdhc-imx.h index 0434f68eda86..cba1184b364c 100644 --- a/include/linux/platform_data/mmc-esdhc-imx.h +++ b/include/linux/platform_data/mmc-esdhc-imx.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright 2010 Wolfram Sang + * Copyright 2010 Wolfram Sang */ #ifndef __ASM_ARCH_IMX_ESDHC_H -- cgit v1.2.3 From f4f20d6897b1c047c5d38b2cfc50928e83d8fd83 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:22:18 -0500 Subject: memstick: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507192218.GA16315@embeddedor Signed-off-by: Ulf Hansson --- include/linux/memstick.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memstick.h b/include/linux/memstick.h index 216a713bef7f..da4c65f9435f 100644 --- a/include/linux/memstick.h +++ b/include/linux/memstick.h @@ -288,7 +288,7 @@ struct memstick_host { int (*set_param)(struct memstick_host *host, enum memstick_param param, int value); - unsigned long private[0] ____cacheline_aligned; + unsigned long private[] ____cacheline_aligned; }; struct memstick_driver { -- cgit v1.2.3 From 1be64c7963f89afbd6f96f27effea20900650dfe Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 8 May 2020 13:29:02 +0200 Subject: mmc: host: Drop redundant MMC_CAP_ERASE The MMC_CAP_ERASE bit is no longer used by the mmc core as erase, discard and trim operations are now always supported. Therefore, drop the bit and move all mmc hosts away from using it. Signed-off-by: Ulf Hansson Reviewed-by: Rui Miguel Silva Link: https://lore.kernel.org/r/20200508112902.23575-1-ulf.hansson@linaro.org Reviewed-by: Linus Walleij --- drivers/mmc/host/bcm2835.c | 3 +-- drivers/mmc/host/cavium.c | 3 +-- drivers/mmc/host/dw_mmc.c | 6 ------ drivers/mmc/host/meson-mx-sdio.c | 2 +- drivers/mmc/host/mtk-sd.c | 2 +- drivers/mmc/host/mvsdio.c | 2 -- drivers/mmc/host/mxs-mmc.c | 3 +-- drivers/mmc/host/omap.c | 2 +- drivers/mmc/host/omap_hsmmc.c | 2 +- drivers/mmc/host/rtsx_pci_sdmmc.c | 2 +- drivers/mmc/host/rtsx_usb_sdmmc.c | 2 +- drivers/mmc/host/sdhci.c | 2 +- drivers/mmc/host/sunxi-mmc.c | 2 +- drivers/mmc/host/tmio_mmc_core.c | 2 +- drivers/staging/greybus/sdio.c | 1 - include/linux/mmc/host.h | 1 - 16 files changed, 12 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index c3d949847cbd..a0767790a826 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1280,8 +1280,7 @@ static int bcm2835_add_host(struct bcm2835_host *host) /* host controller capabilities */ mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED | - MMC_CAP_NEEDS_POLL | MMC_CAP_HW_RESET | MMC_CAP_ERASE | - MMC_CAP_CMD23; + MMC_CAP_NEEDS_POLL | MMC_CAP_HW_RESET | MMC_CAP_CMD23; spin_lock_init(&host->lock); mutex_init(&host->mutex); diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c index 89deb451e0ac..c5da3aaee334 100644 --- a/drivers/mmc/host/cavium.c +++ b/drivers/mmc/host/cavium.c @@ -1038,8 +1038,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host) * Disable bounce buffers for max_segs = 1 */ mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | - MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD | - MMC_CAP_3_3V_DDR; + MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD | MMC_CAP_3_3V_DDR; if (host->use_sg) mmc->max_segs = 16; diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 5d1f8a3ec3a5..35ae5737c622 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -2751,12 +2751,6 @@ static int dw_mci_init_slot_caps(struct dw_mci_slot *slot) if (host->pdata->caps) mmc->caps = host->pdata->caps; - /* - * Support MMC_CAP_ERASE by default. - * It needs to use trim/discard/erase commands. - */ - mmc->caps |= MMC_CAP_ERASE; - if (host->pdata->pm_caps) mmc->pm_caps = host->pdata->pm_caps; diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index 3813b544f571..9b2cf7afc246 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -564,7 +564,7 @@ static int meson_mx_mmc_add_host(struct meson_mx_mmc_host *host) mmc->f_max = clk_round_rate(host->cfg_div_clk, clk_get_rate(host->parent_clk)); - mmc->caps |= MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_WAIT_WHILE_BUSY; + mmc->caps |= MMC_CAP_CMD23 | MMC_CAP_WAIT_WHILE_BUSY; mmc->ops = &meson_mx_mmc_ops; ret = mmc_of_parse(mmc); diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 53819ae9f271..39e7fc54c438 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2326,7 +2326,7 @@ static int msdc_drv_probe(struct platform_device *pdev) if (mmc->caps & MMC_CAP_SDIO_IRQ) mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; - mmc->caps |= MMC_CAP_ERASE | MMC_CAP_CMD23; + mmc->caps |= MMC_CAP_CMD23; /* MMC core transfer sizes tunable parameters */ mmc->max_segs = MAX_BD_NUM; if (host->dev_comp->support_64g) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 203b61712601..cc0752a9df6d 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -752,8 +752,6 @@ static int mvsd_probe(struct platform_device *pdev) if (maxfreq) mmc->f_max = maxfreq; - mmc->caps |= MMC_CAP_ERASE; - spin_lock_init(&host->lock); host->base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index d82674aed447..b1820def36c0 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -634,8 +634,7 @@ static int mxs_mmc_probe(struct platform_device *pdev) /* set mmc core parameters */ mmc->ops = &mxs_mmc_ops; mmc->caps = MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED | - MMC_CAP_SDIO_IRQ | MMC_CAP_NEEDS_POLL | MMC_CAP_CMD23 | - MMC_CAP_ERASE; + MMC_CAP_SDIO_IRQ | MMC_CAP_NEEDS_POLL | MMC_CAP_CMD23; host->broken_cd = of_property_read_bool(np, "broken-cd"); diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index d74e73c95fdf..33d7af7c7762 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1244,7 +1244,7 @@ static int mmc_omap_new_slot(struct mmc_omap_host *host, int id) mmc->caps = 0; if (host->pdata->slots[id].wires >= 4) - mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_ERASE; + mmc->caps |= MMC_CAP_4_BIT_DATA; mmc->ops = &mmc_omap_ops; mmc->f_min = 400000; diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index a379c45b985c..37b8740513f5 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1922,7 +1922,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count; mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | - MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE | MMC_CAP_CMD23; + MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_CMD23; mmc->caps |= mmc_pdata(host)->caps; if (mmc->caps & MMC_CAP_8_BIT_DATA) diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 11087976ab19..5a71f6678fd3 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -1347,7 +1347,7 @@ static void realtek_init_host(struct realtek_pci_sdmmc *host) mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34 | MMC_VDD_165_195; mmc->caps = MMC_CAP_4_BIT_DATA | MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED | MMC_CAP_BUS_WIDTH_TEST | - MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25 | MMC_CAP_ERASE; + MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25; mmc->caps2 = MMC_CAP2_NO_PRESCAN_POWERUP | MMC_CAP2_FULL_PWR_CYCLE; mmc->max_current_330 = 400; mmc->max_current_180 = 800; diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index 81d0dfe553a8..a7084c50ad65 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -1314,7 +1314,7 @@ static void rtsx_usb_init_host(struct rtsx_usb_sdmmc *host) mmc->caps = MMC_CAP_4_BIT_DATA | MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED | MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25 | MMC_CAP_UHS_SDR50 | - MMC_CAP_ERASE | MMC_CAP_SYNC_RUNTIME_PM; + MMC_CAP_SYNC_RUNTIME_PM; mmc->caps2 = MMC_CAP2_NO_PRESCAN_POWERUP | MMC_CAP2_FULL_PWR_CYCLE | MMC_CAP2_NO_SDIO; diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 1bb6b6796318..95cc08c1fed9 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -4331,7 +4331,7 @@ int sdhci_setup_host(struct sdhci_host *host) !host->ops->get_max_timeout_count) mmc->max_busy_timeout = 0; - mmc->caps |= MMC_CAP_SDIO_IRQ | MMC_CAP_ERASE | MMC_CAP_CMD23; + mmc->caps |= MMC_CAP_SDIO_IRQ | MMC_CAP_CMD23; mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; if (host->quirks & SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 3bfbd89bd4ab..5e95bbc51644 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1394,7 +1394,7 @@ static int sunxi_mmc_probe(struct platform_device *pdev) mmc->f_min = 400000; mmc->f_max = 52000000; mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | - MMC_CAP_ERASE | MMC_CAP_SDIO_IRQ; + MMC_CAP_SDIO_IRQ; /* * Some H5 devices do not have signal traces precise enough to diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 9520bd94cf43..f31afd1c2671 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -1128,7 +1128,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) if (ret == -EPROBE_DEFER) return ret; - mmc->caps |= MMC_CAP_ERASE | MMC_CAP_4_BIT_DATA | pdata->capabilities; + mmc->caps |= MMC_CAP_4_BIT_DATA | pdata->capabilities; mmc->caps2 |= pdata->capabilities2; mmc->max_segs = pdata->max_segs ? : 32; mmc->max_blk_size = TMIO_MAX_BLK_SIZE; diff --git a/drivers/staging/greybus/sdio.c b/drivers/staging/greybus/sdio.c index c4b16bb5c1a4..0939f4a4c963 100644 --- a/drivers/staging/greybus/sdio.c +++ b/drivers/staging/greybus/sdio.c @@ -67,7 +67,6 @@ static void _gb_sdio_set_host_caps(struct gb_sdio_host *host, u32 r) ((r & GB_SDIO_CAP_8_BIT_DATA) ? MMC_CAP_8_BIT_DATA : 0) | ((r & GB_SDIO_CAP_MMC_HS) ? MMC_CAP_MMC_HIGHSPEED : 0) | ((r & GB_SDIO_CAP_SD_HS) ? MMC_CAP_SD_HIGHSPEED : 0) | - ((r & GB_SDIO_CAP_ERASE) ? MMC_CAP_ERASE : 0) | ((r & GB_SDIO_CAP_1_2V_DDR) ? MMC_CAP_1_2V_DDR : 0) | ((r & GB_SDIO_CAP_1_8V_DDR) ? MMC_CAP_1_8V_DDR : 0) | ((r & GB_SDIO_CAP_POWER_OFF_CARD) ? MMC_CAP_POWER_OFF_CARD : 0) | diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index d4a50e5dc111..7149bab555d7 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -321,7 +321,6 @@ struct mmc_host { #define MMC_CAP_AGGRESSIVE_PM (1 << 7) /* Suspend (e)MMC/SD at idle */ #define MMC_CAP_NONREMOVABLE (1 << 8) /* Nonremovable e.g. eMMC */ #define MMC_CAP_WAIT_WHILE_BUSY (1 << 9) /* Waits while card is busy */ -#define MMC_CAP_ERASE (1 << 10) /* Allow erase/trim commands */ #define MMC_CAP_3_3V_DDR (1 << 11) /* Host supports eMMC DDR 3.3V */ #define MMC_CAP_1_8V_DDR (1 << 12) /* Host supports eMMC DDR 1.8V */ #define MMC_CAP_1_2V_DDR (1 << 13) /* Host supports eMMC DDR 1.2V */ -- cgit v1.2.3 From 991f5c4dd2422881c933ec3c7c19f3a2a1858cc4 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Mon, 18 May 2020 21:17:39 +0200 Subject: m68k: mcf5441x: add support for esdhc mmc controller Add support for sdhci-edshc mmc controller. Signed-off-by: Angelo Dureghello Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20200518191742.1251440-1-angelo.dureghello@timesys.com Signed-off-by: Ulf Hansson --- arch/m68k/coldfire/clk.c | 15 +++++++++++++ arch/m68k/coldfire/device.c | 33 +++++++++++++++++++++++++++-- arch/m68k/coldfire/m5441x.c | 12 ++++++++++- arch/m68k/include/asm/m5441xsim.h | 15 +++++++++++++ arch/m68k/include/asm/mcfclk.h | 2 ++ include/linux/platform_data/mmc-esdhc-mcf.h | 17 +++++++++++++++ 6 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 include/linux/platform_data/mmc-esdhc-mcf.h (limited to 'include/linux') diff --git a/arch/m68k/coldfire/clk.c b/arch/m68k/coldfire/clk.c index 7bc666e482eb..75a057445472 100644 --- a/arch/m68k/coldfire/clk.c +++ b/arch/m68k/coldfire/clk.c @@ -73,6 +73,21 @@ struct clk_ops clk_ops1 = { #endif /* MCFPM_PPMCR1 */ #endif /* MCFPM_PPMCR0 */ +static void __clk_enable2(struct clk *clk) +{ + __raw_writel(__raw_readl(MCFSDHC_CLK) | (1 << clk->slot), MCFSDHC_CLK); +} + +static void __clk_disable2(struct clk *clk) +{ + __raw_writel(__raw_readl(MCFSDHC_CLK) & ~(1 << clk->slot), MCFSDHC_CLK); +} + +struct clk_ops clk_ops2 = { + .enable = __clk_enable2, + .disable = __clk_disable2, +}; + struct clk *clk_get(struct device *dev, const char *id) { const char *clk_name = dev ? dev_name(dev) : id ? id : NULL; diff --git a/arch/m68k/coldfire/device.c b/arch/m68k/coldfire/device.c index b4103b6bfdeb..9ef4ec0aea00 100644 --- a/arch/m68k/coldfire/device.c +++ b/arch/m68k/coldfire/device.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * All current ColdFire parts contain from 2, 3, 4 or 10 UARTS. @@ -551,9 +552,35 @@ static struct platform_device mcf_edma = { .platform_data = &mcf_edma_data, } }; - #endif /* IS_ENABLED(CONFIG_MCF_EDMA) */ +#if IS_ENABLED(CONFIG_MMC) +static struct mcf_esdhc_platform_data mcf_esdhc_data = { + .max_bus_width = 4, + .cd_type = ESDHC_CD_NONE, +}; + +static struct resource mcf_esdhc_resources[] = { + { + .start = MCFSDHC_BASE, + .end = MCFSDHC_BASE + MCFSDHC_SIZE - 1, + .flags = IORESOURCE_MEM, + }, { + .start = MCF_IRQ_SDHC, + .end = MCF_IRQ_SDHC, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_esdhc = { + .name = "sdhci-esdhc-mcf", + .id = 0, + .num_resources = ARRAY_SIZE(mcf_esdhc_resources), + .resource = mcf_esdhc_resources, + .dev.platform_data = &mcf_esdhc_data, +}; +#endif /* IS_ENABLED(CONFIG_MMC) */ + static struct platform_device *mcf_devices[] __initdata = { &mcf_uart, #if IS_ENABLED(CONFIG_FEC) @@ -586,6 +613,9 @@ static struct platform_device *mcf_devices[] __initdata = { #if IS_ENABLED(CONFIG_MCF_EDMA) &mcf_edma, #endif +#if IS_ENABLED(CONFIG_MMC) + &mcf_esdhc, +#endif }; /* @@ -614,4 +644,3 @@ static int __init mcf_init_devices(void) } arch_initcall(mcf_init_devices); - diff --git a/arch/m68k/coldfire/m5441x.c b/arch/m68k/coldfire/m5441x.c index 5bd24c9b865d..ffa02de1a3fb 100644 --- a/arch/m68k/coldfire/m5441x.c +++ b/arch/m68k/coldfire/m5441x.c @@ -52,7 +52,7 @@ DEFINE_CLK(0, "mcfssi.0", 47, MCF_CLK); DEFINE_CLK(0, "pll.0", 48, MCF_CLK); DEFINE_CLK(0, "mcfrng.0", 49, MCF_CLK); DEFINE_CLK(0, "mcfssi.1", 50, MCF_CLK); -DEFINE_CLK(0, "mcfsdhc.0", 51, MCF_CLK); +DEFINE_CLK(0, "sdhci-esdhc-mcf.0", 51, MCF_CLK); DEFINE_CLK(0, "enet-fec.0", 53, MCF_CLK); DEFINE_CLK(0, "enet-fec.1", 54, MCF_CLK); DEFINE_CLK(0, "switch.0", 55, MCF_CLK); @@ -74,6 +74,10 @@ DEFINE_CLK(1, "mcfpwm.0", 34, MCF_BUSCLK); DEFINE_CLK(1, "sys.0", 36, MCF_BUSCLK); DEFINE_CLK(1, "gpio.0", 37, MCF_BUSCLK); +DEFINE_CLK(2, "ipg.0", 0, MCF_CLK); +DEFINE_CLK(2, "ahb.0", 1, MCF_CLK); +DEFINE_CLK(2, "per.0", 2, MCF_CLK); + struct clk *mcf_clks[] = { &__clk_0_2, &__clk_0_8, @@ -131,6 +135,11 @@ struct clk *mcf_clks[] = { &__clk_1_34, &__clk_1_36, &__clk_1_37, + + &__clk_2_0, + &__clk_2_1, + &__clk_2_2, + NULL, }; @@ -151,6 +160,7 @@ static struct clk * const enable_clks[] __initconst = { &__clk_0_33, /* pit.1 */ &__clk_0_37, /* eport */ &__clk_0_48, /* pll */ + &__clk_0_51, /* esdhc */ &__clk_1_36, /* CCM/reset module/Power management */ &__clk_1_37, /* gpio */ diff --git a/arch/m68k/include/asm/m5441xsim.h b/arch/m68k/include/asm/m5441xsim.h index 4892f314ff38..e091e36d3464 100644 --- a/arch/m68k/include/asm/m5441xsim.h +++ b/arch/m68k/include/asm/m5441xsim.h @@ -278,6 +278,13 @@ #define MCFGPIO_IRQ_VECBASE (MCFINT_VECBASE - MCFGPIO_IRQ_MIN) #define MCFGPIO_PIN_MAX 87 +/* + * Phase Locked Loop (PLL) + */ +#define MCF_PLL_CR 0xFC0C0000 +#define MCF_PLL_DR 0xFC0C0004 +#define MCF_PLL_SR 0xFC0C0008 + /* * DSPI module. */ @@ -298,5 +305,13 @@ #define MCFEDMA_IRQ_INTR16 (MCFINT1_VECBASE + MCFEDMA_EDMA_INTR16) #define MCFEDMA_IRQ_INTR56 (MCFINT2_VECBASE + MCFEDMA_EDMA_INTR56) #define MCFEDMA_IRQ_ERR (MCFINT0_VECBASE + MCFINT0_EDMA_ERR) +/* + * esdhc module. + */ +#define MCFSDHC_BASE 0xfc0cc000 +#define MCFSDHC_SIZE 256 +#define MCFINT2_SDHC 31 +#define MCF_IRQ_SDHC (MCFINT2_VECBASE + MCFINT2_SDHC) +#define MCFSDHC_CLK (MCFSDHC_BASE + 0x2c) #endif /* m5441xsim_h */ diff --git a/arch/m68k/include/asm/mcfclk.h b/arch/m68k/include/asm/mcfclk.h index 0aca504fae31..722627e06d66 100644 --- a/arch/m68k/include/asm/mcfclk.h +++ b/arch/m68k/include/asm/mcfclk.h @@ -30,6 +30,8 @@ extern struct clk_ops clk_ops0; extern struct clk_ops clk_ops1; #endif /* MCFPM_PPMCR1 */ +extern struct clk_ops clk_ops2; + #define DEFINE_CLK(clk_bank, clk_name, clk_slot, clk_rate) \ static struct clk __clk_##clk_bank##_##clk_slot = { \ .name = clk_name, \ diff --git a/include/linux/platform_data/mmc-esdhc-mcf.h b/include/linux/platform_data/mmc-esdhc-mcf.h new file mode 100644 index 000000000000..85cb786a62fe --- /dev/null +++ b/include/linux/platform_data/mmc-esdhc-mcf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_PLATFORM_DATA_MCF_ESDHC_H__ +#define __LINUX_PLATFORM_DATA_MCF_ESDHC_H__ + +enum cd_types { + ESDHC_CD_NONE, /* no CD, neither controller nor gpio */ + ESDHC_CD_CONTROLLER, /* mmc controller internal CD */ + ESDHC_CD_PERMANENT, /* no CD, card permanently wired to host */ +}; + +struct mcf_esdhc_platform_data { + int max_bus_width; + int cd_type; +}; + +#endif /* __LINUX_PLATFORM_DATA_MCF_ESDHC_H__ */ -- cgit v1.2.3 From 4bc90f492230af6661bc2021dddd501f7c842334 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:02 +0200 Subject: mmc: sdio: Fix macro name for Marvell device with ID 0x9134 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marvell SDIO device ID 0x9134 is used in SDIO Common CIS (Card Information Structure) and not in SDIO wlan function (with ID 1). SDIO Common CIS is accessed by function ID 0. So change this misleading macro name to SDIO_DEVICE_ID_MARVELL_8887_F0 as it does not refer to wlan function. It refers to function 0. Wlan module on this SDIO card is available at function ID 1 and is identified by different SDIO device ID 0x9135. Kernel quirks for SDIO devices are matched against device ID from SDIO Common CIS. Therefore device ID used in quirk is correct, just has misleading name. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-2-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/quirks.h | 2 +- include/linux/mmc/sdio_ids.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index 3dba15bccce2..472fa2fdcf13 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -139,7 +139,7 @@ static const struct mmc_fixup sdio_fixup_methods[] = { SDIO_FIXUP(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8797_F0, add_quirk, MMC_QUIRK_BROKEN_IRQ_POLLING), - SDIO_FIXUP(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8887WLAN, + SDIO_FIXUP(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8887_F0, add_limit_rate_quirk, 150000000), END_FIXUP diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 2e9a6e4634eb..96f43e0dc78f 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -59,7 +59,7 @@ #define SDIO_DEVICE_ID_MARVELL_8688WLAN 0x9104 #define SDIO_DEVICE_ID_MARVELL_8688BT 0x9105 #define SDIO_DEVICE_ID_MARVELL_8797_F0 0x9128 -#define SDIO_DEVICE_ID_MARVELL_8887WLAN 0x9134 +#define SDIO_DEVICE_ID_MARVELL_8887_F0 0x9134 #define SDIO_VENDOR_ID_MEDIATEK 0x037a -- cgit v1.2.3 From b75b7ca7c27dfd61dba368f390b7d4dc20b3a8cb Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 25 Nov 2019 13:24:03 -0600 Subject: fs: remove dio_end_io() Since we removed the last user of dio_end_io(), remove the helper function dio_end_io(). Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/direct-io.c | 19 ------------------- include/linux/fs.h | 2 -- 2 files changed, 21 deletions(-) (limited to 'include/linux') diff --git a/fs/direct-io.c b/fs/direct-io.c index 00b4d15bb811..c44d60f375bc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, diff --git a/include/linux/fs.h b/include/linux/fs.h index 366c533d30cd..e84623d5e173 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3187,8 +3187,6 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio); - ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, -- cgit v1.2.3 From dc35ada4251f183137ee3a524543c9329d7a4fa2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 15:41:23 +0200 Subject: block: fix a warning when blkdev.h is included for !CONFIG_BLOCK builds disk_start_io_acct and disk_end_io_acct need at least a struct gendisk forward declaration, but for weird historic reasons much of blkdev.h is stubbed out for CONFIG_BLOCK=n. Fix this by stubbing more out for now, but eventually this header will need a massive cleanup. Fixes: 956d510ee78 ("block: add disk/bio-based accounting helpers") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6f7ff0fa8fcf..8fd900998b4e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1892,12 +1892,12 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } +#ifdef CONFIG_BLOCK unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -#ifdef CONFIG_BLOCK /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for -- cgit v1.2.3 From 1597b374af22266266e1e20612208c4b11359ad4 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 28 May 2020 17:28:04 +0300 Subject: hwmon: Add notification support For hwmon drivers using the hwmon_device_register_with_info() API, it is desirable to have a generic notification mechanism available. This mechanism can be used to notify userspace as well as the thermal subsystem if the driver experiences any events, such as warning or critical alarms. Implement hwmon_notify_event() to provide this mechanism. The function generates a sysfs event and a udev event. If the device is registered with the thermal subsystem and the event is associated with a temperature sensor, also notify the thermal subsystem that a thermal event occurred. Signed-off-by: Guenter Roeck Signed-off-by: Serge Semin Cc: Maxim Kaurkin Cc: Alexey Malahov Cc: Thomas Bogendoerfer Cc: Arnd Bergmann Cc: Rob Herring Cc: linux-mips@vger.kernel.org Cc: devicetree@vger.kernel.org Signed-off-by: Guenter Roeck --- drivers/hwmon/hwmon.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/hwmon.h | 3 +++ 2 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index dcd4445d4570..3f596a5328da 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -31,7 +32,7 @@ struct hwmon_device { const char *name; struct device dev; const struct hwmon_chip_info *chip; - + struct list_head tzdata; struct attribute_group group; const struct attribute_group **groups; }; @@ -55,12 +56,12 @@ struct hwmon_device_attribute { /* * Thermal zone information - * In addition to the reference to the hwmon device, - * also provides the sensor index. */ struct hwmon_thermal_data { + struct list_head node; /* hwmon tzdata list entry */ struct device *dev; /* Reference to hwmon device */ int index; /* sensor index */ + struct thermal_zone_device *tzd;/* thermal zone device */ }; static ssize_t @@ -156,10 +157,17 @@ static const struct thermal_zone_of_device_ops hwmon_thermal_ops = { .get_temp = hwmon_thermal_get_temp, }; +static void hwmon_thermal_remove_sensor(void *data) +{ + list_del(data); +} + static int hwmon_thermal_add_sensor(struct device *dev, int index) { + struct hwmon_device *hwdev = to_hwmon_device(dev); struct hwmon_thermal_data *tdata; struct thermal_zone_device *tzd; + int err; tdata = devm_kzalloc(dev, sizeof(*tdata), GFP_KERNEL); if (!tdata) @@ -177,6 +185,13 @@ static int hwmon_thermal_add_sensor(struct device *dev, int index) if (IS_ERR(tzd) && (PTR_ERR(tzd) != -ENODEV)) return PTR_ERR(tzd); + err = devm_add_action(dev, hwmon_thermal_remove_sensor, &tdata->node); + if (err) + return err; + + tdata->tzd = tzd; + list_add(&tdata->node, &hwdev->tzdata); + return 0; } @@ -211,11 +226,27 @@ static int hwmon_thermal_register_sensors(struct device *dev) return 0; } +static void hwmon_thermal_notify(struct device *dev, int index) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + struct hwmon_thermal_data *tzdata; + + list_for_each_entry(tzdata, &hwdev->tzdata, node) { + if (tzdata->index == index) { + thermal_zone_device_update(tzdata->tzd, + THERMAL_EVENT_UNSPECIFIED); + } + } +} + #else static int hwmon_thermal_register_sensors(struct device *dev) { return 0; } + +static void hwmon_thermal_notify(struct device *dev, int index) { } + #endif /* IS_REACHABLE(CONFIG_THERMAL) && ... */ static int hwmon_attr_base(enum hwmon_sensor_types type) @@ -543,6 +574,35 @@ static const int __templates_size[] = { [hwmon_intrusion] = ARRAY_SIZE(hwmon_intrusion_attr_templates), }; +int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel) +{ + char sattr[MAX_SYSFS_ATTR_NAME_LENGTH]; + const char * const *templates; + const char *template; + int base; + + if (type >= ARRAY_SIZE(__templates)) + return -EINVAL; + if (attr >= __templates_size[type]) + return -EINVAL; + + templates = __templates[type]; + template = templates[attr]; + + base = hwmon_attr_base(type); + + scnprintf(sattr, MAX_SYSFS_ATTR_NAME_LENGTH, template, base + channel); + sysfs_notify(&dev->kobj, NULL, sattr); + kobject_uevent(&dev->kobj, KOBJ_CHANGE); + + if (type == hwmon_temp) + hwmon_thermal_notify(dev, channel); + + return 0; +} +EXPORT_SYMBOL_GPL(hwmon_notify_event); + static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) { int i, n; @@ -693,6 +753,8 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, if (err) goto free_hwmon; + INIT_LIST_HEAD(&hwdev->tzdata); + if (dev && dev->of_node && chip && chip->ops->read && chip->info[0]->type == hwmon_chip && (chip->info[0]->config[0] & HWMON_C_REGISTER_TZ)) { diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 5e609f25878c..363d4a814aa1 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -436,6 +436,9 @@ devm_hwmon_device_register_with_info(struct device *dev, void hwmon_device_unregister(struct device *dev); void devm_hwmon_device_unregister(struct device *dev); +int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel); + /** * hwmon_is_bad_char - Is the char invalid in a hwmon name * @ch: the char to be considered -- cgit v1.2.3 From 7df915e540ec51ce9ac5f2d2d9801d0356b617da Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 25 May 2020 14:05:04 +0200 Subject: i2c: avoid confusing naming in header i2c_client pointers are usually named 'client'. Use it here to get rid of the ambiguity of 'dev->dev'. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 49d29054e657..c10617bb980a 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -351,14 +351,14 @@ static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj) return to_i2c_client(dev); } -static inline void *i2c_get_clientdata(const struct i2c_client *dev) +static inline void *i2c_get_clientdata(const struct i2c_client *client) { - return dev_get_drvdata(&dev->dev); + return dev_get_drvdata(&client->dev); } -static inline void i2c_set_clientdata(struct i2c_client *dev, void *data) +static inline void i2c_set_clientdata(struct i2c_client *client, void *data) { - dev_set_drvdata(&dev->dev, data); + dev_set_drvdata(&client->dev, data); } /* I2C slave support */ -- cgit v1.2.3 From 655078f5f5286fe0ab38e90c4695001a0e15e9dd Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 13 May 2020 20:55:57 +0200 Subject: kobject: increase allowed number of uevent variables SBS battery driver exposes 32 power supply properties now, which will result in uevent failure on (un)plugging the battery. Other drivers (e.g. bq27xxx) are also coming close to this limit, so increase it. Acked-by: Greg Kroah-Hartman Reviewed-by: Emil Velikov Signed-off-by: Sebastian Reichel --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e2ca0a292e21..75e822569e39 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -29,7 +29,7 @@ #include #define UEVENT_HELPER_PATH_LEN 256 -#define UEVENT_NUM_ENVP 32 /* number of env pointers */ +#define UEVENT_NUM_ENVP 64 /* number of env pointers */ #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ #ifdef CONFIG_UEVENT_HELPER -- cgit v1.2.3 From bac705abcf345c28e419157cfcd1c44032cc9db2 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 13 May 2020 20:55:58 +0200 Subject: power: supply: core: add capacity error margin property Add a property for reporting the error margin expected by fuel gauge chips. Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 15 +++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index bf3b48f022dc..2f896555ae23 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -74,6 +74,21 @@ Description: Access: Read, Write Valid values: 0 - 100 (percent) +What: /sys/class/power_supply//capacity_error_margin +Date: April 2019 +Contact: linux-pm@vger.kernel.org +Description: + Battery capacity measurement becomes unreliable without + recalibration. This values provides the maximum error + margin expected to exist by the fuel gauge in percent. + Values close to 0% will be returned after (re-)calibration + has happened. Over time the error margin will increase. + 100% means, that the capacity related values are basically + completely useless. + + Access: Read + Valid values: 0 - 100 (percent) + What: /sys/class/power_supply//capacity_level Date: June 2009 Contact: linux-pm@vger.kernel.org diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index d21b4e0edf38..e664774a2d1e 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -178,6 +178,7 @@ static struct power_supply_attr power_supply_attrs[] = { POWER_SUPPLY_ATTR(CAPACITY), POWER_SUPPLY_ATTR(CAPACITY_ALERT_MIN), POWER_SUPPLY_ATTR(CAPACITY_ALERT_MAX), + POWER_SUPPLY_ATTR(CAPACITY_ERROR_MARGIN), POWER_SUPPLY_ENUM_ATTR(CAPACITY_LEVEL), POWER_SUPPLY_ATTR(TEMP), POWER_SUPPLY_ATTR(TEMP_MAX), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 1f60731ec7fe..453a85f25635 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -139,6 +139,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_CAPACITY, /* in percents! */ POWER_SUPPLY_PROP_CAPACITY_ALERT_MIN, /* in percents! */ POWER_SUPPLY_PROP_CAPACITY_ALERT_MAX, /* in percents! */ + POWER_SUPPLY_PROP_CAPACITY_ERROR_MARGIN, /* in percents! */ POWER_SUPPLY_PROP_CAPACITY_LEVEL, POWER_SUPPLY_PROP_TEMP, POWER_SUPPLY_PROP_TEMP_MAX, -- cgit v1.2.3 From feabe49e46bb556b8d43e28d4a0d459940f7a5cb Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 13 May 2020 20:55:59 +0200 Subject: power: supply: core: add manufacture date properties Some smart batteries store their manufacture date, which is useful to identify the battery and/or to know about the cell quality. Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 28 ++++++++++++++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 3 +++ include/linux/power_supply.h | 3 +++ 3 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 2f896555ae23..e6d7348766b2 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -680,3 +680,31 @@ Description: Valid values: - 1: enabled - 0: disabled + +What: /sys/class/power_supply//manufacture_year +Date: January 2020 +Contact: linux-pm@vger.kernel.org +Description: + Reports the year (following Gregorian calendar) when the device has been + manufactured. + + Access: Read + Valid values: Reported as integer + +What: /sys/class/power_supply//manufacture_month +Date: January 2020 +Contact: linux-pm@vger.kernel.org +Description: + Reports the month when the device has been manufactured. + + Access: Read + Valid values: 1-12 + +What: /sys/class/power_supply//manufacture_day +Date: January 2020 +Contact: linux-pm@vger.kernel.org +Description: + Reports the day of month when the device has been manufactured. + + Access: Read + Valid values: 1-31 diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index e664774a2d1e..78d5382e69f1 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -198,6 +198,9 @@ static struct power_supply_attr power_supply_attrs[] = { POWER_SUPPLY_ATTR(PRECHARGE_CURRENT), POWER_SUPPLY_ATTR(CHARGE_TERM_CURRENT), POWER_SUPPLY_ATTR(CALIBRATE), + POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), + POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), + POWER_SUPPLY_ATTR(MANUFACTURE_DAY), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 453a85f25635..63ffe2a0a87b 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -159,6 +159,9 @@ enum power_supply_property { POWER_SUPPLY_PROP_PRECHARGE_CURRENT, POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT, POWER_SUPPLY_PROP_CALIBRATE, + POWER_SUPPLY_PROP_MANUFACTURE_YEAR, + POWER_SUPPLY_PROP_MANUFACTURE_MONTH, + POWER_SUPPLY_PROP_MANUFACTURE_DAY, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, -- cgit v1.2.3 From 601c2a543f02da484362b3ff9074b2cfe08750de Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 13 May 2020 20:56:00 +0200 Subject: power: supply: core: add POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED Some battery fuel gauges know when the battery needs to be recalibrated before providing usable values. This should be reported via the health property. Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 2 +- drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index e6d7348766b2..216d61a22f1e 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -205,7 +205,7 @@ Description: Valid values: "Unknown", "Good", "Overheat", "Dead", "Over voltage", "Unspecified failure", "Cold", "Watchdog timer expire", "Safety timer expire", - "Over current" + "Over current", "Calibration required" What: /sys/class/power_supply//precharge_current Date: June 2017 diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 78d5382e69f1..bc79560229b5 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -100,6 +100,7 @@ static const char * const POWER_SUPPLY_HEALTH_TEXT[] = { [POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE] = "Watchdog timer expire", [POWER_SUPPLY_HEALTH_SAFETY_TIMER_EXPIRE] = "Safety timer expire", [POWER_SUPPLY_HEALTH_OVERCURRENT] = "Over current", + [POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED] = "Calibration required", }; static const char * const POWER_SUPPLY_TECHNOLOGY_TEXT[] = { diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 63ffe2a0a87b..ac1345a48ad0 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -61,6 +61,7 @@ enum { POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE, POWER_SUPPLY_HEALTH_SAFETY_TIMER_EXPIRE, POWER_SUPPLY_HEALTH_OVERCURRENT, + POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED, }; enum { -- cgit v1.2.3 From db10538a4b997a77a1fd561adaaa58afc7dcfa2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:18 +0200 Subject: tcp: add tcp_sock_set_cork Add a helper to directly set the TCP_CORK sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 14 ----------- drivers/block/drbd/drbd_receiver.c | 4 +-- drivers/block/drbd/drbd_worker.c | 6 ++--- fs/cifs/transport.c | 8 ++---- include/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 51 ++++++++++++++++++++++++-------------- net/rds/tcp_send.c | 9 ++----- 7 files changed, 43 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index aae99a2d7bd4..3550adc93c68 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,20 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_cork(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_uncork(struct socket *sock) -{ - int val = 0; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - static inline void drbd_tcp_nodelay(struct socket *sock) { int val = 1; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c15e7083b13a..55ea907ad33c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -6162,7 +6162,7 @@ void drbd_send_acks_wf(struct work_struct *ws) rcu_read_unlock(); if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, true); err = drbd_finish_peer_reqs(device); kref_put(&device->kref, drbd_destroy_device); @@ -6175,7 +6175,7 @@ void drbd_send_acks_wf(struct work_struct *ws) } if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, false); return; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0dc019da1f8d..2b89c9f2ca70 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * if (uncork) { mutex_lock(&connection->data.mutex); if (connection->data.socket) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); mutex_unlock(&connection->data.mutex); } @@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * mutex_lock(&connection->data.mutex); if (connection->data.socket) { if (cork) - drbd_tcp_cork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, true); else if (!uncork) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); } mutex_unlock(&connection->data.mutex); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c97570eb2c18..99760063e000 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -325,7 +325,6 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; - int val = 1; __be32 rfc1002_marker; if (cifs_rdma_enabled(server)) { @@ -345,8 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, } /* cork the socket */ - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, true); for (j = 0; j < num_rqst; j++) send_length += smb_rqst_len(server, &rqst[j]); @@ -435,9 +433,7 @@ unmask: } /* uncork it */ - val = 0; - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, false); if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bf44e85d709d..889eeb2256c2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -497,4 +497,6 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); +void tcp_sock_set_cork(struct sock *sk, bool on); + #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 970064996377..e6cf702e16d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2801,6 +2801,37 @@ static void tcp_enable_tx_delay(void) } } +/* When set indicates to always queue non-full frames. Later the user clears + * this option and we transmit any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly filled frames when the + * user (for example) must write out headers with a write() call first and then + * use sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than + * TCP_NODELAY. + */ +static void __tcp_sock_set_cork(struct sock *sk, bool on) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (on) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle & TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } +} + +void tcp_sock_set_cork(struct sock *sk, bool on) +{ + lock_sock(sk); + __tcp_sock_set_cork(sk, on); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_cork); + /* * Socket option code for TCP. */ @@ -2979,25 +3010,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. - */ - if (val) { - tp->nonagle |= TCP_NAGLE_CORK; - } else { - tp->nonagle &= ~TCP_NAGLE_CORK; - if (tp->nonagle&TCP_NAGLE_OFF) - tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } + __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 78a2554a4497..8c4d1d6e9249 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -38,23 +38,18 @@ #include "rds.h" #include "tcp.h" -static void rds_tcp_cork(struct socket *sock, int val) -{ - kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val)); -} - void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 1); + tcp_sock_set_cork(tc->t_sock->sk, true); } void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 0); + tcp_sock_set_cork(tc->t_sock->sk, false); } /* the core send_sem serializes this with other xmit and shutdown */ -- cgit v1.2.3 From 12abc5ee7873a085cc280240822b8ac53c86fecd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:19 +0200 Subject: tcp: add tcp_sock_set_nodelay Add a helper to directly set the TCP_NODELAY sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Acked-by: Jason Gunthorpe Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 7 ------ drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/infiniband/sw/siw/siw_cm.c | 24 ++++--------------- drivers/nvme/host/tcp.c | 9 +------ drivers/nvme/target/tcp.c | 12 ++-------- drivers/target/iscsi/iscsi_target_login.c | 15 +++--------- fs/cifs/connect.c | 10 ++------ fs/dlm/lowcomms.c | 8 ++----- fs/ocfs2/cluster/tcp.c | 20 ++-------------- include/linux/tcp.h | 1 + net/ceph/messenger.c | 11 ++------- net/ipv4/tcp.c | 39 ++++++++++++++++++++----------- net/rds/tcp.c | 11 +-------- net/rds/tcp.h | 1 - net/rds/tcp_listen.c | 2 +- 16 files changed, 49 insertions(+), 127 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 3550adc93c68..e24bba87c8e0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_nodelay(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char*)&val, sizeof(val)); -} - static inline void drbd_tcp_quickack(struct socket *sock) { int val = 2; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c094c3c2c5d4..45fbd526c453 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr, /* DRBD protocol "pings" are latency critical. * This is supposed to trigger tcp_push_pending_frames() */ if (!err && (cmd == P_PING || cmd == P_PING_ACK)) - drbd_tcp_nodelay(sock->socket); + tcp_sock_set_nodelay(sock->socket->sk); return err; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 55ea907ad33c..20a5e94494ac 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1051,8 +1051,8 @@ randomize: /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + tcp_sock_set_nodelay(sock.socket->sk); + tcp_sock_set_nodelay(msock.socket->sk); connection->data.socket = sock.socket; connection->meta.socket = msock.socket; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index d1860f3e8740..1662216be66d 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -947,16 +947,8 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rv) { - siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); @@ -1386,16 +1378,8 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, - sizeof(val)); - if (rv) { - siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a307972d33a0..4e4a750ecdb9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1346,14 +1346,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Set TCP no delay */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_nodelay(queue->sock->sk); /* * Cleanup whatever is sitting in the TCP transmit queue on socket diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f3088156d01d..55bc4c3c0a74 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1580,7 +1580,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; __kernel_sa_family_t af; - int opt, ret; + int ret; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) @@ -1625,15 +1625,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; sock_set_reuseaddr(port->sock->sk); - - opt = 1; - ret = kernel_setsockopt(port->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } - + tcp_sock_set_nodelay(port->sock->sk); if (so_priority > 0) sock_set_priority(port->sock->sk, so_priority); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 91acb3f07b4c..b561b07a869a 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -897,20 +897,11 @@ int iscsit_setup_np( /* * Set SO_REUSEADDR, and disable Nagel Algorithm with TCP_NODELAY. */ - /* FIXME: Someone please explain why this is endian-safe */ - opt = 1; - if (np->np_network_transport == ISCSI_TCP) { - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for TCP_NODELAY" - " failed: %d\n", ret); - goto fail; - } - } - + if (np->np_network_transport == ISCSI_TCP) + tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); + opt = 1; ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, (char *)&opt, sizeof(opt)); if (ret < 0) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 28268ed461b8..ad8fb53b3682 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3929,14 +3929,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_rcvbuf = 140 * 1024; } - if (server->tcp_nodelay) { - int val = 1; - rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rc) - cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", - rc); - } + if (server->tcp_nodelay) + tcp_sock_set_nodelay(socket->sk); cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", socket->sk->sk_sndbuf, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2822a430a2b4..69333728d871 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1011,7 +1011,6 @@ static void tcp_connect_to_sock(struct connection *con) struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; - int one = 1; int result; if (con->nodeid == 0) { @@ -1060,8 +1059,7 @@ static void tcp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, O_NONBLOCK); @@ -1103,7 +1101,6 @@ static struct socket *tcp_create_listen_sock(struct connection *con, { struct socket *sock = NULL; int result = 0; - int one = 1; int addr_len; if (dlm_local_addr[0]->ss_family == AF_INET) @@ -1120,8 +1117,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, } /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2c512b40a940..4c70fe9d19ab 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_nodelay(struct socket *sock) -{ - int val = 1; - - return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (void *)&val, sizeof(val)); -} - static int o2net_set_usertimeout(struct socket *sock) { int user_timeout = O2NET_TCP_USER_TIMEOUT; @@ -1636,11 +1628,7 @@ static void o2net_start_connect(struct work_struct *work) goto out; } - ret = o2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(sc->sc_sock->sk); ret = o2net_set_usertimeout(sock); if (ret) { @@ -1832,11 +1820,7 @@ static int o2net_accept_one(struct socket *sock, int *more) *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; - ret = o2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(new_sock->sk); ret = o2net_set_usertimeout(new_sock); if (ret) { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 889eeb2256c2..9e42c7fe50a8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,5 +498,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +void tcp_sock_set_nodelay(struct sock *sk); #endif /* _LINUX_TCP_H */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8ca5edc5f2c..27d6ab11f9ee 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { - int optval = 1; - - ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&optval, sizeof(optval)); - if (ret) - pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", - ret); - } + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) + tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6cf702e16d6..a65f293a19fa 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2832,6 +2832,30 @@ void tcp_sock_set_cork(struct sock *sk, bool on) } EXPORT_SYMBOL(tcp_sock_set_cork); +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is + * remembered, but it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make an explicit push, which overrides + * even TCP_CORK for currently queued segments. + */ +static void __tcp_sock_set_nodelay(struct sock *sk, bool on) +{ + if (on) { + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; + } +} + +void tcp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + __tcp_sock_set_nodelay(sk, true); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_nodelay); + /* * Socket option code for TCP. */ @@ -2929,20 +2953,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. - */ - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } else { - tp->nonagle &= ~TCP_NAGLE_OFF; - } + __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 46782fac4c16..43db0eca911f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -89,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { { } }; -/* doing it this way avoids calling tcp_sk() */ -void rds_tcp_nonagle(struct socket *sock) -{ - int val = 1; - - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val, - sizeof(val)); -} - u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { /* seq# of the last byte of data in tcp send buffer */ @@ -502,7 +493,7 @@ void rds_tcp_tune(struct socket *sock) struct net *net = sock_net(sk); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); lock_sock(sk); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index d640e210b97b..f6d75d8cb167 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -50,7 +50,6 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); -void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index d8bd13276959..6f90ea077adc 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -288,7 +288,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) } sock->sk->sk_reuse = SK_CAN_REUSE; - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; -- cgit v1.2.3 From ddd061b8daed3ce0c01109a69c9a2a9f9669f01a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:20 +0200 Subject: tcp: add tcp_sock_set_quickack Add a helper to directly set the TCP_QUICKACK sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 7 ------- drivers/block/drbd/drbd_receiver.c | 5 ++--- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 39 +++++++++++++++++++++++++------------- 4 files changed, 29 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e24bba87c8e0..14345a87c7cc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_quickack(struct socket *sock) -{ - int val = 2; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char*)&val, sizeof(val)); -} - /* sets the number of 512 byte sectors of our virtual device */ void drbd_set_my_capacity(struct drbd_device *device, sector_t size); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 20a5e94494ac..3a3f2b6a821f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str * quickly as possible, and let remote TCP know what we have * received so far. */ if (err == -EAGAIN) { - drbd_tcp_quickack(connection->data.socket); + tcp_sock_set_quickack(connection->data.socket->sk, 2); drbd_unplug_all_devices(connection); } if (err > 0) { @@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(connection->data.socket); - + tcp_sock_set_quickack(connection->data.socket->sk, 2); return 0; } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9e42c7fe50a8..2eaf8320b9db 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -499,5 +499,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); +void tcp_sock_set_quickack(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a65f293a19fa..27b5e7a4e2ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2856,6 +2856,31 @@ void tcp_sock_set_nodelay(struct sock *sk) } EXPORT_SYMBOL(tcp_sock_set_nodelay); +static void __tcp_sock_set_quickack(struct sock *sk, int val) +{ + if (!val) { + inet_csk_enter_pingpong_mode(sk); + return; + } + + inet_csk_exit_pingpong_mode(sk); + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +} + +void tcp_sock_set_quickack(struct sock *sk, int val) +{ + lock_sock(sk); + __tcp_sock_set_quickack(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_quickack); + /* * Socket option code for TCP. */ @@ -3096,19 +3121,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_QUICKACK: - if (!val) { - inet_csk_enter_pingpong_mode(sk); - } else { - inet_csk_exit_pingpong_mode(sk); - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); - if (!(val & 1)) - inet_csk_enter_pingpong_mode(sk); - } - } + __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 557eadfcc5ee8f8fa98a795e05ed21db58a65db5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:21 +0200 Subject: tcp: add tcp_sock_set_syncnt Add a helper to directly set the TCP_SYNCNT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 9 +-------- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4e4a750ecdb9..2872584f52f6 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1336,14 +1336,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Single syn retry */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_SYNCNT sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_syncnt(queue->sock->sk, 1); /* Set TCP no delay */ tcp_sock_set_nodelay(queue->sock->sk); diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2eaf8320b9db..6aa4ae5ebf3d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -500,5 +500,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); +int tcp_sock_set_syncnt(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27b5e7a4e2ef..d2c67ae1da07 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2881,6 +2881,18 @@ void tcp_sock_set_quickack(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_quickack); +int tcp_sock_set_syncnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + lock_sock(sk); + inet_csk(sk)->icsk_syn_retries = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_syncnt); + /* * Socket option code for TCP. */ -- cgit v1.2.3 From c488aeadcbd002a992593e6090d54e8ac27c4310 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:22 +0200 Subject: tcp: add tcp_sock_set_user_timeout Add a helper to directly set the TCP_USER_TIMEOUT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/ocfs2/cluster/tcp.c | 22 ++-------------------- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 8 ++++++++ net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 12 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4c70fe9d19ab..79a231719460 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_usertimeout(struct socket *sock) -{ - int user_timeout = O2NET_TCP_USER_TIMEOUT; - - return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (void *)&user_timeout, sizeof(user_timeout)); -} - static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1629,12 +1621,7 @@ static void o2net_start_connect(struct work_struct *work) } tcp_sock_set_nodelay(sc->sc_sock->sk); - - ret = o2net_set_usertimeout(sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT); o2net_register_callbacks(sc->sc_sock->sk, sc); @@ -1821,12 +1808,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->sk->sk_allocation = GFP_ATOMIC; tcp_sock_set_nodelay(new_sock->sk); - - ret = o2net_set_usertimeout(new_sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6aa4ae5ebf3d..de682143efe4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -501,5 +501,6 @@ void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d2c67ae1da07..0004bd9ae7b0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2893,6 +2893,14 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_syncnt); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +{ + lock_sock(sk); + inet_csk(sk)->icsk_user_timeout = val; + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_user_timeout); + /* * Socket option code for TCP. */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 399848c2bcb2..231fd6162f68 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2115,8 +2115,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, (char *)&keepcnt, sizeof(keepcnt)); /* TCP user timeout (see RFC5482) */ - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + tcp_sock_set_user_timeout(sock->sk, timeo); } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, -- cgit v1.2.3 From 71c48eb81c9ecb6fed49dc33e7c9b621fdcb7bf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:23 +0200 Subject: tcp: add tcp_sock_set_keepidle Add a helper to directly set the TCP_KEEP_IDLE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 49 ++++++++++++++++++++++++++++++++++--------------- net/rds/tcp_listen.c | 5 +---- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 37 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index de682143efe4..5724dd84a85e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepidle(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0004bd9ae7b0..bdf0ff933351 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2901,6 +2901,39 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val) } EXPORT_SYMBOL(tcp_sock_set_user_timeout); +static int __tcp_sock_set_keepidle(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (val < 1 || val > MAX_TCP_KEEPIDLE) + return -EINVAL; + + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + + return 0; +} + +int tcp_sock_set_keepidle(struct sock *sk, int val) +{ + int err; + + lock_sock(sk); + err = __tcp_sock_set_keepidle(sk, val); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sock_set_keepidle); + /* * Socket option code for TCP. */ @@ -3070,21 +3103,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - tp->keepalive_time = val * HZ; - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - u32 elapsed = keepalive_time_elapsed(tp); - if (tp->keepalive_time > elapsed) - elapsed = tp->keepalive_time - elapsed; - else - elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); - } - } + err = __tcp_sock_set_keepidle(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 6f90ea077adc..79f9adc00811 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -52,10 +52,7 @@ int rds_tcp_keepalive(struct socket *sock) if (ret < 0) goto bail; - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - if (ret < 0) - goto bail; + tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 231fd6162f68..473290f7c5c0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2107,8 +2107,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP Keepalive options */ sock_set_keepalive(sock->sk); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepidle(sock->sk, keepidle); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, (char *)&keepidle, sizeof(keepidle)); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, -- cgit v1.2.3 From d41ecaac903c9f4658a71d4e7a708673cfb5abba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:24 +0200 Subject: tcp: add tcp_sock_set_keepintvl Add a helper to directly set the TCP_KEEPINTVL sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/rds/tcp_listen.c | 4 +--- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5724dd84a85e..1f9bada00faa 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -499,6 +499,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepidle(struct sock *sk, int val); +int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bdf0ff933351..7eb083e09786 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2934,6 +2934,18 @@ int tcp_sock_set_keepidle(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepidle); +int tcp_sock_set_keepintvl(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPINTVL) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_intvl = val * HZ; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepintvl); + /* * Socket option code for TCP. */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 79f9adc00811..9ad555c48d15 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -53,12 +53,10 @@ int rds_tcp_keepalive(struct socket *sock) goto bail; tcp_sock_set_keepidle(sock->sk, keepidle); - /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepintvl(sock->sk, keepidle); bail: return ret; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 473290f7c5c0..5ca64e12af0c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2108,8 +2108,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP Keepalive options */ sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepintvl(sock->sk, keepidle); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); -- cgit v1.2.3 From 480aeb9639d6a077c611b303a22f9b1e5937d081 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:25 +0200 Subject: tcp: add tcp_sock_set_keepcnt Add a helper to directly set the TCP_KEEPCNT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/rds/tcp.h | 2 +- net/rds/tcp_listen.c | 17 +++-------------- net/sunrpc/xprtsock.c | 3 +-- 5 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1f9bada00faa..9aac824c523c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7eb083e09786..15d47d5e7951 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2946,6 +2946,18 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepintvl); +int tcp_sock_set_keepcnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPCNT) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_probes = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepcnt); + /* * Socket option code for TCP. */ diff --git a/net/rds/tcp.h b/net/rds/tcp.h index f6d75d8cb167..bad9cf49d565 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -70,7 +70,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); -int rds_tcp_keepalive(struct socket *sock); +void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); /* tcp_recv.c */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 9ad555c48d15..101cf14215a0 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,27 +38,19 @@ #include "rds.h" #include "tcp.h" -int rds_tcp_keepalive(struct socket *sock) +void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int ret = 0; sock_set_keepalive(sock->sk); - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - if (ret < 0) - goto bail; - + tcp_sock_set_keepcnt(sock->sk, keepcnt); tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ tcp_sock_set_keepintvl(sock->sk, keepidle); -bail: - return ret; } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the @@ -140,10 +132,7 @@ int rds_tcp_accept_one(struct socket *sock) new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); - ret = rds_tcp_keepalive(new_sock); - if (ret < 0) - goto out; - + rds_tcp_keepalive(new_sock); rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5ca64e12af0c..0d3ec055bc12 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2109,8 +2109,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); tcp_sock_set_keepintvl(sock->sk, keepidle); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); + tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); -- cgit v1.2.3 From a8173820f441ab3e2a45c4bb66b70da9a57a349e Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Sat, 23 May 2020 22:41:10 +0530 Subject: gpio: gpiolib: Allow GPIO IRQs to lazy disable With 'commit 461c1a7d4733 ("gpiolib: override irq_enable/disable")' gpiolib overrides irqchip's irq_enable and irq_disable callbacks. If irq_disable callback is implemented then genirq takes unlazy path to disable irq. Underlying irqchip may not want to implement irq_disable callback to lazy disable irq when client drivers invokes disable_irq(). By overriding irq_disable callback, gpiolib ends up always unlazy disabling IRQ. Allow gpiolib to lazy disable IRQs by overriding irq_disable callback only if irqchip implemented irq_disable. In cases where irq_disable is not implemented irq_mask is overridden. Similarly override irq_enable callback only if irqchip implemented irq_enable otherwise irq_unmask is overridden. Fixes: 461c1a7d4733 ("gpiolib: override irq_enable/disable") Signed-off-by: Maulik Shah Tested-by: Hans Verkuil Link: https://lore.kernel.org/r/1590253873-11556-2-git-send-email-mkshah@codeaurora.org Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 55 +++++++++++++++++++++++++++++---------------- include/linux/gpio/driver.h | 13 +++++++++++ 2 files changed, 49 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 34259c1e78ca..167860684be6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2471,32 +2471,37 @@ static void gpiochip_irq_relres(struct irq_data *d) gpiochip_relres_irq(gc, d->hwirq); } +static void gpiochip_irq_mask(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + + if (gc->irq.irq_mask) + gc->irq.irq_mask(d); + gpiochip_disable_irq(gc, d->hwirq); +} + +static void gpiochip_irq_unmask(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + + gpiochip_enable_irq(gc, d->hwirq); + if (gc->irq.irq_unmask) + gc->irq.irq_unmask(d); +} + static void gpiochip_irq_enable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); gpiochip_enable_irq(gc, d->hwirq); - if (gc->irq.irq_enable) - gc->irq.irq_enable(d); - else - gc->irq.chip->irq_unmask(d); + gc->irq.irq_enable(d); } static void gpiochip_irq_disable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - /* - * Since we override .irq_disable() we need to mimic the - * behaviour of __irq_disable() in irq/chip.c. - * First call .irq_disable() if it exists, else mimic the - * behaviour of mask_irq() which calls .irq_mask() if - * it exists. - */ - if (gc->irq.irq_disable) - gc->irq.irq_disable(d); - else if (gc->irq.chip->irq_mask) - gc->irq.chip->irq_mask(d); + gc->irq.irq_disable(d); gpiochip_disable_irq(gc, d->hwirq); } @@ -2521,10 +2526,22 @@ static void gpiochip_set_irq_hooks(struct gpio_chip *gc) "detected irqchip that is shared with multiple gpiochips: please fix the driver.\n"); return; } - gc->irq.irq_enable = irqchip->irq_enable; - gc->irq.irq_disable = irqchip->irq_disable; - irqchip->irq_enable = gpiochip_irq_enable; - irqchip->irq_disable = gpiochip_irq_disable; + + if (irqchip->irq_disable) { + gc->irq.irq_disable = irqchip->irq_disable; + irqchip->irq_disable = gpiochip_irq_disable; + } else { + gc->irq.irq_mask = irqchip->irq_mask; + irqchip->irq_mask = gpiochip_irq_mask; + } + + if (irqchip->irq_enable) { + gc->irq.irq_enable = irqchip->irq_enable; + irqchip->irq_enable = gpiochip_irq_enable; + } else { + gc->irq.irq_unmask = irqchip->irq_unmask; + irqchip->irq_unmask = gpiochip_irq_unmask; + } } /** diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 8c41ae41b6bb..c8bcaf315d03 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -253,6 +253,19 @@ struct gpio_irq_chip { * Store old irq_chip irq_disable callback */ void (*irq_disable)(struct irq_data *data); + /** + * @irq_unmask: + * + * Store old irq_chip irq_unmask callback + */ + void (*irq_unmask)(struct irq_data *data); + + /** + * @irq_mask: + * + * Store old irq_chip irq_mask callback + */ + void (*irq_mask)(struct irq_data *data); }; /** -- cgit v1.2.3 From 95fc87b44104d9a524ff3e975bbfbd7c1f1a2dd5 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Fri, 29 May 2020 02:00:54 +0530 Subject: vfio: Selective dirty page tracking if IOMMU backed device pins pages Added a check such that only singleton IOMMU groups can pin pages. >From the point when vendor driver pins any pages, consider IOMMU group dirty page scope to be limited to pinned pages. To optimize to avoid walking list often, added flag pinned_page_dirty_scope to indicate if all of the vfio_groups for each vfio_domain in the domain_list dirty page scope is limited to pinned pages. This flag is updated on first pinned pages request for that IOMMU group and on attaching/detaching group. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 13 +++-- drivers/vfio/vfio_iommu_type1.c | 103 +++++++++++++++++++++++++++++++++++++--- include/linux/vfio.h | 4 +- 3 files changed, 109 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 765e0e5d83ed..580099afeaff 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -85,6 +85,7 @@ struct vfio_group { atomic_t opened; wait_queue_head_t container_q; bool noiommu; + unsigned int dev_counter; struct kvm *kvm; struct blocking_notifier_head notifier; }; @@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group, mutex_lock(&group->device_lock); list_add(&device->group_next, &group->device_list); + group->dev_counter++; mutex_unlock(&group->device_lock); return device; @@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref) struct vfio_group *group = device->group; list_del(&device->group_next); + group->dev_counter--; mutex_unlock(&group->device_lock); dev_set_drvdata(device->dev, NULL); @@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, if (!group) return -ENODEV; + if (group->dev_counter > 1) + return -EINVAL; + ret = vfio_group_add_container_user(group); if (ret) goto err_pin_pages; @@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, user_pfn, + ret = driver->ops->pin_pages(container->iommu_data, + group->iommu_group, user_pfn, npage, prot, phys_pfn); else ret = -ENOTTY; @@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group, driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) ret = driver->ops->pin_pages(container->iommu_data, - user_iova_pfn, npage, - prot, phys_pfn); + group->iommu_group, user_iova_pfn, + npage, prot, phys_pfn); else ret = -ENOTTY; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index f5c6ca46f165..a6d632d44c82 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -73,6 +73,7 @@ struct vfio_iommu { bool v2; bool nesting; bool dirty_page_tracking; + bool pinned_page_dirty_scope; }; struct vfio_domain { @@ -100,6 +101,7 @@ struct vfio_group { struct iommu_group *iommu_group; struct list_head next; bool mdev_group; /* An mdev group */ + bool pinned_page_dirty_scope; }; struct vfio_iova { @@ -143,6 +145,10 @@ struct vfio_regions { static int put_pfn(unsigned long pfn, int prot); +static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, + struct iommu_group *iommu_group); + +static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu); /* * This code handles mapping and unmapping of user data buffers * into DMA'ble space using the IOMMU @@ -622,11 +628,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, } static int vfio_iommu_type1_pin_pages(void *iommu_data, + struct iommu_group *iommu_group, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn) { struct vfio_iommu *iommu = iommu_data; + struct vfio_group *group; int i, j, ret; unsigned long remote_vaddr; struct vfio_dma *dma; @@ -699,8 +707,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, (iova - dma->iova) >> pgshift, 1); } } - ret = i; + + group = vfio_iommu_find_iommu_group(iommu, iommu_group); + if (!group->pinned_page_dirty_scope) { + group->pinned_page_dirty_scope = true; + update_pinned_page_dirty_scope(iommu); + } + goto pin_done; pin_unwind: @@ -960,8 +974,9 @@ static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu) } } -static int update_user_bitmap(u64 __user *bitmap, struct vfio_dma *dma, - dma_addr_t base_iova, size_t pgsize) +static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, + struct vfio_dma *dma, dma_addr_t base_iova, + size_t pgsize) { unsigned long pgshift = __ffs(pgsize); unsigned long nbits = dma->size >> pgshift; @@ -970,8 +985,11 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_dma *dma, unsigned long shift = bit_offset % BITS_PER_LONG; unsigned long leftover; - /* mark all pages dirty if all pages are pinned and mapped. */ - if (dma->iommu_mapped) + /* + * mark all pages dirty if any IOMMU capable device is not able + * to report dirty pages and all pages are pinned and mapped. + */ + if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped) bitmap_set(dma->bitmap, 0, nbits); if (shift) { @@ -1024,7 +1042,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, if (dma->iova > iova + size - 1) break; - ret = update_user_bitmap(bitmap, dma, iova, pgsize); + ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize); if (ret) return ret; @@ -1169,7 +1187,7 @@ again: } if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { - ret = update_user_bitmap(bitmap->data, dma, + ret = update_user_bitmap(bitmap->data, iommu, dma, unmap->iova, pgsize); if (ret) break; @@ -1521,6 +1539,51 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain, return NULL; } +static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, + struct iommu_group *iommu_group) +{ + struct vfio_domain *domain; + struct vfio_group *group = NULL; + + list_for_each_entry(domain, &iommu->domain_list, next) { + group = find_iommu_group(domain, iommu_group); + if (group) + return group; + } + + if (iommu->external_domain) + group = find_iommu_group(iommu->external_domain, iommu_group); + + return group; +} + +static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu) +{ + struct vfio_domain *domain; + struct vfio_group *group; + + list_for_each_entry(domain, &iommu->domain_list, next) { + list_for_each_entry(group, &domain->group_list, next) { + if (!group->pinned_page_dirty_scope) { + iommu->pinned_page_dirty_scope = false; + return; + } + } + } + + if (iommu->external_domain) { + domain = iommu->external_domain; + list_for_each_entry(group, &domain->group_list, next) { + if (!group->pinned_page_dirty_scope) { + iommu->pinned_page_dirty_scope = false; + return; + } + } + } + + iommu->pinned_page_dirty_scope = true; +} + static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, phys_addr_t *base) { @@ -1928,6 +1991,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, list_add(&group->next, &iommu->external_domain->group_list); + /* + * Non-iommu backed group cannot dirty memory directly, + * it can only use interfaces that provide dirty + * tracking. + * The iommu scope can only be promoted with the + * addition of a dirty tracking group. + */ + group->pinned_page_dirty_scope = true; + if (!iommu->pinned_page_dirty_scope) + update_pinned_page_dirty_scope(iommu); mutex_unlock(&iommu->lock); return 0; @@ -2051,6 +2124,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, done: /* Delete the old one and insert new iova list */ vfio_iommu_iova_insert_copy(iommu, &iova_copy); + + /* + * An iommu backed group can dirty memory directly and therefore + * demotes the iommu scope until it declares itself dirty tracking + * capable via the page pinning interface. + */ + iommu->pinned_page_dirty_scope = false; mutex_unlock(&iommu->lock); vfio_iommu_resv_free(&group_resv_regions); @@ -2203,6 +2283,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, struct vfio_iommu *iommu = iommu_data; struct vfio_domain *domain; struct vfio_group *group; + bool update_dirty_scope = false; LIST_HEAD(iova_copy); mutex_lock(&iommu->lock); @@ -2210,6 +2291,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (iommu->external_domain) { group = find_iommu_group(iommu->external_domain, iommu_group); if (group) { + update_dirty_scope = !group->pinned_page_dirty_scope; list_del(&group->next); kfree(group); @@ -2239,6 +2321,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, continue; vfio_iommu_detach_group(domain, group); + update_dirty_scope = !group->pinned_page_dirty_scope; list_del(&group->next); kfree(group); /* @@ -2270,6 +2353,12 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, vfio_iommu_iova_free(&iova_copy); detach_group_done: + /* + * Removal of a group without dirty tracking may allow the iommu scope + * to be promoted. + */ + if (update_dirty_scope) + update_pinned_page_dirty_scope(iommu); mutex_unlock(&iommu->lock); } diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 5d92ee15d098..38d3c6a8dc7e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -76,7 +76,9 @@ struct vfio_iommu_driver_ops { struct iommu_group *group); void (*detach_group)(void *iommu_data, struct iommu_group *group); - int (*pin_pages)(void *iommu_data, unsigned long *user_pfn, + int (*pin_pages)(void *iommu_data, + struct iommu_group *group, + unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); int (*unpin_pages)(void *iommu_data, -- cgit v1.2.3 From 24c5efe41c29ee3e55bcf5a1c9f61ca8709622e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 22 May 2020 12:01:33 +1000 Subject: sunrpc: clean up properly in gss_mech_unregister() gss_mech_register() calls svcauth_gss_register_pseudoflavor() for each flavour, but gss_mech_unregister() does not call auth_domain_put(). This is unbalanced and makes it impossible to reload the module. Change svcauth_gss_register_pseudoflavor() to return the registered auth_domain, and save it for later release. Cc: stable@vger.kernel.org (v2.6.12+) Link: https://bugzilla.kernel.org/show_bug.cgi?id=206651 Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/gss_api.h | 1 + include/linux/sunrpc/svcauth_gss.h | 3 ++- net/sunrpc/auth_gss/gss_mech_switch.c | 12 +++++++++--- net/sunrpc/auth_gss/svcauth_gss.c | 12 ++++++------ 4 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index bc07e51f20d1..bf4ac8a0268c 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -84,6 +84,7 @@ struct pf_desc { u32 service; char *name; char *auth_domain_name; + struct auth_domain *domain; bool datatouch; }; diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index ca39a388dc22..f09c82b0a7ae 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -20,7 +20,8 @@ int gss_svc_init(void); void gss_svc_shutdown(void); int gss_svc_init_net(struct net *net); void gss_svc_shutdown_net(struct net *net); -int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); +struct auth_domain *svcauth_gss_register_pseudoflavor(u32 pseudoflavor, + char *name); u32 svcauth_gss_flavor(struct auth_domain *dom); #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 69316ab1b9fa..fae632da1058 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -37,6 +37,8 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; + if (pf->domain) + auth_domain_put(pf->domain); kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } @@ -59,6 +61,7 @@ make_auth_domain_name(char *name) static int gss_mech_svc_setup(struct gss_api_mech *gm) { + struct auth_domain *dom; struct pf_desc *pf; int i, status; @@ -68,10 +71,13 @@ gss_mech_svc_setup(struct gss_api_mech *gm) status = -ENOMEM; if (pf->auth_domain_name == NULL) goto out; - status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, - pf->auth_domain_name); - if (status) + dom = svcauth_gss_register_pseudoflavor( + pf->pseudoflavor, pf->auth_domain_name); + if (IS_ERR(dom)) { + status = PTR_ERR(dom); goto out; + } + pf->domain = dom; } return 0; out: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 49bb346a6215..46027d0c903f 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -809,7 +809,7 @@ u32 svcauth_gss_flavor(struct auth_domain *dom) EXPORT_SYMBOL_GPL(svcauth_gss_flavor); -int +struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; @@ -832,17 +832,17 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) name); stat = -EADDRINUSE; auth_domain_put(test); - kfree(new->h.name); - goto out_free_dom; + goto out_free_name; } - return 0; + return test; +out_free_name: + kfree(new->h.name); out_free_dom: kfree(new); out: - return stat; + return ERR_PTR(stat); } - EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); static inline int -- cgit v1.2.3 From 6d3f922c46f2e91f63c92f8dd28381f097082912 Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Tue, 12 May 2020 15:53:21 +0300 Subject: opp: Add support for parsing interconnect bandwidth The OPP bindings now support bandwidth values, so add support to parse it from device tree and store it into the new dev_pm_opp_icc_bw struct, which is part of the dev_pm_opp. Signed-off-by: Georgi Djakov Reviewed-by: Matthias Kaehlcke [ Viresh: Create _read_bw() and use it, renamed _of_find_icc_paths() to dev_pm_opp_of_find_icc_paths(), exported it and made opp_table argument optional. Also drop the depends on from Kconfig. ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 28 ++++++++++-- drivers/opp/of.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++- drivers/opp/opp.h | 7 +++ include/linux/pm_opp.h | 18 ++++++++ 4 files changed, 162 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/opp/core.c b/drivers/opp/core.c index ce7e4103ec09..d19cc7970643 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -999,6 +999,12 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) ret); } + /* Find interconnect path(s) for the device */ + ret = dev_pm_opp_of_find_icc_paths(dev, opp_table); + if (ret) + dev_warn(dev, "%s: Error finding interconnect paths: %d\n", + __func__, ret); + BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head); INIT_LIST_HEAD(&opp_table->opp_list); kref_init(&opp_table->kref); @@ -1057,6 +1063,7 @@ static void _opp_table_kref_release(struct kref *kref) { struct opp_table *opp_table = container_of(kref, struct opp_table, kref); struct opp_device *opp_dev, *temp; + int i; _of_clear_opp_table(opp_table); @@ -1064,6 +1071,12 @@ static void _opp_table_kref_release(struct kref *kref) if (!IS_ERR(opp_table->clk)) clk_put(opp_table->clk); + if (opp_table->paths) { + for (i = 0; i < opp_table->path_count; i++) + icc_put(opp_table->paths[i]); + kfree(opp_table->paths); + } + WARN_ON(!list_empty(&opp_table->opp_list)); list_for_each_entry_safe(opp_dev, temp, &opp_table->dev_list, node) { @@ -1243,19 +1256,23 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_remove_all_dynamic); struct dev_pm_opp *_opp_allocate(struct opp_table *table) { struct dev_pm_opp *opp; - int count, supply_size; + int supply_count, supply_size, icc_size; /* Allocate space for at least one supply */ - count = table->regulator_count > 0 ? table->regulator_count : 1; - supply_size = sizeof(*opp->supplies) * count; + supply_count = table->regulator_count > 0 ? table->regulator_count : 1; + supply_size = sizeof(*opp->supplies) * supply_count; + icc_size = sizeof(*opp->bandwidth) * table->path_count; /* allocate new OPP node and supplies structures */ - opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL); + opp = kzalloc(sizeof(*opp) + supply_size + icc_size, GFP_KERNEL); + if (!opp) return NULL; /* Put the supplies at the end of the OPP structure as an empty array */ opp->supplies = (struct dev_pm_opp_supply *)(opp + 1); + if (icc_size) + opp->bandwidth = (struct dev_pm_opp_icc_bw *)(opp->supplies + supply_count); INIT_LIST_HEAD(&opp->node); return opp; @@ -1290,6 +1307,9 @@ int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2) { if (opp1->rate != opp2->rate) return opp1->rate < opp2->rate ? -1 : 1; + if (opp1->bandwidth && opp2->bandwidth && + opp1->bandwidth[0].peak != opp2->bandwidth[0].peak) + return opp1->bandwidth[0].peak < opp2->bandwidth[0].peak ? -1 : 1; if (opp1->level != opp2->level) return opp1->level < opp2->level ? -1 : 1; return 0; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 303d2207e0ff..0c55862f5624 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -332,6 +332,62 @@ free_required_opps: return ret; } +int dev_pm_opp_of_find_icc_paths(struct device *dev, + struct opp_table *opp_table) +{ + struct device_node *np; + int ret = 0, i, count, num_paths; + struct icc_path **paths; + + np = of_node_get(dev->of_node); + if (!np) + return 0; + + count = of_count_phandle_with_args(np, "interconnects", + "#interconnect-cells"); + of_node_put(np); + if (count < 0) + return 0; + + /* two phandles when #interconnect-cells = <1> */ + if (count % 2) { + dev_err(dev, "%s: Invalid interconnects values\n", __func__); + return -EINVAL; + } + + num_paths = count / 2; + paths = kcalloc(num_paths, sizeof(*paths), GFP_KERNEL); + if (!paths) + return -ENOMEM; + + for (i = 0; i < num_paths; i++) { + paths[i] = of_icc_get_by_index(dev, i); + if (IS_ERR(paths[i])) { + ret = PTR_ERR(paths[i]); + if (ret != -EPROBE_DEFER) { + dev_err(dev, "%s: Unable to get path%d: %d\n", + __func__, i, ret); + } + goto err; + } + } + + if (opp_table) { + opp_table->paths = paths; + opp_table->path_count = num_paths; + return 0; + } + +err: + while (i--) + icc_put(paths[i]); + + kfree(paths); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_of_find_icc_paths); + static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table, struct device_node *np) { @@ -521,9 +577,45 @@ void dev_pm_opp_of_remove_table(struct device *dev) } EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); +static int _read_bw(struct dev_pm_opp *new_opp, struct device_node *np, + bool peak) +{ + const char *name = peak ? "opp-peak-kBps" : "opp-avg-kBps"; + struct property *prop; + int i, count, ret; + u32 *bw; + + prop = of_find_property(np, name, NULL); + if (!prop) + return -ENODEV; + + count = prop->length / sizeof(u32); + bw = kmalloc_array(count, sizeof(*bw), GFP_KERNEL); + if (!bw) + return -ENOMEM; + + ret = of_property_read_u32_array(np, name, bw, count); + if (ret) { + pr_err("%s: Error parsing %s: %d\n", __func__, name, ret); + goto out; + } + + for (i = 0; i < count; i++) { + if (peak) + new_opp->bandwidth[i].peak = kBps_to_icc(bw[i]); + else + new_opp->bandwidth[i].avg = kBps_to_icc(bw[i]); + } + +out: + kfree(bw); + return ret; +} + static int _read_opp_key(struct dev_pm_opp *new_opp, struct device_node *np, bool *rate_not_available) { + bool found = false; u64 rate; int ret; @@ -535,10 +627,30 @@ static int _read_opp_key(struct dev_pm_opp *new_opp, struct device_node *np, * bit guaranteed in clk API. */ new_opp->rate = (unsigned long)rate; + found = true; } *rate_not_available = !!ret; - of_property_read_u32(np, "opp-level", &new_opp->level); + /* + * Bandwidth consists of peak and average (optional) values: + * opp-peak-kBps = ; + * opp-avg-kBps = ; + */ + ret = _read_bw(new_opp, np, true); + if (!ret) { + found = true; + ret = _read_bw(new_opp, np, false); + } + + /* The properties were found but we failed to parse them */ + if (ret && ret != -ENODEV) + return ret; + + if (!of_property_read_u32(np, "opp-level", &new_opp->level)) + found = true; + + if (found) + return 0; return ret; } diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index bcadb1e328a4..2b81ffef1ba4 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -12,6 +12,7 @@ #define __DRIVER_OPP_H__ #include +#include #include #include #include @@ -59,6 +60,7 @@ extern struct list_head opp_tables; * @rate: Frequency in hertz * @level: Performance level * @supplies: Power supplies voltage/current values + * @bandwidth: Interconnect bandwidth values * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's * frequency from any other OPP's frequency. * @required_opps: List of OPPs that are required by this OPP. @@ -81,6 +83,7 @@ struct dev_pm_opp { unsigned int level; struct dev_pm_opp_supply *supplies; + struct dev_pm_opp_icc_bw *bandwidth; unsigned long clock_latency_ns; @@ -146,6 +149,8 @@ enum opp_table_access { * @regulator_count: Number of power supply regulators. Its value can be -1 * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt * property). + * @paths: Interconnect path handles + * @path_count: Number of interconnect paths * @genpd_performance_state: Device's power domain support performance state. * @is_genpd: Marks if the OPP table belongs to a genpd. * @set_opp: Platform specific set_opp callback @@ -189,6 +194,8 @@ struct opp_table { struct clk *clk; struct regulator **regulators; int regulator_count; + struct icc_path **paths; + unsigned int path_count; bool genpd_performance_state; bool is_genpd; diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 747861816f4f..d5c4a329321d 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -41,6 +41,18 @@ struct dev_pm_opp_supply { unsigned long u_amp; }; +/** + * struct dev_pm_opp_icc_bw - Interconnect bandwidth values + * @avg: Average bandwidth corresponding to this OPP (in icc units) + * @peak: Peak bandwidth corresponding to this OPP (in icc units) + * + * This structure stores the bandwidth values for a single interconnect path. + */ +struct dev_pm_opp_icc_bw { + u32 avg; + u32 peak; +}; + /** * struct dev_pm_opp_info - OPP freq/voltage/current values * @rate: Target clk rate in hz @@ -360,6 +372,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpuma struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev); struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp); int of_get_required_opp_performance_state(struct device_node *np, int index); +int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table); void dev_pm_opp_of_register_em(struct cpumask *cpus); #else static inline int dev_pm_opp_of_add_table(struct device *dev) @@ -408,6 +421,11 @@ static inline int of_get_required_opp_performance_state(struct device_node *np, { return -ENOTSUPP; } + +static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table) +{ + return -ENOTSUPP; +} #endif #endif /* __LINUX_OPP_H__ */ -- cgit v1.2.3 From 0430b1d5704b0f0f1d237236dde9c143f8669e49 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 18 May 2020 14:25:32 +0300 Subject: opp: Expose bandwidth information via debugfs Expose the bandwidth information as well via debugfs. Signed-off-by: Viresh Kumar Signed-off-by: Georgi Djakov --- drivers/interconnect/core.c | 18 ++++++++++++++++++ drivers/opp/debugfs.c | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/interconnect.h | 6 ++++++ 3 files changed, 66 insertions(+) (limited to 'include/linux') diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 2d2e49780511..a56349c14985 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -514,6 +514,24 @@ void icc_set_tag(struct icc_path *path, u32 tag) } EXPORT_SYMBOL_GPL(icc_set_tag); +/** + * icc_get_name() - Get name of the icc path + * @path: reference to the path returned by icc_get() + * + * This function is used by an interconnect consumer to get the name of the icc + * path. + * + * Returns a valid pointer on success, or NULL otherwise. + */ +const char *icc_get_name(struct icc_path *path) +{ + if (!path) + return NULL; + + return path->name; +} +EXPORT_SYMBOL_GPL(icc_get_name); + /** * icc_set_bw() - set bandwidth constraints on an interconnect path * @path: reference to the path returned by icc_get() diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c index 609665e339b6..596c185b5dda 100644 --- a/drivers/opp/debugfs.c +++ b/drivers/opp/debugfs.c @@ -32,6 +32,47 @@ void opp_debug_remove_one(struct dev_pm_opp *opp) debugfs_remove_recursive(opp->dentry); } +static ssize_t bw_name_read(struct file *fp, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct icc_path *path = fp->private_data; + char buf[64]; + int i; + + i = scnprintf(buf, sizeof(buf), "%.62s\n", icc_get_name(path)); + + return simple_read_from_buffer(userbuf, count, ppos, buf, i); +} + +static const struct file_operations bw_name_fops = { + .open = simple_open, + .read = bw_name_read, + .llseek = default_llseek, +}; + +static void opp_debug_create_bw(struct dev_pm_opp *opp, + struct opp_table *opp_table, + struct dentry *pdentry) +{ + struct dentry *d; + char name[11]; + int i; + + for (i = 0; i < opp_table->path_count; i++) { + snprintf(name, sizeof(name), "icc-path-%.1d", i); + + /* Create per-path directory */ + d = debugfs_create_dir(name, pdentry); + + debugfs_create_file("name", S_IRUGO, d, opp_table->paths[i], + &bw_name_fops); + debugfs_create_u32("peak_bw", S_IRUGO, d, + &opp->bandwidth[i].peak); + debugfs_create_u32("avg_bw", S_IRUGO, d, + &opp->bandwidth[i].avg); + } +} + static void opp_debug_create_supplies(struct dev_pm_opp *opp, struct opp_table *opp_table, struct dentry *pdentry) @@ -94,6 +135,7 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) &opp->clock_latency_ns); opp_debug_create_supplies(opp, opp_table, d); + opp_debug_create_bw(opp, opp_table, d); opp->dentry = d; } diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index 34e97231a6ab..1ad09efd296e 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -32,6 +32,7 @@ struct icc_path *of_icc_get_by_index(struct device *dev, int idx); void icc_put(struct icc_path *path); int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw); void icc_set_tag(struct icc_path *path, u32 tag); +const char *icc_get_name(struct icc_path *path); #else @@ -65,6 +66,11 @@ static inline void icc_set_tag(struct icc_path *path, u32 tag) { } +static inline const char *icc_get_name(struct icc_path *path) +{ + return NULL; +} + #endif /* CONFIG_INTERCONNECT */ #endif /* __LINUX_INTERCONNECT_H */ -- cgit v1.2.3 From 2849beec3343343b293f33267affbdd7c3f42814 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:03 +0200 Subject: mmc: sdio: Change macro names for Marvell 8688 modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add underscore as separator in Marvell 8688 macro names for better readability and consistency. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-3-pali@kernel.org Signed-off-by: Ulf Hansson Acked-by: Ganapathi Bhat --- drivers/net/wireless/marvell/libertas/if_sdio.c | 2 +- include/linux/mmc/sdio_ids.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/marvell/libertas/if_sdio.c b/drivers/net/wireless/marvell/libertas/if_sdio.c index acf61b93b782..44fbd0acb87a 100644 --- a/drivers/net/wireless/marvell/libertas/if_sdio.c +++ b/drivers/net/wireless/marvell/libertas/if_sdio.c @@ -65,7 +65,7 @@ static const struct sdio_device_id if_sdio_ids[] = { { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_LIBERTAS) }, { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, - SDIO_DEVICE_ID_MARVELL_8688WLAN) }, + SDIO_DEVICE_ID_MARVELL_8688_WLAN) }, { /* end: all zeroes */ }, }; diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 96f43e0dc78f..7e992a0e6cc0 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -56,8 +56,8 @@ #define SDIO_VENDOR_ID_MARVELL 0x02df #define SDIO_DEVICE_ID_MARVELL_LIBERTAS 0x9103 -#define SDIO_DEVICE_ID_MARVELL_8688WLAN 0x9104 -#define SDIO_DEVICE_ID_MARVELL_8688BT 0x9105 +#define SDIO_DEVICE_ID_MARVELL_8688_WLAN 0x9104 +#define SDIO_DEVICE_ID_MARVELL_8688_BT 0x9105 #define SDIO_DEVICE_ID_MARVELL_8797_F0 0x9128 #define SDIO_DEVICE_ID_MARVELL_8887_F0 0x9134 -- cgit v1.2.3 From 7d14c687376e6bd33cf42028300838466b95e765 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:04 +0200 Subject: mmc: sdio: Move SDIO IDs from mwifiex driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add _WLAN suffix to macro names for consistency with other Marvell macros. These IDs represents wlan function of combo bt/wlan cards. Other functions of these cards have different IDs. Signed-off-by: Pali Rohár Acked-by: Kalle Valo Link: https://lore.kernel.org/r/20200522144412.19712-4-pali@kernel.org Signed-off-by: Ulf Hansson Acked-by: Ganapathi Bhat --- drivers/net/wireless/marvell/mwifiex/sdio.c | 38 +++++++---------------------- include/linux/mmc/sdio_ids.h | 10 ++++++++ 2 files changed, 19 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/marvell/mwifiex/sdio.c b/drivers/net/wireless/marvell/mwifiex/sdio.c index 6a2dcb01caf4..a042965962a2 100644 --- a/drivers/net/wireless/marvell/mwifiex/sdio.c +++ b/drivers/net/wireless/marvell/mwifiex/sdio.c @@ -480,45 +480,25 @@ static void mwifiex_sdio_coredump(struct device *dev) schedule_work(&card->work); } -/* Device ID for SD8786 */ -#define SDIO_DEVICE_ID_MARVELL_8786 (0x9116) -/* Device ID for SD8787 */ -#define SDIO_DEVICE_ID_MARVELL_8787 (0x9119) -/* Device ID for SD8797 */ -#define SDIO_DEVICE_ID_MARVELL_8797 (0x9129) -/* Device ID for SD8897 */ -#define SDIO_DEVICE_ID_MARVELL_8897 (0x912d) -/* Device ID for SD8887 */ -#define SDIO_DEVICE_ID_MARVELL_8887 (0x9135) -/* Device ID for SD8801 */ -#define SDIO_DEVICE_ID_MARVELL_8801 (0x9139) -/* Device ID for SD8977 */ -#define SDIO_DEVICE_ID_MARVELL_8977 (0x9145) -/* Device ID for SD8987 */ -#define SDIO_DEVICE_ID_MARVELL_8987 (0x9149) -/* Device ID for SD8997 */ -#define SDIO_DEVICE_ID_MARVELL_8997 (0x9141) - - /* WLAN IDs */ static const struct sdio_device_id mwifiex_ids[] = { - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8786), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8786_WLAN), .driver_data = (unsigned long) &mwifiex_sdio_sd8786}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8787), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8787_WLAN), .driver_data = (unsigned long) &mwifiex_sdio_sd8787}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8797), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8797_WLAN), .driver_data = (unsigned long) &mwifiex_sdio_sd8797}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8897), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8897_WLAN), .driver_data = (unsigned long) &mwifiex_sdio_sd8897}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8887), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8887_WLAN), .driver_data = (unsigned long)&mwifiex_sdio_sd8887}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8801), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8801_WLAN), .driver_data = (unsigned long)&mwifiex_sdio_sd8801}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8977), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8977_WLAN), .driver_data = (unsigned long)&mwifiex_sdio_sd8977}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8987), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8987_WLAN), .driver_data = (unsigned long)&mwifiex_sdio_sd8987}, - {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8997), + {SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8997_WLAN), .driver_data = (unsigned long)&mwifiex_sdio_sd8997}, {}, }; diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 7e992a0e6cc0..90361ea7f5ed 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -58,8 +58,18 @@ #define SDIO_DEVICE_ID_MARVELL_LIBERTAS 0x9103 #define SDIO_DEVICE_ID_MARVELL_8688_WLAN 0x9104 #define SDIO_DEVICE_ID_MARVELL_8688_BT 0x9105 +#define SDIO_DEVICE_ID_MARVELL_8786_WLAN 0x9116 +#define SDIO_DEVICE_ID_MARVELL_8787_WLAN 0x9119 #define SDIO_DEVICE_ID_MARVELL_8797_F0 0x9128 +#define SDIO_DEVICE_ID_MARVELL_8797_WLAN 0x9129 +#define SDIO_DEVICE_ID_MARVELL_8897_WLAN 0x912d #define SDIO_DEVICE_ID_MARVELL_8887_F0 0x9134 +#define SDIO_DEVICE_ID_MARVELL_8887_WLAN 0x9135 +#define SDIO_DEVICE_ID_MARVELL_8801_WLAN 0x9139 +#define SDIO_DEVICE_ID_MARVELL_8997_F0 0x9140 +#define SDIO_DEVICE_ID_MARVELL_8997_WLAN 0x9141 +#define SDIO_DEVICE_ID_MARVELL_8977_WLAN 0x9145 +#define SDIO_DEVICE_ID_MARVELL_8987_WLAN 0x9149 #define SDIO_VENDOR_ID_MEDIATEK 0x037a -- cgit v1.2.3 From 649c7d76d87ca2285a82a65d86d99ddb60571e20 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:05 +0200 Subject: mmc: sdio: Move SDIO IDs from btmrvl driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define appropriate macro names for consistency with other Marvell macros. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-5-pali@kernel.org Signed-off-by: Ulf Hansson Acked-by: Ganapathi Bhat --- drivers/bluetooth/btmrvl_sdio.c | 18 +++++++++--------- include/linux/mmc/sdio_ids.h | 8 ++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/bluetooth/btmrvl_sdio.c b/drivers/bluetooth/btmrvl_sdio.c index 0f3a020703ab..a296f8526433 100644 --- a/drivers/bluetooth/btmrvl_sdio.c +++ b/drivers/bluetooth/btmrvl_sdio.c @@ -355,31 +355,31 @@ static const struct btmrvl_sdio_device btmrvl_sdio_sd8997 = { static const struct sdio_device_id btmrvl_sdio_ids[] = { /* Marvell SD8688 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x9105), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8688_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8688 }, /* Marvell SD8787 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x911A), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8787_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8787 }, /* Marvell SD8787 Bluetooth AMP device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x911B), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8787_BT_AMP), .driver_data = (unsigned long)&btmrvl_sdio_sd8787 }, /* Marvell SD8797 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x912A), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8797_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8797 }, /* Marvell SD8887 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x9136), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8887_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8887 }, /* Marvell SD8897 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x912E), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8897_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8897 }, /* Marvell SD8977 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x9146), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8977_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8977 }, /* Marvell SD8987 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x914A), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8987_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8987 }, /* Marvell SD8997 Bluetooth device */ - { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, 0x9142), + { SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_8997_BT), .driver_data = (unsigned long)&btmrvl_sdio_sd8997 }, { } /* Terminating entry */ diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 90361ea7f5ed..1237e1048d06 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -60,16 +60,24 @@ #define SDIO_DEVICE_ID_MARVELL_8688_BT 0x9105 #define SDIO_DEVICE_ID_MARVELL_8786_WLAN 0x9116 #define SDIO_DEVICE_ID_MARVELL_8787_WLAN 0x9119 +#define SDIO_DEVICE_ID_MARVELL_8787_BT 0x911a +#define SDIO_DEVICE_ID_MARVELL_8787_BT_AMP 0x911b #define SDIO_DEVICE_ID_MARVELL_8797_F0 0x9128 #define SDIO_DEVICE_ID_MARVELL_8797_WLAN 0x9129 +#define SDIO_DEVICE_ID_MARVELL_8797_BT 0x912a #define SDIO_DEVICE_ID_MARVELL_8897_WLAN 0x912d +#define SDIO_DEVICE_ID_MARVELL_8897_BT 0x912e #define SDIO_DEVICE_ID_MARVELL_8887_F0 0x9134 #define SDIO_DEVICE_ID_MARVELL_8887_WLAN 0x9135 +#define SDIO_DEVICE_ID_MARVELL_8887_BT 0x9136 #define SDIO_DEVICE_ID_MARVELL_8801_WLAN 0x9139 #define SDIO_DEVICE_ID_MARVELL_8997_F0 0x9140 #define SDIO_DEVICE_ID_MARVELL_8997_WLAN 0x9141 +#define SDIO_DEVICE_ID_MARVELL_8997_BT 0x9142 #define SDIO_DEVICE_ID_MARVELL_8977_WLAN 0x9145 +#define SDIO_DEVICE_ID_MARVELL_8977_BT 0x9146 #define SDIO_DEVICE_ID_MARVELL_8987_WLAN 0x9149 +#define SDIO_DEVICE_ID_MARVELL_8987_BT 0x914a #define SDIO_VENDOR_ID_MEDIATEK 0x037a -- cgit v1.2.3 From baaa110dcacfd704d182a275c4307baecb81423b Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:06 +0200 Subject: mmc: sdio: Move SDIO IDs from btmtksdio driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define appropriate macro names for consistency with other macros. Signed-off-by: Pali Rohár Reviewed-by: Matthias Brugger Link: https://lore.kernel.org/r/20200522144412.19712-6-pali@kernel.org Signed-off-by: Ulf Hansson Acked-by: Ganapathi Bhat --- drivers/bluetooth/btmtksdio.c | 4 ++-- include/linux/mmc/sdio_ids.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index 519788c442ca..bff095be2f97 100644 --- a/drivers/bluetooth/btmtksdio.c +++ b/drivers/bluetooth/btmtksdio.c @@ -51,9 +51,9 @@ static const struct btmtksdio_data mt7668_data = { }; static const struct sdio_device_id btmtksdio_table[] = { - {SDIO_DEVICE(SDIO_VENDOR_ID_MEDIATEK, 0x7663), + {SDIO_DEVICE(SDIO_VENDOR_ID_MEDIATEK, SDIO_DEVICE_ID_MEDIATEK_MT7663), .driver_data = (kernel_ulong_t)&mt7663_data }, - {SDIO_DEVICE(SDIO_VENDOR_ID_MEDIATEK, 0x7668), + {SDIO_DEVICE(SDIO_VENDOR_ID_MEDIATEK, SDIO_DEVICE_ID_MEDIATEK_MT7668), .driver_data = (kernel_ulong_t)&mt7668_data }, { } /* Terminating entry */ }; diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 1237e1048d06..c9aca57d4dea 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -80,6 +80,8 @@ #define SDIO_DEVICE_ID_MARVELL_8987_BT 0x914a #define SDIO_VENDOR_ID_MEDIATEK 0x037a +#define SDIO_DEVICE_ID_MEDIATEK_MT7663 0x7663 +#define SDIO_DEVICE_ID_MEDIATEK_MT7668 0x7668 #define SDIO_VENDOR_ID_SIANO 0x039a #define SDIO_DEVICE_ID_SIANO_NOVA_B0 0x0201 -- cgit v1.2.3 From b8c26a9663e1d888bbf4e14d3a62d033f286623e Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:07 +0200 Subject: mmc: sdio: Move SDIO IDs from smssdio driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define appropriate macro names for consistency with other Siano macros. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-7-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/media/mmc/siano/smssdio.c | 10 +++++----- include/linux/mmc/sdio_ids.h | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/mmc/siano/smssdio.c b/drivers/media/mmc/siano/smssdio.c index def5e93849d2..065b572e0272 100644 --- a/drivers/media/mmc/siano/smssdio.c +++ b/drivers/media/mmc/siano/smssdio.c @@ -58,15 +58,15 @@ static const struct sdio_device_id smssdio_ids[] = { .driver_data = SMS1XXX_BOARD_SIANO_VEGA}, {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, SDIO_DEVICE_ID_SIANO_VENICE), .driver_data = SMS1XXX_BOARD_SIANO_VEGA}, - {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, 0x302), + {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, SDIO_DEVICE_ID_SIANO_MING), .driver_data = SMS1XXX_BOARD_SIANO_MING}, - {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, 0x500), + {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, SDIO_DEVICE_ID_SIANO_PELE), .driver_data = SMS1XXX_BOARD_SIANO_PELE}, - {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, 0x600), + {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, SDIO_DEVICE_ID_SIANO_RIO), .driver_data = SMS1XXX_BOARD_SIANO_RIO}, - {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, 0x700), + {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, SDIO_DEVICE_ID_SIANO_DENVER_2160), .driver_data = SMS1XXX_BOARD_SIANO_DENVER_2160}, - {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, 0x800), + {SDIO_DEVICE(SDIO_VENDOR_ID_SIANO, SDIO_DEVICE_ID_SIANO_DENVER_1530), .driver_data = SMS1XXX_BOARD_SIANO_DENVER_1530}, { /* end: all zeroes */ }, }; diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index c9aca57d4dea..9ec675a7ac37 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -88,6 +88,11 @@ #define SDIO_DEVICE_ID_SIANO_NICE 0x0202 #define SDIO_DEVICE_ID_SIANO_VEGA_A0 0x0300 #define SDIO_DEVICE_ID_SIANO_VENICE 0x0301 +#define SDIO_DEVICE_ID_SIANO_MING 0x0302 +#define SDIO_DEVICE_ID_SIANO_PELE 0x0500 +#define SDIO_DEVICE_ID_SIANO_RIO 0x0600 +#define SDIO_DEVICE_ID_SIANO_DENVER_2160 0x0700 +#define SDIO_DEVICE_ID_SIANO_DENVER_1530 0x0800 #define SDIO_DEVICE_ID_SIANO_NOVA_A0 0x1100 #define SDIO_DEVICE_ID_SIANO_STELLAR 0x5347 -- cgit v1.2.3 From ecc2f3962587aed327ccd3c32e99ccfc9e744fa0 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:08 +0200 Subject: mmc: sdio: Move SDIO IDs from ath6kl driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also replace generic MANUFACTURER macros by proper SDIO IDs macros. Check for "AR6003 or later" is slightly modified to use SDIO device IDs. This allows removal of all custom MANUFACTURER macros from ath6kl. Signed-off-by: Pali Rohár Acked-by: Kalle Valo Link: https://lore.kernel.org/r/20200522144412.19712-8-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/net/wireless/ath/ath6kl/hif.h | 6 ------ drivers/net/wireless/ath/ath6kl/sdio.c | 17 ++++++++--------- include/linux/mmc/sdio_ids.h | 10 ++++++++++ 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/ath/ath6kl/hif.h b/drivers/net/wireless/ath/ath6kl/hif.h index dc6bd8cd9b83..c6dafc38936a 100644 --- a/drivers/net/wireless/ath/ath6kl/hif.h +++ b/drivers/net/wireless/ath/ath6kl/hif.h @@ -35,12 +35,6 @@ #define MAX_SCATTER_ENTRIES_PER_REQ 16 #define MAX_SCATTER_REQ_TRANSFER_SIZE (32 * 1024) -#define MANUFACTURER_ID_AR6003_BASE 0x300 -#define MANUFACTURER_ID_AR6004_BASE 0x400 - /* SDIO manufacturer ID and Codes */ -#define MANUFACTURER_ID_ATH6KL_BASE_MASK 0xFF00 -#define MANUFACTURER_CODE 0x271 /* Atheros */ - /* Mailbox address in SDIO address space */ #define HIF_MBOX_BASE_ADDR 0x800 #define HIF_MBOX_WIDTH 0x800 diff --git a/drivers/net/wireless/ath/ath6kl/sdio.c b/drivers/net/wireless/ath/ath6kl/sdio.c index bb50680580f3..6b51a2dceadc 100644 --- a/drivers/net/wireless/ath/ath6kl/sdio.c +++ b/drivers/net/wireless/ath/ath6kl/sdio.c @@ -799,8 +799,7 @@ static int ath6kl_sdio_config(struct ath6kl *ar) sdio_claim_host(func); - if ((ar_sdio->id->device & MANUFACTURER_ID_ATH6KL_BASE_MASK) >= - MANUFACTURER_ID_AR6003_BASE) { + if (ar_sdio->id->device >= SDIO_DEVICE_ID_ATHEROS_AR6003_00) { /* enable 4-bit ASYNC interrupt on AR6003 or later */ ret = ath6kl_sdio_func0_cmd52_wr_byte(func->card, CCCR_SDIO_IRQ_MODE_REG, @@ -1409,13 +1408,13 @@ static void ath6kl_sdio_remove(struct sdio_func *func) } static const struct sdio_device_id ath6kl_sdio_devices[] = { - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6003_BASE | 0x0))}, - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6003_BASE | 0x1))}, - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6004_BASE | 0x0))}, - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6004_BASE | 0x1))}, - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6004_BASE | 0x2))}, - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6004_BASE | 0x18))}, - {SDIO_DEVICE(MANUFACTURER_CODE, (MANUFACTURER_ID_AR6004_BASE | 0x19))}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6003_00)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6003_01)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6004_00)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6004_01)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6004_02)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6004_18)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6004_19)}, {}, }; diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 9ec675a7ac37..95b67ab7d06a 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -24,6 +24,16 @@ /* * Vendors and devices. Sort key: vendor first, device next. */ + +#define SDIO_VENDOR_ID_ATHEROS 0x0271 +#define SDIO_DEVICE_ID_ATHEROS_AR6003_00 0x0300 +#define SDIO_DEVICE_ID_ATHEROS_AR6003_01 0x0301 +#define SDIO_DEVICE_ID_ATHEROS_AR6004_00 0x0400 +#define SDIO_DEVICE_ID_ATHEROS_AR6004_01 0x0401 +#define SDIO_DEVICE_ID_ATHEROS_AR6004_02 0x0402 +#define SDIO_DEVICE_ID_ATHEROS_AR6004_18 0x0418 +#define SDIO_DEVICE_ID_ATHEROS_AR6004_19 0x0419 + #define SDIO_VENDOR_ID_BROADCOM 0x02d0 #define SDIO_DEVICE_ID_BROADCOM_43143 0xa887 #define SDIO_DEVICE_ID_BROADCOM_43241 0x4324 -- cgit v1.2.3 From 4dc28c948f480c5004613aab153c7160bd4f29ce Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:09 +0200 Subject: mmc: sdio: Move SDIO IDs from ath10k driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also replace generic MANUFACTURER macros by proper SDIO IDs macros. Checks for device IDs are slightly modified to use SDIO device IDs. This allows removal of all custom MANUFACTURER macros from ath10k. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-9-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/net/wireless/ath/ath10k/sdio.c | 25 ++++++++++--------------- drivers/net/wireless/ath/ath10k/sdio.h | 8 -------- include/linux/mmc/sdio_ids.h | 2 ++ 3 files changed, 12 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 1f709b65c29b..59e725515041 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -1083,10 +1083,10 @@ static void ath10k_sdio_set_mbox_info(struct ath10k *ar) mbox_info->ext_info[0].htc_ext_addr = ATH10K_HIF_MBOX0_EXT_BASE_ADDR; - dev_id_base = FIELD_GET(QCA_MANUFACTURER_ID_BASE, device); - dev_id_chiprev = FIELD_GET(QCA_MANUFACTURER_ID_REV_MASK, device); + dev_id_base = (device & 0x0F00); + dev_id_chiprev = (device & 0x00FF); switch (dev_id_base) { - case QCA_MANUFACTURER_ID_AR6005_BASE: + case (SDIO_DEVICE_ID_ATHEROS_AR6005 & 0x0F00): if (dev_id_chiprev < 4) mbox_info->ext_info[0].htc_ext_sz = ATH10K_HIF_MBOX0_EXT_WIDTH; @@ -1097,7 +1097,7 @@ static void ath10k_sdio_set_mbox_info(struct ath10k *ar) mbox_info->ext_info[0].htc_ext_sz = ATH10K_HIF_MBOX0_EXT_WIDTH_ROME_2_0; break; - case QCA_MANUFACTURER_ID_QCA9377_BASE: + case (SDIO_DEVICE_ID_ATHEROS_QCA9377 & 0x0F00): mbox_info->ext_info[0].htc_ext_sz = ATH10K_HIF_MBOX0_EXT_WIDTH_ROME_2_0; break; @@ -2185,19 +2185,16 @@ static int ath10k_sdio_probe(struct sdio_func *func, skb_queue_head_init(&ar_sdio->rx_head); INIT_WORK(&ar_sdio->async_work_rx, ath10k_rx_indication_async_work); - dev_id_base = FIELD_GET(QCA_MANUFACTURER_ID_BASE, id->device); - switch (dev_id_base) { - case QCA_MANUFACTURER_ID_AR6005_BASE: - case QCA_MANUFACTURER_ID_QCA9377_BASE: - ar->dev_id = QCA9377_1_0_DEVICE_ID; - break; - default: + dev_id_base = (id->device & 0x0F00); + if (dev_id_base != (SDIO_DEVICE_ID_ATHEROS_AR6005 & 0x0F00) && + dev_id_base != (SDIO_DEVICE_ID_ATHEROS_QCA9377 & 0x0F00)) { ret = -ENODEV; ath10k_err(ar, "unsupported device id %u (0x%x)\n", dev_id_base, id->device); goto err_free_wq; } + ar->dev_id = QCA9377_1_0_DEVICE_ID; ar->id.vendor = id->vendor; ar->id.device = id->device; @@ -2246,10 +2243,8 @@ static void ath10k_sdio_remove(struct sdio_func *func) } static const struct sdio_device_id ath10k_sdio_devices[] = { - {SDIO_DEVICE(QCA_MANUFACTURER_CODE, - (QCA_SDIO_ID_AR6005_BASE | 0xA))}, - {SDIO_DEVICE(QCA_MANUFACTURER_CODE, - (QCA_SDIO_ID_QCA9377_BASE | 0x1))}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_AR6005)}, + {SDIO_DEVICE(SDIO_VENDOR_ID_ATHEROS, SDIO_DEVICE_ID_ATHEROS_QCA9377)}, {}, }; diff --git a/drivers/net/wireless/ath/ath10k/sdio.h b/drivers/net/wireless/ath/ath10k/sdio.h index 33195f49acab..e8951f9cdb5f 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.h +++ b/drivers/net/wireless/ath/ath10k/sdio.h @@ -10,14 +10,6 @@ #define ATH10K_HIF_MBOX_BLOCK_SIZE 256 -#define QCA_MANUFACTURER_ID_BASE GENMASK(11, 8) -#define QCA_MANUFACTURER_ID_AR6005_BASE 0x5 -#define QCA_MANUFACTURER_ID_QCA9377_BASE 0x7 -#define QCA_SDIO_ID_AR6005_BASE 0x500 -#define QCA_SDIO_ID_QCA9377_BASE 0x700 -#define QCA_MANUFACTURER_ID_REV_MASK 0x00FF -#define QCA_MANUFACTURER_CODE 0x271 /* Qualcomm/Atheros */ - #define ATH10K_SDIO_MAX_BUFFER_SIZE 4096 /*Unsure of this constant*/ /* Mailbox address in SDIO address space */ diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 95b67ab7d06a..2894f7739acc 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -33,6 +33,8 @@ #define SDIO_DEVICE_ID_ATHEROS_AR6004_02 0x0402 #define SDIO_DEVICE_ID_ATHEROS_AR6004_18 0x0418 #define SDIO_DEVICE_ID_ATHEROS_AR6004_19 0x0419 +#define SDIO_DEVICE_ID_ATHEROS_AR6005 0x050A +#define SDIO_DEVICE_ID_ATHEROS_QCA9377 0x0701 #define SDIO_VENDOR_ID_BROADCOM 0x02d0 #define SDIO_DEVICE_ID_BROADCOM_43143 0xa887 -- cgit v1.2.3 From 8baa6d1bce05cf26eec6b43bcd428735f33cb60e Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:10 +0200 Subject: mmc: sdio: Move SDIO IDs from b43-sdio driver to common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define appropriate macro names for consistency with other macros. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-10-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/net/wireless/broadcom/b43/sdio.c | 4 ++-- include/linux/mmc/sdio_ids.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/broadcom/b43/sdio.c b/drivers/net/wireless/broadcom/b43/sdio.c index 881a7938c494..02b0cfd535ab 100644 --- a/drivers/net/wireless/broadcom/b43/sdio.c +++ b/drivers/net/wireless/broadcom/b43/sdio.c @@ -180,8 +180,8 @@ static void b43_sdio_remove(struct sdio_func *func) } static const struct sdio_device_id b43_sdio_ids[] = { - { SDIO_DEVICE(0x02d0, 0x044b) }, /* Nintendo Wii WLAN daughter card */ - { SDIO_DEVICE(0x0092, 0x0004) }, /* C-guys, Inc. EW-CG1102GC */ + { SDIO_DEVICE(SDIO_VENDOR_ID_BROADCOM, SDIO_DEVICE_ID_BROADCOM_NINTENDO_WII) }, + { SDIO_DEVICE(SDIO_VENDOR_ID_CGUYS, SDIO_DEVICE_ID_CGUYS_EW_CG1102GC) }, { }, }; diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 2894f7739acc..8e7a0683b927 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -25,6 +25,9 @@ * Vendors and devices. Sort key: vendor first, device next. */ +#define SDIO_VENDOR_ID_CGUYS 0x0092 +#define SDIO_DEVICE_ID_CGUYS_EW_CG1102GC 0x0004 + #define SDIO_VENDOR_ID_ATHEROS 0x0271 #define SDIO_DEVICE_ID_ATHEROS_AR6003_00 0x0300 #define SDIO_DEVICE_ID_ATHEROS_AR6003_01 0x0301 @@ -37,6 +40,7 @@ #define SDIO_DEVICE_ID_ATHEROS_QCA9377 0x0701 #define SDIO_VENDOR_ID_BROADCOM 0x02d0 +#define SDIO_DEVICE_ID_BROADCOM_NINTENDO_WII 0x044b #define SDIO_DEVICE_ID_BROADCOM_43143 0xa887 #define SDIO_DEVICE_ID_BROADCOM_43241 0x4324 #define SDIO_DEVICE_ID_BROADCOM_4329 0x4329 -- cgit v1.2.3 From 1eb911258805695c8f85795d3d4cdbd1e84fc2d9 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:11 +0200 Subject: mmc: sdio: Fix Cypress SDIO IDs macros in common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All macro names for SDIO device IDs are prefixed by vendor name to which device ID belongs. So for consistency add Broadcom string vendor prefix to all Cypress macro names as they belong to SDIO Broadcom vendor ID. Change also Cypress 43012 value from decimal do hexadecimal notation to be consistent with all other values. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-11-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 6 +++--- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 4 ++-- include/linux/mmc/sdio_ids.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index b684a5b6d904..a1fdb618cf14 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -970,9 +970,9 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = { BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359), - BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_CYPRESS_4373), - BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_CYPRESS_43012), - BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_CYPRESS_89359), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359), { /* end: all zeroes */ } }; MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 3a08252f1a53..1c9561665a67 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -4187,7 +4187,7 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, bus->hostintmask, NULL); switch (sdiod->func1->device) { - case SDIO_DEVICE_ID_CYPRESS_4373: + case SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", CY_4373_F2_WATERMARK); brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, @@ -4201,7 +4201,7 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, CY_4373_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB, &err); break; - case SDIO_DEVICE_ID_CYPRESS_43012: + case SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", CY_43012_F2_WATERMARK); brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 8e7a0683b927..b19200aea56a 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -58,9 +58,9 @@ #define SDIO_DEVICE_ID_BROADCOM_4354 0x4354 #define SDIO_DEVICE_ID_BROADCOM_4356 0x4356 #define SDIO_DEVICE_ID_BROADCOM_4359 0x4359 -#define SDIO_DEVICE_ID_CYPRESS_4373 0x4373 -#define SDIO_DEVICE_ID_CYPRESS_43012 43012 -#define SDIO_DEVICE_ID_CYPRESS_89359 0x4355 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373 0x4373 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012 0xa804 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359 0x4355 #define SDIO_VENDOR_ID_INTEL 0x0089 #define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX 0x1402 -- cgit v1.2.3 From 798dd3c311f6c862719a3fc71c3c14eba192c64e Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Fri, 22 May 2020 16:44:12 +0200 Subject: mmc: sdio: Sort all SDIO IDs in common include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix ordering of SDIO IDs to conform to the comment above, which says vendor first, device next. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200522144412.19712-12-pali@kernel.org Signed-off-by: Ulf Hansson --- include/linux/mmc/sdio_ids.h | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index b19200aea56a..15ed8ce9d394 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -25,9 +25,23 @@ * Vendors and devices. Sort key: vendor first, device next. */ +#define SDIO_VENDOR_ID_STE 0x0020 +#define SDIO_DEVICE_ID_STE_CW1200 0x2280 + +#define SDIO_VENDOR_ID_INTEL 0x0089 +#define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX 0x1402 +#define SDIO_DEVICE_ID_INTEL_IWMC3200WIFI 0x1403 +#define SDIO_DEVICE_ID_INTEL_IWMC3200TOP 0x1404 +#define SDIO_DEVICE_ID_INTEL_IWMC3200GPS 0x1405 +#define SDIO_DEVICE_ID_INTEL_IWMC3200BT 0x1406 +#define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX_2G5 0x1407 + #define SDIO_VENDOR_ID_CGUYS 0x0092 #define SDIO_DEVICE_ID_CGUYS_EW_CG1102GC 0x0004 +#define SDIO_VENDOR_ID_TI 0x0097 +#define SDIO_DEVICE_ID_TI_WL1271 0x4076 + #define SDIO_VENDOR_ID_ATHEROS 0x0271 #define SDIO_DEVICE_ID_ATHEROS_AR6003_00 0x0300 #define SDIO_DEVICE_ID_ATHEROS_AR6003_01 0x0301 @@ -41,34 +55,26 @@ #define SDIO_VENDOR_ID_BROADCOM 0x02d0 #define SDIO_DEVICE_ID_BROADCOM_NINTENDO_WII 0x044b -#define SDIO_DEVICE_ID_BROADCOM_43143 0xa887 #define SDIO_DEVICE_ID_BROADCOM_43241 0x4324 #define SDIO_DEVICE_ID_BROADCOM_4329 0x4329 #define SDIO_DEVICE_ID_BROADCOM_4330 0x4330 #define SDIO_DEVICE_ID_BROADCOM_4334 0x4334 -#define SDIO_DEVICE_ID_BROADCOM_43340 0xa94c -#define SDIO_DEVICE_ID_BROADCOM_43341 0xa94d #define SDIO_DEVICE_ID_BROADCOM_4335_4339 0x4335 #define SDIO_DEVICE_ID_BROADCOM_4339 0x4339 -#define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 -#define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 -#define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 #define SDIO_DEVICE_ID_BROADCOM_4345 0x4345 -#define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_DEVICE_ID_BROADCOM_4354 0x4354 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359 0x4355 #define SDIO_DEVICE_ID_BROADCOM_4356 0x4356 #define SDIO_DEVICE_ID_BROADCOM_4359 0x4359 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373 0x4373 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012 0xa804 -#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359 0x4355 - -#define SDIO_VENDOR_ID_INTEL 0x0089 -#define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX 0x1402 -#define SDIO_DEVICE_ID_INTEL_IWMC3200WIFI 0x1403 -#define SDIO_DEVICE_ID_INTEL_IWMC3200TOP 0x1404 -#define SDIO_DEVICE_ID_INTEL_IWMC3200GPS 0x1405 -#define SDIO_DEVICE_ID_INTEL_IWMC3200BT 0x1406 -#define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX_2G5 0x1407 +#define SDIO_DEVICE_ID_BROADCOM_43143 0xa887 +#define SDIO_DEVICE_ID_BROADCOM_43340 0xa94c +#define SDIO_DEVICE_ID_BROADCOM_43341 0xa94d +#define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 +#define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 +#define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 +#define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_VENDOR_ID_MARVELL 0x02df #define SDIO_DEVICE_ID_MARVELL_LIBERTAS 0x9103 @@ -112,12 +118,7 @@ #define SDIO_DEVICE_ID_SIANO_NOVA_A0 0x1100 #define SDIO_DEVICE_ID_SIANO_STELLAR 0x5347 -#define SDIO_VENDOR_ID_TI 0x0097 -#define SDIO_DEVICE_ID_TI_WL1271 0x4076 #define SDIO_VENDOR_ID_TI_WL1251 0x104c #define SDIO_DEVICE_ID_TI_WL1251 0x9066 -#define SDIO_VENDOR_ID_STE 0x0020 -#define SDIO_DEVICE_ID_STE_CW1200 0x2280 - #endif /* LINUX_MMC_SDIO_IDS_H */ -- cgit v1.2.3 From d58a2df3d8877b91ecbfb936a15da364251a228f Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 18 May 2020 16:45:02 +0200 Subject: serial: 8250: Support rs485 bus termination GPIO Commit e8759ad17d41 ("serial: uapi: Add support for bus termination") introduced the ability to enable rs485 bus termination from user space. So far the feature is only used by a single driver, 8250_exar.c, using a hardcoded GPIO pin specific to Siemens IOT2040 products. Provide for a more generic solution by allowing specification of an rs485 bus termination GPIO pin in the device tree: Amend the serial core to retrieve the GPIO from the device tree (or ACPI table) and amend the default ->rs485_config() callback for 8250 drivers to change the GPIO on request from user space. Perhaps 8250_exar.c can be converted to the generic approach in a follow-up patch. Signed-off-by: Lukas Wunner Reviewed-by: Andy Shevchenko Cc: Jan Kiszka Link: https://lore.kernel.org/r/94c6c800d1ca9fa04766dd1d43a8272c5ad4bedd.1589811297.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 3 +++ drivers/tty/serial/serial_core.c | 16 ++++++++++++++++ include/linux/serial_core.h | 2 ++ 3 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index bf2722e5e3aa..1632f7d25acc 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -681,6 +681,9 @@ int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485) memset(rs485->padding, 0, sizeof(rs485->padding)); port->rs485 = *rs485; + gpiod_set_value(port->rs485_term_gpio, + rs485->flags & SER_RS485_TERMINATE_BUS); + /* * Both serial8250_em485_init() and serial8250_em485_destroy() * are idempotent. diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 43b6682877d5..57840cf90388 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -3317,6 +3317,7 @@ int uart_get_rs485_mode(struct uart_port *port) * to get to a defined state with the following properties: */ rs485conf->flags &= ~(SER_RS485_RX_DURING_TX | SER_RS485_ENABLED | + SER_RS485_TERMINATE_BUS | SER_RS485_RTS_AFTER_SEND); rs485conf->flags |= SER_RS485_RTS_ON_SEND; @@ -3331,6 +3332,21 @@ int uart_get_rs485_mode(struct uart_port *port) rs485conf->flags |= SER_RS485_RTS_AFTER_SEND; } + /* + * Disabling termination by default is the safe choice: Else if many + * bus participants enable it, no communication is possible at all. + * Works fine for short cables and users may enable for longer cables. + */ + port->rs485_term_gpio = devm_gpiod_get_optional(dev, "rs485-term", + GPIOD_OUT_LOW); + if (IS_ERR(port->rs485_term_gpio)) { + ret = PTR_ERR(port->rs485_term_gpio); + port->rs485_term_gpio = NULL; + if (ret != -EPROBE_DEFER) + dev_err(dev, "Cannot get rs485-term-gpios\n"); + return ret; + } + return 0; } EXPORT_SYMBOL_GPL(uart_get_rs485_mode); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index b649a2b894e7..9fd550e7946a 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -251,6 +252,7 @@ struct uart_port { struct attribute_group *attr_group; /* port specific attributes */ const struct attribute_group **tty_groups; /* all attributes (serial core use only) */ struct serial_rs485 rs485; + struct gpio_desc *rs485_term_gpio; /* enable RS485 bus termination */ struct serial_iso7816 iso7816; void *private_data; /* generic platform data pointer */ }; -- cgit v1.2.3 From 5ace60859e84113b7a185c117fbf2c429d381b59 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 16 Mar 2020 11:22:24 +0100 Subject: i2c: smbus: Add a way to instantiate SPD EEPROMs automatically In simple cases we can instantiate SPD EEPROMs on the SMBus automatically. Start with just DDR2, DDR3 and DDR4 on x86 for now, and only for systems with no more than 4 memory slots. These limitations may be lifted later. Signed-off-by: Jean Delvare [wsa: minor change for new API] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-smbus.c | 104 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/i2c-smbus.h | 8 +++- 2 files changed, 110 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c index 809bcf8387d0..dc0108287ccf 100644 --- a/drivers/i2c/i2c-smbus.c +++ b/drivers/i2c/i2c-smbus.c @@ -3,10 +3,11 @@ * i2c-smbus.c - SMBus extensions to the I2C protocol * * Copyright (C) 2008 David Brownell - * Copyright (C) 2010 Jean Delvare + * Copyright (C) 2010-2019 Jean Delvare */ #include +#include #include #include #include @@ -196,6 +197,107 @@ EXPORT_SYMBOL_GPL(i2c_handle_smbus_alert); module_i2c_driver(smbalert_driver); +/* + * SPD is not part of SMBus but we include it here for convenience as the + * target systems are the same. + * Restrictions to automatic SPD instantiation: + * - Only works if all filled slots have the same memory type + * - Only works for DDR2, DDR3 and DDR4 for now + * - Only works on systems with 1 to 4 memory slots + */ +#if IS_ENABLED(CONFIG_DMI) +void i2c_register_spd(struct i2c_adapter *adap) +{ + int n, slot_count = 0, dimm_count = 0; + u16 handle; + u8 common_mem_type = 0x0, mem_type; + u64 mem_size; + const char *name; + + while ((handle = dmi_memdev_handle(slot_count)) != 0xffff) { + slot_count++; + + /* Skip empty slots */ + mem_size = dmi_memdev_size(handle); + if (!mem_size) + continue; + + /* Skip undefined memory type */ + mem_type = dmi_memdev_type(handle); + if (mem_type <= 0x02) /* Invalid, Other, Unknown */ + continue; + + if (!common_mem_type) { + /* First filled slot */ + common_mem_type = mem_type; + } else { + /* Check that all filled slots have the same type */ + if (mem_type != common_mem_type) { + dev_warn(&adap->dev, + "Different memory types mixed, not instantiating SPD\n"); + return; + } + } + dimm_count++; + } + + /* No useful DMI data, bail out */ + if (!dimm_count) + return; + + dev_info(&adap->dev, "%d/%d memory slots populated (from DMI)\n", + dimm_count, slot_count); + + if (slot_count > 4) { + dev_warn(&adap->dev, + "Systems with more than 4 memory slots not supported yet, not instantiating SPD\n"); + return; + } + + switch (common_mem_type) { + case 0x13: /* DDR2 */ + case 0x18: /* DDR3 */ + case 0x1C: /* LPDDR2 */ + case 0x1D: /* LPDDR3 */ + name = "spd"; + break; + case 0x1A: /* DDR4 */ + case 0x1E: /* LPDDR4 */ + name = "ee1004"; + break; + default: + dev_info(&adap->dev, + "Memory type 0x%02x not supported yet, not instantiating SPD\n", + common_mem_type); + return; + } + + /* + * We don't know in which slots the memory modules are. We could + * try to guess from the slot names, but that would be rather complex + * and unreliable, so better probe all possible addresses until we + * have found all memory modules. + */ + for (n = 0; n < slot_count && dimm_count; n++) { + struct i2c_board_info info; + unsigned short addr_list[2]; + + memset(&info, 0, sizeof(struct i2c_board_info)); + strlcpy(info.type, name, I2C_NAME_SIZE); + addr_list[0] = 0x50 + n; + addr_list[1] = I2C_CLIENT_END; + + if (!IS_ERR(i2c_new_scanned_device(adap, &info, addr_list, NULL))) { + dev_info(&adap->dev, + "Successfully instantiated SPD at 0x%hx\n", + addr_list[0]); + dimm_count--; + } + } +} +EXPORT_SYMBOL_GPL(i2c_register_spd); +#endif + MODULE_AUTHOR("Jean Delvare "); MODULE_DESCRIPTION("SMBus protocol extensions support"); MODULE_LICENSE("GPL"); diff --git a/include/linux/i2c-smbus.h b/include/linux/i2c-smbus.h index 8c5459034f92..1e4e0de4ef8b 100644 --- a/include/linux/i2c-smbus.h +++ b/include/linux/i2c-smbus.h @@ -2,7 +2,7 @@ /* * i2c-smbus.h - SMBus extensions to the I2C protocol * - * Copyright (C) 2010 Jean Delvare + * Copyright (C) 2010-2019 Jean Delvare */ #ifndef _LINUX_I2C_SMBUS_H @@ -39,4 +39,10 @@ static inline int of_i2c_setup_smbus_alert(struct i2c_adapter *adap) } #endif +#if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_DMI) +void i2c_register_spd(struct i2c_adapter *adap); +#else +static inline void i2c_register_spd(struct i2c_adapter *adap) { } +#endif + #endif /* _LINUX_I2C_SMBUS_H */ -- cgit v1.2.3 From 8baebfc2aca26e3fa67ab28343671b82be42b22c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 28 May 2020 02:41:03 +0300 Subject: regmap: add helper for per-port regfield initialization Similar to the standalone regfields, add an initializer for the users who need to set .id_size and .id_offset in order to use the regmap_fields_update_bits_base API. Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20200527234113.2491988-2-olteanv@gmail.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 40b07168fd8e..87703d105191 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1134,6 +1134,14 @@ struct reg_field { .msb = _msb, \ } +#define REG_FIELD_ID(_reg, _lsb, _msb, _size, _offset) { \ + .reg = _reg, \ + .lsb = _lsb, \ + .msb = _msb, \ + .id_size = _size, \ + .id_offset = _offset, \ + } + struct regmap_field *regmap_field_alloc(struct regmap *regmap, struct reg_field reg_field); void regmap_field_free(struct regmap_field *field); -- cgit v1.2.3 From fb01562e5a8a731bb1807eba0a9fadb355ca2277 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 23 Apr 2020 14:53:28 +0200 Subject: uacce: Remove mm_exit() op The mm_exit() op will be removed from the SVA API. When a process dies and its mm goes away, the IOMMU driver won't notify device drivers anymore. Drivers should expect to handle a lot more aborted DMA. On the upside, it does greatly simplify the queue management. The uacce_mm struct, that tracks all queues bound to an mm, was only used by the mm_exit() callback. Remove it. Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Acked-by: Lu Baolu Acked-by: Zhangfei Gao Link: https://lore.kernel.org/r/20200423125329.782066-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/misc/uacce/uacce.c | 172 +++++++++++---------------------------------- include/linux/uacce.h | 34 +++------ 2 files changed, 51 insertions(+), 155 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index d39307f060bd..107028e77ca3 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -90,109 +90,39 @@ static long uacce_fops_compat_ioctl(struct file *filep, } #endif -static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle, - void *data) +static int uacce_bind_queue(struct uacce_device *uacce, struct uacce_queue *q) { - struct uacce_mm *uacce_mm = data; - struct uacce_queue *q; - - /* - * No new queue can be added concurrently because no caller can have a - * reference to this mm. But there may be concurrent calls to - * uacce_mm_put(), so we need the lock. - */ - mutex_lock(&uacce_mm->lock); - list_for_each_entry(q, &uacce_mm->queues, list) - uacce_put_queue(q); - uacce_mm->mm = NULL; - mutex_unlock(&uacce_mm->lock); + int pasid; + struct iommu_sva *handle; - return 0; -} - -static struct iommu_sva_ops uacce_sva_ops = { - .mm_exit = uacce_sva_exit, -}; - -static struct uacce_mm *uacce_mm_get(struct uacce_device *uacce, - struct uacce_queue *q, - struct mm_struct *mm) -{ - struct uacce_mm *uacce_mm = NULL; - struct iommu_sva *handle = NULL; - int ret; - - lockdep_assert_held(&uacce->mm_lock); - - list_for_each_entry(uacce_mm, &uacce->mm_list, list) { - if (uacce_mm->mm == mm) { - mutex_lock(&uacce_mm->lock); - list_add(&q->list, &uacce_mm->queues); - mutex_unlock(&uacce_mm->lock); - return uacce_mm; - } - } - - uacce_mm = kzalloc(sizeof(*uacce_mm), GFP_KERNEL); - if (!uacce_mm) - return NULL; + if (!(uacce->flags & UACCE_DEV_SVA)) + return 0; - if (uacce->flags & UACCE_DEV_SVA) { - /* - * Safe to pass an incomplete uacce_mm, since mm_exit cannot - * fire while we hold a reference to the mm. - */ - handle = iommu_sva_bind_device(uacce->parent, mm, uacce_mm); - if (IS_ERR(handle)) - goto err_free; + handle = iommu_sva_bind_device(uacce->parent, current->mm, NULL); + if (IS_ERR(handle)) + return PTR_ERR(handle); - ret = iommu_sva_set_ops(handle, &uacce_sva_ops); - if (ret) - goto err_unbind; - - uacce_mm->pasid = iommu_sva_get_pasid(handle); - if (uacce_mm->pasid == IOMMU_PASID_INVALID) - goto err_unbind; + pasid = iommu_sva_get_pasid(handle); + if (pasid == IOMMU_PASID_INVALID) { + iommu_sva_unbind_device(handle); + return -ENODEV; } - uacce_mm->mm = mm; - uacce_mm->handle = handle; - INIT_LIST_HEAD(&uacce_mm->queues); - mutex_init(&uacce_mm->lock); - list_add(&q->list, &uacce_mm->queues); - list_add(&uacce_mm->list, &uacce->mm_list); - - return uacce_mm; - -err_unbind: - if (handle) - iommu_sva_unbind_device(handle); -err_free: - kfree(uacce_mm); - return NULL; + q->handle = handle; + q->pasid = pasid; + return 0; } -static void uacce_mm_put(struct uacce_queue *q) +static void uacce_unbind_queue(struct uacce_queue *q) { - struct uacce_mm *uacce_mm = q->uacce_mm; - - lockdep_assert_held(&q->uacce->mm_lock); - - mutex_lock(&uacce_mm->lock); - list_del(&q->list); - mutex_unlock(&uacce_mm->lock); - - if (list_empty(&uacce_mm->queues)) { - if (uacce_mm->handle) - iommu_sva_unbind_device(uacce_mm->handle); - list_del(&uacce_mm->list); - kfree(uacce_mm); - } + if (!q->handle) + return; + iommu_sva_unbind_device(q->handle); + q->handle = NULL; } static int uacce_fops_open(struct inode *inode, struct file *filep) { - struct uacce_mm *uacce_mm = NULL; struct uacce_device *uacce; struct uacce_queue *q; int ret = 0; @@ -205,21 +135,16 @@ static int uacce_fops_open(struct inode *inode, struct file *filep) if (!q) return -ENOMEM; - mutex_lock(&uacce->mm_lock); - uacce_mm = uacce_mm_get(uacce, q, current->mm); - mutex_unlock(&uacce->mm_lock); - if (!uacce_mm) { - ret = -ENOMEM; + ret = uacce_bind_queue(uacce, q); + if (ret) goto out_with_mem; - } q->uacce = uacce; - q->uacce_mm = uacce_mm; if (uacce->ops->get_queue) { - ret = uacce->ops->get_queue(uacce, uacce_mm->pasid, q); + ret = uacce->ops->get_queue(uacce, q->pasid, q); if (ret < 0) - goto out_with_mm; + goto out_with_bond; } init_waitqueue_head(&q->wait); @@ -227,12 +152,14 @@ static int uacce_fops_open(struct inode *inode, struct file *filep) uacce->inode = inode; q->state = UACCE_Q_INIT; + mutex_lock(&uacce->queues_lock); + list_add(&q->list, &uacce->queues); + mutex_unlock(&uacce->queues_lock); + return 0; -out_with_mm: - mutex_lock(&uacce->mm_lock); - uacce_mm_put(q); - mutex_unlock(&uacce->mm_lock); +out_with_bond: + uacce_unbind_queue(q); out_with_mem: kfree(q); return ret; @@ -241,14 +168,12 @@ out_with_mem: static int uacce_fops_release(struct inode *inode, struct file *filep) { struct uacce_queue *q = filep->private_data; - struct uacce_device *uacce = q->uacce; + mutex_lock(&q->uacce->queues_lock); + list_del(&q->list); + mutex_unlock(&q->uacce->queues_lock); uacce_put_queue(q); - - mutex_lock(&uacce->mm_lock); - uacce_mm_put(q); - mutex_unlock(&uacce->mm_lock); - + uacce_unbind_queue(q); kfree(q); return 0; @@ -513,8 +438,8 @@ struct uacce_device *uacce_alloc(struct device *parent, if (ret < 0) goto err_with_uacce; - INIT_LIST_HEAD(&uacce->mm_list); - mutex_init(&uacce->mm_lock); + INIT_LIST_HEAD(&uacce->queues); + mutex_init(&uacce->queues_lock); device_initialize(&uacce->dev); uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id); uacce->dev.class = uacce_class; @@ -561,8 +486,7 @@ EXPORT_SYMBOL_GPL(uacce_register); */ void uacce_remove(struct uacce_device *uacce) { - struct uacce_mm *uacce_mm; - struct uacce_queue *q; + struct uacce_queue *q, *next_q; if (!uacce) return; @@ -574,24 +498,12 @@ void uacce_remove(struct uacce_device *uacce) unmap_mapping_range(uacce->inode->i_mapping, 0, 0, 1); /* ensure no open queue remains */ - mutex_lock(&uacce->mm_lock); - list_for_each_entry(uacce_mm, &uacce->mm_list, list) { - /* - * We don't take the uacce_mm->lock here. Since we hold the - * device's mm_lock, no queue can be added to or removed from - * this uacce_mm. We may run concurrently with mm_exit, but - * uacce_put_queue() is serialized and iommu_sva_unbind_device() - * waits for the lock that mm_exit is holding. - */ - list_for_each_entry(q, &uacce_mm->queues, list) - uacce_put_queue(q); - - if (uacce->flags & UACCE_DEV_SVA) { - iommu_sva_unbind_device(uacce_mm->handle); - uacce_mm->handle = NULL; - } + mutex_lock(&uacce->queues_lock); + list_for_each_entry_safe(q, next_q, &uacce->queues, list) { + uacce_put_queue(q); + uacce_unbind_queue(q); } - mutex_unlock(&uacce->mm_lock); + mutex_unlock(&uacce->queues_lock); /* disable sva now since no opened queues */ if (uacce->flags & UACCE_DEV_SVA) diff --git a/include/linux/uacce.h b/include/linux/uacce.h index 0e215e6d0534..454c2f6672d7 100644 --- a/include/linux/uacce.h +++ b/include/linux/uacce.h @@ -68,19 +68,21 @@ enum uacce_q_state { * @uacce: pointer to uacce * @priv: private pointer * @wait: wait queue head - * @list: index into uacce_mm - * @uacce_mm: the corresponding mm + * @list: index into uacce queues list * @qfrs: pointer of qfr regions * @state: queue state machine + * @pasid: pasid associated to the mm + * @handle: iommu_sva handle returned by iommu_sva_bind_device() */ struct uacce_queue { struct uacce_device *uacce; void *priv; wait_queue_head_t wait; struct list_head list; - struct uacce_mm *uacce_mm; struct uacce_qfile_region *qfrs[UACCE_MAX_REGION]; enum uacce_q_state state; + int pasid; + struct iommu_sva *handle; }; /** @@ -96,8 +98,8 @@ struct uacce_queue { * @cdev: cdev of the uacce * @dev: dev of the uacce * @priv: private pointer of the uacce - * @mm_list: list head of uacce_mm->list - * @mm_lock: lock for mm_list + * @queues: list of queues + * @queues_lock: lock for queues list * @inode: core vfs */ struct uacce_device { @@ -112,27 +114,9 @@ struct uacce_device { struct cdev *cdev; struct device dev; void *priv; - struct list_head mm_list; - struct mutex mm_lock; - struct inode *inode; -}; - -/** - * struct uacce_mm - keep track of queues bound to a process - * @list: index into uacce_device - * @queues: list of queues - * @mm: the mm struct - * @lock: protects the list of queues - * @pasid: pasid of the uacce_mm - * @handle: iommu_sva handle return from iommu_sva_bind_device - */ -struct uacce_mm { - struct list_head list; struct list_head queues; - struct mm_struct *mm; - struct mutex lock; - int pasid; - struct iommu_sva *handle; + struct mutex queues_lock; + struct inode *inode; }; #if IS_ENABLED(CONFIG_UACCE) -- cgit v1.2.3 From edcc40d2ab5f47f205c2dd2a9aeedd8c77de050a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 23 Apr 2020 14:53:30 +0200 Subject: iommu: Remove iommu_sva_ops::mm_exit() After binding a device to an mm, device drivers currently need to register a mm_exit handler. This function is called when the mm exits, to gracefully stop DMA targeting the address space and flush page faults to the IOMMU. This is deemed too complex for the MMU release() notifier, which may be triggered by any mmput() invocation, from about 120 callsites [1]. The upcoming SVA module has an example of such complexity: the I/O Page Fault handler would need to call mmput_async() instead of mmput() after handling an IOPF, to avoid triggering the release() notifier which would in turn drain the IOPF queue and lock up. Another concern is the DMA stop function taking too long, up to several minutes [2]. For some mmput() callers this may disturb other users. For example, if the OOM killer picks the mm bound to a device as the victim and that mm's memory is locked, if the release() takes too long, it might choose additional innocent victims to kill. To simplify the MMU release notifier, don't forward the notification to device drivers. Since they don't stop DMA on mm exit anymore, the PASID lifetime is extended: (1) The device driver calls bind(). A PASID is allocated. Here any DMA fault is handled by mm, and on error we don't print anything to dmesg. Userspace can easily trigger errors by issuing DMA on unmapped buffers. (2) exit_mmap(), for example the process took a SIGKILL. This step doesn't happen during normal operations. Remove the pgd from the PASID table, since the page tables are about to be freed. Invalidate the IOTLBs. Here the device may still perform DMA on the address space. Incoming transactions are aborted but faults aren't printed out. ATS Translation Requests return Successful Translation Completions with R=W=0. PRI Page Requests return with Invalid Request. (3) The device driver stops DMA, possibly following release of a fd, and calls unbind(). PASID table is cleared, IOTLB invalidated if necessary. The page fault queues are drained, and the PASID is freed. If DMA for that PASID is still running here, something went seriously wrong and errors should be reported. For now remove iommu_sva_ops entirely. We might need to re-introduce them at some point, for example to notify device drivers of unhandled IOPF. [1] https://lore.kernel.org/linux-iommu/20200306174239.GM31668@ziepe.ca/ [2] https://lore.kernel.org/linux-iommu/4d68da96-0ad5-b412-5987-2f7a6aa796c3@amd.com/ Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200423125329.782066-3-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 11 ----------- include/linux/iommu.h | 30 ------------------------------ 2 files changed, 41 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 298397721144..abcd19118169 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2883,17 +2883,6 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) } EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); -int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *sva_ops) -{ - if (handle->ops && handle->ops != sva_ops) - return -EEXIST; - - handle->ops = sva_ops; - return 0; -} -EXPORT_SYMBOL_GPL(iommu_sva_set_ops); - int iommu_sva_get_pasid(struct iommu_sva *handle) { const struct iommu_ops *ops = handle->dev->bus->iommu_ops; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cfd2dddb49d..08d3506172a1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -53,8 +53,6 @@ struct iommu_fault_event; typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); -typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *, - void *); typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *); struct iommu_domain_geometry { @@ -171,25 +169,6 @@ enum iommu_dev_features { #define IOMMU_PASID_INVALID (-1U) -/** - * struct iommu_sva_ops - device driver callbacks for an SVA context - * - * @mm_exit: called when the mm is about to be torn down by exit_mmap. After - * @mm_exit returns, the device must not issue any more transaction - * with the PASID given as argument. - * - * The @mm_exit handler is allowed to sleep. Be careful about the - * locks taken in @mm_exit, because they might lead to deadlocks if - * they are also held when dropping references to the mm. Consider the - * following call chain: - * mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A) - * Using mmput_async() prevents this scenario. - * - */ -struct iommu_sva_ops { - iommu_mm_exit_handler_t mm_exit; -}; - #ifdef CONFIG_IOMMU_API /** @@ -616,7 +595,6 @@ struct iommu_fwspec { */ struct iommu_sva { struct device *dev; - const struct iommu_sva_ops *ops; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, @@ -664,8 +642,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata); void iommu_sva_unbind_device(struct iommu_sva *handle); -int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *ops); int iommu_sva_get_pasid(struct iommu_sva *handle); #else /* CONFIG_IOMMU_API */ @@ -1069,12 +1045,6 @@ static inline void iommu_sva_unbind_device(struct iommu_sva *handle) { } -static inline int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *ops) -{ - return -EINVAL; -} - static inline int iommu_sva_get_pasid(struct iommu_sva *handle) { return IOMMU_PASID_INVALID; -- cgit v1.2.3 From aa2ff9dbaeddabb5ad166db5f9f1a0580a8bbba8 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 28 May 2020 17:45:02 +0200 Subject: regmap: provide helpers for simple bit operations In many instances regmap_update_bits() is used for simple bit setting and clearing. In these cases the last argument is redundant and we can hide it with a static inline function. This adds three new helpers for simple bit operations: set_bits, clear_bits and test_bits (the last one defined as a regular function). Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20200528154503.26304-2-brgl@bgdev.pl Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 22 ++++++++++++++++++++++ include/linux/regmap.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 59f911e57719..4ad5c5adc0a3 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2936,6 +2936,28 @@ int regmap_update_bits_base(struct regmap *map, unsigned int reg, } EXPORT_SYMBOL_GPL(regmap_update_bits_base); +/** + * regmap_test_bits() - Check if all specified bits are set in a register. + * + * @map: Register map to operate on + * @reg: Register to read from + * @bits: Bits to test + * + * Returns -1 if the underlying regmap_read() fails, 0 if at least one of the + * tested bits is not set and 1 if all tested bits are set. + */ +int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits) +{ + unsigned int val, ret; + + ret = regmap_read(map, reg, &val); + if (ret) + return ret; + + return (val & bits) == bits; +} +EXPORT_SYMBOL_GPL(regmap_test_bits); + void regmap_async_complete_cb(struct regmap_async *async, int ret) { struct regmap *map = async->map; diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 40b07168fd8e..ddf0baff195d 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1111,6 +1111,21 @@ bool regmap_reg_in_ranges(unsigned int reg, const struct regmap_range *ranges, unsigned int nranges); +static inline int regmap_set_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + return regmap_update_bits_base(map, reg, bits, bits, + NULL, false, false); +} + +static inline int regmap_clear_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + return regmap_update_bits_base(map, reg, bits, 0, NULL, false, false); +} + +int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits); + /** * struct reg_field - Description of an register field * @@ -1410,6 +1425,27 @@ static inline int regmap_update_bits_base(struct regmap *map, unsigned int reg, return -EINVAL; } +static inline int regmap_set_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + +static inline int regmap_clear_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + +static inline int regmap_test_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regmap_field_update_bits_base(struct regmap_field *field, unsigned int mask, unsigned int val, bool *change, bool async, bool force) -- cgit v1.2.3 From 4fda230ecddc2573ed88632e98b69b0b9b68c0ad Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 27 May 2020 10:56:16 -0600 Subject: iommu/vt-d: Allocate domain info for real DMA sub-devices Sub-devices of a real DMA device might exist on a separate segment than the real DMA device and its IOMMU. These devices should still have a valid device_domain_info, but the current dma alias model won't allocate info for the subdevice. This patch adds a segment member to struct device_domain_info and uses the sub-device's BDF so that these sub-devices won't alias to other devices. Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping") Cc: stable@vger.kernel.org # v5.6+ Signed-off-by: Jon Derrick Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200527165617.297470-3-jonathan.derrick@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 19 +++++++++++++++---- include/linux/intel-iommu.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 1ff45b2d03ab..6d39b9bd89a6 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2463,7 +2463,7 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn) struct device_domain_info *info; list_for_each_entry(info, &device_domain_list, global) - if (info->iommu->segment == segment && info->bus == bus && + if (info->segment == segment && info->bus == bus && info->devfn == devfn) return info; @@ -2520,8 +2520,18 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (!info) return NULL; - info->bus = bus; - info->devfn = devfn; + if (!dev_is_real_dma_subdevice(dev)) { + info->bus = bus; + info->devfn = devfn; + info->segment = iommu->segment; + } else { + struct pci_dev *pdev = to_pci_dev(dev); + + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->segment = pci_domain_nr(pdev->bus); + } + info->ats_supported = info->pasid_supported = info->pri_supported = 0; info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; info->ats_qdep = 0; @@ -2561,7 +2571,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (!found) { struct device_domain_info *info2; - info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); + info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, + info->devfn); if (info2) { found = info2->domain; info2->dev = dev; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 21633cee6331..4100bd224f5c 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -609,6 +609,7 @@ struct device_domain_info { struct list_head auxiliary_domains; /* auxiliary domains * attached to this device */ + u32 segment; /* PCI segment number */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ u16 pfsid; /* SRIOV physical function source ID */ -- cgit v1.2.3 From 752db83a5dfd4fd3a0624b9ab440ed947fa003ca Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 29 May 2020 14:49:39 +0200 Subject: regulator: extract voltage balancing code to the separate function Move the coupled regulators voltage balancing code to the separate function and allow to call it from the custom regulator couplers. Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200529124940.10675-2-m.szyprowski@samsung.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 49 +++++++++++++++++++++++---------------- include/linux/regulator/coupler.h | 8 +++++++ 2 files changed, 37 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index c340505150b6..1eba6c202bf5 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -3642,36 +3642,19 @@ finish: return done; } -static int regulator_balance_voltage(struct regulator_dev *rdev, - suspend_state_t state) +int regulator_do_balance_voltage(struct regulator_dev *rdev, + suspend_state_t state, bool skip_coupled) { struct regulator_dev **c_rdevs; struct regulator_dev *best_rdev; struct coupling_desc *c_desc = &rdev->coupling_desc; - struct regulator_coupler *coupler = c_desc->coupler; int i, ret, n_coupled, best_min_uV, best_max_uV, best_c_rdev; unsigned int delta, best_delta; unsigned long c_rdev_done = 0; bool best_c_rdev_done; c_rdevs = c_desc->coupled_rdevs; - n_coupled = c_desc->n_coupled; - - /* - * If system is in a state other than PM_SUSPEND_ON, don't check - * other coupled regulators. - */ - if (state != PM_SUSPEND_ON) - n_coupled = 1; - - if (c_desc->n_resolved < n_coupled) { - rdev_err(rdev, "Not all coupled regulators registered\n"); - return -EPERM; - } - - /* Invoke custom balancer for customized couplers */ - if (coupler && coupler->balance_voltage) - return coupler->balance_voltage(coupler, rdev, state); + n_coupled = skip_coupled ? 1 : c_desc->n_coupled; /* * Find the best possible voltage change on each loop. Leave the loop @@ -3742,6 +3725,32 @@ out: return ret; } +static int regulator_balance_voltage(struct regulator_dev *rdev, + suspend_state_t state) +{ + struct coupling_desc *c_desc = &rdev->coupling_desc; + struct regulator_coupler *coupler = c_desc->coupler; + bool skip_coupled = false; + + /* + * If system is in a state other than PM_SUSPEND_ON, don't check + * other coupled regulators. + */ + if (state != PM_SUSPEND_ON) + skip_coupled = true; + + if (c_desc->n_resolved < c_desc->n_coupled) { + rdev_err(rdev, "Not all coupled regulators registered\n"); + return -EPERM; + } + + /* Invoke custom balancer for customized couplers */ + if (coupler && coupler->balance_voltage) + return coupler->balance_voltage(coupler, rdev, state); + + return regulator_do_balance_voltage(rdev, state, skip_coupled); +} + /** * regulator_set_voltage - set regulator output voltage * @regulator: regulator source diff --git a/include/linux/regulator/coupler.h b/include/linux/regulator/coupler.h index 0212d6255e4e..5f86824bd117 100644 --- a/include/linux/regulator/coupler.h +++ b/include/linux/regulator/coupler.h @@ -62,6 +62,8 @@ int regulator_get_voltage_rdev(struct regulator_dev *rdev); int regulator_set_voltage_rdev(struct regulator_dev *rdev, int min_uV, int max_uV, suspend_state_t state); +int regulator_do_balance_voltage(struct regulator_dev *rdev, + suspend_state_t state, bool skip_coupled); #else static inline int regulator_coupler_register(struct regulator_coupler *coupler) { @@ -92,6 +94,12 @@ static inline int regulator_set_voltage_rdev(struct regulator_dev *rdev, { return -EINVAL; } +static inline int regulator_do_balance_voltage(struct regulator_dev *rdev, + suspend_state_t state, + bool skip_coupled) +{ + return -EINVAL; +} #endif #endif -- cgit v1.2.3 From 9a7875461fd0427dc86e3a87e93bd5723679b8b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 May 2020 16:45:14 +0200 Subject: PM: runtime: Replace pm_runtime_callbacks_present() The name of pm_runtime_callbacks_present() is confusing, because it suggests that the device has PM-runtime callbacks if 'true' is returned by that function, but in fact that may not be the case, so replace it with pm_runtime_has_no_callbacks() which is not ambiguous. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- drivers/base/power/sysfs.c | 4 ++-- include/linux/pm_runtime.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index 2b99fe1eb207..24d25cf8ab14 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -666,7 +666,7 @@ int dpm_sysfs_add(struct device *dev) if (rc) return rc; - if (pm_runtime_callbacks_present(dev)) { + if (!pm_runtime_has_no_callbacks(dev)) { rc = sysfs_merge_group(&dev->kobj, &pm_runtime_attr_group); if (rc) goto err_out; @@ -709,7 +709,7 @@ int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) if (rc) return rc; - if (pm_runtime_callbacks_present(dev)) { + if (!pm_runtime_has_no_callbacks(dev)) { rc = sysfs_group_change_owner( &dev->kobj, &pm_runtime_attr_group, kuid, kgid); if (rc) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 3bdcbce8141a..3dbc207bff53 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -102,9 +102,9 @@ static inline bool pm_runtime_enabled(struct device *dev) return !dev->power.disable_depth; } -static inline bool pm_runtime_callbacks_present(struct device *dev) +static inline bool pm_runtime_has_no_callbacks(struct device *dev) { - return !dev->power.no_callbacks; + return dev->power.no_callbacks; } static inline void pm_runtime_mark_last_busy(struct device *dev) -- cgit v1.2.3 From 7b11eab041dacfeaaa6d27d9183b247a995bc16d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 May 2020 07:51:59 -0700 Subject: blk-mq: blk-mq: provide forced completion method Drivers may need to bypass error injection for error recovery. Rename __blk_mq_complete_request() to blk_mq_force_complete_rq() and export that function so drivers may skip potential fake timeouts after they've reclaimed lost requests. Signed-off-by: Keith Busch Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq.c | 15 +++++++++++++-- include/linux/blk-mq.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index c606c74463cc..c6dcd390be2d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -556,7 +556,17 @@ static void __blk_mq_complete_request_remote(void *data) q->mq_ops->complete(rq); } -static void __blk_mq_complete_request(struct request *rq) +/** + * blk_mq_force_complete_rq() - Force complete the request, bypassing any error + * injection that could drop the completion. + * @rq: Request to be force completed + * + * Drivers should use blk_mq_complete_request() to complete requests in their + * normal IO path. For timeout error recovery, drivers may call this forced + * completion routine after they've reclaimed timed out requests to bypass + * potentially subsequent fake timeouts. + */ +void blk_mq_force_complete_rq(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; @@ -602,6 +612,7 @@ static void __blk_mq_complete_request(struct request *rq) } put_cpu(); } +EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -635,7 +646,7 @@ bool blk_mq_complete_request(struct request *rq) { if (unlikely(blk_should_fake_timeout(rq->q))) return false; - __blk_mq_complete_request(rq); + blk_mq_force_complete_rq(rq); return true; } EXPORT_SYMBOL(blk_mq_complete_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d7307795439a..856bb10993cf 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -494,6 +494,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); +void blk_mq_force_complete_rq(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From 5d9c305b8ea3fbc95bedfde01f7dd91e68082098 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 15:53:08 +0200 Subject: blk-mq: remove the bio argument to ->prepare_request None of the I/O schedulers actually needs it. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-mq.c | 2 +- block/kyber-iosched.c | 2 +- block/mq-deadline.c | 2 +- include/linux/elevator.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3d411716d7ee..50c8f034c01c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6073,7 +6073,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, * comments on bfq_init_rq for the reason behind this delayed * preparation. */ -static void bfq_prepare_request(struct request *rq, struct bio *bio) +static void bfq_prepare_request(struct request *rq) { /* * Regardless of whether we have an icq attached, we have to diff --git a/block/blk-mq.c b/block/blk-mq.c index c6dcd390be2d..f15722cbfac6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -387,7 +387,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, if (e->type->icq_cache) blk_mq_sched_assign_ioc(rq); - e->type->ops.prepare_request(rq, bio); + e->type->ops.prepare_request(rq); rq->rq_flags |= RQF_ELVPRIV; } } diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 34dcea0ef637..a38c5ab103d1 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -579,7 +579,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, return merged; } -static void kyber_prepare_request(struct request *rq, struct bio *bio) +static void kyber_prepare_request(struct request *rq) { rq_set_domain_token(rq, -1); } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b490f47fd553..b57470e154c8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -541,7 +541,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, * Nothing to do here. This is defined only to ensure that .finish_request * method is called upon request completion. */ -static void dd_prepare_request(struct request *rq, struct bio *bio) +static void dd_prepare_request(struct request *rq) { } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 901bda352dcb..bacc40a0bdf3 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -39,7 +39,7 @@ struct elevator_mq_ops { void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); - void (*prepare_request)(struct request *, struct bio *bio); + void (*prepare_request)(struct request *); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); -- cgit v1.2.3 From bf0beec0607db3c6f6fb7bd2c6d503792b05cf3f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 29 May 2020 15:53:15 +0200 Subject: blk-mq: drain I/O when all CPUs in a hctx are offline Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup up queue mapping. Thomas mentioned the following point[1]: "That was the constraint of managed interrupts from the very beginning: The driver/subsystem has to quiesce the interrupt line and the associated queue _before_ it gets shutdown in CPU unplug and not fiddle with it until it's restarted by the core when the CPU is plugged in again." However, current blk-mq implementation doesn't quiesce hw queue before the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is a cpuhp state handled after the CPU is down, so there isn't any chance to quiesce the hctx before shutting down the CPU. Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs where the last CPU goes away, and wait for completion of in-flight requests. This guarantees that there is no inflight I/O before shutting down the managed IRQ. Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need to wait for completion of in-flight requests from these drivers to avoid a potential dead-lock. It is safe to do this for stacking drivers as those do not use interrupts at all and their I/O completions are triggered by underlying devices I/O completion. [1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/ [hch: different retry mechanism, merged two patches, minor cleanups] Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 + block/blk-mq-tag.c | 8 ++++ block/blk-mq.c | 112 ++++++++++++++++++++++++++++++++++++++++++++- drivers/block/loop.c | 2 +- drivers/md/dm-rq.c | 2 +- include/linux/blk-mq.h | 10 ++++ include/linux/cpuhotplug.h | 1 + 7 files changed, 133 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 96b7a35c898a..15df3a36e9fa 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), + HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME @@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(TAG_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), + HCTX_FLAG_NAME(STACKING), }; #undef HCTX_FLAG_NAME diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 762198b62088..96a39d0724a2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); found_tag: + /* + * Give up this allocation if the hctx is inactive. The caller will + * retry on an active hctx. + */ + if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { + blk_mq_put_tag(tags, data->ctx, tag + tag_offset); + return BLK_MQ_NO_TAG; + } return tag + tag_offset; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 560ef5df8993..9a36ac1c1fa1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) e->type->ops.limit_depth(data->cmd_flags, data); } +retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); if (!(data->flags & BLK_MQ_REQ_INTERNAL)) blk_mq_tag_busy(data->hctx); + /* + * Waiting allocations only fail because of an inactive hctx. In that + * case just retry the hctx assignment and tag allocation as CPU hotplug + * should have migrated us to an online CPU by now. + */ tag = blk_mq_get_tag(data); - if (tag == BLK_MQ_NO_TAG) - return NULL; + if (tag == BLK_MQ_NO_TAG) { + if (data->flags & BLK_MQ_REQ_NOWAIT) + return NULL; + + /* + * Give up the CPU and sleep for a random short time to ensure + * that thread using a realtime scheduling class are migrated + * off the the CPU, and thus off the hctx that is going away. + */ + msleep(3); + goto retry; + } return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); } @@ -2335,6 +2351,86 @@ fail: return -ENOMEM; } +struct rq_iter_data { + struct blk_mq_hw_ctx *hctx; + bool has_rq; +}; + +static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +{ + struct rq_iter_data *iter_data = data; + + if (rq->mq_hctx != iter_data->hctx) + return true; + iter_data->has_rq = true; + return false; +} + +static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->sched_tags ? + hctx->sched_tags : hctx->tags; + struct rq_iter_data data = { + .hctx = hctx, + }; + + blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + return data.has_rq; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; +} + +static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask) || + !blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Prevent new request from being allocated on the current hctx. + * + * The smp_mb__after_atomic() Pairs with the implied barrier in + * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is + * seen once we return from the tag allocator. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + + /* + * Try to grab a reference to the queue and wait for any outstanding + * requests. If we could not grab a reference the queue has been + * frozen and there are no requests. + */ + if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { + while (blk_mq_hctx_has_requests(hctx)) + msleep(5); + percpu_ref_put(&hctx->queue->q_usage_counter); + } + + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); + return 0; +} + /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it @@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; @@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } @@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q, { hctx->queue_num = hctx_idx; + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; @@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void) { cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); + cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", + blk_mq_hctx_notify_online, + blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..d7904b4d8d12 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 3f8577e2c13b..f60c02512121 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 856bb10993cf..d6fcae17da5a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,8 @@ struct blk_mq_hw_ctx { */ atomic_t nr_active; + /** @cpuhp_online: List to store request if CPU is going to die */ + struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ @@ -391,6 +393,11 @@ struct blk_mq_ops { enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, + /* + * Set when this device requires underlying blk-mq device for + * completing IO: + */ + BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, @@ -400,6 +407,9 @@ enum { BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE = 3, + BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 77d70b633531..24b3a77810b6 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -152,6 +152,7 @@ enum cpuhp_state { CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, + CPUHP_AP_BLK_MQ_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, -- cgit v1.2.3 From 5a892ff2facb4548c17c05931ed899038a0da63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:43 +0200 Subject: net: remove kernel_setsockopt No users left. Signed-off-by: Christoph Hellwig Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/net.h | 2 -- net/socket.c | 31 ------------------------------- 2 files changed, 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 74ef5d7315f7..e10f378194a5 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, - unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, diff --git a/net/socket.c b/net/socket.c index 81a98b6cbd08..976426d03f09 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3624,37 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_setsockopt - set a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Returns 0 or an error. - */ - -int kernel_setsockopt(struct socket *sock, int level, int optname, - char *optval, unsigned int optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int err; - - uoptval = (char __user __force *) optval; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); - else - err = sock->ops->setsockopt(sock, level, optname, uoptval, - optlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_setsockopt); - /** * kernel_sendpage - send a &page through a socket (kernel space) * @sock: socket -- cgit v1.2.3 From a7868323c2638a7c6c5b30b37831b73cbdf0dc15 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 May 2020 08:24:10 -0500 Subject: exec: Add a per bprm->file version of per_clear There is a small bug in the code that recomputes parts of bprm->cred for every bprm->file. The code never recomputes the part of clear_dangerous_personality_flags it is responsible for. Which means that in practice if someone creates a sgid script the interpreter will not be able to use any of: READ_IMPLIES_EXEC ADDR_NO_RANDOMIZE ADDR_COMPAT_LAYOUT MMAP_PAGE_ZERO. This accentially clearing of personality flags probably does not matter in practice because no one has complained but it does make the code more difficult to understand. Further remaining bug compatible prevents the recomputation from being removed and replaced by simply computing bprm->cred once from the final bprm->file. Making this change removes the last behavior difference between computing bprm->creds from the final file and recomputing bprm->cred several times. Which allows this behavior change to be justified for it's own reasons, and for any but hunts looking into why the behavior changed to wind up here instead of in the code that will follow that computes bprm->cred from the final bprm->file. This small logic bug appears to have existed since the code started clearing dangerous personality bits. History Tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Fixes: 1bb0fa189c6a ("[PATCH] NX: clean up legacy binary support") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 6 ++++-- include/linux/binfmts.h | 5 +++++ include/linux/lsm_hooks.h | 2 ++ security/commoncap.c | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index c3c879a55d65..0f793536e393 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1354,6 +1354,7 @@ int begin_new_exec(struct linux_binprm * bprm) me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); + bprm->per_clear |= bprm->pf_per_clear; me->personality &= ~bprm->per_clear; /* @@ -1628,12 +1629,12 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; if (mode & S_ISUID) { - bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->pf_per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = uid; } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->pf_per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = gid; } } @@ -1654,6 +1655,7 @@ static int prepare_binprm(struct linux_binprm *bprm) /* Recompute parts of bprm->cred based on bprm->file */ bprm->active_secureexec = 0; + bprm->pf_per_clear = 0; bprm_fill_uid(bprm); retval = security_bprm_repopulate_creds(bprm); if (retval) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7fc05929c967..50025ead0b72 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -55,6 +55,11 @@ struct linux_binprm { struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ + /* + * bits to clear in current->personality + * recalculated for each bprm->file. + */ + unsigned int pf_per_clear; unsigned int per_clear; /* bits to clear in current->personality */ int argc, envc; const char * filename; /* Name of binary as seen by procps */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d618ecc4d660..f68076d440f3 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -55,6 +55,8 @@ * transitions between security domains). * The hook must set @bprm->active_secureexec to 1 if AT_SECURE should be set to * request libc enable secure mode. + * The hook must add to @bprm->pf_per_clear any personality flags that + * should be cleared from current->personality. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. * @bprm_check_security: diff --git a/security/commoncap.c b/security/commoncap.c index 77b04cb6feac..6de72d22dc6c 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -826,7 +826,7 @@ int cap_bprm_repopulate_creds(struct linux_binprm *bprm) /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) - bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->pf_per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. -- cgit v1.2.3 From 56305aa9b6fab91a5555a45796b79c1b0a6353d1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 May 2020 22:00:54 -0500 Subject: exec: Compute file based creds only once Move the computation of creds from prepare_binfmt into begin_new_exec so that the creds need only be computed once. This is just code reorganization no semantic changes of any kind are made. Moving the computation is safe. I have looked through the kernel and verified none of the binfmts look at bprm->cred directly, and that there are no helpers that look at bprm->cred indirectly. Which means that it is not a problem to compute the bprm->cred later in the execution flow as it is not used until it becomes current->cred. A new function bprm_creds_from_file is added to contain the work that needs to be done. bprm_creds_from_file first computes which file bprm->executable or most likely bprm->file that the bprm->creds will be computed from. The funciton bprm_fill_uid is updated to receive the file instead of accessing bprm->file. The now unnecessary work needed to reset the bprm->cred->euid, and bprm->cred->egid is removed from brpm_fill_uid. A small comment to document that bprm_fill_uid now only deals with the work to handle suid and sgid files. The default case is already heandled by prepare_exec_creds. The function security_bprm_repopulate_creds is renamed security_bprm_creds_from_file and now is explicitly passed the file from which to compute the creds. The documentation of the bprm_creds_from_file security hook is updated to explain when the hook is called and what it needs to do. The file is passed from cap_bprm_creds_from_file into get_file_caps so that the caps are computed for the appropriate file. The now unnecessary work in cap_bprm_creds_from_file to reset the ambient capabilites has been removed. A small comment to document that the work of cap_bprm_creds_from_file is to read capabilities from the files secureity attribute and derive capabilities from the fact the user had uid 0 has been added. Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/binfmt_misc.c | 2 +- fs/exec.c | 63 ++++++++++++++++++------------------------- include/linux/binfmts.h | 14 ++-------- include/linux/lsm_hook_defs.h | 2 +- include/linux/lsm_hooks.h | 22 +++++++-------- include/linux/security.h | 9 ++++--- security/commoncap.c | 24 +++++++++-------- security/security.c | 4 +-- 8 files changed, 61 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 53968ea07b57..bc5506619b7e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -192,7 +192,7 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->interpreter = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) - bprm->preserve_creds = 1; + bprm->execfd_creds = 1; retval = 0; ret: diff --git a/fs/exec.c b/fs/exec.c index 0f793536e393..e8599236290d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -72,6 +72,8 @@ #include +static int bprm_creds_from_file(struct linux_binprm *bprm); + int suid_dumpable = 0; static LIST_HEAD(formats); @@ -1304,6 +1306,11 @@ int begin_new_exec(struct linux_binprm * bprm) struct task_struct *me = current; int retval; + /* Once we are committed compute the creds */ + retval = bprm_creds_from_file(bprm); + if (retval) + return retval; + /* * Ensure all future errors are fatal. */ @@ -1354,7 +1361,6 @@ int begin_new_exec(struct linux_binprm * bprm) me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); - bprm->per_clear |= bprm->pf_per_clear; me->personality &= ~bprm->per_clear; /* @@ -1365,13 +1371,6 @@ int begin_new_exec(struct linux_binprm * bprm) */ do_close_on_exec(me->files); - /* - * Once here, prepare_binrpm() will not be called any more, so - * the final state of setuid/setgid/fscaps can be merged into the - * secureexec flag. - */ - bprm->secureexec |= bprm->active_secureexec; - if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; @@ -1587,29 +1586,21 @@ static void check_unsafe_exec(struct linux_binprm *bprm) spin_unlock(&p->fs->lock); } -static void bprm_fill_uid(struct linux_binprm *bprm) +static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { + /* Handle suid and sgid on files */ struct inode *inode; unsigned int mode; kuid_t uid; kgid_t gid; - /* - * Since this can be called multiple times (via prepare_binprm), - * we must clear any previous work done when setting set[ug]id - * bits from any earlier bprm->file uses (for example when run - * first for a setuid script then again for its interpreter). - */ - bprm->cred->euid = current_euid(); - bprm->cred->egid = current_egid(); - - if (!mnt_may_suid(bprm->file->f_path.mnt)) + if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; - inode = bprm->file->f_path.dentry->d_inode; + inode = file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; @@ -1629,19 +1620,31 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; if (mode & S_ISUID) { - bprm->pf_per_clear |= PER_CLEAR_ON_SETID; + bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = uid; } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - bprm->pf_per_clear |= PER_CLEAR_ON_SETID; + bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = gid; } } +/* + * Compute brpm->cred based upon the final binary. + */ +static int bprm_creds_from_file(struct linux_binprm *bprm) +{ + /* Compute creds based on which file? */ + struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; + + bprm_fill_uid(bprm, file); + return security_bprm_creds_from_file(bprm, file); +} + /* * Fill the binprm structure from the inode. - * Check permissions, then read the first BINPRM_BUF_SIZE bytes + * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ @@ -1649,20 +1652,6 @@ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; - /* Can the interpreter get to the executable without races? */ - if (!bprm->preserve_creds) { - int retval; - - /* Recompute parts of bprm->cred based on bprm->file */ - bprm->active_secureexec = 0; - bprm->pf_per_clear = 0; - bprm_fill_uid(bprm); - retval = security_bprm_repopulate_creds(bprm); - if (retval) - return retval; - } - bprm->preserve_creds = 0; - memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 50025ead0b72..aece1b340e7d 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -29,13 +29,8 @@ struct linux_binprm { /* Should an execfd be passed to userspace? */ have_execfd:1, - /* It is safe to use the creds of a script (see binfmt_misc) */ - preserve_creds:1, - /* - * True if most recent call to security_bprm_set_creds - * resulted in elevated privileges. - */ - active_secureexec:1, + /* Use the creds of a script (see binfmt_misc) */ + execfd_creds:1, /* * Set by bprm_creds_for_exec hook to indicate a * privilege-gaining exec has happened. Used to set @@ -55,11 +50,6 @@ struct linux_binprm { struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ - /* - * bits to clear in current->personality - * recalculated for each bprm->file. - */ - unsigned int pf_per_clear; unsigned int per_clear; /* bits to clear in current->personality */ int argc, envc; const char * filename; /* Name of binary as seen by procps */ diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 1e295ba12c0d..adbc6603abba 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -50,7 +50,7 @@ LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages) LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) -LSM_HOOK(int, 0, bprm_repopulate_creds, struct linux_binprm *bprm) +LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *file) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index f68076d440f3..c523c18efa0e 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -44,18 +44,18 @@ * request libc enable secure mode. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. - * @bprm_repopulate_creds: - * Assuming that the relevant bits of @bprm->cred->security have been - * previously set, examine @bprm->file and regenerate them. This is - * so that the credentials derived from the interpreter the code is - * actually going to run are used rather than credentials derived - * from a script. This done because the interpreter binary needs to - * reopen script, and may end up opening something completely different. - * This hook may also optionally check permissions (e.g. for - * transitions between security domains). - * The hook must set @bprm->active_secureexec to 1 if AT_SECURE should be set to + * @bprm_creds_from_file: + * If @file is setpcap, suid, sgid or otherwise marked to change + * privilege upon exec, update @bprm->cred to reflect that change. + * This is called after finding the binary that will be executed. + * without an interpreter. This ensures that the credentials will not + * be derived from a script that the binary will need to reopen, which + * when reopend may end up being a completely different file. This + * hook may also optionally check permissions (e.g. for transitions + * between security domains). + * The hook must set @bprm->secureexec to 1 if AT_SECURE should be set to * request libc enable secure mode. - * The hook must add to @bprm->pf_per_clear any personality flags that + * The hook must add to @bprm->per_clear any personality flags that * should be cleared from current->personality. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. diff --git a/include/linux/security.h b/include/linux/security.h index 6dcec9375e8f..8444fae7c5b9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -140,7 +140,7 @@ extern int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -extern int cap_bprm_repopulate_creds(struct linux_binprm *bprm); +extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int cap_inode_removexattr(struct dentry *dentry, const char *name); @@ -277,7 +277,7 @@ int security_syslog(int type); int security_settime64(const struct timespec64 *ts, const struct timezone *tz); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_bprm_creds_for_exec(struct linux_binprm *bprm); -int security_bprm_repopulate_creds(struct linux_binprm *bprm); +int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); void security_bprm_committed_creds(struct linux_binprm *bprm); @@ -575,9 +575,10 @@ static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm) return 0; } -static inline int security_bprm_repopulate_creds(struct linux_binprm *bprm) +static inline int security_bprm_creds_from_file(struct linux_binprm *bprm, + struct file *file) { - return cap_bprm_repopulate_creds(bprm); + return cap_bprm_creds_from_file(bprm, file); } static inline int security_bprm_check(struct linux_binprm *bprm) diff --git a/security/commoncap.c b/security/commoncap.c index 6de72d22dc6c..59bf3c1674c8 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -647,7 +647,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ -static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_fcap) +static int get_file_caps(struct linux_binprm *bprm, struct file *file, + bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; @@ -657,7 +658,7 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_f if (!file_caps_enabled) return 0; - if (!mnt_may_suid(bprm->file->f_path.mnt)) + if (!mnt_may_suid(file->f_path.mnt)) return 0; /* @@ -665,10 +666,10 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_f * explicit that capability bits are limited to s_user_ns and its * descendants. */ - if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) + if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; - rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); + rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", @@ -797,26 +798,27 @@ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, } /** - * cap_bprm_repopulate_creds - Set up the proposed credentials for execve(). + * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds + * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. Returns 0 if successful, -ve on error. */ -int cap_bprm_repopulate_creds(struct linux_binprm *bprm) +int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { + /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; - new->cap_ambient = old->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; - ret = get_file_caps(bprm, &effective, &has_fcap); + ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; @@ -826,7 +828,7 @@ int cap_bprm_repopulate_creds(struct linux_binprm *bprm) /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) - bprm->pf_per_clear |= PER_CLEAR_ON_SETID; + bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. @@ -889,7 +891,7 @@ int cap_bprm_repopulate_creds(struct linux_binprm *bprm) (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) - bprm->active_secureexec = 1; + bprm->secureexec = 1; return 0; } @@ -1346,7 +1348,7 @@ static struct security_hook_list capability_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), - LSM_HOOK_INIT(bprm_repopulate_creds, cap_bprm_repopulate_creds), + LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), diff --git a/security/security.c b/security/security.c index b890b7e2a765..259b8e750aa2 100644 --- a/security/security.c +++ b/security/security.c @@ -828,9 +828,9 @@ int security_bprm_creds_for_exec(struct linux_binprm *bprm) return call_int_hook(bprm_creds_for_exec, 0, bprm); } -int security_bprm_repopulate_creds(struct linux_binprm *bprm) +int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { - return call_int_hook(bprm_repopulate_creds, 0, bprm); + return call_int_hook(bprm_creds_from_file, 0, bprm, file); } int security_bprm_check(struct linux_binprm *bprm) -- cgit v1.2.3 From 2553f421f44f4db7579f202b79b69046b579c7b5 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 27 May 2020 23:16:02 -0700 Subject: net/mlx5: cmd: Fix memset with byte count warning Fix sparse warning: drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1949:15: warning: memset with byte count of 271720 mlx5_cmd_stats array is too big to be held inline in mlx5_cmd. Allocate it separately. Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 20 ++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 2 +- include/linux/mlx5/driver.h | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index eca159e8e123..1d91a0d0ab1d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1072,7 +1072,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, ds = ent->ts2 - ent->ts1; op = MLX5_GET(mbox_in, in->first.data, opcode); - if (op < ARRAY_SIZE(cmd->stats)) { + if (op < MLX5_CMD_OP_MAX) { stats = &cmd->stats[op]; spin_lock_irq(&stats->lock); stats->sum += ds; @@ -1551,7 +1551,7 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force if (ent->callback) { ds = ent->ts2 - ent->ts1; - if (ent->op < ARRAY_SIZE(cmd->stats)) { + if (ent->op < MLX5_CMD_OP_MAX) { stats = &cmd->stats[ent->op]; spin_lock_irqsave(&stats->lock, flags); stats->sum += ds; @@ -1960,10 +1960,16 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) return -EINVAL; } - cmd->pool = dma_pool_create("mlx5_cmd", dev->device, size, align, 0); - if (!cmd->pool) + cmd->stats = kvzalloc(MLX5_CMD_OP_MAX * sizeof(*cmd->stats), GFP_KERNEL); + if (!cmd->stats) return -ENOMEM; + cmd->pool = dma_pool_create("mlx5_cmd", dev->device, size, align, 0); + if (!cmd->pool) { + err = -ENOMEM; + goto dma_pool_err; + } + err = alloc_cmd_page(dev, cmd); if (err) goto err_free_pool; @@ -1999,7 +2005,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) spin_lock_init(&cmd->alloc_lock); spin_lock_init(&cmd->token_lock); - for (i = 0; i < ARRAY_SIZE(cmd->stats); i++) + for (i = 0; i < MLX5_CMD_OP_MAX; i++) spin_lock_init(&cmd->stats[i].lock); sema_init(&cmd->sem, cmd->max_reg_cmds); @@ -2046,7 +2052,8 @@ err_free_page: err_free_pool: dma_pool_destroy(cmd->pool); - +dma_pool_err: + kvfree(cmd->stats); return err; } EXPORT_SYMBOL(mlx5_cmd_init); @@ -2060,6 +2067,7 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) destroy_msg_cache(dev); free_cmd_page(dev, cmd); dma_pool_destroy(cmd->pool); + kvfree(cmd->stats); } EXPORT_SYMBOL(mlx5_cmd_cleanup); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index d2d57213511b..07c8d9811bc8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -171,7 +171,7 @@ void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) cmd = &dev->priv.cmdif_debugfs; *cmd = debugfs_create_dir("commands", dev->priv.dbg_root); - for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) { + for (i = 0; i < MLX5_CMD_OP_MAX; i++) { stats = &dev->cmd.stats[i]; namep = mlx5_command_str(i); if (strcmp(namep, "unknown command opcode")) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6aa6bbd60559..13c0e4556eda 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -298,7 +298,7 @@ struct mlx5_cmd { struct mlx5_cmd_debug dbg; struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES]; int checksum_disabled; - struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX]; + struct mlx5_cmd_stats *stats; }; struct mlx5_port_caps { -- cgit v1.2.3 From 44345c4c130ee3df9b9fbc366d59ab3ac707d7f8 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 29 May 2020 00:47:12 -0700 Subject: net/mlx5: IPSec: Fix incorrect type for spi spi is __be32, fix that. Fixes sparse warning: drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c:74:64 warning: incorrect type Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index b919d143a9a6..96ebaa94a92e 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -76,7 +76,7 @@ struct aes_gcm_keymat { struct mlx5_accel_esp_xfrm_attrs { enum mlx5_accel_esp_action action; u32 esn; - u32 spi; + __be32 spi; u32 seq; u32 tfc_pad; u32 flags; -- cgit v1.2.3 From 563ca40ddf400dbf8c6254077f9b6887101d0f08 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 09:16:02 -0700 Subject: pstore/platform: Switch pstore_info::name to const In order to more cleanly pass around backend names, make the "name" member const. This means the module param needs to be dynamic (technically, it was before, so this actually cleans up a minor memory leak if a backend was specified and then gets unloaded.) Link: https://lore.kernel.org/lkml/20200510202436.63222-3-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 3 ++- include/linux/pstore.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 8beaeff72386..715396bef0ea 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -624,7 +624,7 @@ int pstore_register(struct pstore_info *psi) * Update the module parameter backend, so it is visible * through /sys/module/pstore/parameters/backend */ - backend = psi->name; + backend = kstrdup(psi->name, GFP_KERNEL); pr_info("Registered %s as persistent store backend\n", psi->name); @@ -667,6 +667,7 @@ void pstore_unregister(struct pstore_info *psi) free_buf_for_compression(); psinfo = NULL; + kfree(backend); backend = NULL; mutex_unlock(&psinfo_lock); } diff --git a/include/linux/pstore.h b/include/linux/pstore.h index e779441e6d26..f6f22b13e04f 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -170,7 +170,7 @@ struct pstore_record { */ struct pstore_info { struct module *owner; - char *name; + const char *name; struct semaphore buf_lock; char *buf; -- cgit v1.2.3 From 6d3cf962dd1a95df868c547b090bfc4c7977f4be Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 May 2020 11:05:43 -0700 Subject: printk: Collapse shutdown types into a single dump reason To turn the KMSG_DUMP_* reasons into a more ordered list, collapse the redundant KMSG_DUMP_(RESTART|HALT|POWEROFF) reasons into KMSG_DUMP_SHUTDOWN. The current users already don't meaningfully distinguish between them, so there's no need to, as discussed here: https://lore.kernel.org/lkml/CA+CK2bAPv5u1ih5y9t5FUnTyximtFCtDYXJCpuyjOyHNOkRdqw@mail.gmail.com/ Link: https://lore.kernel.org/lkml/20200515184434.8470-2-keescook@chromium.org/ Reviewed-by: Pavel Tatashin Reviewed-by: Petr Mladek Acked-by: Michael Ellerman (powerpc) Signed-off-by: Kees Cook --- arch/powerpc/kernel/nvram_64.c | 4 +--- fs/pstore/platform.c | 8 ++------ include/linux/kmsg_dump.h | 4 +--- kernel/reboot.c | 6 +++--- 4 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index fb4f61096613..0cd1c88bfc8b 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -655,9 +655,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, int rc = -1; switch (reason) { - case KMSG_DUMP_RESTART: - case KMSG_DUMP_HALT: - case KMSG_DUMP_POWEROFF: + case KMSG_DUMP_SHUTDOWN: /* These are almost always orderly shutdowns. */ return; case KMSG_DUMP_OOPS: diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 072440457c08..90d74ebaa70a 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -144,12 +144,8 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; - case KMSG_DUMP_RESTART: - return "Restart"; - case KMSG_DUMP_HALT: - return "Halt"; - case KMSG_DUMP_POWEROFF: - return "Poweroff"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; default: return "Unknown"; } diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 2e7a1e032c71..3f82b5cb2d82 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -25,9 +25,7 @@ enum kmsg_dump_reason { KMSG_DUMP_PANIC, KMSG_DUMP_OOPS, KMSG_DUMP_EMERG, - KMSG_DUMP_RESTART, - KMSG_DUMP_HALT, - KMSG_DUMP_POWEROFF, + KMSG_DUMP_SHUTDOWN, }; /** diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..491f1347bf43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); -- cgit v1.2.3 From b1f6f161b236d0e5a9222fb8b482e65aaff13689 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:06 -0400 Subject: printk: honor the max_reason field in kmsg_dumper kmsg_dump() allows to dump kmesg buffer for various system events: oops, panic, reboot, etc. It provides an interface to register a callback call for clients, and in that callback interface there is a field "max_reason", but it was getting ignored when set to any "reason" higher than KMSG_DUMP_OOPS unless "always_kmsg_dump" was passed as kernel parameter. Allow clients to actually control their "max_reason", and keep the current behavior when "max_reason" is not set. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-3-keescook@chromium.org/ Reviewed-by: Petr Mladek Signed-off-by: Kees Cook --- include/linux/kmsg_dump.h | 1 + kernel/printk/printk.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 3f82b5cb2d82..9826014771ab 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -26,6 +26,7 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_EMERG, KMSG_DUMP_SHUTDOWN, + KMSG_DUMP_MAX }; /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..a121c2255737 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3157,12 +3157,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ -- cgit v1.2.3 From fb13cb8a0482105a415e24042209d02a684255e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 May 2020 19:36:22 -0700 Subject: printk: Introduce kmsg_dump_reason_str() The pstore subsystem already had a private version of this function. With the coming addition of the pstore/zone driver, this needs to be shared. As it really should live with printk, move it there instead. Link: https://lore.kernel.org/lkml/20200515184434.8470-4-keescook@chromium.org/ Acked-by: Petr Mladek Acked-by: Sergey Senozhatsky Reviewed-by: Pavel Tatashin Signed-off-by: Kees Cook --- fs/pstore/platform.c | 18 +----------------- include/linux/kmsg_dump.h | 7 +++++++ kernel/printk/printk.c | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 90d74ebaa70a..5e6c6022deb9 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -135,22 +135,6 @@ enum pstore_type_id pstore_name_to_type(const char *name) } EXPORT_SYMBOL_GPL(pstore_name_to_type); -static const char *get_reason_str(enum kmsg_dump_reason reason) -{ - switch (reason) { - case KMSG_DUMP_PANIC: - return "Panic"; - case KMSG_DUMP_OOPS: - return "Oops"; - case KMSG_DUMP_EMERG: - return "Emergency"; - case KMSG_DUMP_SHUTDOWN: - return "Shutdown"; - default: - return "Unknown"; - } -} - static void pstore_timer_kick(void) { if (pstore_update_ms < 0) @@ -403,7 +387,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned int part = 1; int ret; - why = get_reason_str(reason); + why = kmsg_dump_reason_str(reason); if (down_trylock(&psinfo->buf_lock)) { /* Failed to acquire lock: give up if we cannot wait. */ diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 9826014771ab..3378bcbe585e 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -70,6 +70,8 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper); int kmsg_dump_register(struct kmsg_dumper *dumper); int kmsg_dump_unregister(struct kmsg_dumper *dumper); + +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason); #else static inline void kmsg_dump(enum kmsg_dump_reason reason) { @@ -111,6 +113,11 @@ static inline int kmsg_dump_unregister(struct kmsg_dumper *dumper) { return -EINVAL; } + +static inline const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + return "Disabled"; +} #endif #endif /* _LINUX_KMSG_DUMP_H */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a121c2255737..14ca4d05d902 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3144,6 +3144,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping -- cgit v1.2.3 From 3524e688b8ee50b0edc76f0e020727eb6c684dbc Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:07 -0400 Subject: pstore/platform: Pass max_reason to kmesg dump Add a new member to struct pstore_info for passing information about kmesg dump maximum reason. This allows a finer control of what kmesg dumps are sent to pstore storage backends. Those backends that do not explicitly set this field (keeping it equal to 0), get the default behavior: store only Oopses and Panics, or everything if the printk.always_kmsg_dump boot param is set. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-5-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/platform.c | 4 +++- include/linux/pstore.h | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 5e6c6022deb9..a9e297eefdff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -595,8 +595,10 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); - if (psi->flags & PSTORE_FLAGS_DMESG) + if (psi->flags & PSTORE_FLAGS_DMESG) { + pstore_dumper.max_reason = psinfo->max_reason; pstore_register_kmsg(); + } if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_register_console(); if (psi->flags & PSTORE_FLAGS_FTRACE) diff --git a/include/linux/pstore.h b/include/linux/pstore.h index f6f22b13e04f..eb93a54cff31 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -96,6 +96,12 @@ struct pstore_record { * * @read_mutex: serializes @open, @read, @close, and @erase callbacks * @flags: bitfield of frontends the backend can accept writes for + * @max_reason: Used when PSTORE_FLAGS_DMESG is set. Contains the + * kmsg_dump_reason enum value. KMSG_DUMP_UNDEF means + * "use existing kmsg_dump() filtering, based on the + * printk.always_kmsg_dump boot param" (which is either + * KMSG_DUMP_OOPS when false, or KMSG_DUMP_MAX when + * true); see printk.always_kmsg_dump for more details. * @data: backend-private pointer passed back during callbacks * * Callbacks: @@ -179,6 +185,7 @@ struct pstore_info { struct mutex read_mutex; int flags; + int max_reason; void *data; int (*open)(struct pstore_info *psi); -- cgit v1.2.3 From 791205e3ec6081a8da6f00621e3453d622dc41e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 May 2020 14:35:03 -0700 Subject: pstore/ram: Introduce max_reason and convert dump_oops Now that pstore_register() can correctly pass max_reason to the kmesg dump facility, introduce a new "max_reason" module parameter and "max-reason" Device Tree field. The "dump_oops" module parameter and "dump-oops" Device Tree field are now considered deprecated, but are now automatically converted to their corresponding max_reason values when present, though the new max_reason setting has precedence. For struct ramoops_platform_data, the "dump_oops" member is entirely replaced by a new "max_reason" member, with the only existing user updated in place. Additionally remove the "reason" filter logic from ramoops_pstore_write(), as that is not specifically needed anymore, though technically this is a change in behavior for any ramoops users also setting the printk.always_kmsg_dump boot param, which will cause ramoops to behave as if max_reason was set to KMSG_DUMP_MAX. Co-developed-by: Pavel Tatashin Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-6-keescook@chromium.org/ Signed-off-by: Kees Cook --- Documentation/admin-guide/ramoops.rst | 14 +++++--- drivers/platform/chrome/chromeos_pstore.c | 2 +- fs/pstore/ram.c | 58 +++++++++++++++++++++---------- include/linux/pstore_ram.h | 2 +- 4 files changed, 51 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/ramoops.rst b/Documentation/admin-guide/ramoops.rst index 6dbcc5481000..a60a96218ba9 100644 --- a/Documentation/admin-guide/ramoops.rst +++ b/Documentation/admin-guide/ramoops.rst @@ -32,11 +32,17 @@ memory to be mapped strongly ordered, and atomic operations on strongly ordered memory are implementation defined, and won't work on many ARMs such as omaps. The memory area is divided into ``record_size`` chunks (also rounded down to -power of two) and each oops/panic writes a ``record_size`` chunk of +power of two) and each kmesg dump writes a ``record_size`` chunk of information. -Dumping both oopses and panics can be done by setting 1 in the ``dump_oops`` -variable while setting 0 in that variable dumps only the panics. +Limiting which kinds of kmsg dumps are stored can be controlled via +the ``max_reason`` value, as defined in include/linux/kmsg_dump.h's +``enum kmsg_dump_reason``. For example, to store both Oopses and Panics, +``max_reason`` should be set to 2 (KMSG_DUMP_OOPS), to store only Panics +``max_reason`` should be set to 1 (KMSG_DUMP_PANIC). Setting this to 0 +(KMSG_DUMP_UNDEF), means the reason filtering will be controlled by the +``printk.always_kmsg_dump`` boot param: if unset, it'll be KMSG_DUMP_OOPS, +otherwise KMSG_DUMP_MAX. The module uses a counter to record multiple dumps but the counter gets reset on restart (i.e. new dumps after the restart will overwrite old ones). @@ -90,7 +96,7 @@ Setting the ramoops parameters can be done in several different manners: .mem_address = <...>, .mem_type = <...>, .record_size = <...>, - .dump_oops = <...>, + .max_reason = <...>, .ecc = <...>, }; diff --git a/drivers/platform/chrome/chromeos_pstore.c b/drivers/platform/chrome/chromeos_pstore.c index d13770785fb5..fa51153688b4 100644 --- a/drivers/platform/chrome/chromeos_pstore.c +++ b/drivers/platform/chrome/chromeos_pstore.c @@ -57,7 +57,7 @@ static struct ramoops_platform_data chromeos_ramoops_data = { .record_size = 0x40000, .console_size = 0x20000, .ftrace_size = 0x20000, - .dump_oops = 1, + .max_reason = KMSG_DUMP_OOPS, }; static struct platform_device chromeos_ramoops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 26a1f502a3ea..ca6d8a867285 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -58,10 +58,10 @@ module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, "set to 1 to try to use unbuffered memory (default 0)"); -static int dump_oops = 1; -module_param(dump_oops, int, 0400); -MODULE_PARM_DESC(dump_oops, - "set to 1 to dump oopses, 0 to only dump panics (default 1)"); +static int ramoops_max_reason = -1; +module_param_named(max_reason, ramoops_max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic) "); static int ramoops_ecc; module_param_named(ecc, ramoops_ecc, int, 0400); @@ -70,6 +70,11 @@ MODULE_PARM_DESC(ramoops_ecc, "ECC buffer size in bytes (1 is a special value, means 16 " "bytes ECC)"); +static int ramoops_dump_oops = -1; +module_param_named(dump_oops, ramoops_dump_oops, int, 0400); +MODULE_PARM_DESC(dump_oops, + "(deprecated: use max_reason instead) set to 1 to dump oopses & panics, 0 to only dump panics"); + struct ramoops_context { struct persistent_ram_zone **dprzs; /* Oops dump zones */ struct persistent_ram_zone *cprz; /* Console zone */ @@ -82,7 +87,6 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; size_t pmsg_size; - int dump_oops; u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; @@ -336,16 +340,14 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) return -EINVAL; /* - * Out of the various dmesg dump types, ramoops is currently designed - * to only store crash logs, rather than storing general kernel logs. + * We could filter on record->reason here if we wanted to (which + * would duplicate what happened before the "max_reason" setting + * was added), but that would defeat the purpose of a system + * changing printk.always_kmsg_dump, so instead log everything that + * the kmsg dumper sends us, since it should be doing the filtering + * based on the combination of printk.always_kmsg_dump and our + * requested "max_reason". */ - if (record->reason != KMSG_DUMP_OOPS && - record->reason != KMSG_DUMP_PANIC) - return -EINVAL; - - /* Skip Oopes when configured to do so. */ - if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) - return -EINVAL; /* * Explicitly only take the first part of any new crash. @@ -647,7 +649,14 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); - pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); + /* + * Setting "no-dump-oops" is deprecated and will be ignored if + * "max_reason" is also specified. + */ + if (of_property_read_bool(of_node, "no-dump-oops")) + pdata->max_reason = KMSG_DUMP_PANIC; + else + pdata->max_reason = KMSG_DUMP_OOPS; #define parse_u32(name, field, default_value) { \ ret = ramoops_parse_dt_u32(pdev, name, default_value, \ @@ -663,6 +672,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, parse_u32("pmsg-size", pdata->pmsg_size, 0); parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0); parse_u32("flags", pdata->flags, 0); + parse_u32("max-reason", pdata->max_reason, pdata->max_reason); #undef parse_u32 @@ -746,7 +756,6 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; - cxt->dump_oops = pdata->dump_oops; cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; @@ -789,8 +798,10 @@ static int ramoops_probe(struct platform_device *pdev) * the single region size is how to check. */ cxt->pstore.flags = 0; - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + cxt->pstore.max_reason = pdata->max_reason; + } if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; if (cxt->max_ftrace_cnt) @@ -826,7 +837,7 @@ static int ramoops_probe(struct platform_device *pdev) mem_size = pdata->mem_size; mem_address = pdata->mem_address; record_size = pdata->record_size; - dump_oops = pdata->dump_oops; + ramoops_max_reason = pdata->max_reason; ramoops_console_size = pdata->console_size; ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; @@ -909,7 +920,16 @@ static void __init ramoops_register_dummy(void) pdata.console_size = ramoops_console_size; pdata.ftrace_size = ramoops_ftrace_size; pdata.pmsg_size = ramoops_pmsg_size; - pdata.dump_oops = dump_oops; + /* If "max_reason" is set, its value has priority over "dump_oops". */ + if (ramoops_max_reason >= 0) + pdata.max_reason = ramoops_max_reason; + /* Otherwise, if "dump_oops" is set, parse it into "max_reason". */ + else if (ramoops_dump_oops != -1) + pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS + : KMSG_DUMP_PANIC; + /* And if neither are explicitly set, use the default. */ + else + pdata.max_reason = KMSG_DUMP_OOPS; pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU; /* diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 9cb9b9067298..9f16afec7290 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -133,7 +133,7 @@ struct ramoops_platform_data { unsigned long console_size; unsigned long ftrace_size; unsigned long pmsg_size; - int dump_oops; + int max_reason; u32 flags; struct persistent_ram_ecc_info ecc_info; }; -- cgit v1.2.3 From d26c3321fe18dc74517dc1f518d584aa33b0a851 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:54:56 +0800 Subject: pstore/zone: Introduce common layer to manage storage zones Implement a common set of APIs needed to support pstore storage zones, based on how ramoops is designed. This will be used by pstore/blk with the intention of migrating pstore/ram in the future. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-2-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 7 + fs/pstore/Makefile | 3 + fs/pstore/zone.c | 985 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/pstore_zone.h | 44 ++ 4 files changed, 1039 insertions(+) create mode 100644 fs/pstore/zone.c create mode 100644 include/linux/pstore_zone.h (limited to 'include/linux') diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8f0369aad22a..98d2457bdd9f 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -153,3 +153,10 @@ config PSTORE_RAM "ramoops.ko". For more information, see Documentation/admin-guide/ramoops.rst. + +config PSTORE_ZONE + tristate + depends on PSTORE + help + The common layer for pstore/blk (and pstore/ram in the future) + to manage storage in zones. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 967b5891f325..58a967cbe4af 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -12,3 +12,6 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o + +pstore_zone-objs += zone.o +obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c new file mode 100644 index 000000000000..362d72a24012 --- /dev/null +++ b/fs/pstore/zone.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide a pstore intermediate backend, organized into kernel memory + * allocated zones that are then mapped and flushed into a single + * contiguous region on a storage backend of some kind (block, mtd, etc). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/** + * struct psz_head - header of zone to flush to storage + * + * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) + * @datalen: length of data in @data + * @data: zone data. + */ +struct psz_buffer { +#define PSZ_SIG (0x43474244) /* DBGC */ + uint32_t sig; + atomic_t datalen; + uint8_t data[]; +}; + +/** + * struct psz_kmsg_header - kmsg dump-specific header to flush to storage + * + * @magic: magic num for kmsg dump header + * @time: kmsg dump trigger time + * @compressed: whether conpressed + * @counter: kmsg dump counter + * @reason: the kmsg dump reason (e.g. oops, panic, etc) + * @data: pointer to log data + * + * This is a sub-header for a kmsg dump, trailing after &psz_buffer. + */ +struct psz_kmsg_header { +#define PSTORE_KMSG_HEADER_MAGIC 0x4dfc3ae5 /* Just a random number */ + uint32_t magic; + struct timespec64 time; + bool compressed; + uint32_t counter; + enum kmsg_dump_reason reason; + uint8_t data[]; +}; + +/** + * struct pstore_zone - single stored buffer + * + * @off: zone offset of storage + * @type: front-end type for this zone + * @name: front-end name for this zone + * @buffer: pointer to data buffer managed by this zone + * @oldbuf: pointer to old data buffer + * @buffer_size: bytes in @buffer->data + * @should_recover: whether this zone should recover from storage + * @dirty: whether the data in @buffer dirty + * + * zone structure in memory. + */ +struct pstore_zone { + loff_t off; + const char *name; + enum pstore_type_id type; + + struct psz_buffer *buffer; + struct psz_buffer *oldbuf; + size_t buffer_size; + bool should_recover; + atomic_t dirty; +}; + +/** + * struct psz_context - all about running state of pstore/zone + * + * @kpszs: kmsg dump storage zones + * @kmsg_max_cnt: max count of @kpszs + * @kmsg_read_cnt: counter of total read kmsg dumps + * @kmsg_write_cnt: counter of total kmsg dump writes + * @oops_counter: counter of oops dumps + * @panic_counter: counter of panic dumps + * @recovered: whether finished recovering data from storage + * @on_panic: whether panic is happening + * @pstore_zone_info_lock: lock to @pstore_zone_info + * @pstore_zone_info: information from backend + * @pstore: structure for pstore + */ +struct psz_context { + struct pstore_zone **kpszs; + unsigned int kmsg_max_cnt; + unsigned int kmsg_read_cnt; + unsigned int kmsg_write_cnt; + /* + * These counters should be calculated during recovery. + * It records the oops/panic times after crashes rather than boots. + */ + unsigned int oops_counter; + unsigned int panic_counter; + atomic_t recovered; + atomic_t on_panic; + + /* + * pstore_zone_info_lock protects this entire structure during calls + * to register_pstore_zone()/unregister_pstore_zone(). + */ + struct mutex pstore_zone_info_lock; + struct pstore_zone_info *pstore_zone_info; + struct pstore_info pstore; +}; +static struct psz_context pstore_zone_cxt; + +/** + * enum psz_flush_mode - flush mode for psz_zone_write() + * + * @FLUSH_NONE: do not flush to storage but update data on memory + * @FLUSH_PART: just flush part of data including meta data to storage + * @FLUSH_META: just flush meta data of zone to storage + * @FLUSH_ALL: flush all of zone + */ +enum psz_flush_mode { + FLUSH_NONE = 0, + FLUSH_PART, + FLUSH_META, + FLUSH_ALL, +}; + +static inline int buffer_datalen(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->datalen); +} + +static inline bool is_on_panic(void) +{ + return atomic_read(&pstore_zone_cxt.on_panic); +} + +static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone->buffer) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->buffer->data + off, len); + return len; +} + +static int psz_zone_write(struct pstore_zone *zone, + enum psz_flush_mode flush_mode, const char *buf, + size_t len, unsigned long off) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + ssize_t wcnt = 0; + ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos); + size_t wlen; + + if (off > zone->buffer_size) + return -EINVAL; + + wlen = min_t(size_t, len, zone->buffer_size - off); + if (buf && wlen) { + memcpy(zone->buffer->data + off, buf, wlen); + atomic_set(&zone->buffer->datalen, wlen + off); + } + + /* avoid to damage old records */ + if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) + goto dirty; + + writeop = is_on_panic() ? info->panic_write : info->write; + if (!writeop) + goto dirty; + + switch (flush_mode) { + case FLUSH_NONE: + if (unlikely(buf && wlen)) + goto dirty; + return 0; + case FLUSH_PART: + wcnt = writeop((const char *)zone->buffer->data + off, wlen, + zone->off + sizeof(*zone->buffer) + off); + if (wcnt != wlen) + goto dirty; + fallthrough; + case FLUSH_META: + wlen = sizeof(struct psz_buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + case FLUSH_ALL: + wlen = zone->buffer_size + sizeof(*zone->buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + } + + return 0; +dirty: + atomic_set(&zone->dirty, true); + return -EBUSY; +} + +static int psz_flush_dirty_zone(struct pstore_zone *zone) +{ + int ret; + + if (unlikely(!zone)) + return -EINVAL; + + if (unlikely(!atomic_read(&pstore_zone_cxt.recovered))) + return -EBUSY; + + if (!atomic_xchg(&zone->dirty, false)) + return 0; + + ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0); + if (ret) + atomic_set(&zone->dirty, true); + return ret; +} + +static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt) +{ + int i, ret; + struct pstore_zone *zone; + + if (!zones) + return -EINVAL; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (!zone) + return -EINVAL; + ret = psz_flush_dirty_zone(zone); + if (ret) + return ret; + } + return 0; +} + +static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) +{ + const char *data = (const char *)old->buffer->data; + int ret; + + ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0); + if (ret) { + atomic_set(&new->buffer->datalen, 0); + atomic_set(&new->dirty, false); + return ret; + } + atomic_set(&old->buffer->datalen, 0); + return 0; +} + +static int psz_kmsg_recover_data(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone = NULL; + struct psz_buffer *buf; + unsigned long i; + ssize_t rcnt; + + if (!info->read) + return -EINVAL; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + if (atomic_read(&zone->dirty)) { + unsigned int wcnt = cxt->kmsg_write_cnt; + struct pstore_zone *new = cxt->kpszs[wcnt]; + int ret; + + ret = psz_move_zone(zone, new); + if (ret) { + pr_err("move zone from %lu to %d failed\n", + i, wcnt); + return ret; + } + cxt->kmsg_write_cnt = (wcnt + 1) % cxt->kmsg_max_cnt; + } + if (!zone->should_recover) + continue; + buf = zone->buffer; + rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), + zone->off); + if (rcnt != zone->buffer_size + sizeof(*buf)) + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + return 0; +} + +static int psz_kmsg_recover_meta(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone; + size_t rcnt, len; + struct psz_buffer *buf; + struct psz_kmsg_header *hdr; + struct timespec64 time = { }; + unsigned long i; + /* + * Recover may on panic, we can't allocate any memory by kmalloc. + * So, we use local array instead. + */ + char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; + + if (!info->read) + return -EINVAL; + + len = sizeof(*buf) + sizeof(*hdr); + buf = (struct psz_buffer *)buffer_header; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + + rcnt = info->read((char *)buf, len, zone->off); + if (rcnt != len) { + pr_err("read %s with id %lu failed\n", zone->name, i); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (buf->sig != zone->buffer->sig) { + pr_debug("no valid data in kmsg dump zone %lu\n", i); + continue; + } + + if (zone->buffer_size < atomic_read(&buf->datalen)) { + pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + hdr = (struct psz_kmsg_header *)buf->data; + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) { + pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + /* + * we get the newest zone, and the next one must be the oldest + * or unused zone, because we do write one by one like a circle. + */ + if (hdr->time.tv_sec >= time.tv_sec) { + time.tv_sec = hdr->time.tv_sec; + cxt->kmsg_write_cnt = (i + 1) % cxt->kmsg_max_cnt; + } + + if (hdr->reason == KMSG_DUMP_OOPS) + cxt->oops_counter = + max(cxt->oops_counter, hdr->counter); + else if (hdr->reason == KMSG_DUMP_PANIC) + cxt->panic_counter = + max(cxt->panic_counter, hdr->counter); + + if (!atomic_read(&buf->datalen)) { + pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, + atomic_read(&buf->datalen)); + continue; + } + + if (!is_on_panic()) + zone->should_recover = true; + pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, atomic_read(&buf->datalen)); + } + + return 0; +} + +static int psz_kmsg_recover(struct psz_context *cxt) +{ + int ret; + + if (!cxt->kpszs) + return 0; + + ret = psz_kmsg_recover_meta(cxt); + if (ret) + goto recover_fail; + + ret = psz_kmsg_recover_data(cxt); + if (ret) + goto recover_fail; + + return 0; +recover_fail: + pr_debug("psz_recover_kmsg failed\n"); + return ret; +} + +/** + * psz_recovery() - recover data from storage + * @cxt: the context of pstore/zone + * + * recovery means reading data back from storage after rebooting + * + * Return: 0 on success, others on failure. + */ +static inline int psz_recovery(struct psz_context *cxt) +{ + int ret; + + if (atomic_read(&cxt->recovered)) + return 0; + + ret = psz_kmsg_recover(cxt); + + if (unlikely(ret)) + pr_err("recover failed\n"); + else { + pr_debug("recover end!\n"); + atomic_set(&cxt->recovered, 1); + } + return ret; +} + +static int psz_pstore_open(struct pstore_info *psi) +{ + struct psz_context *cxt = psi->data; + + cxt->kmsg_read_cnt = 0; + return 0; +} + +static inline bool psz_ok(struct pstore_zone *zone) +{ + if (zone && zone->buffer && buffer_datalen(zone)) + return true; + return false; +} + +static inline int psz_kmsg_erase(struct psz_context *cxt, + struct pstore_zone *zone, struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (unlikely(!psz_ok(zone))) + return 0; + /* this zone is already updated, no need to erase */ + if (record->count != hdr->counter) + return 0; + + atomic_set(&zone->buffer->datalen, 0); + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); +} + +static int psz_pstore_erase(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + if (record->id >= cxt->kmsg_max_cnt) + return -EINVAL; + return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + default: + return -EINVAL; + } +} + +static void psz_write_kmsg_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + hdr->magic = PSTORE_KMSG_HEADER_MAGIC; + hdr->compressed = record->compressed; + hdr->time.tv_sec = record->time.tv_sec; + hdr->time.tv_nsec = record->time.tv_nsec; + hdr->reason = record->reason; + if (hdr->reason == KMSG_DUMP_OOPS) + hdr->counter = ++cxt->oops_counter; + else if (hdr->reason == KMSG_DUMP_PANIC) + hdr->counter = ++cxt->panic_counter; + else + hdr->counter = 0; +} + +static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, + struct pstore_record *record) +{ + size_t size, hlen; + struct pstore_zone *zone; + unsigned int zonenum; + + zonenum = cxt->kmsg_write_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + cxt->kmsg_write_cnt = (zonenum + 1) % cxt->kmsg_max_cnt; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + return psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); +} + +static int notrace psz_kmsg_write(struct psz_context *cxt, + struct pstore_record *record) +{ + int ret; + + /* + * Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (record->part != 1) + return -ENOSPC; + + if (!cxt->kpszs) + return -ENOSPC; + + ret = psz_kmsg_write_record(cxt, record); + if (!ret) { + pr_debug("try to flush other dirty zones\n"); + psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + } + + /* always return 0 as we had handled it on buffer */ + return 0; +} + +static int notrace psz_pstore_write(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + if (record->type == PSTORE_TYPE_DMESG && + record->reason == KMSG_DUMP_PANIC) + atomic_set(&cxt->on_panic, 1); + + switch (record->type) { + case PSTORE_TYPE_DMESG: + return psz_kmsg_write(cxt, record); + default: + return -EINVAL; + } +} + +static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) +{ + struct pstore_zone *zone = NULL; + + while (cxt->kmsg_read_cnt < cxt->kmsg_max_cnt) { + zone = cxt->kpszs[cxt->kmsg_read_cnt++]; + if (psz_ok(zone)) + return zone; + } + + return NULL; +} + +static int psz_kmsg_read_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) + return -EINVAL; + record->compressed = hdr->compressed; + record->time.tv_sec = hdr->time.tv_sec; + record->time.tv_nsec = hdr->time.tv_nsec; + record->reason = hdr->reason; + record->count = hdr->counter; + return 0; +} + +static ssize_t psz_kmsg_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + ssize_t size, hlen = 0; + + size = buffer_datalen(zone); + /* Clear and skip this kmsg dump record if it has no valid header */ + if (psz_kmsg_read_hdr(zone, record)) { + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->dirty, 0); + return -ENOMSG; + } + size -= sizeof(struct psz_kmsg_header); + + if (!record->compressed) { + char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n", + kmsg_dump_reason_str(record->reason), + record->count); + hlen = strlen(buf); + record->buf = krealloc(buf, hlen + size, GFP_KERNEL); + if (!record->buf) { + kfree(buf); + return -ENOMEM; + } + } else { + record->buf = kmalloc(size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + } + + size = psz_zone_read(zone, record->buf + hlen, size, + sizeof(struct psz_kmsg_header)); + if (unlikely(size < 0)) { + kfree(record->buf); + return -ENOMSG; + } + + return size + hlen; +} + +static ssize_t psz_pstore_read(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + ssize_t (*readop)(struct pstore_zone *zone, + struct pstore_record *record); + struct pstore_zone *zone; + ssize_t ret; + + /* before read, we must recover from storage */ + ret = psz_recovery(cxt); + if (ret) + return ret; + +next_zone: + zone = psz_read_next_zone(cxt); + if (!zone) + return 0; + + record->type = zone->type; + switch (record->type) { + case PSTORE_TYPE_DMESG: + readop = psz_kmsg_read; + record->id = cxt->kmsg_read_cnt - 1; + break; + default: + goto next_zone; + } + + ret = readop(zone, record); + if (ret == -ENOMSG) + goto next_zone; + return ret; +} + +static struct psz_context pstore_zone_cxt = { + .pstore_zone_info_lock = + __MUTEX_INITIALIZER(pstore_zone_cxt.pstore_zone_info_lock), + .recovered = ATOMIC_INIT(0), + .on_panic = ATOMIC_INIT(0), + .pstore = { + .owner = THIS_MODULE, + .open = psz_pstore_open, + .read = psz_pstore_read, + .write = psz_pstore_write, + .erase = psz_pstore_erase, + }, +}; + +static void psz_free_zone(struct pstore_zone **pszone) +{ + struct pstore_zone *zone = *pszone; + + if (!zone) + return; + + kfree(zone->buffer); + kfree(zone); + *pszone = NULL; +} + +static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt) +{ + struct pstore_zone **zones = *pszones; + + if (!zones) + return; + + while (*cnt > 0) { + (*cnt)--; + psz_free_zone(&(zones[*cnt])); + } + kfree(zones); + *pszones = NULL; +} + +static void psz_free_all_zones(struct psz_context *cxt) +{ + if (cxt->kpszs) + psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); +} + +static struct pstore_zone *psz_init_zone(enum pstore_type_id type, + loff_t *off, size_t size) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone *zone; + const char *name = pstore_type_to_name(type); + + if (!size) + return NULL; + + if (*off + size > info->total_size) { + pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n", + name, size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL); + if (!zone) + return ERR_PTR(-ENOMEM); + + zone->buffer = kmalloc(size, GFP_KERNEL); + if (!zone->buffer) { + kfree(zone); + return ERR_PTR(-ENOMEM); + } + memset(zone->buffer, 0xFF, size); + zone->off = *off; + zone->name = name; + zone->type = type; + zone->buffer_size = size - sizeof(struct psz_buffer); + zone->buffer->sig = type ^ PSZ_SIG; + atomic_set(&zone->dirty, 0); + atomic_set(&zone->buffer->datalen, 0); + + *off += size; + + pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name, + zone->off, sizeof(*zone->buffer), zone->buffer_size); + return zone; +} + +static struct pstore_zone **psz_init_zones(enum pstore_type_id type, + loff_t *off, size_t total_size, ssize_t record_size, + unsigned int *cnt) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone **zones, *zone; + const char *name = pstore_type_to_name(type); + int c, i; + + *cnt = 0; + if (!total_size || !record_size) + return NULL; + + if (*off + total_size > info->total_size) { + pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n", + name, total_size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + c = total_size / record_size; + zones = kcalloc(c, sizeof(*zones), GFP_KERNEL); + if (!zones) { + pr_err("allocate for zones %s failed\n", name); + return ERR_PTR(-ENOMEM); + } + memset(zones, 0, c * sizeof(*zones)); + + for (i = 0; i < c; i++) { + zone = psz_init_zone(type, off, record_size); + if (!zone || IS_ERR(zone)) { + pr_err("initialize zones %s failed\n", name); + psz_free_zones(&zones, &i); + return (void *)zone; + } + zones[i] = zone; + } + + *cnt = c; + return zones; +} + +static int psz_alloc_zones(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + loff_t off = 0; + int err; + size_t size; + + size = info->total_size; + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, size, + info->kmsg_size, &cxt->kmsg_max_cnt); + if (IS_ERR(cxt->kpszs)) { + err = PTR_ERR(cxt->kpszs); + cxt->kpszs = NULL; + goto fail_out; + } + + return 0; +fail_out: + return err; +} + +/** + * register_pstore_zone() - register to pstore/zone + * + * @info: back-end driver information. See &struct pstore_zone_info. + * + * Only one back-end at one time. + * + * Return: 0 on success, others on failure. + */ +int register_pstore_zone(struct pstore_zone_info *info) +{ + int err = -EINVAL; + struct psz_context *cxt = &pstore_zone_cxt; + + if (info->total_size < 4096) { + pr_warn("total_size must be >= 4096\n"); + return -EINVAL; + } + + if (!info->kmsg_size) { + pr_warn("at least one record size must be non-zero\n"); + return -EINVAL; + } + + if (!info->name || !info->name[0]) + return -EINVAL; + +#define check_size(name, size) { \ + if (info->name > 0 && info->name < (size)) { \ + pr_err(#name " must be over %d\n", (size)); \ + return -EINVAL; \ + } \ + if (info->name & (size - 1)) { \ + pr_err(#name " must be a multiple of %d\n", \ + (size)); \ + return -EINVAL; \ + } \ + } + + check_size(total_size, 4096); + check_size(kmsg_size, SECTOR_SIZE); + +#undef check_size + + /* + * the @read and @write must be applied. + * if no @read, pstore may mount failed. + * if no @write, pstore do not support to remove record file. + */ + if (!info->read || !info->write) { + pr_err("no valid general read/write interface\n"); + return -EINVAL; + } + + mutex_lock(&cxt->pstore_zone_info_lock); + if (cxt->pstore_zone_info) { + pr_warn("'%s' already loaded: ignoring '%s'\n", + cxt->pstore_zone_info->name, info->name); + mutex_unlock(&cxt->pstore_zone_info_lock); + return -EBUSY; + } + cxt->pstore_zone_info = info; + + pr_debug("register %s with properties:\n", info->name); + pr_debug("\ttotal size : %ld Bytes\n", info->total_size); + pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + + err = psz_alloc_zones(cxt); + if (err) { + pr_err("alloc zones failed\n"); + goto fail_out; + } + + if (info->kmsg_size) { + cxt->pstore.bufsize = cxt->kpszs[0]->buffer_size - + sizeof(struct psz_kmsg_header); + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + err = -ENOMEM; + goto fail_free; + } + } + cxt->pstore.data = cxt; + + pr_info("registered %s as backend for", info->name); + cxt->pstore.max_reason = info->max_reason; + cxt->pstore.name = info->name; + if (info->kmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + pr_cont(" kmsg(%s", + kmsg_dump_reason_str(cxt->pstore.max_reason)); + if (cxt->pstore_zone_info->panic_write) + pr_cont(",panic_write"); + pr_cont(")"); + } + pr_cont("\n"); + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_free; + } + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + + return 0; + +fail_free: + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + psz_free_all_zones(cxt); +fail_out: + pstore_zone_cxt.pstore_zone_info = NULL; + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + return err; +} +EXPORT_SYMBOL_GPL(register_pstore_zone); + +/** + * unregister_pstore_zone() - unregister to pstore/zone + * + * @info: back-end driver information. See struct pstore_zone_info. + */ +void unregister_pstore_zone(struct pstore_zone_info *info) +{ + struct psz_context *cxt = &pstore_zone_cxt; + + mutex_lock(&cxt->pstore_zone_info_lock); + if (!cxt->pstore_zone_info) { + mutex_unlock(&cxt->pstore_zone_info_lock); + return; + } + + /* Stop incoming writes from pstore. */ + pstore_unregister(&cxt->pstore); + + /* Clean up allocations. */ + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + cxt->pstore_zone_info = NULL; + + psz_free_all_zones(cxt); + + /* Clear counters and zone state. */ + cxt->oops_counter = 0; + cxt->panic_counter = 0; + atomic_set(&cxt->recovered, 0); + atomic_set(&cxt->on_panic, 0); + + mutex_unlock(&cxt->pstore_zone_info_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_zone); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao "); +MODULE_AUTHOR("Kees Cook "); +MODULE_DESCRIPTION("Storage Manager for pstore/blk"); diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h new file mode 100644 index 000000000000..eb005d9ae40c --- /dev/null +++ b/include/linux/pstore_zone.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PSTORE_ZONE_H_ +#define __PSTORE_ZONE_H_ + +#include + +typedef ssize_t (*pstore_zone_read_op)(char *, size_t, loff_t); +typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); +/** + * struct pstore_zone_info - pstore/zone back-end driver structure + * + * @owner: Module which is responsible for this back-end driver. + * @name: Name of the back-end driver. + * @total_size: The total size in bytes pstore/zone can use. It must be greater + * than 4096 and be multiple of 4096. + * @kmsg_size: The size of oops/panic zone. Zero means disabled, otherwise, + * it must be multiple of SECTOR_SIZE(512 Bytes). + * @max_reason: Maximum kmsg dump reason to store. + * @read: The general read operation. Both of the function parameters + * @size and @offset are relative value to storage. + * On success, the number of bytes should be returned, others + * means error. + * @write: The same as @read. + * @panic_write:The write operation only used for panic case. It's optional + * if you do not care panic log. The parameters and return value + * are the same as @read. + */ +struct pstore_zone_info { + struct module *owner; + const char *name; + + unsigned long total_size; + unsigned long kmsg_size; + int max_reason; + pstore_zone_read_op read; + pstore_zone_write_op write; + pstore_zone_write_op panic_write; +}; + +extern int register_pstore_zone(struct pstore_zone_info *info); +extern void unregister_pstore_zone(struct pstore_zone_info *info); + +#endif -- cgit v1.2.3 From 17639f67c1d61aba3c05e7703f75cd468f9d484f Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:54:57 +0800 Subject: pstore/blk: Introduce backend for block devices pstore/blk is similar to pstore/ram, but uses a block device as the storage rather than persistent ram. The pstore/blk backend solves two common use-cases that used to preclude using pstore/ram: - not all devices have a battery that could be used to persist regular RAM across power failures. - most embedded intelligent equipment have no persistent ram, which increases costs, instead preferring cheaper solutions, like block devices. pstore/blk provides separate configurations for the end user and for the block drivers. User configuration determines how pstore/blk operates, such as record sizes, max kmsg dump reasons, etc. These can be set by Kconfig and/or module parameters, but module parameter have priority over Kconfig. Driver configuration covers all the details about the target block device, such as total size of the device and how to perform read/write operations. These are provided by block drivers, calling pstore_register_blkdev(), including an optional panic_write callback used to bypass regular IO APIs in an effort to avoid potentially destabilized kernel code during a panic. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-3-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 64 +++++++ fs/pstore/Makefile | 3 + fs/pstore/blk.c | 434 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/pstore_blk.h | 51 ++++++ 4 files changed, 552 insertions(+) create mode 100644 fs/pstore/blk.c create mode 100644 include/linux/pstore_blk.h (limited to 'include/linux') diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 98d2457bdd9f..958bec75f907 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -160,3 +160,67 @@ config PSTORE_ZONE help The common layer for pstore/blk (and pstore/ram in the future) to manage storage in zones. + +config PSTORE_BLK + tristate "Log panic/oops to a block device" + depends on PSTORE + depends on BLOCK + select PSTORE_ZONE + default n + help + This enables panic and oops message to be logged to a block dev + where it can be read back at some later point. + + If unsure, say N. + +config PSTORE_BLK_BLKDEV + string "block device identifier" + depends on PSTORE_BLK + default "" + help + Which block device should be used for pstore/blk. + + It accept the following variants: + 1) device number in hexadecimal representation, + with no leading 0x, for example b302. + 2) /dev/ represents the device number of disk + 3) /dev/ represents the device number + of partition - device number of disk plus the partition number + 4) /dev/p - same as the above, this form is + used when disk name of partitioned disk ends with a digit. + 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + unique id of a partition if the partition table provides it. + The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + filled hex representation of the 32-bit "NT disk signature", and PP + is a zero-filled hex representation of the 1-based partition number. + 6) PARTUUID=/PARTNROFF= to select a partition in relation + to a partition with a known unique id. + 7) : major and minor number of the device separated by + a colon. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_KMSG_SIZE + int "Size in Kbytes of kmsg dump log to store" + depends on PSTORE_BLK + default 64 + help + This just sets size of kmsg dump (oops, panic, etc) log for + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_MAX_REASON + int "Maximum kmsg dump reason to store" + depends on PSTORE_BLK + default 2 + help + The maximum reason for kmsg dumps to store. The default is + 2 (KMSG_DUMP_OOPS), see include/linux/kmsg_dump.h's + enum kmsg_dump_reason for more details. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 58a967cbe4af..c270467aeece 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -15,3 +15,6 @@ obj-$(CONFIG_PSTORE_RAM) += ramoops.o pstore_zone-objs += zone.o obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o + +pstore_blk-objs += blk.o +obj-$(CONFIG_PSTORE_BLK) += pstore_blk.o diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c new file mode 100644 index 000000000000..8f131c6e412e --- /dev/null +++ b/fs/pstore/blk.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implements pstore backend driver that write to block (or non-block) storage + * devices, using the pstore/zone API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include "../../block/blk.h" +#include +#include +#include +#include +#include +#include +#include +#include + +static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; +module_param(kmsg_size, long, 0400); +MODULE_PARM_DESC(kmsg_size, "kmsg dump record size in kbytes"); + +static int max_reason = CONFIG_PSTORE_BLK_MAX_REASON; +module_param(max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic)"); + +/* + * blkdev - the block device to use for pstore storage + * + * Usually, this will be a partition of a block device. + * + * blkdev accepts the following variants: + * 1) device number in hexadecimal representation, + * with no leading 0x, for example b302. + * 2) /dev/ represents the device number of disk + * 3) /dev/ represents the device number + * of partition - device number of disk plus the partition number + * 4) /dev/p - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 6) PARTUUID=/PARTNROFF= to select a partition in relation to + * a partition with a known unique id. + * 7) : major and minor number of the device separated by + * a colon. + */ +static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; +module_param_string(blkdev, blkdev, 80, 0400); +MODULE_PARM_DESC(blkdev, "block device for pstore storage"); + +/* + * All globals must only be accessed under the pstore_blk_lock + * during the register/unregister functions. + */ +static DEFINE_MUTEX(pstore_blk_lock); +static struct block_device *psblk_bdev; +static struct pstore_zone_info *pstore_zone_info; +static pstore_blk_panic_write_op blkdev_panic_write; + +struct bdev_info { + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +/** + * struct pstore_device_info - back-end pstore/blk driver structure. + * + * @total_size: The total size in bytes pstore/blk can use. It must be greater + * than 4096 and be multiple of 4096. + * @flags: Refer to macro starting with PSTORE_FLAGS defined in + * linux/pstore.h. It means what front-ends this device support. + * Zero means all backends for compatible. + * @read: The general read operation. Both of the function parameters + * @size and @offset are relative value to bock device (not the + * whole disk). + * On success, the number of bytes should be returned, others + * means error. + * @write: The same as @read. + * @panic_write:The write operation only used for panic case. It's optional + * if you do not care panic log. The parameters and return value + * are the same as @read. + */ +struct pstore_device_info { + unsigned long total_size; + unsigned int flags; + pstore_zone_read_op read; + pstore_zone_write_op write; + pstore_zone_write_op panic_write; +}; + +static int psblk_register_do(struct pstore_device_info *dev) +{ + int ret; + + if (!dev || !dev->total_size || !dev->read || !dev->write) + return -EINVAL; + + mutex_lock(&pstore_blk_lock); + + /* someone already registered before */ + if (pstore_zone_info) { + mutex_unlock(&pstore_blk_lock); + return -EBUSY; + } + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); + if (!pstore_zone_info) { + mutex_unlock(&pstore_blk_lock); + return -ENOMEM; + } + + /* zero means not limit on which backends to attempt to store. */ + if (!dev->flags) + dev->flags = UINT_MAX; + +#define verify_size(name, alignsize, enabled) { \ + long _##name_ = (enabled) ? (name) : 0; \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + name = _##name_ / 1024; \ + pstore_zone_info->name = _##name_; \ + } + + verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); +#undef verify_size + + pstore_zone_info->total_size = dev->total_size; + pstore_zone_info->max_reason = max_reason; + pstore_zone_info->read = dev->read; + pstore_zone_info->write = dev->write; + pstore_zone_info->panic_write = dev->panic_write; + pstore_zone_info->name = KBUILD_MODNAME; + pstore_zone_info->owner = THIS_MODULE; + + ret = register_pstore_zone(pstore_zone_info); + if (ret) { + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + mutex_unlock(&pstore_blk_lock); + return ret; +} + +static void psblk_unregister_do(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + if (pstore_zone_info && pstore_zone_info->read == dev->read) { + unregister_pstore_zone(pstore_zone_info); + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + mutex_unlock(&pstore_blk_lock); +} + +/** + * psblk_get_bdev() - open block device + * + * @holder: Exclusive holder identifier + * @info: Information about bdev to fill in + * + * Return: pointer to block device on success and others on error. + * + * On success, the returned block_device has reference count of one. + */ +static struct block_device *psblk_get_bdev(void *holder, + struct bdev_info *info) +{ + struct block_device *bdev = ERR_PTR(-ENODEV); + fmode_t mode = FMODE_READ | FMODE_WRITE; + sector_t nr_sects; + + lockdep_assert_held(&pstore_blk_lock); + + if (pstore_zone_info) + return ERR_PTR(-EBUSY); + + if (!blkdev[0]) + return ERR_PTR(-ENODEV); + + if (holder) + mode |= FMODE_EXCL; + bdev = blkdev_get_by_path(blkdev, mode, holder); + if (IS_ERR(bdev)) { + dev_t devt; + + devt = name_to_dev_t(blkdev); + if (devt == 0) + return ERR_PTR(-ENODEV); + bdev = blkdev_get_by_dev(devt, mode, holder); + if (IS_ERR(bdev)) + return bdev; + } + + nr_sects = part_nr_sects_read(bdev->bd_part); + if (!nr_sects) { + pr_err("not enough space for '%s'\n", blkdev); + blkdev_put(bdev, mode); + return ERR_PTR(-ENOSPC); + } + + if (info) { + info->devt = bdev->bd_dev; + info->nr_sects = nr_sects; + info->start_sect = get_start_sect(bdev); + } + + return bdev; +} + +static void psblk_put_bdev(struct block_device *bdev, void *holder) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + lockdep_assert_held(&pstore_blk_lock); + + if (!bdev) + return; + + if (holder) + mode |= FMODE_EXCL; + blkdev_put(bdev, mode); +} + +static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct file file; + struct kiocb kiocb; + struct iov_iter iter; + struct kvec iov = {.iov_base = buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + file_ra_state_init(&file.f_ra, file.f_mapping); + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, READ, &iov, 1, bytes); + + return generic_file_read_iter(&kiocb, &iter); +} + +static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, + loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct iov_iter iter; + struct kiocb kiocb; + struct file file; + ssize_t ret; + struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + /* Console/Ftrace backend may handle buffer until flush dirty zones */ + if (in_interrupt() || irqs_disabled()) + return -EBUSY; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); + + inode_lock(bdev->bd_inode); + ret = generic_write_checks(&kiocb, &iter); + if (ret > 0) + ret = generic_perform_write(&file, &iter, pos); + inode_unlock(bdev->bd_inode); + + if (likely(ret > 0)) { + const struct file_operations f_op = {.fsync = blkdev_fsync}; + + file.f_op = &f_op; + kiocb.ki_pos += ret; + ret = generic_write_sync(&kiocb, ret); + } + return ret; +} + +static ssize_t psblk_blk_panic_write(const char *buf, size_t size, + loff_t off) +{ + int ret; + + if (!blkdev_panic_write) + return -EOPNOTSUPP; + + /* size and off must align to SECTOR_SIZE for block device */ + ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, + size >> SECTOR_SHIFT); + return ret ? -EIO : size; +} + +static int __register_pstore_blk(struct pstore_blk_info *info) +{ + char bdev_name[BDEVNAME_SIZE]; + struct block_device *bdev; + struct pstore_device_info dev; + struct bdev_info binfo; + void *holder = blkdev; + int ret = -ENODEV; + + lockdep_assert_held(&pstore_blk_lock); + + /* hold bdev exclusively */ + memset(&binfo, 0, sizeof(binfo)); + bdev = psblk_get_bdev(holder, &binfo); + if (IS_ERR(bdev)) { + pr_err("failed to open '%s'!\n", blkdev); + return PTR_ERR(bdev); + } + + /* only allow driver matching the @blkdev */ + if (!binfo.devt || MAJOR(binfo.devt) != info->major) { + pr_debug("invalid major %u (expect %u)\n", + info->major, MAJOR(binfo.devt)); + ret = -ENODEV; + goto err_put_bdev; + } + + /* psblk_bdev must be assigned before register to pstore/blk */ + psblk_bdev = bdev; + blkdev_panic_write = info->panic_write; + + /* Copy back block device details. */ + info->devt = binfo.devt; + info->nr_sects = binfo.nr_sects; + info->start_sect = binfo.start_sect; + + memset(&dev, 0, sizeof(dev)); + dev.total_size = info->nr_sects << SECTOR_SHIFT; + dev.flags = info->flags; + dev.read = psblk_generic_blk_read; + dev.write = psblk_generic_blk_write; + dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; + + ret = psblk_register_do(&dev); + if (ret) + goto err_put_bdev; + + bdevname(bdev, bdev_name); + pr_info("attached %s%s\n", bdev_name, + info->panic_write ? "" : " (no dedicated panic_write!)"); + return 0; + +err_put_bdev: + psblk_bdev = NULL; + blkdev_panic_write = NULL; + psblk_put_bdev(bdev, holder); + return ret; +} + +/** + * register_pstore_blk() - register block device to pstore/blk + * + * @info: details on the desired block device interface + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_blk(struct pstore_blk_info *info) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_blk(info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_blk); + +static void __unregister_pstore_blk(unsigned int major) +{ + struct pstore_device_info dev = { .read = psblk_generic_blk_read }; + void *holder = blkdev; + + lockdep_assert_held(&pstore_blk_lock); + if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { + psblk_unregister_do(&dev); + psblk_put_bdev(psblk_bdev, holder); + blkdev_panic_write = NULL; + psblk_bdev = NULL; + } +} + +/** + * unregister_pstore_blk() - unregister block device from pstore/blk + * + * @major: the major device number of device + */ +void unregister_pstore_blk(unsigned int major) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_blk(major); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_blk); + +static void __exit pstore_blk_exit(void) +{ + mutex_lock(&pstore_blk_lock); + if (psblk_bdev) + __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + mutex_unlock(&pstore_blk_lock); +} +module_exit(pstore_blk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao "); +MODULE_AUTHOR("Kees Cook "); +MODULE_DESCRIPTION("pstore backend for block devices"); diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h new file mode 100644 index 000000000000..4501977b1336 --- /dev/null +++ b/include/linux/pstore_blk.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PSTORE_BLK_H_ +#define __PSTORE_BLK_H_ + +#include +#include +#include + +/** + * typedef pstore_blk_panic_write_op - panic write operation to block device + * + * @buf: the data to write + * @start_sect: start sector to block device + * @sects: sectors count on buf + * + * Return: On success, zero should be returned. Others mean error. + * + * Panic write to block device must be aligned to SECTOR_SIZE. + */ +typedef int (*pstore_blk_panic_write_op)(const char *buf, sector_t start_sect, + sector_t sects); + +/** + * struct pstore_blk_info - pstore/blk registration details + * + * @major: Which major device number to support with pstore/blk + * @flags: The supported PSTORE_FLAGS_* from linux/pstore.h. + * @panic_write:The write operation only used for the panic case. + * This can be NULL, but is recommended to avoid losing + * crash data if the kernel's IO path or work queues are + * broken during a panic. + * @devt: The dev_t that pstore/blk has attached to. + * @nr_sects: Number of sectors on @devt. + * @start_sect: Starting sector on @devt. + */ +struct pstore_blk_info { + unsigned int major; + unsigned int flags; + pstore_blk_panic_write_op panic_write; + + /* Filled in by pstore/blk after registration. */ + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +int register_pstore_blk(struct pstore_blk_info *info); +void unregister_pstore_blk(unsigned int major); + +#endif -- cgit v1.2.3 From 0dc068265a1c5923ffebf40388fbe93050a77ad1 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:54:59 +0800 Subject: pstore/zone,blk: Add support for pmsg frontend Add pmsg support to pstore/blk (through pstore/zone). To enable, pmsg_size must be greater than 0 and a multiple of 4096. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-4-keescook@chromium.org/ Co-developed-by: Colin Ian King Signed-off-by: Colin Ian King Link: https://lore.kernel.org/lkml/20200512171932.222102-1-colin.king@canonical.com Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 12 ++ fs/pstore/blk.c | 9 ++ fs/pstore/zone.c | 268 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/pstore_zone.h | 2 + 4 files changed, 282 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 958bec75f907..ef01c48f0ff7 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -224,3 +224,15 @@ config PSTORE_BLK_MAX_REASON NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_PMSG_SIZE + int "Size in Kbytes of pmsg to store" + depends on PSTORE_BLK + depends on PSTORE_PMSG + default 64 + help + This just sets size of pmsg (pmsg_size) for pstore/blk. The size is + in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 8f131c6e412e..a9e7078f08d6 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -27,6 +27,14 @@ module_param(max_reason, int, 0400); MODULE_PARM_DESC(max_reason, "maximum reason for kmsg dump (default 2: Oops and Panic)"); +#if IS_ENABLED(CONFIG_PSTORE_PMSG) +static long pmsg_size = CONFIG_PSTORE_BLK_PMSG_SIZE; +#else +static long pmsg_size = -1; +#endif +module_param(pmsg_size, long, 0400); +MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); + /* * blkdev - the block device to use for pstore storage * @@ -133,6 +141,7 @@ static int psblk_register_do(struct pstore_device_info *dev) } verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); + verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); #undef verify_size pstore_zone_info->total_size = dev->total_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 362d72a24012..2f8e3b349edb 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -27,12 +27,14 @@ * * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) * @datalen: length of data in @data + * @start: offset into @data where the beginning of the stored bytes begin * @data: zone data. */ struct psz_buffer { #define PSZ_SIG (0x43474244) /* DBGC */ uint32_t sig; atomic_t datalen; + atomic_t start; uint8_t data[]; }; @@ -88,9 +90,11 @@ struct pstore_zone { * struct psz_context - all about running state of pstore/zone * * @kpszs: kmsg dump storage zones + * @ppsz: pmsg storage zone * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes + * @pmsg_read_cnt: counter of total read pmsg zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -101,9 +105,11 @@ struct pstore_zone { */ struct psz_context { struct pstore_zone **kpszs; + struct pstore_zone *ppsz; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; + unsigned int pmsg_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -143,15 +149,20 @@ static inline int buffer_datalen(struct pstore_zone *zone) return atomic_read(&zone->buffer->datalen); } +static inline int buffer_start(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->start); +} + static inline bool is_on_panic(void) { return atomic_read(&pstore_zone_cxt.on_panic); } -static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf, +static ssize_t psz_zone_read_buffer(struct pstore_zone *zone, char *buf, size_t len, unsigned long off) { - if (!buf || !zone->buffer) + if (!buf || !zone || !zone->buffer) return -EINVAL; if (off > zone->buffer_size) return -EINVAL; @@ -160,6 +171,18 @@ static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf, return len; } +static int psz_zone_read_oldbuf(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->oldbuf) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->oldbuf->data + off, len); + return 0; +} + static int psz_zone_write(struct pstore_zone *zone, enum psz_flush_mode flush_mode, const char *buf, size_t len, unsigned long off) @@ -415,6 +438,93 @@ recover_fail: return ret; } +static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct psz_buffer *oldbuf, tmpbuf; + int ret = 0; + char *buf; + ssize_t rcnt, len, start, off; + + if (!zone || zone->oldbuf) + return 0; + + if (is_on_panic()) { + /* save data as much as possible */ + psz_flush_dirty_zone(zone); + return 0; + } + + if (unlikely(!info->read)) + return -EINVAL; + + len = sizeof(struct psz_buffer); + rcnt = info->read((char *)&tmpbuf, len, zone->off); + if (rcnt != len) { + pr_debug("read zone %s failed\n", zone->name); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (tmpbuf.sig != zone->buffer->sig) { + pr_debug("no valid data in zone %s\n", zone->name); + return 0; + } + + if (zone->buffer_size < atomic_read(&tmpbuf.datalen) || + zone->buffer_size < atomic_read(&tmpbuf.start)) { + pr_info("found overtop zone: %s: off %lld, size %zu\n", + zone->name, zone->off, zone->buffer_size); + /* just keep going */ + return 0; + } + + if (!atomic_read(&tmpbuf.datalen)) { + pr_debug("found erased zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + return 0; + } + + pr_debug("found nice zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + + len = atomic_read(&tmpbuf.datalen) + sizeof(*oldbuf); + oldbuf = kzalloc(len, GFP_KERNEL); + if (!oldbuf) + return -ENOMEM; + + memcpy(oldbuf, &tmpbuf, sizeof(*oldbuf)); + buf = (char *)oldbuf + sizeof(*oldbuf); + len = atomic_read(&oldbuf->datalen); + start = atomic_read(&oldbuf->start); + off = zone->off + sizeof(*oldbuf); + + /* get part of data */ + rcnt = info->read(buf, len - start, off + start); + if (rcnt != len - start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + /* get the rest of data */ + rcnt = info->read(buf + len - start, start, off); + if (rcnt != start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + zone->oldbuf = oldbuf; + psz_flush_dirty_zone(zone); + return 0; + +free_oldbuf: + kfree(oldbuf); + return ret; +} + /** * psz_recovery() - recover data from storage * @cxt: the context of pstore/zone @@ -431,7 +541,12 @@ static inline int psz_recovery(struct psz_context *cxt) return 0; ret = psz_kmsg_recover(cxt); + if (ret) + goto out; + ret = psz_recover_zone(cxt, cxt->ppsz); + +out: if (unlikely(ret)) pr_err("recover failed\n"); else { @@ -446,9 +561,17 @@ static int psz_pstore_open(struct pstore_info *psi) struct psz_context *cxt = psi->data; cxt->kmsg_read_cnt = 0; + cxt->pmsg_read_cnt = 0; return 0; } +static inline bool psz_old_ok(struct pstore_zone *zone) +{ + if (zone && zone->oldbuf && atomic_read(&zone->oldbuf->datalen)) + return true; + return false; +} + static inline bool psz_ok(struct pstore_zone *zone) { if (zone && zone->buffer && buffer_datalen(zone)) @@ -473,6 +596,25 @@ static inline int psz_kmsg_erase(struct psz_context *cxt, return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); } +static inline int psz_record_erase(struct psz_context *cxt, + struct pstore_zone *zone) +{ + if (unlikely(!psz_old_ok(zone))) + return 0; + + kfree(zone->oldbuf); + zone->oldbuf = NULL; + /* + * if there are new data in zone buffer, that means the old data + * are already invalid. It is no need to flush 0 (erase) to + * block device. + */ + if (!buffer_datalen(zone)) + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + psz_flush_dirty_zone(zone); + return 0; +} + static int psz_pstore_erase(struct pstore_record *record) { struct psz_context *cxt = record->psi->data; @@ -482,6 +624,8 @@ static int psz_pstore_erase(struct pstore_record *record) if (record->id >= cxt->kmsg_max_cnt) return -EINVAL; return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + case PSTORE_TYPE_PMSG: + return psz_record_erase(cxt, cxt->ppsz); default: return -EINVAL; } @@ -555,6 +699,55 @@ static int notrace psz_kmsg_write(struct psz_context *cxt, return 0; } +static int notrace psz_record_write(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t start, rem; + bool is_full_data = false; + char *buf; + int cnt; + + if (!zone || !record) + return -ENOSPC; + + if (atomic_read(&zone->buffer->datalen) >= zone->buffer_size) + is_full_data = true; + + cnt = record->size; + buf = record->buf; + if (unlikely(cnt > zone->buffer_size)) { + buf += cnt - zone->buffer_size; + cnt = zone->buffer_size; + } + + start = buffer_start(zone); + rem = zone->buffer_size - start; + if (unlikely(rem < cnt)) { + psz_zone_write(zone, FLUSH_PART, buf, rem, start); + buf += rem; + cnt -= rem; + start = 0; + is_full_data = true; + } + + atomic_set(&zone->buffer->start, cnt + start); + psz_zone_write(zone, FLUSH_PART, buf, cnt, start); + + /** + * psz_zone_write will set datalen as start + cnt. + * It work if actual data length lesser than buffer size. + * If data length greater than buffer size, pmsg will rewrite to + * beginning of zone, which make buffer->datalen wrongly. + * So we should reset datalen as buffer size once actual data length + * greater than buffer size. + */ + if (is_full_data) { + atomic_set(&zone->buffer->datalen, zone->buffer_size); + psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + } + return 0; +} + static int notrace psz_pstore_write(struct pstore_record *record) { struct psz_context *cxt = record->psi->data; @@ -566,6 +759,8 @@ static int notrace psz_pstore_write(struct pstore_record *record) switch (record->type) { case PSTORE_TYPE_DMESG: return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_PMSG: + return psz_record_write(cxt->ppsz, record); default: return -EINVAL; } @@ -581,6 +776,13 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->pmsg_read_cnt == 0) { + cxt->pmsg_read_cnt++; + zone = cxt->ppsz; + if (psz_old_ok(zone)) + return zone; + } + return NULL; } @@ -631,7 +833,7 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, return -ENOMEM; } - size = psz_zone_read(zone, record->buf + hlen, size, + size = psz_zone_read_buffer(zone, record->buf + hlen, size, sizeof(struct psz_kmsg_header)); if (unlikely(size < 0)) { kfree(record->buf); @@ -641,6 +843,32 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, return size + hlen; } +static ssize_t psz_record_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t len; + struct psz_buffer *buf; + + if (!zone || !record) + return -ENOSPC; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + len = atomic_read(&buf->datalen); + record->buf = kmalloc(len, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(psz_zone_read_oldbuf(zone, record->buf, len, 0))) { + kfree(record->buf); + return -ENOMSG; + } + + return len; +} + static ssize_t psz_pstore_read(struct pstore_record *record) { struct psz_context *cxt = record->psi->data; @@ -665,6 +893,9 @@ next_zone: readop = psz_kmsg_read; record->id = cxt->kmsg_read_cnt - 1; break; + case PSTORE_TYPE_PMSG: + readop = psz_record_read; + break; default: goto next_zone; } @@ -720,6 +951,8 @@ static void psz_free_all_zones(struct psz_context *cxt) { if (cxt->kpszs) psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); + if (cxt->ppsz) + psz_free_zone(&cxt->ppsz); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -753,8 +986,10 @@ static struct pstore_zone *psz_init_zone(enum pstore_type_id type, zone->type = type; zone->buffer_size = size - sizeof(struct psz_buffer); zone->buffer->sig = type ^ PSZ_SIG; + zone->oldbuf = NULL; atomic_set(&zone->dirty, 0); atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->buffer->start, 0); *off += size; @@ -809,19 +1044,28 @@ static int psz_alloc_zones(struct psz_context *cxt) struct pstore_zone_info *info = cxt->pstore_zone_info; loff_t off = 0; int err; - size_t size; + size_t off_size = 0; + + off_size += info->pmsg_size; + cxt->ppsz = psz_init_zone(PSTORE_TYPE_PMSG, &off, info->pmsg_size); + if (IS_ERR(cxt->ppsz)) { + err = PTR_ERR(cxt->ppsz); + cxt->ppsz = NULL; + goto free_out; + } - size = info->total_size; - cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, size, + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, + info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); if (IS_ERR(cxt->kpszs)) { err = PTR_ERR(cxt->kpszs); cxt->kpszs = NULL; - goto fail_out; + goto free_out; } return 0; -fail_out: +free_out: + psz_free_all_zones(cxt); return err; } @@ -844,7 +1088,7 @@ int register_pstore_zone(struct pstore_zone_info *info) return -EINVAL; } - if (!info->kmsg_size) { + if (!info->kmsg_size && !info->pmsg_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -866,6 +1110,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(total_size, 4096); check_size(kmsg_size, SECTOR_SIZE); + check_size(pmsg_size, SECTOR_SIZE); #undef check_size @@ -891,6 +1136,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("register %s with properties:\n", info->name); pr_debug("\ttotal size : %ld Bytes\n", info->total_size); pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); err = psz_alloc_zones(cxt); if (err) { @@ -920,6 +1166,10 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_cont(",panic_write"); pr_cont(")"); } + if (info->pmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + pr_cont(" pmsg"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index eb005d9ae40c..29c367a3bd80 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -17,6 +17,7 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @kmsg_size: The size of oops/panic zone. Zero means disabled, otherwise, * it must be multiple of SECTOR_SIZE(512 Bytes). * @max_reason: Maximum kmsg dump reason to store. + * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others @@ -33,6 +34,7 @@ struct pstore_zone_info { unsigned long total_size; unsigned long kmsg_size; int max_reason; + unsigned long pmsg_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_write_op panic_write; -- cgit v1.2.3 From cc9c4d1b5597167f8e8c92f6b61e1cda6d01884d Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:00 +0800 Subject: pstore/zone,blk: Add console frontend support Support backend for console. To enable console backend, just make console_size be greater than 0 and a multiple of 4096. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-5-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 18 ++++++++-- fs/pstore/blk.c | 12 ++++++- fs/pstore/zone.c | 81 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/pstore_zone.h | 4 ++- 4 files changed, 105 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index ef01c48f0ff7..126aa6c3ecf2 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -180,11 +180,11 @@ config PSTORE_BLK_BLKDEV help Which block device should be used for pstore/blk. - It accept the following variants: + It accepts the following variants: 1) device number in hexadecimal representation, with no leading 0x, for example b302. - 2) /dev/ represents the device number of disk - 3) /dev/ represents the device number + 2) /dev/ represents the device name of disk + 3) /dev/ represents the device name and number of partition - device number of disk plus the partition number 4) /dev/p - same as the above, this form is used when disk name of partitioned disk ends with a digit. @@ -236,3 +236,15 @@ config PSTORE_BLK_PMSG_SIZE NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_CONSOLE_SIZE + int "Size in Kbytes of console log to store" + depends on PSTORE_BLK + depends on PSTORE_CONSOLE + default 64 + help + This just sets size of console log (console_size) to store via + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index a9e7078f08d6..082f6cdaf4bd 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -35,6 +35,14 @@ static long pmsg_size = -1; module_param(pmsg_size, long, 0400); MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); +#if IS_ENABLED(CONFIG_PSTORE_CONSOLE) +static long console_size = CONFIG_PSTORE_BLK_CONSOLE_SIZE; +#else +static long console_size = -1; +#endif +module_param(console_size, long, 0400); +MODULE_PARM_DESC(console_size, "console size in kbytes"); + /* * blkdev - the block device to use for pstore storage * @@ -91,7 +99,8 @@ struct bdev_info { * whole disk). * On success, the number of bytes should be returned, others * means error. - * @write: The same as @read. + * @write: The same as @read, but the following error number: + * -EBUSY means try to write again later. * @panic_write:The write operation only used for panic case. It's optional * if you do not care panic log. The parameters and return value * are the same as @read. @@ -142,6 +151,7 @@ static int psblk_register_do(struct pstore_device_info *dev) verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); + verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); #undef verify_size pstore_zone_info->total_size = dev->total_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 2f8e3b349edb..ee0b64da410d 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -91,10 +91,12 @@ struct pstore_zone { * * @kpszs: kmsg dump storage zones * @ppsz: pmsg storage zone + * @cpsz: console storage zone * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes * @pmsg_read_cnt: counter of total read pmsg zone + * @console_read_cnt: counter of total read console zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -106,10 +108,12 @@ struct pstore_zone { struct psz_context { struct pstore_zone **kpszs; struct pstore_zone *ppsz; + struct pstore_zone *cpsz; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; unsigned int pmsg_read_cnt; + unsigned int console_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -129,6 +133,9 @@ struct psz_context { }; static struct psz_context pstore_zone_cxt; +static void psz_flush_all_dirty_zones(struct work_struct *); +static DECLARE_DELAYED_WORK(psz_cleaner, psz_flush_all_dirty_zones); + /** * enum psz_flush_mode - flush mode for psz_zone_write() * @@ -237,6 +244,9 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: atomic_set(&zone->dirty, true); + /* flush dirty zones nicely */ + if (wcnt == -EBUSY && !is_on_panic()) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(500)); return -EBUSY; } @@ -293,6 +303,21 @@ static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) return 0; } +static void psz_flush_all_dirty_zones(struct work_struct *work) +{ + struct psz_context *cxt = &pstore_zone_cxt; + int ret = 0; + + if (cxt->ppsz) + ret |= psz_flush_dirty_zone(cxt->ppsz); + if (cxt->cpsz) + ret |= psz_flush_dirty_zone(cxt->cpsz); + if (cxt->kpszs) + ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (ret && cxt->pstore_zone_info) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); +} + static int psz_kmsg_recover_data(struct psz_context *cxt) { struct pstore_zone_info *info = cxt->pstore_zone_info; @@ -545,6 +570,10 @@ static inline int psz_recovery(struct psz_context *cxt) goto out; ret = psz_recover_zone(cxt, cxt->ppsz); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->cpsz); out: if (unlikely(ret)) @@ -562,6 +591,7 @@ static int psz_pstore_open(struct pstore_info *psi) cxt->kmsg_read_cnt = 0; cxt->pmsg_read_cnt = 0; + cxt->console_read_cnt = 0; return 0; } @@ -626,8 +656,9 @@ static int psz_pstore_erase(struct pstore_record *record) return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); case PSTORE_TYPE_PMSG: return psz_record_erase(cxt, cxt->ppsz); - default: - return -EINVAL; + case PSTORE_TYPE_CONSOLE: + return psz_record_erase(cxt, cxt->cpsz); + default: return -EINVAL; } } @@ -690,9 +721,10 @@ static int notrace psz_kmsg_write(struct psz_context *cxt, return -ENOSPC; ret = psz_kmsg_write_record(cxt, record); - if (!ret) { + if (!ret && is_on_panic()) { + /* ensure all data are flushed to storage when panic */ pr_debug("try to flush other dirty zones\n"); - psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + psz_flush_all_dirty_zones(NULL); } /* always return 0 as we had handled it on buffer */ @@ -756,9 +788,18 @@ static int notrace psz_pstore_write(struct pstore_record *record) record->reason == KMSG_DUMP_PANIC) atomic_set(&cxt->on_panic, 1); + /* + * if on panic, do not write except panic records + * Fix case that panic_write prints log which wakes up console backend. + */ + if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) + return -EBUSY; + switch (record->type) { case PSTORE_TYPE_DMESG: return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_CONSOLE: + return psz_record_write(cxt->cpsz, record); case PSTORE_TYPE_PMSG: return psz_record_write(cxt->ppsz, record); default: @@ -783,6 +824,13 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->console_read_cnt == 0) { + cxt->console_read_cnt++; + zone = cxt->cpsz; + if (psz_old_ok(zone)) + return zone; + } + return NULL; } @@ -893,6 +941,8 @@ next_zone: readop = psz_kmsg_read; record->id = cxt->kmsg_read_cnt - 1; break; + case PSTORE_TYPE_CONSOLE: + fallthrough; case PSTORE_TYPE_PMSG: readop = psz_record_read; break; @@ -953,6 +1003,8 @@ static void psz_free_all_zones(struct psz_context *cxt) psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); if (cxt->ppsz) psz_free_zone(&cxt->ppsz); + if (cxt->cpsz) + psz_free_zone(&cxt->cpsz); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -1054,6 +1106,15 @@ static int psz_alloc_zones(struct psz_context *cxt) goto free_out; } + off_size += info->console_size; + cxt->cpsz = psz_init_zone(PSTORE_TYPE_CONSOLE, &off, + info->console_size); + if (IS_ERR(cxt->cpsz)) { + err = PTR_ERR(cxt->cpsz); + cxt->cpsz = NULL; + goto free_out; + } + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); @@ -1088,7 +1149,7 @@ int register_pstore_zone(struct pstore_zone_info *info) return -EINVAL; } - if (!info->kmsg_size && !info->pmsg_size) { + if (!info->kmsg_size && !info->pmsg_size && !info->console_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -1111,6 +1172,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(total_size, 4096); check_size(kmsg_size, SECTOR_SIZE); check_size(pmsg_size, SECTOR_SIZE); + check_size(console_size, SECTOR_SIZE); #undef check_size @@ -1137,6 +1199,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("\ttotal size : %ld Bytes\n", info->total_size); pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); + pr_debug("\tconsole size : %ld Bytes\n", info->console_size); err = psz_alloc_zones(cxt); if (err) { @@ -1170,6 +1233,10 @@ int register_pstore_zone(struct pstore_zone_info *info) cxt->pstore.flags |= PSTORE_FLAGS_PMSG; pr_cont(" pmsg"); } + if (info->console_size) { + cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; + pr_cont(" console"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); @@ -1211,6 +1278,10 @@ void unregister_pstore_zone(struct pstore_zone_info *info) /* Stop incoming writes from pstore. */ pstore_unregister(&cxt->pstore); + /* Flush any pending writes. */ + psz_flush_all_dirty_zones(NULL); + flush_delayed_work(&psz_cleaner); + /* Clean up allocations. */ kfree(cxt->pstore.buf); cxt->pstore.buf = NULL; diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 29c367a3bd80..904ee67f4ba2 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -18,11 +18,12 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * it must be multiple of SECTOR_SIZE(512 Bytes). * @max_reason: Maximum kmsg dump reason to store. * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. + * @console_size:The size of console zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others * means error. - * @write: The same as @read. + * @write: The same as @read, but -EBUSY means try to write again later. * @panic_write:The write operation only used for panic case. It's optional * if you do not care panic log. The parameters and return value * are the same as @read. @@ -35,6 +36,7 @@ struct pstore_zone_info { unsigned long kmsg_size; int max_reason; unsigned long pmsg_size; + unsigned long console_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_write_op panic_write; -- cgit v1.2.3 From 34327e9fd213414b35eb70aa512c4e39b2095907 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:01 +0800 Subject: pstore/zone,blk: Add ftrace frontend support Support backend for ftrace. To enable ftrace backend, just make ftrace_size be greater than 0 and a multiple of 4096. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-6-keescook@chromium.org/ Co-developed-by: Colin Ian King Signed-off-by: Colin Ian King Link: https://lore.kernel.org/lkml/20200512170719.221514-1-colin.king@canonical.com Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 12 +++++ fs/pstore/blk.c | 9 ++++ fs/pstore/zone.c | 114 +++++++++++++++++++++++++++++++++++++++++++- include/linux/pstore_zone.h | 2 + 4 files changed, 136 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 126aa6c3ecf2..c2237984b407 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -248,3 +248,15 @@ config PSTORE_BLK_CONSOLE_SIZE NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_FTRACE_SIZE + int "Size in Kbytes of ftrace log to store" + depends on PSTORE_BLK + depends on PSTORE_FTRACE + default 64 + help + This just sets size of ftrace log (ftrace_size) for pstore/blk. The + size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 082f6cdaf4bd..e0d95fb48428 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -43,6 +43,14 @@ static long console_size = -1; module_param(console_size, long, 0400); MODULE_PARM_DESC(console_size, "console size in kbytes"); +#if IS_ENABLED(CONFIG_PSTORE_FTRACE) +static long ftrace_size = CONFIG_PSTORE_BLK_FTRACE_SIZE; +#else +static long ftrace_size = -1; +#endif +module_param(ftrace_size, long, 0400); +MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); + /* * blkdev - the block device to use for pstore storage * @@ -152,6 +160,7 @@ static int psblk_register_do(struct pstore_device_info *dev) verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); + verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); #undef verify_size pstore_zone_info->total_size = dev->total_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index ee0b64da410d..2d9f452360bb 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -92,11 +92,14 @@ struct pstore_zone { * @kpszs: kmsg dump storage zones * @ppsz: pmsg storage zone * @cpsz: console storage zone + * @fpszs: ftrace storage zones * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes * @pmsg_read_cnt: counter of total read pmsg zone * @console_read_cnt: counter of total read console zone + * @ftrace_max_cnt: max count of @fpszs + * @ftrace_read_cnt: counter of max read ftrace zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -109,11 +112,14 @@ struct psz_context { struct pstore_zone **kpszs; struct pstore_zone *ppsz; struct pstore_zone *cpsz; + struct pstore_zone **fpszs; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; unsigned int pmsg_read_cnt; unsigned int console_read_cnt; + unsigned int ftrace_max_cnt; + unsigned int ftrace_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -314,6 +320,8 @@ static void psz_flush_all_dirty_zones(struct work_struct *work) ret |= psz_flush_dirty_zone(cxt->cpsz); if (cxt->kpszs) ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (cxt->fpszs) + ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt); if (ret && cxt->pstore_zone_info) schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); } @@ -550,6 +558,31 @@ free_oldbuf: return ret; } +static int psz_recover_zones(struct psz_context *cxt, + struct pstore_zone **zones, unsigned int cnt) +{ + int ret; + unsigned int i; + struct pstore_zone *zone; + + if (!zones) + return 0; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (unlikely(!zone)) + continue; + ret = psz_recover_zone(cxt, zone); + if (ret) + goto recover_fail; + } + + return 0; +recover_fail: + pr_debug("recover %s[%u] failed\n", zone->name, i); + return ret; +} + /** * psz_recovery() - recover data from storage * @cxt: the context of pstore/zone @@ -574,6 +607,10 @@ static inline int psz_recovery(struct psz_context *cxt) goto out; ret = psz_recover_zone(cxt, cxt->cpsz); + if (ret) + goto out; + + ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt); out: if (unlikely(ret)) @@ -592,6 +629,7 @@ static int psz_pstore_open(struct pstore_info *psi) cxt->kmsg_read_cnt = 0; cxt->pmsg_read_cnt = 0; cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; return 0; } @@ -658,6 +696,10 @@ static int psz_pstore_erase(struct pstore_record *record) return psz_record_erase(cxt, cxt->ppsz); case PSTORE_TYPE_CONSOLE: return psz_record_erase(cxt, cxt->cpsz); + case PSTORE_TYPE_FTRACE: + if (record->id >= cxt->ftrace_max_cnt) + return -EINVAL; + return psz_record_erase(cxt, cxt->fpszs[record->id]); default: return -EINVAL; } } @@ -802,6 +844,13 @@ static int notrace psz_pstore_write(struct pstore_record *record) return psz_record_write(cxt->cpsz, record); case PSTORE_TYPE_PMSG: return psz_record_write(cxt->ppsz, record); + case PSTORE_TYPE_FTRACE: { + int zonenum = smp_processor_id(); + + if (!cxt->fpszs) + return -ENOSPC; + return psz_record_write(cxt->fpszs[zonenum], record); + } default: return -EINVAL; } @@ -817,6 +866,14 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* + * No need psz_old_ok(). Let psz_ftrace_read() do so for + * combination. psz_ftrace_read() should traverse over + * all zones in case of some zone without data. + */ + return cxt->fpszs[cxt->ftrace_read_cnt++]; + if (cxt->pmsg_read_cnt == 0) { cxt->pmsg_read_cnt++; zone = cxt->ppsz; @@ -891,6 +948,38 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, return size + hlen; } +/* try to combine all ftrace zones */ +static ssize_t psz_ftrace_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt; + struct psz_buffer *buf; + int ret; + + if (!zone || !record) + return -ENOSPC; + + if (!psz_old_ok(zone)) + goto out; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + ret = pstore_ftrace_combine_log(&record->buf, &record->size, + (char *)buf->data, atomic_read(&buf->datalen)); + if (unlikely(ret)) + return ret; + +out: + cxt = record->psi->data; + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* then, read next ftrace zone */ + return -ENOMSG; + record->id = 0; + return record->size ? record->size : -ENOMSG; +} + static ssize_t psz_record_read(struct pstore_zone *zone, struct pstore_record *record) { @@ -941,6 +1030,9 @@ next_zone: readop = psz_kmsg_read; record->id = cxt->kmsg_read_cnt - 1; break; + case PSTORE_TYPE_FTRACE: + readop = psz_ftrace_read; + break; case PSTORE_TYPE_CONSOLE: fallthrough; case PSTORE_TYPE_PMSG: @@ -1005,6 +1097,8 @@ static void psz_free_all_zones(struct psz_context *cxt) psz_free_zone(&cxt->ppsz); if (cxt->cpsz) psz_free_zone(&cxt->cpsz); + if (cxt->fpszs) + psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -1115,6 +1209,17 @@ static int psz_alloc_zones(struct psz_context *cxt) goto free_out; } + off_size += info->ftrace_size; + cxt->fpszs = psz_init_zones(PSTORE_TYPE_FTRACE, &off, + info->ftrace_size, + info->ftrace_size / nr_cpu_ids, + &cxt->ftrace_max_cnt); + if (IS_ERR(cxt->fpszs)) { + err = PTR_ERR(cxt->fpszs); + cxt->fpszs = NULL; + goto free_out; + } + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); @@ -1149,7 +1254,8 @@ int register_pstore_zone(struct pstore_zone_info *info) return -EINVAL; } - if (!info->kmsg_size && !info->pmsg_size && !info->console_size) { + if (!info->kmsg_size && !info->pmsg_size && !info->console_size && + !info->ftrace_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -1173,6 +1279,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(kmsg_size, SECTOR_SIZE); check_size(pmsg_size, SECTOR_SIZE); check_size(console_size, SECTOR_SIZE); + check_size(ftrace_size, SECTOR_SIZE); #undef check_size @@ -1200,6 +1307,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); pr_debug("\tconsole size : %ld Bytes\n", info->console_size); + pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size); err = psz_alloc_zones(cxt); if (err) { @@ -1237,6 +1345,10 @@ int register_pstore_zone(struct pstore_zone_info *info) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; pr_cont(" console"); } + if (info->ftrace_size) { + cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; + pr_cont(" ftrace"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 904ee67f4ba2..6f16b0dd834a 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -19,6 +19,7 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @max_reason: Maximum kmsg dump reason to store. * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. * @console_size:The size of console zone which is the same as @kmsg_size. + * @ftrace_size:The size of ftrace zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others @@ -37,6 +38,7 @@ struct pstore_zone_info { int max_reason; unsigned long pmsg_size; unsigned long console_size; + unsigned long ftrace_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_write_op panic_write; -- cgit v1.2.3 From 9630a055256de420d528ecde9f35902a9dcd8750 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 19 May 2020 15:00:35 +0200 Subject: mtd: rawnand: Stop using nand_release() This helper is not very useful and very often people get confused: they use nand_release() instead of nand_cleanup(). Now that all drivers have been converted to do not use nand_release() anymore, let's remove this helper. Signed-off-by: Miquel Raynal Cc: Jonathan Corbet Link: https://lore.kernel.org/linux-mtd/20200519130035.1883-63-miquel.raynal@bootlin.com --- Documentation/driver-api/mtdnand.rst | 6 ++++-- drivers/mtd/nand/raw/nand_base.c | 12 ------------ include/linux/mtd/bbm.h | 2 +- include/linux/mtd/rawnand.h | 2 -- 4 files changed, 5 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/mtdnand.rst b/Documentation/driver-api/mtdnand.rst index 55447659b81f..0bf8d6ec3f54 100644 --- a/Documentation/driver-api/mtdnand.rst +++ b/Documentation/driver-api/mtdnand.rst @@ -276,8 +276,10 @@ unregisters the partitions in the MTD layer. #ifdef MODULE static void __exit board_cleanup (void) { - /* Release resources, unregister device */ - nand_release (mtd_to_nand(board_mtd)); + /* Unregister device */ + WARN_ON(mtd_device_unregister(board_mtd)); + /* Release resources */ + nand_cleanup(mtd_to_nand(board_mtd)); /* unmap physical address */ iounmap(baseaddr); diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 6a6a0a36b3fd..8ce31490b9d0 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -6178,18 +6178,6 @@ void nand_cleanup(struct nand_chip *chip) EXPORT_SYMBOL_GPL(nand_cleanup); -/** - * nand_release - [NAND Interface] Unregister the MTD device and free resources - * held by the NAND device - * @chip: NAND chip object - */ -void nand_release(struct nand_chip *chip) -{ - mtd_device_unregister(nand_to_mtd(chip)); - nand_cleanup(chip); -} -EXPORT_SYMBOL_GPL(nand_release); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steven J. Hill "); MODULE_AUTHOR("Thomas Gleixner "); diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h index 886e30441c90..d890805f5494 100644 --- a/include/linux/mtd/bbm.h +++ b/include/linux/mtd/bbm.h @@ -98,7 +98,7 @@ struct nand_bbt_descr { /* * Flag set by nand_create_default_bbt_descr(), marking that the nand_bbt_descr - * was allocated dynamicaly and must be freed in nand_release(). Has no meaning + * was allocated dynamicaly and must be freed in nand_cleanup(). Has no meaning * in nand_chip.bbt_options. */ #define NAND_BBT_DYNAMICSTRUCT 0x80000000 diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 2804c13e5662..b0b358908a47 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1398,8 +1398,6 @@ void nand_wait_ready(struct nand_chip *chip); * sucessful nand_scan(). */ void nand_cleanup(struct nand_chip *chip); -/* Unregister the MTD device and calls nand_cleanup() */ -void nand_release(struct nand_chip *chip); /* * External helper for controller drivers that have to implement the WAITRDY -- cgit v1.2.3 From f66a6fd0dc7cc49c891a167937e161114f48a62e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 26 May 2020 21:56:14 +0200 Subject: mtd: rawnand: Avoid a typedef In new code, the use of typedef is discouraged. Turn this one in the raw NAND core into a regular enumeration. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200526195633.11543-3-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/nand_base.c | 4 ++-- include/linux/mtd/rawnand.h | 6 +++--- include/linux/platform_data/mtd-davinci.h | 2 +- include/linux/platform_data/mtd-nand-s3c2410.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 8ce31490b9d0..4bf614d4b7ee 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -5099,8 +5099,8 @@ static int of_get_nand_ecc_mode(struct device_node *np) /* * For backward compatibility we support few obsoleted values that don't - * have their mappings into nand_ecc_modes_t anymore (they were merged - * with other enums). + * have their mappings into the nand_ecc_mode enum anymore (they were + * merged with other enums). */ if (!strcasecmp(pm, "soft_bch")) return NAND_ECC_SOFT; diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index b0b358908a47..8f662df02fdd 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -83,14 +83,14 @@ struct nand_chip; /* * Constants for ECC_MODES */ -typedef enum { +enum nand_ecc_mode { NAND_ECC_NONE, NAND_ECC_SOFT, NAND_ECC_HW, NAND_ECC_HW_SYNDROME, NAND_ECC_HW_OOB_FIRST, NAND_ECC_ON_DIE, -} nand_ecc_modes_t; +}; enum nand_ecc_algo { NAND_ECC_UNKNOWN, @@ -362,7 +362,7 @@ static const struct nand_ecc_caps __name = { \ * @write_oob: function to write chip OOB data */ struct nand_ecc_ctrl { - nand_ecc_modes_t mode; + enum nand_ecc_mode mode; enum nand_ecc_algo algo; int steps; int size; diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h index 08e639e047e5..03e92c71b3fa 100644 --- a/include/linux/platform_data/mtd-davinci.h +++ b/include/linux/platform_data/mtd-davinci.h @@ -68,7 +68,7 @@ struct davinci_nand_pdata { /* platform_data */ * Newer ones also support 4-bit ECC, but are awkward * using it with large page chips. */ - nand_ecc_modes_t ecc_mode; + enum nand_ecc_mode ecc_mode; u8 ecc_bits; /* e.g. NAND_BUSWIDTH_16 */ diff --git a/include/linux/platform_data/mtd-nand-s3c2410.h b/include/linux/platform_data/mtd-nand-s3c2410.h index deb849bcf0ec..08675b16f9e1 100644 --- a/include/linux/platform_data/mtd-nand-s3c2410.h +++ b/include/linux/platform_data/mtd-nand-s3c2410.h @@ -49,7 +49,7 @@ struct s3c2410_platform_nand { unsigned int ignore_unset_ecc:1; - nand_ecc_modes_t ecc_mode; + enum nand_ecc_mode ecc_mode; int nr_sets; struct s3c2410_nand_set *sets; -- cgit v1.2.3 From 74e24cd2376d9cc4cfc6edad8610780b79fd5def Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 26 May 2020 21:56:15 +0200 Subject: mtd: rawnand: Drop OOB_FIRST placement scheme This scheme has been introduced for the Davinci controller and means that the OOB area must be read *before* the rest of the data. This has nothing to do with the ECC in OOB placement as it could be understood and most importantly, there is no point in having this function out of the Davinci NAND controller driver. A DT property for this scheme has been added but never used, even by the Davinci driver which only uses this scheme to change the default nand_read_page(). Move the main read_page() helper into the Davinci driver and remove the remaining boilerplate. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200526195633.11543-4-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/davinci_nand.c | 126 ++++++++++++++++++++++++++++-------- drivers/mtd/nand/raw/nand_base.c | 81 ----------------------- include/linux/mtd/rawnand.h | 1 - 3 files changed, 98 insertions(+), 110 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/davinci_nand.c b/drivers/mtd/nand/raw/davinci_nand.c index 52b87304954b..d975a62caaa5 100644 --- a/drivers/mtd/nand/raw/davinci_nand.c +++ b/drivers/mtd/nand/raw/davinci_nand.c @@ -371,6 +371,77 @@ correct: return corrected; } +/** + * nand_read_page_hwecc_oob_first - hw ecc, read oob first + * @chip: nand chip info structure + * @buf: buffer to store read data + * @oob_required: caller requires OOB data read to chip->oob_poi + * @page: page number to read + * + * Hardware ECC for large page chips, require OOB to be read first. For this + * ECC mode, the write_page method is re-used from ECC_HW. These methods + * read/write ECC from the OOB area, unlike the ECC_HW_SYNDROME support with + * multiple ECC steps, follows the "infix ECC" scheme and reads/writes ECC from + * the data area, by overwriting the NAND manufacturer bad block markings. + */ +static int nand_davinci_read_page_hwecc_oob_first(struct nand_chip *chip, + uint8_t *buf, + int oob_required, int page) +{ + struct mtd_info *mtd = nand_to_mtd(chip); + int i, eccsize = chip->ecc.size, ret; + int eccbytes = chip->ecc.bytes; + int eccsteps = chip->ecc.steps; + uint8_t *p = buf; + uint8_t *ecc_code = chip->ecc.code_buf; + uint8_t *ecc_calc = chip->ecc.calc_buf; + unsigned int max_bitflips = 0; + + /* Read the OOB area first */ + ret = nand_read_oob_op(chip, page, 0, chip->oob_poi, mtd->oobsize); + if (ret) + return ret; + + ret = nand_read_page_op(chip, page, 0, NULL, 0); + if (ret) + return ret; + + ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0, + chip->ecc.total); + if (ret) + return ret; + + for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) { + int stat; + + chip->ecc.hwctl(chip, NAND_ECC_READ); + + ret = nand_read_data_op(chip, p, eccsize, false, false); + if (ret) + return ret; + + chip->ecc.calculate(chip, p, &ecc_calc[i]); + + stat = chip->ecc.correct(chip, p, &ecc_code[i], NULL); + if (stat == -EBADMSG && + (chip->ecc.options & NAND_ECC_GENERIC_ERASED_CHECK)) { + /* check for empty pages with bitflips */ + stat = nand_check_erased_ecc_chunk(p, eccsize, + &ecc_code[i], + eccbytes, NULL, 0, + chip->ecc.strength); + } + + if (stat < 0) { + mtd->ecc_stats.failed++; + } else { + mtd->ecc_stats.corrected += stat; + max_bitflips = max_t(unsigned int, max_bitflips, stat); + } + } + return max_bitflips; +} + /*----------------------------------------------------------------------*/ /* An ECC layout for using 4-bit ECC with small-page flash, storing @@ -530,6 +601,13 @@ static int davinci_nand_attach_chip(struct nand_chip *chip) break; case NAND_ECC_HW: if (pdata->ecc_bits == 4) { + int chunks = mtd->writesize / 512; + + if (!chunks || mtd->oobsize < 16) { + dev_dbg(&info->pdev->dev, "too small\n"); + return -EINVAL; + } + /* * No sanity checks: CPUs must support this, * and the chips may not use NAND_BUSWIDTH_16. @@ -552,6 +630,26 @@ static int davinci_nand_attach_chip(struct nand_chip *chip) info->chip.ecc.bytes = 10; info->chip.ecc.options = NAND_ECC_GENERIC_ERASED_CHECK; info->chip.ecc.algo = NAND_ECC_BCH; + + /* + * Update ECC layout if needed ... for 1-bit HW ECC, the + * default is OK, but it allocates 6 bytes when only 3 + * are needed (for each 512 bytes). For 4-bit HW ECC, + * the default is not usable: 10 bytes needed, not 6. + * + * For small page chips, preserve the manufacturer's + * badblock marking data ... and make sure a flash BBT + * table marker fits in the free bytes. + */ + if (chunks == 1) { + mtd_set_ooblayout(mtd, + &hwecc4_small_ooblayout_ops); + } else if (chunks == 4 || chunks == 8) { + mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops); + info->chip.ecc.read_page = nand_davinci_read_page_hwecc_oob_first; + } else { + return -EIO; + } } else { /* 1bit ecc hamming */ info->chip.ecc.calculate = nand_davinci_calculate_1bit; @@ -567,34 +665,6 @@ static int davinci_nand_attach_chip(struct nand_chip *chip) return -EINVAL; } - /* - * Update ECC layout if needed ... for 1-bit HW ECC, the default - * is OK, but it allocates 6 bytes when only 3 are needed (for - * each 512 bytes). For the 4-bit HW ECC, that default is not - * usable: 10 bytes are needed, not 6. - */ - if (pdata->ecc_bits == 4) { - int chunks = mtd->writesize / 512; - - if (!chunks || mtd->oobsize < 16) { - dev_dbg(&info->pdev->dev, "too small\n"); - return -EINVAL; - } - - /* For small page chips, preserve the manufacturer's - * badblock marking data ... and make sure a flash BBT - * table marker fits in the free bytes. - */ - if (chunks == 1) { - mtd_set_ooblayout(mtd, &hwecc4_small_ooblayout_ops); - } else if (chunks == 4 || chunks == 8) { - mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops); - info->chip.ecc.mode = NAND_ECC_HW_OOB_FIRST; - } else { - return -EIO; - } - } - return ret; } diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 4bf614d4b7ee..d7c791de3e88 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -3075,76 +3075,6 @@ static int nand_read_page_hwecc(struct nand_chip *chip, uint8_t *buf, return max_bitflips; } -/** - * nand_read_page_hwecc_oob_first - [REPLACEABLE] hw ecc, read oob first - * @chip: nand chip info structure - * @buf: buffer to store read data - * @oob_required: caller requires OOB data read to chip->oob_poi - * @page: page number to read - * - * Hardware ECC for large page chips, require OOB to be read first. For this - * ECC mode, the write_page method is re-used from ECC_HW. These methods - * read/write ECC from the OOB area, unlike the ECC_HW_SYNDROME support with - * multiple ECC steps, follows the "infix ECC" scheme and reads/writes ECC from - * the data area, by overwriting the NAND manufacturer bad block markings. - */ -static int nand_read_page_hwecc_oob_first(struct nand_chip *chip, uint8_t *buf, - int oob_required, int page) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - int i, eccsize = chip->ecc.size, ret; - int eccbytes = chip->ecc.bytes; - int eccsteps = chip->ecc.steps; - uint8_t *p = buf; - uint8_t *ecc_code = chip->ecc.code_buf; - uint8_t *ecc_calc = chip->ecc.calc_buf; - unsigned int max_bitflips = 0; - - /* Read the OOB area first */ - ret = nand_read_oob_op(chip, page, 0, chip->oob_poi, mtd->oobsize); - if (ret) - return ret; - - ret = nand_read_page_op(chip, page, 0, NULL, 0); - if (ret) - return ret; - - ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0, - chip->ecc.total); - if (ret) - return ret; - - for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) { - int stat; - - chip->ecc.hwctl(chip, NAND_ECC_READ); - - ret = nand_read_data_op(chip, p, eccsize, false, false); - if (ret) - return ret; - - chip->ecc.calculate(chip, p, &ecc_calc[i]); - - stat = chip->ecc.correct(chip, p, &ecc_code[i], NULL); - if (stat == -EBADMSG && - (chip->ecc.options & NAND_ECC_GENERIC_ERASED_CHECK)) { - /* check for empty pages with bitflips */ - stat = nand_check_erased_ecc_chunk(p, eccsize, - &ecc_code[i], eccbytes, - NULL, 0, - chip->ecc.strength); - } - - if (stat < 0) { - mtd->ecc_stats.failed++; - } else { - mtd->ecc_stats.corrected += stat; - max_bitflips = max_t(unsigned int, max_bitflips, stat); - } - } - return max_bitflips; -} - /** * nand_read_page_syndrome - [REPLACEABLE] hardware ECC syndrome based page read * @chip: nand chip info structure @@ -5080,7 +5010,6 @@ static const char * const nand_ecc_modes[] = { [NAND_ECC_SOFT] = "soft", [NAND_ECC_HW] = "hw", [NAND_ECC_HW_SYNDROME] = "hw_syndrome", - [NAND_ECC_HW_OOB_FIRST] = "hw_oob_first", [NAND_ECC_ON_DIE] = "on-die", }; @@ -5829,16 +5758,6 @@ static int nand_scan_tail(struct nand_chip *chip) */ switch (ecc->mode) { - case NAND_ECC_HW_OOB_FIRST: - /* Similar to NAND_ECC_HW, but a separate read_page handle */ - if (!ecc->calculate || !ecc->correct || !ecc->hwctl) { - WARN(1, "No ECC functions supplied; hardware ECC not possible\n"); - ret = -EINVAL; - goto err_nand_manuf_cleanup; - } - if (!ecc->read_page) - ecc->read_page = nand_read_page_hwecc_oob_first; - fallthrough; case NAND_ECC_HW: /* Use standard hwecc read page function? */ if (!ecc->read_page) diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 8f662df02fdd..605d64ead3a8 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -88,7 +88,6 @@ enum nand_ecc_mode { NAND_ECC_SOFT, NAND_ECC_HW, NAND_ECC_HW_SYNDROME, - NAND_ECC_HW_OOB_FIRST, NAND_ECC_ON_DIE, }; -- cgit v1.2.3 From 86f2b225adf4ecd2edfdc8541a7342645556ac3b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 26 May 2020 21:56:18 +0200 Subject: mtd: rawnand: Add an invalid ECC mode to discriminate with valid ones NAND ECC modes (or providers) have their own enumeration but, unlike their algorithms counterpart, there is no invalid or uninitialized value to discriminate between an error and having chosen a no-ECC situation. Add an "invalid" entry for this purpose. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200526195633.11543-7-miquel.raynal@bootlin.com --- drivers/mtd/nand/raw/nand_base.c | 2 +- include/linux/mtd/rawnand.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index d46d34d0d8be..45124dbb1835 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -5022,7 +5022,7 @@ static int of_get_nand_ecc_mode(struct device_node *np) if (err < 0) return err; - for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++) + for (i = NAND_ECC_NONE; i < ARRAY_SIZE(nand_ecc_modes); i++) if (!strcasecmp(pm, nand_ecc_modes[i])) return i; diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 605d64ead3a8..65b1c1c18b41 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -84,6 +84,7 @@ struct nand_chip; * Constants for ECC_MODES */ enum nand_ecc_mode { + NAND_ECC_INVALID, NAND_ECC_NONE, NAND_ECC_SOFT, NAND_ECC_HW, -- cgit v1.2.3 From 372b38ea5911fc2500f0291b00140e80a26c0e36 Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Thu, 28 May 2020 21:34:26 +0200 Subject: ieee80211: definitions for reduced neighbor reports Add the necessary definitions to parse reduced neighbor report elements. Signed-off-by: Tova Mussai [change struct name, remove IEEE80211_MIN_AP_NEIGHBOR_INFO_SIZE] Link: https://lore.kernel.org/r/20200528213443.4f9154461c06.I518d9898ad982f838112ea9ca14a20d6bbb16394@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0320ca4c7d28..c29184bf9416 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2754,6 +2754,8 @@ enum ieee80211_eid { WLAN_EID_QUIET_CHANNEL = 198, WLAN_EID_OPMODE_NOTIF = 199, + WLAN_EID_REDUCED_NEIGHBOR_REPORT = 201, + WLAN_EID_S1G_BCN_COMPAT = 213, WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, WLAN_EID_S1G_CAPABILITIES = 217, @@ -3675,4 +3677,30 @@ static inline bool for_each_element_completed(const struct element *element, #define WLAN_RSNX_CAPA_PROTECTED_TWT BIT(4) #define WLAN_RSNX_CAPA_SAE_H2E BIT(5) +/* + * reduced neighbor report, based on Draft P802.11ax_D5.0, + * section 9.4.2.170 + */ +#define IEEE80211_AP_INFO_TBTT_HDR_TYPE 0x03 +#define IEEE80211_AP_INFO_TBTT_HDR_FILTERED 0x04 +#define IEEE80211_AP_INFO_TBTT_HDR_COLOC 0x08 +#define IEEE80211_AP_INFO_TBTT_HDR_COUNT 0xF0 +#define IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM 8 +#define IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM 12 + +#define IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED 0x01 +#define IEEE80211_RNR_TBTT_PARAMS_SAME_SSID 0x02 +#define IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID 0x04 +#define IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID 0x08 +#define IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS 0x10 +#define IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE 0x20 +#define IEEE80211_RNR_TBTT_PARAMS_COLOC_AP 0x40 + +struct ieee80211_neighbor_ap_info { + u8 tbtt_info_hdr; + u8 tbtt_info_len; + u8 op_class; + u8 channel; +} __packed; + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 821273a5a502eebaae005557907d122d1e9b7b98 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:27 +0200 Subject: ieee80211: add code to obtain and parse 6 GHz operation field Add some code to obtain and parse the 6 GHz operation field inside the HE operation element. While at it, fix the required length using sizeof() the new struct, which is 5 instead of 4 now. Link: https://lore.kernel.org/r/20200528213443.42ca72c45ca9.Id74bc1b03da9ea6574f9bc70deeb60dfc1634359@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c29184bf9416..2bd9e757167d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2209,6 +2209,28 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 +/** + * ieee80211_he_6ghz_oper - HE 6 GHz operation Information field + * @primary: primary channel + * @control: control flags + * @ccfs0: channel center frequency segment 0 + * @ccfs1: channel center frequency segment 1 + * @minrate: minimum rate (in 1 Mbps units) + */ +struct ieee80211_he_6ghz_oper { + u8 primary; +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH 0x3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ 0 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ 1 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 minrate; +} __packed; + /* * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size * @he_oper_ie: byte data of the He Operations IE, stating from the byte @@ -2235,7 +2257,7 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) oper_len++; if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) - oper_len += 4; + oper_len += sizeof(struct ieee80211_he_6ghz_oper); /* Add the first byte (extension ID) to the total length */ oper_len++; @@ -2243,6 +2265,34 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) return oper_len; } +/** + * ieee80211_he_6ghz_oper - obtain 6 GHz operation field + * @he_oper: HE operation element (must be pre-validated for size) + * but may be %NULL + * + * Return: a pointer to the 6 GHz operation field, or %NULL + */ +static inline const struct ieee80211_he_6ghz_oper * +ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) +{ + const u8 *ret = (void *)&he_oper->optional; + u32 he_oper_params; + + if (!he_oper) + return NULL; + + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + + if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) + return NULL; + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + ret += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) + ret++; + + return (void *)ret; +} + /* HE Spatial Reuse defines */ #define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 #define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 -- cgit v1.2.3 From 8b30808d9be4183fab17f0b0e68eea88c94ff15a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:28 +0200 Subject: ieee80211: add HE ext EIDs and 6 GHz capability defines Add the HE extended element IDs and the definitions for the HE 6 GHz band capabilities element, from Draft 5.0. Link: https://lore.kernel.org/r/20200528213443.1a6689fe093f.Ifdc5400fb01779351354daf38663ebeea03c9ad9@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2bd9e757167d..9580dfd9e2d1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2839,9 +2839,19 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_UORA = 37, WLAN_EID_EXT_HE_MU_EDCA = 38, WLAN_EID_EXT_HE_SPR = 39, + WLAN_EID_EXT_NDP_FEEDBACK_REPORT_PARAMSET = 41, + WLAN_EID_EXT_BSS_COLOR_CHG_ANN = 42, + WLAN_EID_EXT_QUIET_TIME_PERIOD_SETUP = 43, + WLAN_EID_EXT_ESS_REPORT = 45, + WLAN_EID_EXT_OPS = 46, + WLAN_EID_EXT_HE_BSS_LOAD = 47, WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52, WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55, WLAN_EID_EXT_NON_INHERITANCE = 56, + WLAN_EID_EXT_KNOWN_BSSID = 57, + WLAN_EID_EXT_SHORT_SSID_LIST = 58, + WLAN_EID_EXT_HE_6GHZ_CAPA = 59, + WLAN_EID_EXT_UL_MU_POWER_CAPA = 60, }; /* Action category code */ @@ -3384,6 +3394,24 @@ struct ieee80211_tspec_ie { __le16 medium_time; } __packed; +struct ieee80211_he_6ghz_capa { + /* uses IEEE80211_HE_6GHZ_CAP_* below */ + __le16 capa; +} __packed; + +/* HE 6 GHz band capabilities */ +/* uses enum ieee80211_min_mpdu_spacing values */ +#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START 0x0007 +/* uses enum ieee80211_vht_max_ampdu_length_exp values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP 0x0038 +/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN 0x00c0 +/* WLAN_HT_CAP_SM_PS_* values */ +#define IEEE80211_HE_6GHZ_CAP_SM_PS 0x0600 +#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER 0x0800 +#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS 0x1000 +#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS 0x2000 + /** * ieee80211_get_qos_ctl - get pointer to qos control bytes * @hdr: the frame -- cgit v1.2.3 From 3b3ec3d52e8f72ec8c40477b96f23440a89000be Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Thu, 28 May 2020 21:34:37 +0200 Subject: mac80211: check the correct bit for EMA AP An AP supporting EMA (Enhanced Multi-BSSID advertisement) should set bit 83 in the extended capabilities IE (9.4.2.26 in the 802.11ax D5 spec). So the *3rd* bit of the 10th byte should be checked. Also, in one place, the wrong byte was checked. (cfg80211_find_ie returns a pointer to the beginning of the IE, so the data really starts at ie[2], so the 10th byte should be ie[12]. To avoid this confusion, use cfg80211_find_elem instead). Signed-off-by: Shaul Triebitz Link: https://lore.kernel.org/r/20200528213443.4316121fa2a3.I9745582f8d41ad8e689dac0fefcd70b276d7c1ea@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- net/mac80211/mlme.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9580dfd9e2d1..1ecfd19f836d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3082,7 +3082,7 @@ enum ieee80211_tdls_actioncode { #define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7) /* Defines support for enhanced multi-bssid advertisement*/ -#define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(1) +#define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(3) /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index bc558d1d20fc..c534cd1bb9cd 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5596,7 +5596,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout_started = true; assoc_data->need_beacon = true; } else if (beacon_ies) { - const u8 *ie; + const struct element *elem; u8 dtim_count = 0; ieee80211_get_dtim(beacon_ies, &dtim_count, @@ -5613,15 +5613,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = dtim_count; } - ie = cfg80211_find_ext_ie(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 3) - sdata->vif.bss_conf.profile_periodicity = ie[4]; + elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 3) + sdata->vif.bss_conf.profile_periodicity = elem->data[2]; - ie = cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 11 && - (ie[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) + elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 11 && + (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) sdata->vif.bss_conf.ema_ap = true; } else { assoc_data->timeout = jiffies; -- cgit v1.2.3 From 3f19b2ab97a97b413c24b66c67ae16daa4f56c35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Dec 2017 11:40:16 +0000 Subject: vfs, afs, ext4: Make the inode hash table RCU searchable Make the inode hash table RCU searchable so that searches that want to access or modify an inode without taking a ref on that inode can do so without taking the inode hash table lock. The main thing this requires is some RCU annotation on the list manipulation operations. Inodes are already freed by RCU in most cases. Users of this interface must take care as the inode may be still under construction or may be being torn down around them. There are at least three instances where this can be of use: (1) Testing whether the inode number iunique() is going to return is currently unique (the iunique_lock is still held). (2) Ext4 date stamp updating. (3) AFS callback breaking. Signed-off-by: David Howells Acked-by: Konstantin Khlebnikov cc: linux-ext4@vger.kernel.org cc: linux-afs@lists.infradead.org --- fs/afs/callback.c | 12 ++++-- fs/ext4/inode.c | 44 ++++++++++----------- fs/inode.c | 112 +++++++++++++++++++++++++++++++++++++++++++++-------- include/linux/fs.h | 3 ++ 4 files changed, 130 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 2dca8df1a18d..0dcbd40732d1 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -252,6 +252,7 @@ static void afs_break_one_callback(struct afs_server *server, struct afs_vnode *vnode; struct inode *inode; + rcu_read_lock(); read_lock(&server->cb_break_lock); hlist_for_each_entry(vi, &server->cb_volumes, srv_link) { if (vi->vid < fid->vid) @@ -287,12 +288,16 @@ static void afs_break_one_callback(struct afs_server *server, } else { data.volume = NULL; data.fid = *fid; - inode = ilookup5_nowait(cbi->sb, fid->vnode, - afs_iget5_test, &data); + + /* See if we can find a matching inode - even an I_NEW + * inode needs to be marked as it can have its callback + * broken before we finish setting up the local inode. + */ + inode = find_inode_rcu(cbi->sb, fid->vnode, + afs_iget5_test, &data); if (inode) { vnode = AFS_FS_I(inode); afs_break_callback(vnode, afs_cb_break_for_callback); - iput(inode); } else { trace_afs_cb_miss(fid, afs_cb_break_for_callback); } @@ -301,6 +306,7 @@ static void afs_break_one_callback(struct afs_server *server, out: read_unlock(&server->cb_break_lock); + rcu_read_unlock(); } /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..2bbb55d05bb7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4860,21 +4860,22 @@ static int ext4_inode_blocks_set(handle_t *handle, return 0; } -struct other_inode { - unsigned long orig_ino; - struct ext4_inode *raw_inode; -}; - -static int other_inode_match(struct inode * inode, unsigned long ino, - void *data) +static void __ext4_update_other_inode_time(struct super_block *sb, + unsigned long orig_ino, + unsigned long ino, + struct ext4_inode *raw_inode) { - struct other_inode *oi = (struct other_inode *) data; + struct inode *inode; + + inode = find_inode_by_ino_rcu(sb, ino); + if (!inode) + return; - if ((inode->i_ino != ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | I_DIRTY_INODE)) || ((inode->i_state & I_DIRTY_TIME) == 0)) - return 0; + return; + spin_lock(&inode->i_lock); if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | I_DIRTY_INODE)) == 0) && @@ -4885,16 +4886,15 @@ static int other_inode_match(struct inode * inode, unsigned long ino, spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); - EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); - ext4_inode_csum_set(inode, oi->raw_inode, ei); + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); - trace_ext4_other_inode_update_time(inode, oi->orig_ino); - return -1; + trace_ext4_other_inode_update_time(inode, orig_ino); + return; } spin_unlock(&inode->i_lock); - return -1; } /* @@ -4904,24 +4904,24 @@ static int other_inode_match(struct inode * inode, unsigned long ino, static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { - struct other_inode oi; unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); - oi.orig_ino = orig_ino; /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; + rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; - oi.raw_inode = (struct ext4_inode *) buf; - (void) find_inode_nowait(sb, ino, other_inode_match, &oi); + __ext4_update_other_inode_time(sb, orig_ino, ino, + (struct ext4_inode *)buf); } + rcu_read_unlock(); } /* diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..b7bd7162c902 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -497,7 +497,7 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); - hlist_add_head(&inode->i_hash, b); + hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } @@ -513,7 +513,7 @@ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); - hlist_del_init(&inode->i_hash); + hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } @@ -1107,7 +1107,7 @@ again: */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; - hlist_add_head(&inode->i_hash, head); + hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); if (!creating) inode_sb_list_add(inode); @@ -1201,7 +1201,7 @@ again: inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; - hlist_add_head(&inode->i_hash, head); + hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); @@ -1244,15 +1244,10 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino) struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; - spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) { - spin_unlock(&inode_hash_lock); + hlist_for_each_entry_rcu(inode, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) return 0; - } } - spin_unlock(&inode_hash_lock); - return 1; } @@ -1281,6 +1276,7 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) static unsigned int counter; ino_t res; + rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) @@ -1288,6 +1284,7 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); + rcu_read_unlock(); return res; } @@ -1456,6 +1453,84 @@ out: } EXPORT_SYMBOL(find_inode_nowait); +/** + * find_inode_rcu - find an inode in the inode cache + * @sb: Super block of file system to search + * @hashval: Key to hash + * @test: Function to test match on an inode + * @data: Data for test function + * + * Search for the inode specified by @hashval and @data in the inode cache, + * where the helper function @test will return 0 if the inode does not match + * and 1 if it does. The @test function must be responsible for taking the + * i_lock spin_lock and checking i_state for an inode being freed or being + * initialized. + * + * If successful, this will return the inode for which the @test function + * returned 1 and NULL otherwise. + * + * The @test function is not permitted to take a ref on any inode presented. + * It is also not permitted to sleep. + * + * The caller must hold the RCU read lock. + */ +struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_rcu() usage"); + + hlist_for_each_entry_rcu(inode, head, i_hash) { + if (inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + test(inode, data)) + return inode; + } + return NULL; +} +EXPORT_SYMBOL(find_inode_rcu); + +/** + * find_inode_by_rcu - Find an inode in the inode cache + * @sb: Super block of file system to search + * @ino: The inode number to match + * + * Search for the inode specified by @hashval and @data in the inode cache, + * where the helper function @test will return 0 if the inode does not match + * and 1 if it does. The @test function must be responsible for taking the + * i_lock spin_lock and checking i_state for an inode being freed or being + * initialized. + * + * If successful, this will return the inode for which the @test function + * returned 1 and NULL otherwise. + * + * The @test function is not permitted to take a ref on any inode presented. + * It is also not permitted to sleep. + * + * The caller must hold the RCU read lock. + */ +struct inode *find_inode_by_ino_rcu(struct super_block *sb, + unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_by_ino_rcu() usage"); + + hlist_for_each_entry_rcu(inode, head, i_hash) { + if (inode->i_ino == ino && + inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) + return inode; + } + return NULL; +} +EXPORT_SYMBOL(find_inode_by_ino_rcu); + int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1480,7 +1555,7 @@ int insert_inode_locked(struct inode *inode) if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; - hlist_add_head(&inode->i_hash, head); + hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; @@ -1540,6 +1615,7 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; + unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); @@ -1555,16 +1631,20 @@ static void iput_final(struct inode *inode) return; } + state = inode->i_state; if (!drop) { - inode->i_state |= I_WILL_FREE; + WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_NEW); - inode->i_state &= ~I_WILL_FREE; + state = inode->i_state; + WARN_ON(state & I_NEW); + state &= ~I_WILL_FREE; } - inode->i_state |= I_FREEING; + WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 45cc10cdf6dd..5f9b2bb4b44f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3070,6 +3070,9 @@ extern struct inode *find_inode_nowait(struct super_block *, int (*match)(struct inode *, unsigned long, void *), void *data); +extern struct inode *find_inode_rcu(struct super_block *, unsigned long, + int (*)(struct inode *, void *), void *); +extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.2.3 From 335426c6dcdd338d6b7c939c2da15fc9c5dd4959 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:03 +0800 Subject: pstore/zone: Provide way to skip "broken" zone for MTD devices One requirement to support MTD devices in pstore/zone is having a way to declare certain regions as broken. Add this support to pstore/zone. The MTD driver should return -ENOMSG when encountering a bad region, which tells pstore/zone to skip and try the next one. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-8-keescook@chromium.org/ Co-developed-by: Colin Ian King Signed-off-by: Colin Ian King Link: //lore.kernel.org/lkml/20200512173801.222666-1-colin.king@canonical.com Signed-off-by: Kees Cook --- fs/pstore/blk.c | 10 +++++-- fs/pstore/zone.c | 65 ++++++++++++++++++++++++++++++++++++--------- include/linux/pstore_blk.h | 3 ++- include/linux/pstore_zone.h | 12 ++++++--- 4 files changed, 71 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index e0d95fb48428..67343d6aa27a 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -109,9 +109,12 @@ struct bdev_info { * means error. * @write: The same as @read, but the following error number: * -EBUSY means try to write again later. + * -ENOMSG means to try next zone. * @panic_write:The write operation only used for panic case. It's optional - * if you do not care panic log. The parameters and return value - * are the same as @read. + * if you do not care panic log. The parameters are relative + * value to storage. + * On success, the number of bytes should be returned, others + * excluding -ENOMSG mean error. -ENOMSG means to try next zone. */ struct pstore_device_info { unsigned long total_size; @@ -337,6 +340,9 @@ static ssize_t psblk_blk_panic_write(const char *buf, size_t size, /* size and off must align to SECTOR_SIZE for block device */ ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, size >> SECTOR_SHIFT); + /* try next zone */ + if (ret == -ENOMSG) + return ret; return ret ? -EIO : size; } diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 2d9f452360bb..add26b125984 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -249,6 +249,9 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: + /* no need to mark dirty if going to try next zone */ + if (wcnt == -ENOMSG) + return -ENOMSG; atomic_set(&zone->dirty, true); /* flush dirty zones nicely */ if (wcnt == -EBUSY && !is_on_panic()) @@ -391,7 +394,11 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) return -EINVAL; rcnt = info->read((char *)buf, len, zone->off); - if (rcnt != len) { + if (rcnt == -ENOMSG) { + pr_debug("%s with id %lu may be broken, skip\n", + zone->name, i); + continue; + } else if (rcnt != len) { pr_err("read %s with id %lu failed\n", zone->name, i); return (int)rcnt < 0 ? (int)rcnt : -EIO; } @@ -725,24 +732,58 @@ static void psz_write_kmsg_hdr(struct pstore_zone *zone, hdr->counter = 0; } +/* + * In case zone is broken, which may occur to MTD device, we try each zones, + * start at cxt->kmsg_write_cnt. + */ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, struct pstore_record *record) { size_t size, hlen; struct pstore_zone *zone; - unsigned int zonenum; + unsigned int i; - zonenum = cxt->kmsg_write_cnt; - zone = cxt->kpszs[zonenum]; - if (unlikely(!zone)) - return -ENOSPC; - cxt->kmsg_write_cnt = (zonenum + 1) % cxt->kmsg_max_cnt; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + unsigned int zonenum, len; + int ret; - pr_debug("write %s to zone id %d\n", zone->name, zonenum); - psz_write_kmsg_hdr(zone, record); - hlen = sizeof(struct psz_kmsg_header); - size = min_t(size_t, record->size, zone->buffer_size - hlen); - return psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + zonenum = (cxt->kmsg_write_cnt + i) % cxt->kmsg_max_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + + /* avoid destroying old data, allocate a new one */ + len = zone->buffer_size + sizeof(*zone->buffer); + zone->oldbuf = zone->buffer; + zone->buffer = kzalloc(len, GFP_KERNEL); + if (!zone->buffer) { + zone->buffer = zone->oldbuf; + return -ENOMEM; + } + zone->buffer->sig = zone->oldbuf->sig; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + ret = psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + if (likely(!ret || ret != -ENOMSG)) { + cxt->kmsg_write_cnt = zonenum + 1; + cxt->kmsg_write_cnt %= cxt->kmsg_max_cnt; + /* no need to try next zone, free last zone buffer */ + kfree(zone->oldbuf); + zone->oldbuf = NULL; + return ret; + } + + pr_debug("zone %u may be broken, try next dmesg zone\n", + zonenum); + kfree(zone->buffer); + zone->buffer = zone->oldbuf; + zone->oldbuf = NULL; + } + + return -EBUSY; } static int notrace psz_kmsg_write(struct psz_context *cxt, diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index 4501977b1336..ccba8c068752 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -14,7 +14,8 @@ * @start_sect: start sector to block device * @sects: sectors count on buf * - * Return: On success, zero should be returned. Others mean error. + * Return: On success, zero should be returned. Others excluding -ENOMSG + * mean error. -ENOMSG means to try next zone. * * Panic write to block device must be aligned to SECTOR_SIZE. */ diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 6f16b0dd834a..e79a18e41064 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -23,11 +23,15 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others - * means error. - * @write: The same as @read, but -EBUSY means try to write again later. + * mean error. + * @write: The same as @read, but the following error number: + * -EBUSY means try to write again later. + * -ENOMSG means to try next zone. * @panic_write:The write operation only used for panic case. It's optional - * if you do not care panic log. The parameters and return value - * are the same as @read. + * if you do not care panic log. The parameters are relative + * value to storage. + * On success, the number of bytes should be returned, others + * excluding -ENOMSG mean error. -ENOMSG means to try next zone. */ struct pstore_zone_info { struct module *owner; -- cgit v1.2.3 From 1525fb3bb6d69028b3941d34363397c28345ffab Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:04 +0800 Subject: pstore/blk: Provide way to query pstore configuration In order to configure itself, the MTD backend needs to be able to query the current pstore configuration. Introduce pstore_blk_get_config() for this purpose. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-9-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/blk.c | 37 ++++++++++++++++++++++++++++++------- include/linux/pstore_blk.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 67343d6aa27a..631bc27c8661 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -94,6 +94,17 @@ struct bdev_info { sector_t start_sect; }; +#define check_size(name, alignsize) ({ \ + long _##name_ = (name); \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + _##name_; \ +}) + /** * struct pstore_device_info - back-end pstore/blk driver structure. * @@ -149,13 +160,11 @@ static int psblk_register_do(struct pstore_device_info *dev) dev->flags = UINT_MAX; #define verify_size(name, alignsize, enabled) { \ - long _##name_ = (enabled) ? (name) : 0; \ - _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ - if (_##name_ & ((alignsize) - 1)) { \ - pr_info(#name " must align to %d\n", \ - (alignsize)); \ - _##name_ = ALIGN(name, (alignsize)); \ - } \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ name = _##name_ / 1024; \ pstore_zone_info->name = _##name_; \ } @@ -453,6 +462,20 @@ void unregister_pstore_blk(unsigned int major) } EXPORT_SYMBOL_GPL(unregister_pstore_blk); +/* get information of pstore/blk */ +int pstore_blk_get_config(struct pstore_blk_config *info) +{ + strncpy(info->device, blkdev, 80); + info->max_reason = max_reason; + info->kmsg_size = check_size(kmsg_size, 4096); + info->pmsg_size = check_size(pmsg_size, 4096); + info->ftrace_size = check_size(ftrace_size, 4096); + info->console_size = check_size(console_size, 4096); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_blk_get_config); + static void __exit pstore_blk_exit(void) { mutex_lock(&pstore_blk_lock); diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index ccba8c068752..0c40774e71e0 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -49,4 +49,32 @@ struct pstore_blk_info { int register_pstore_blk(struct pstore_blk_info *info); void unregister_pstore_blk(unsigned int major); +/** + * struct pstore_blk_config - the pstore_blk backend configuration + * + * @device: Name of the desired block device + * @max_reason: Maximum kmsg dump reason to store to block device + * @kmsg_size: Total size of for kmsg dumps + * @pmsg_size: Total size of the pmsg storage area + * @console_size: Total size of the console storage area + * @ftrace_size: Total size for ftrace logging data (for all CPUs) + */ +struct pstore_blk_config { + char device[80]; + enum kmsg_dump_reason max_reason; + unsigned long kmsg_size; + unsigned long pmsg_size; + unsigned long console_size; + unsigned long ftrace_size; +}; + +/** + * pstore_blk_get_config - get a copy of the pstore_blk backend configuration + * + * @info: The sturct pstore_blk_config to be filled in + * + * Failure returns negative error code, and success returns 0. + */ +int pstore_blk_get_config(struct pstore_blk_config *info); + #endif -- cgit v1.2.3 From 7dcb7848ba110ff192efc917d1a6de66b4c9ca4f Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:05 +0800 Subject: pstore/blk: Support non-block storage devices Add support for non-block devices (e.g. MTD). A non-block driver calls pstore_blk_register_device() to register iself. In addition, pstore/zone is updated to handle non-block devices, where an erase must be done before a write. Without this, there is no way to remove records stored to an MTD. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-10-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- Documentation/admin-guide/pstore-blk.rst | 17 ++++-- fs/pstore/blk.c | 93 +++++++++++++++++--------------- fs/pstore/zone.c | 8 ++- include/linux/pstore_blk.h | 38 +++++++++++++ include/linux/pstore_zone.h | 6 +++ 5 files changed, 114 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst index bef8c7436721..d45341e55e82 100644 --- a/Documentation/admin-guide/pstore-blk.rst +++ b/Documentation/admin-guide/pstore-blk.rst @@ -7,8 +7,8 @@ Introduction ------------ pstore block (pstore/blk) is an oops/panic logger that writes its logs to a -block device before the system crashes. You can get these log files by -mounting pstore filesystem like:: +block device and non-block device before the system crashes. You can get +these log files by mounting pstore filesystem like:: mount -t pstore pstore /sys/fs/pstore @@ -24,8 +24,8 @@ Configurations for user determine how pstore/blk works, such as pmsg_size, kmsg_size and so on. All of them support both Kconfig and module parameters, but module parameters have priority over Kconfig. -Configurations for driver are all about block device, such as total_size -of block device and read/write operations. +Configurations for driver are all about block device and non-block device, +such as total_size of block device and read/write operations. Configurations for user ----------------------- @@ -152,6 +152,15 @@ driver uses ``register_pstore_blk`` to register to pstore/blk. .. kernel-doc:: fs/pstore/blk.c :identifiers: register_pstore_blk +A non-block device driver uses ``register_pstore_device`` with +``struct pstore_device_info`` to register to pstore/blk. + +.. kernel-doc:: fs/pstore/blk.c + :identifiers: register_pstore_device + +.. kernel-doc:: include/linux/pstore_blk.h + :identifiers: pstore_device_info + Compression and header ---------------------- diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 631bc27c8661..881b40ed8142 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -105,55 +105,22 @@ struct bdev_info { _##name_; \ }) -/** - * struct pstore_device_info - back-end pstore/blk driver structure. - * - * @total_size: The total size in bytes pstore/blk can use. It must be greater - * than 4096 and be multiple of 4096. - * @flags: Refer to macro starting with PSTORE_FLAGS defined in - * linux/pstore.h. It means what front-ends this device support. - * Zero means all backends for compatible. - * @read: The general read operation. Both of the function parameters - * @size and @offset are relative value to bock device (not the - * whole disk). - * On success, the number of bytes should be returned, others - * means error. - * @write: The same as @read, but the following error number: - * -EBUSY means try to write again later. - * -ENOMSG means to try next zone. - * @panic_write:The write operation only used for panic case. It's optional - * if you do not care panic log. The parameters are relative - * value to storage. - * On success, the number of bytes should be returned, others - * excluding -ENOMSG mean error. -ENOMSG means to try next zone. - */ -struct pstore_device_info { - unsigned long total_size; - unsigned int flags; - pstore_zone_read_op read; - pstore_zone_write_op write; - pstore_zone_write_op panic_write; -}; - -static int psblk_register_do(struct pstore_device_info *dev) +static int __register_pstore_device(struct pstore_device_info *dev) { int ret; + lockdep_assert_held(&pstore_blk_lock); + if (!dev || !dev->total_size || !dev->read || !dev->write) return -EINVAL; - mutex_lock(&pstore_blk_lock); - /* someone already registered before */ - if (pstore_zone_info) { - mutex_unlock(&pstore_blk_lock); + if (pstore_zone_info) return -EBUSY; - } + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); - if (!pstore_zone_info) { - mutex_unlock(&pstore_blk_lock); + if (!pstore_zone_info) return -ENOMEM; - } /* zero means not limit on which backends to attempt to store. */ if (!dev->flags) @@ -179,6 +146,7 @@ static int psblk_register_do(struct pstore_device_info *dev) pstore_zone_info->max_reason = max_reason; pstore_zone_info->read = dev->read; pstore_zone_info->write = dev->write; + pstore_zone_info->erase = dev->erase; pstore_zone_info->panic_write = dev->panic_write; pstore_zone_info->name = KBUILD_MODNAME; pstore_zone_info->owner = THIS_MODULE; @@ -188,20 +156,51 @@ static int psblk_register_do(struct pstore_device_info *dev) kfree(pstore_zone_info); pstore_zone_info = NULL; } + return ret; +} +/** + * register_pstore_device() - register non-block device to pstore/blk + * + * @dev: non-block device information + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_device(dev); mutex_unlock(&pstore_blk_lock); + return ret; } +EXPORT_SYMBOL_GPL(register_pstore_device); -static void psblk_unregister_do(struct pstore_device_info *dev) +static void __unregister_pstore_device(struct pstore_device_info *dev) { - mutex_lock(&pstore_blk_lock); + lockdep_assert_held(&pstore_blk_lock); if (pstore_zone_info && pstore_zone_info->read == dev->read) { unregister_pstore_zone(pstore_zone_info); kfree(pstore_zone_info); pstore_zone_info = NULL; } +} + +/** + * unregister_pstore_device() - unregister non-block device from pstore/blk + * + * @dev: non-block device information + */ +void unregister_pstore_device(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_device(dev); mutex_unlock(&pstore_blk_lock); } +EXPORT_SYMBOL_GPL(unregister_pstore_device); /** * psblk_get_bdev() - open block device @@ -396,9 +395,10 @@ static int __register_pstore_blk(struct pstore_blk_info *info) dev.flags = info->flags; dev.read = psblk_generic_blk_read; dev.write = psblk_generic_blk_write; + dev.erase = NULL; dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; - ret = psblk_register_do(&dev); + ret = __register_pstore_device(&dev); if (ret) goto err_put_bdev; @@ -442,7 +442,7 @@ static void __unregister_pstore_blk(unsigned int major) lockdep_assert_held(&pstore_blk_lock); if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { - psblk_unregister_do(&dev); + __unregister_pstore_device(&dev); psblk_put_bdev(psblk_bdev, holder); blkdev_panic_write = NULL; psblk_bdev = NULL; @@ -481,6 +481,13 @@ static void __exit pstore_blk_exit(void) mutex_lock(&pstore_blk_lock); if (psblk_bdev) __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + else { + struct pstore_device_info dev = { }; + + if (pstore_zone_info) + dev.read = pstore_zone_info->read; + __unregister_pstore_device(&dev); + } mutex_unlock(&pstore_blk_lock); } module_exit(pstore_blk_exit); diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index add26b125984..819428dfa32f 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -660,15 +660,21 @@ static inline int psz_kmsg_erase(struct psz_context *cxt, struct psz_buffer *buffer = zone->buffer; struct psz_kmsg_header *hdr = (struct psz_kmsg_header *)buffer->data; + size_t size; if (unlikely(!psz_ok(zone))) return 0; + /* this zone is already updated, no need to erase */ if (record->count != hdr->counter) return 0; + size = buffer_datalen(zone) + sizeof(*zone->buffer); atomic_set(&zone->buffer->datalen, 0); - return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + if (cxt->pstore_zone_info->erase) + return cxt->pstore_zone_info->erase(size, zone->off); + else + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); } static inline int psz_record_erase(struct psz_context *cxt, diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index 0c40774e71e0..61e914522b01 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -49,6 +49,44 @@ struct pstore_blk_info { int register_pstore_blk(struct pstore_blk_info *info); void unregister_pstore_blk(unsigned int major); +/** + * struct pstore_device_info - back-end pstore/blk driver structure. + * + * @total_size: The total size in bytes pstore/blk can use. It must be greater + * than 4096 and be multiple of 4096. + * @flags: Refer to macro starting with PSTORE_FLAGS defined in + * linux/pstore.h. It means what front-ends this device support. + * Zero means all backends for compatible. + * @read: The general read operation. Both of the function parameters + * @size and @offset are relative value to bock device (not the + * whole disk). + * On success, the number of bytes should be returned, others + * means error. + * @write: The same as @read, but the following error number: + * -EBUSY means try to write again later. + * -ENOMSG means to try next zone. + * @erase: The general erase operation for device with special removing + * job. Both of the function parameters @size and @offset are + * relative value to storage. + * Return 0 on success and others on failure. + * @panic_write:The write operation only used for panic case. It's optional + * if you do not care panic log. The parameters are relative + * value to storage. + * On success, the number of bytes should be returned, others + * excluding -ENOMSG mean error. -ENOMSG means to try next zone. + */ +struct pstore_device_info { + unsigned long total_size; + unsigned int flags; + pstore_zone_read_op read; + pstore_zone_write_op write; + pstore_zone_erase_op erase; + pstore_zone_write_op panic_write; +}; + +int register_pstore_device(struct pstore_device_info *dev); +void unregister_pstore_device(struct pstore_device_info *dev); + /** * struct pstore_blk_config - the pstore_blk backend configuration * diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index e79a18e41064..1e35eaa33e5e 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -7,6 +7,7 @@ typedef ssize_t (*pstore_zone_read_op)(char *, size_t, loff_t); typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); +typedef ssize_t (*pstore_zone_erase_op)(size_t, loff_t); /** * struct pstore_zone_info - pstore/zone back-end driver structure * @@ -27,6 +28,10 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @write: The same as @read, but the following error number: * -EBUSY means try to write again later. * -ENOMSG means to try next zone. + * @erase: The general erase operation for device with special removing + * job. Both of the function parameters @size and @offset are + * relative value to storage. + * Return 0 on success and others on failure. * @panic_write:The write operation only used for panic case. It's optional * if you do not care panic log. The parameters are relative * value to storage. @@ -45,6 +50,7 @@ struct pstore_zone_info { unsigned long ftrace_size; pstore_zone_read_op read; pstore_zone_write_op write; + pstore_zone_erase_op erase; pstore_zone_write_op panic_write; }; -- cgit v1.2.3 From 0958f0cefede403037653e44de0e3332d10b0e1a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:19 +0200 Subject: KVM: introduce kvm_read_guest_offset_cached() We already have kvm_write_guest_offset_cached(), introduce read analogue. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 +++ virt/kvm/kvm_main.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 161684696610..f43b59b1294c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -734,6 +734,9 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); +int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned int offset, + unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c9c6db5f77c2..e2af6ffce9c9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2510,13 +2510,15 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, } EXPORT_SYMBOL_GPL(kvm_write_guest_cached); -int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned int offset, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; + gpa_t gpa = ghc->gpa + offset; - BUG_ON(len > ghc->len); + BUG_ON(len + offset > ghc->len); if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) @@ -2527,14 +2529,21 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return -EFAULT; if (unlikely(!ghc->memslot)) - return kvm_read_guest(kvm, ghc->gpa, data, len); + return kvm_read_guest(kvm, gpa, data, len); - r = __copy_from_user(data, (void __user *)ghc->hva, len); + r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); if (r) return -EFAULT; return 0; } +EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); + +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); +} EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) -- cgit v1.2.3 From 97e27aaa9a2cbd6238c66b3251d397e0eacc9968 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:01 -0400 Subject: ceph: add read/write latency metric support Calculate the latency for OSD read requests. Add a new r_end_stamp field to struct ceph_osd_request that will hold the time of that the reply was received. Use that to calculate the RTT for each call, and divide the sum of those by number of calls to get averate RTT. Keep a tally of RTT for OSD writes and number of calls to track average latency of OSD writes. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 20 ++++++++++++ fs/ceph/debugfs.c | 43 +++++++++++++++++++++++- fs/ceph/file.c | 30 +++++++++++++++++ fs/ceph/metric.c | 72 +++++++++++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 22 +++++++++++++ include/linux/ceph/osd_client.h | 3 ++ net/ceph/osd_client.c | 3 ++ 7 files changed, 192 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6f4678d98df7..01ad09733ac7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,10 +11,12 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" +#include "metric.h" #include #include @@ -216,6 +218,9 @@ static int ceph_sync_readpages(struct ceph_fs_client *fsc, if (!rc) rc = ceph_osdc_wait_request(osdc, req); + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + ceph_osdc_put_request(req); dout("readpages result %d\n", rc); return rc; @@ -299,6 +304,7 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; int rc = req->r_result <= 0 ? req->r_result : 0; int bytes = req->r_result >= 0 ? req->r_result : 0; @@ -336,6 +342,10 @@ unlock: put_page(page); bytes -= PAGE_SIZE; } + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + kfree(osd_data->pages); } @@ -643,6 +653,9 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc, if (!rc) rc = ceph_osdc_wait_request(osdc, req); + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + ceph_osdc_put_request(req); if (rc == 0) rc = len; @@ -794,6 +807,9 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the @@ -1852,6 +1868,10 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + out_put: ceph_osdc_put_request(req); if (err == -ECANCELED) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 37b2775949ad..30acbc7f9acd 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -18,6 +20,7 @@ #ifdef CONFIG_DEBUG_FS #include "mds_client.h" +#include "metric.h" static int mdsmap_show(struct seq_file *s, void *p) { @@ -124,13 +127,51 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \ + s64 _total, _avg, _min, _max, _sq, _st; \ + _avg = ktime_to_us(avg); \ + _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ + _max = ktime_to_us(max); \ + _total = total - 1; \ + _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \ + _st = int_sqrt64(_sq); \ + _st = ktime_to_us(_st); \ + seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \ + name, total, _avg, _min, _max, _st); \ +} + static int metric_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; int i, nr_caps = 0; - + s64 total, sum, avg, min, max, sq; + + seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); + seq_printf(s, "-----------------------------------------------------------------------------------\n"); + + spin_lock(&m->read_latency_lock); + total = m->total_reads; + sum = m->read_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->read_latency_min; + max = m->read_latency_max; + sq = m->read_latency_sq_sum; + spin_unlock(&m->read_latency_lock); + CEPH_METRIC_SHOW("read", total, avg, min, max, sq); + + spin_lock(&m->write_latency_lock); + total = m->total_writes; + sum = m->write_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->write_latency_min; + max = m->write_latency_max; + sq = m->write_latency_sq_sum; + spin_unlock(&m->write_latency_lock); + CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + + seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index afdfca965a7f..160644ddaeed 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -11,11 +11,13 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" #include "io.h" +#include "metric.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -906,6 +908,12 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ret = ceph_osdc_start_request(osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + ret); + ceph_osdc_put_request(req); i_size = i_size_read(inode); @@ -1044,6 +1052,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); @@ -1051,6 +1061,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, osd_data->bvec_pos.iter.bi_size); + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + } + if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; BUG_ON(!aio_req->write); @@ -1179,6 +1199,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; struct ceph_vino vino; struct ceph_osd_request *req; struct bio_vec *bvecs; @@ -1295,6 +1316,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + size = i_size_read(inode); if (!write) { if (ret == -ENOENT) @@ -1466,6 +1494,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, ret); out: ceph_osdc_put_request(req); if (ret != 0) { diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 2a4b7394eed4..f5cf32e7789f 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -2,6 +2,7 @@ #include #include +#include #include "metric.h" @@ -29,6 +30,20 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; + spin_lock_init(&m->read_latency_lock); + m->read_latency_sq_sum = 0; + m->read_latency_min = KTIME_MAX; + m->read_latency_max = 0; + m->total_reads = 0; + m->read_latency_sum = 0; + + spin_lock_init(&m->write_latency_lock); + m->write_latency_sq_sum = 0; + m->write_latency_min = KTIME_MAX; + m->write_latency_max = 0; + m->total_writes = 0; + m->write_latency_sum = 0; + return 0; err_i_caps_mis: @@ -51,3 +66,60 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); } + +static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, + ktime_t *min, ktime_t *max, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t total, avg, sq, lsum; + + total = ++(*totalp); + lsum = (*lsump += lat); + + if (unlikely(lat < *min)) + *min = lat; + if (unlikely(lat > *max)) + *max = lat; + + if (unlikely(total == 1)) + return; + + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); + sq = lat - avg; + avg = DIV64_U64_ROUND_CLOSEST(lsum, total); + sq = sq * (lat - avg); + *sq_sump += sq; +} + +void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->read_latency_lock); + __update_latency(&m->total_reads, &m->read_latency_sum, + &m->read_latency_min, &m->read_latency_max, + &m->read_latency_sq_sum, lat); + spin_unlock(&m->read_latency_lock); +} + +void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->write_latency_lock); + __update_latency(&m->total_writes, &m->write_latency_sum, + &m->write_latency_min, &m->write_latency_max, + &m->write_latency_sq_sum, lat); + spin_unlock(&m->write_latency_lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 098ee8a9adaf..ab1543c70a75 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -4,6 +4,7 @@ #include #include +#include /* This is the global metrics */ struct ceph_client_metric { @@ -13,6 +14,20 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; + + spinlock_t read_latency_lock; + u64 total_reads; + ktime_t read_latency_sum; + ktime_t read_latency_sq_sum; + ktime_t read_latency_min; + ktime_t read_latency_max; + + spinlock_t write_latency_lock; + u64 total_writes; + ktime_t write_latency_sum; + ktime_t write_latency_sq_sum; + ktime_t write_latency_min; + ktime_t write_latency_max; }; extern int ceph_metric_init(struct ceph_client_metric *m); @@ -27,4 +42,11 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) { percpu_counter_inc(&m->i_caps_mis); } + +extern void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); +extern void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 9d9f745b98a1..734f7c6a9f56 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -213,6 +214,8 @@ struct ceph_osd_request { /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ + ktime_t r_start_latency; /* ktime_t */ + ktime_t r_end_latency; /* ktime_t */ int r_attempts; u32 r_map_dne_bound; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1d4973f8cd7a..ece124a5138e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2373,6 +2373,7 @@ static void account_request(struct ceph_osd_request *req) atomic_inc(&req->r_osdc->num_requests); req->r_start_stamp = jiffies; + req->r_start_latency = ktime_get(); } static void submit_request(struct ceph_osd_request *req, bool wrlocked) @@ -2389,6 +2390,8 @@ static void finish_request(struct ceph_osd_request *req) WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + req->r_end_latency = ktime_get(); + if (req->r_osd) unlink_request(req->r_osd, req); atomic_dec(&osdc->num_requests); -- cgit v1.2.3 From 53ab8e7cd2d47594e68951994ac083d30f82fce4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:51:38 -0500 Subject: libceph, rbd: replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd_types.h | 2 +- include/linux/ceph/mon_client.h | 2 +- include/linux/crush/crush.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index ac98ab6ccd3b..a600e0eb6b6f 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -93,7 +93,7 @@ struct rbd_image_header_ondisk { __le32 snap_count; __le32 reserved; __le64 snap_names_len; - struct rbd_image_snap_ondisk snaps[0]; + struct rbd_image_snap_ondisk snaps[]; } __attribute__((packed)); diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index dbb8a6959a73..ce4ffeb384d7 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -19,7 +19,7 @@ struct ceph_monmap { struct ceph_fsid fsid; u32 epoch; u32 num_mon; - struct ceph_entity_inst mon_inst[0]; + struct ceph_entity_inst mon_inst[]; }; struct ceph_mon_client; diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 54741295c70b..38b0e4d50ed9 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -87,7 +87,7 @@ struct crush_rule_mask { struct crush_rule { __u32 len; struct crush_rule_mask mask; - struct crush_rule_step steps[0]; + struct crush_rule_step steps[]; }; #define crush_rule_size(len) (sizeof(struct crush_rule) + \ -- cgit v1.2.3 From 8a4b863c876d9f135fa00cfe65774c3740970303 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 May 2020 16:46:47 +0200 Subject: libceph: add non-asserting rbtree insertion helper Needed for the next commit and useful for ceph_pg_pool_info tree as well. I'm leaving the asserting helper in for now, but we should look at getting rid of it in the future. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 10 ++++++-- net/ceph/osdmap.c | 60 ++++++++------------------------------------ 2 files changed, 19 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 525b7c3f1c81..4b5a47bcaba4 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -188,7 +188,7 @@ static inline int calc_pages_for(u64 off, u64 len) #define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) #define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ -static void insert_##name(struct rb_root *root, type *t) \ +static bool __insert_##name(struct rb_root *root, type *t) \ { \ struct rb_node **n = &root->rb_node; \ struct rb_node *parent = NULL; \ @@ -206,11 +206,17 @@ static void insert_##name(struct rb_root *root, type *t) \ else if (cmp > 0) \ n = &(*n)->rb_right; \ else \ - BUG(); \ + return false; \ } \ \ rb_link_node(&t->nodefld, parent, n); \ rb_insert_color(&t->nodefld, root); \ + return true; \ +} \ +static void __maybe_unused insert_##name(struct rb_root *root, type *t) \ +{ \ + if (!__insert_##name(root, t)) \ + BUG(); \ } \ static void erase_##name(struct rb_root *root, type *t) \ { \ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2a6e63a8edbe..5d00ce2b5339 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -636,48 +636,11 @@ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, /* * rbtree of pg pool info */ -static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_pool_info *pi = NULL; - - while (*p) { - parent = *p; - pi = rb_entry(parent, struct ceph_pg_pool_info, node); - if (new->id < pi->id) - p = &(*p)->rb_left; - else if (new->id > pi->id) - p = &(*p)->rb_right; - else - return -EEXIST; - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); - return 0; -} - -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) -{ - struct ceph_pg_pool_info *pi; - struct rb_node *n = root->rb_node; - - while (n) { - pi = rb_entry(n, struct ceph_pg_pool_info, node); - if (id < pi->id) - n = n->rb_left; - else if (id > pi->id) - n = n->rb_right; - else - return pi; - } - return NULL; -} +DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node) struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) { - return __lookup_pg_pool(&map->pg_pools, id); + return lookup_pg_pool(&map->pg_pools, id); } const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) @@ -690,8 +653,7 @@ const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) if (WARN_ON_ONCE(id > (u64) INT_MAX)) return NULL; - pi = __lookup_pg_pool(&map->pg_pools, (int) id); - + pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->name : NULL; } EXPORT_SYMBOL(ceph_pg_pool_name_by_id); @@ -714,14 +676,14 @@ u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; - pi = __lookup_pg_pool(&map->pg_pools, id); + pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->flags : 0; } EXPORT_SYMBOL(ceph_pg_pool_flags); static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { - rb_erase(&pi->node, root); + erase_pg_pool(root, pi); kfree(pi->name); kfree(pi); } @@ -903,7 +865,7 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, len, bad); dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) { char *name = kstrndup(*p, len, GFP_NOFS); @@ -1154,18 +1116,18 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (!incremental || !pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) return -ENOMEM; + RB_CLEAR_NODE(&pi->node); pi->id = pool; - ret = __insert_pg_pool(&map->pg_pools, pi); - if (ret) { + if (!__insert_pg_pool(&map->pg_pools, pi)) { kfree(pi); - return ret; + return -EEXIST; } } @@ -1829,7 +1791,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_pg_pool_info *pi; ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } -- cgit v1.2.3 From 86403a92c3c5c6c395983fdbfc5e2f29dc39279b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 May 2020 17:09:52 +0200 Subject: libceph: decode CRUSH device/bucket types and names These would be matched with the provided client location to calculate the locality value. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/crush/crush.h | 12 +++++++ net/ceph/crush/crush.c | 3 +- net/ceph/osdmap.c | 85 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 97 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 38b0e4d50ed9..33c16f2de7f6 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -301,6 +301,12 @@ struct crush_map { __u32 *choose_tries; #else + /* device/bucket type id -> type name (CrushWrapper::type_map) */ + struct rb_root type_names; + + /* device/bucket id -> name (CrushWrapper::name_map) */ + struct rb_root names; + /* CrushWrapper::choose_args */ struct rb_root choose_args; #endif @@ -342,4 +348,10 @@ struct crush_work { struct crush_work_bucket **work; /* Per-bucket working store */ }; +#ifdef __KERNEL__ +/* osdmap.c */ +void clear_crush_names(struct rb_root *root); +void clear_choose_args(struct crush_map *c); +#endif + #endif diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 3d70244bc1b6..254ded0b05f6 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -2,7 +2,6 @@ #ifdef __KERNEL__ # include # include -void clear_choose_args(struct crush_map *c); #else # include "crush_compat.h" # include "crush.h" @@ -130,6 +129,8 @@ void crush_destroy(struct crush_map *map) #ifndef __KERNEL__ kfree(map->choose_tries); #else + clear_crush_names(&map->type_names); + clear_crush_names(&map->names); clear_choose_args(map); #endif kfree(map); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 5d00ce2b5339..e74130876d3a 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -138,6 +138,79 @@ bad: return -EINVAL; } +struct crush_name_node { + struct rb_node cn_node; + int cn_id; + char cn_name[]; +}; + +static struct crush_name_node *alloc_crush_name(size_t name_len) +{ + struct crush_name_node *cn; + + cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO); + if (!cn) + return NULL; + + RB_CLEAR_NODE(&cn->cn_node); + return cn; +} + +static void free_crush_name(struct crush_name_node *cn) +{ + WARN_ON(!RB_EMPTY_NODE(&cn->cn_node)); + + kfree(cn); +} + +DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node) + +static int decode_crush_names(void **p, void *end, struct rb_root *root) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct crush_name_node *cn; + int id; + u32 name_len; + + ceph_decode_32_safe(p, end, id, e_inval); + ceph_decode_32_safe(p, end, name_len, e_inval); + ceph_decode_need(p, end, name_len, e_inval); + + cn = alloc_crush_name(name_len); + if (!cn) + return -ENOMEM; + + cn->cn_id = id; + memcpy(cn->cn_name, *p, name_len); + cn->cn_name[name_len] = '\0'; + *p += name_len; + + if (!__insert_crush_name(root, cn)) { + free_crush_name(cn); + return -EEXIST; + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +void clear_crush_names(struct rb_root *root) +{ + while (!RB_EMPTY_ROOT(root)) { + struct crush_name_node *cn = + rb_entry(rb_first(root), struct crush_name_node, cn_node); + + erase_crush_name(root, cn); + free_crush_name(cn); + } +} + static struct crush_choose_arg_map *alloc_choose_arg_map(void) { struct crush_choose_arg_map *arg_map; @@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + c->type_names = RB_ROOT; + c->names = RB_ROOT; c->choose_args = RB_ROOT; /* set tunables to default values */ @@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } } - ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ - ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ + err = decode_crush_names(p, end, &c->type_names); + if (err) + goto fail; + + err = decode_crush_names(p, end, &c->names); + if (err) + goto fail; + ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ -- cgit v1.2.3 From 45e6aa9f5592cd127367074f4822039cd8a825c3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 May 2020 15:24:53 +0200 Subject: libceph: crush_location infrastructure Allow expressing client's location in terms of CRUSH hierarchy as a set of (bucket type name, bucket name) pairs. The userspace syntax "crush_location = key1=value1 key2=value2" is incompatible with mount options and needed adaptation. Key-value pairs are separated by '|' and we use ':' instead of '=' to separate keys from values. So for: crush_location = host=foo rack=bar one would write: crush_location=host:foo|rack:bar As in userspace, "multipath" locations are supported, so indicating locality for parallel hierarchies is possible: crush_location=rack:foo1|rack:foo2|datacenter:bar Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/osdmap.h | 16 +++++- net/ceph/ceph_common.c | 36 ++++++++++++++ net/ceph/osdmap.c | 116 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 4b5a47bcaba4..4733959f1ec7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -64,6 +64,7 @@ struct ceph_options { int num_mon; char *name; struct ceph_crypto_key *key; + struct rb_root crush_locs; }; /* diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5e601975745f..8c9d18cc9f45 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid); +struct crush_loc { + char *cl_type_name; + char *cl_name; +}; + +struct crush_loc_node { + struct rb_node cl_node; + struct crush_loc cl_loc; /* pointers into cl_data */ + char cl_data[]; +}; + +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); +void ceph_clear_crush_locs(struct rb_root *locs); + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); - extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..44770b60bc38 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt, } } + ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); + if (ret) + return ret; + /* any matching mon ip implies a match */ for (i = 0; i < opt1->num_mon; i++) { if (ceph_monmap_contains(client->monc.monmap, @@ -260,6 +264,7 @@ enum { Opt_secret, Opt_key, Opt_ip, + Opt_crush_location, /* string args above */ Opt_share, Opt_crc, @@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), fsparam_flag_no ("crc", Opt_crc), + fsparam_string ("crush_location", Opt_crush_location), fsparam_string ("fsid", Opt_fsid), fsparam_string ("ip", Opt_ip), fsparam_string ("key", Opt_key), @@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void) if (!opt) return NULL; + opt->crush_locs = RB_ROOT; opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) { @@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt) if (!opt) return; + ceph_clear_crush_locs(&opt->crush_locs); kfree(opt->name); if (opt->key) { ceph_crypto_key_destroy(opt->key); @@ -454,6 +462,16 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, if (!opt->key) return -ENOMEM; return get_secret(opt->key, param->string, &log); + case Opt_crush_location: + ceph_clear_crush_locs(&opt->crush_locs); + err = ceph_parse_crush_location(param->string, + &opt->crush_locs); + if (err) { + error_plog(&log, "Failed to parse CRUSH location: %d", + err); + return err; + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -536,6 +554,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, { struct ceph_options *opt = client->options; size_t pos = m->count; + struct rb_node *n; if (opt->name) { seq_puts(m, "name="); @@ -545,6 +564,23 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, if (opt->key) seq_puts(m, "secret=,"); + if (!RB_EMPTY_ROOT(&opt->crush_locs)) { + seq_puts(m, "crush_location="); + for (n = rb_first(&opt->crush_locs); ; ) { + struct crush_loc_node *loc = + rb_entry(n, struct crush_loc_node, cl_node); + + seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name, + loc->cl_loc.cl_name); + n = rb_next(n); + if (!n) + break; + + seq_putc(m, '|'); + } + seq_putc(m, ','); + } + if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); if (opt->flags & CEPH_OPT_NOSHARE) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index e74130876d3a..4b81334e9e5b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2715,3 +2715,119 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); + +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, + size_t name_len) +{ + struct crush_loc_node *loc; + + loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); + if (!loc) + return NULL; + + RB_CLEAR_NODE(&loc->cl_node); + return loc; +} + +static void free_crush_loc(struct crush_loc_node *loc) +{ + WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); + + kfree(loc); +} + +static int crush_loc_compare(const struct crush_loc *loc1, + const struct crush_loc *loc2) +{ + return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: + strcmp(loc1->cl_name, loc2->cl_name); +} + +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, + RB_BYPTR, const struct crush_loc *, cl_node) + +/* + * Parses a set of ':' pairs separated + * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar". + * + * Note that @crush_location is modified by strsep(). + */ +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs) +{ + struct crush_loc_node *loc; + const char *type_name, *name, *colon; + size_t type_name_len, name_len; + + dout("%s '%s'\n", __func__, crush_location); + while ((type_name = strsep(&crush_location, "|"))) { + colon = strchr(type_name, ':'); + if (!colon) + return -EINVAL; + + type_name_len = colon - type_name; + if (type_name_len == 0) + return -EINVAL; + + name = colon + 1; + name_len = strlen(name); + if (name_len == 0) + return -EINVAL; + + loc = alloc_crush_loc(type_name_len, name_len); + if (!loc) + return -ENOMEM; + + loc->cl_loc.cl_type_name = loc->cl_data; + memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); + loc->cl_loc.cl_type_name[type_name_len] = '\0'; + + loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; + memcpy(loc->cl_loc.cl_name, name, name_len); + loc->cl_loc.cl_name[name_len] = '\0'; + + if (!__insert_crush_loc(locs, loc)) { + free_crush_loc(loc); + return -EEXIST; + } + + dout("%s type_name '%s' name '%s'\n", __func__, + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); + } + + return 0; +} + +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) +{ + struct rb_node *n1 = rb_first(locs1); + struct rb_node *n2 = rb_first(locs2); + int ret; + + for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { + struct crush_loc_node *loc1 = + rb_entry(n1, struct crush_loc_node, cl_node); + struct crush_loc_node *loc2 = + rb_entry(n2, struct crush_loc_node, cl_node); + + ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); + if (ret) + return ret; + } + + if (!n1 && n2) + return -1; + if (n1 && !n2) + return 1; + return 0; +} + +void ceph_clear_crush_locs(struct rb_root *locs) +{ + while (!RB_EMPTY_ROOT(locs)) { + struct crush_loc_node *loc = + rb_entry(rb_first(locs), struct crush_loc_node, cl_node); + + erase_crush_loc(locs, loc); + free_crush_loc(loc); + } +} -- cgit v1.2.3 From 117d96a04f007ce8fc2e292369056c3bd09f6f63 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 23 May 2020 11:45:48 +0200 Subject: libceph: support for balanced and localized reads OSD-side issues with reads from replica have been resolved in Octopus. Reading from replica should be safe wrt. unstable or uncommitted state now, so add support for balanced and localized reads. There are two cases when a read from replica can't be served: - OSD may silently drop the request, expecting the client to notice that the acting set has changed and resend via the usual means (handled with t->used_replica) - OSD may return EAGAIN, expecting the client to resend to the primary, ignoring replica read flags (see handle_reply()) Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/osdmap.h | 3 ++ net/ceph/debugfs.c | 6 ++- net/ceph/osd_client.c | 87 ++++++++++++++++++++++++++++++++-- net/ceph/osdmap.c | 102 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 193 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 734f7c6a9f56..671fb93e8c60 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -165,6 +165,7 @@ struct ceph_osd_request_target { bool recovery_deletes; unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool used_replica; bool paused; u32 epoch; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8c9d18cc9f45..3f4498fef6ad 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -317,6 +317,9 @@ int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); void ceph_clear_crush_locs(struct rb_root *locs); +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs); + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 1344f232ecc5..409d505ff320 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -81,11 +81,13 @@ static int osdmap_show(struct seq_file *s, void *p) u32 state = map->osd_state[i]; char sb[64]; - seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n", i, ceph_pr_addr(addr), ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state), - ((ceph_get_primary_affinity(map, i)*100) >> 16)); + ((ceph_get_primary_affinity(map, i)*100) >> 16), + ceph_get_crush_locality(map, i, + &client->options->crush_locs)); } for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { struct ceph_pg_mapping *pg = diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ece124a5138e..4ce6cdc744e4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1497,6 +1497,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, (osdc->osdmap->epoch < osdc->epoch_barrier); } +static int pick_random_replica(const struct ceph_osds *acting) +{ + int i = prandom_u32() % acting->size; + + dout("%s picked osd%d, primary osd%d\n", __func__, + acting->osds[i], acting->primary); + return i; +} + +/* + * Picks the closest replica based on client's location given by + * crush_location option. Prefers the primary if the locality is + * the same. + */ +static int pick_closest_replica(struct ceph_osd_client *osdc, + const struct ceph_osds *acting) +{ + struct ceph_options *opt = osdc->client->options; + int best_i, best_locality; + int i = 0, locality; + + do { + locality = ceph_get_crush_locality(osdc->osdmap, + acting->osds[i], + &opt->crush_locs); + if (i == 0 || + (locality >= 0 && best_locality < 0) || + (locality >= 0 && best_locality >= 0 && + locality < best_locality)) { + best_i = i; + best_locality = locality; + } + } while (++i < acting->size); + + dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, + acting->osds[best_i], best_locality, acting->primary); + return best_i; +} + enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, @@ -1510,6 +1549,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; + bool is_read = t->flags & CEPH_OSD_FLAG_READ; + bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; bool unpaused = false; bool legacy_change = false; @@ -1540,9 +1581,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oloc_copy(&t->target_oloc, &t->base_oloc); if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { - if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + if (is_read && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; - if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + if (is_write && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); @@ -1581,7 +1622,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, unpaused = true; } legacy_change = ceph_pg_compare(&t->pgid, &pgid) || - ceph_osds_changed(&t->acting, &acting, any_change); + ceph_osds_changed(&t->acting, &acting, + t->used_replica || any_change); if (t->pg_num) split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); @@ -1597,7 +1639,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; - t->osd = acting.primary; + if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + !is_write && pi->type == CEPH_POOL_TYPE_REP && + acting.size > 1) { + int pos; + + WARN_ON(!is_read || acting.osds[0] != acting.primary); + if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { + pos = pick_random_replica(&acting); + } else { + pos = pick_closest_replica(osdc, &acting); + } + t->osd = acting.osds[pos]; + t->used_replica = pos > 0; + } else { + t->osd = acting.primary; + t->used_replica = false; + } } if (unpaused || legacy_change || force_resend || split) @@ -3660,6 +3719,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_osdc; } + if (m.result == -EAGAIN) { + dout("req %p tid %llu EAGAIN\n", req, req->r_tid); + unlink_request(osd, req); + mutex_unlock(&osd->lock); + + /* + * The object is missing on the replica or not (yet) + * readable. Clear pgid to force a resend to the primary + * via legacy_change. + */ + req->r_t.pgid.pool = 0; + req->r_t.pgid.seed = 0; + WARN_ON(!req->r_t.used_replica); + req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + req->r_tid = 0; + __submit_request(req, false); + goto out_unlock_osdc; + } + if (m.num_ops != req->r_num_ops) { pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, req->r_num_ops, req->r_tid); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4b81334e9e5b..96c25f5e064a 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2831,3 +2831,105 @@ void ceph_clear_crush_locs(struct rb_root *locs) free_crush_loc(loc); } } + +/* + * [a-zA-Z0-9-_.]+ + */ +static bool is_valid_crush_name(const char *name) +{ + do { + if (!('a' <= *name && *name <= 'z') && + !('A' <= *name && *name <= 'Z') && + !('0' <= *name && *name <= '9') && + *name != '-' && *name != '_' && *name != '.') + return false; + } while (*++name != '\0'); + + return true; +} + +/* + * Gets the parent of an item. Returns its id (<0 because the + * parent is always a bucket), type id (>0 for the same reason, + * via @parent_type_id) and location (via @parent_loc). If no + * parent, returns 0. + * + * Does a linear search, as there are no parent pointers of any + * kind. Note that the result is ambigous for items that occur + * multiple times in the map. + */ +static int get_immediate_parent(struct crush_map *c, int id, + u16 *parent_type_id, + struct crush_loc *parent_loc) +{ + struct crush_bucket *b; + struct crush_name_node *type_cn, *cn; + int i, j; + + for (i = 0; i < c->max_buckets; i++) { + b = c->buckets[i]; + if (!b) + continue; + + /* ignore per-class shadow hierarchy */ + cn = lookup_crush_name(&c->names, b->id); + if (!cn || !is_valid_crush_name(cn->cn_name)) + continue; + + for (j = 0; j < b->size; j++) { + if (b->items[j] != id) + continue; + + *parent_type_id = b->type; + type_cn = lookup_crush_name(&c->type_names, b->type); + parent_loc->cl_type_name = type_cn->cn_name; + parent_loc->cl_name = cn->cn_name; + return b->id; + } + } + + return 0; /* no parent */ +} + +/* + * Calculates the locality/distance from an item to a client + * location expressed in terms of CRUSH hierarchy as a set of + * (bucket type name, bucket name) pairs. Specifically, looks + * for the lowest-valued bucket type for which the location of + * @id matches one of the locations in @locs, so for standard + * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9) + * a matching host is closer than a matching rack and a matching + * data center is closer than a matching zone. + * + * Specifying multiple locations (a "multipath" location) such + * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs + * is a multimap. The locality will be: + * + * - 3 for OSDs in racks foo1 and foo2 + * - 8 for OSDs in data center bar + * - -1 for all other OSDs + * + * The lowest possible bucket type is 1, so the best locality + * for an OSD is 1 (i.e. a matching host). Locality 0 would be + * the OSD itself. + */ +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs) +{ + struct crush_loc loc; + u16 type_id; + + /* + * Instead of repeated get_immediate_parent() calls, + * the location of @id could be obtained with a single + * depth-first traversal. + */ + for (;;) { + id = get_immediate_parent(osdmap->crush, id, &type_id, &loc); + if (id >= 0) + return -1; /* not local */ + + if (lookup_crush_loc(locs, &loc)) + return type_id; + } +} -- cgit v1.2.3 From 8ad44d5e0d1eda7e4a0ed382174888476dc81789 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 23 May 2020 11:47:33 +0200 Subject: libceph: read_from_replica option Expose replica reads through read_from_replica=balance and read_from_replica=localize. The default is to read from primary (read_from_replica=no). Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 2 ++ net/ceph/ceph_common.c | 39 +++++++++++++++++++++++++++++++++++++++ net/ceph/osd_client.c | 5 ++++- 3 files changed, 45 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 4733959f1ec7..2247e71beb83 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -53,6 +53,8 @@ struct ceph_options { unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_request_timeout; /* jiffies */ + u32 osd_req_flags; /* CEPH_OSD_FLAG_*, applied to each OSD request */ + /* * any type that can't be simply compared or doesn't need * to be compared should go beyond this point, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 44770b60bc38..9bab3e9a039b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -265,6 +265,7 @@ enum { Opt_key, Opt_ip, Opt_crush_location, + Opt_read_from_replica, /* string args above */ Opt_share, Opt_crc, @@ -274,6 +275,19 @@ enum { Opt_abort_on_full, }; +enum { + Opt_read_from_replica_no, + Opt_read_from_replica_balance, + Opt_read_from_replica_localize, +}; + +static const struct constant_table ceph_param_read_from_replica[] = { + {"no", Opt_read_from_replica_no}, + {"balance", Opt_read_from_replica_balance}, + {"localize", Opt_read_from_replica_localize}, + {} +}; + static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), @@ -290,6 +304,8 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), __fsparam (fs_param_is_s32, "osdtimeout", Opt_osdtimeout, fs_param_deprecated, NULL), + fsparam_enum ("read_from_replica", Opt_read_from_replica, + ceph_param_read_from_replica), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), @@ -472,6 +488,24 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, return err; } break; + case Opt_read_from_replica: + switch (result.uint_32) { + case Opt_read_from_replica_no: + opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + break; + case Opt_read_from_replica_balance: + opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS; + opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS; + break; + case Opt_read_from_replica_localize: + opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; + opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS; + break; + default: + BUG(); + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -580,6 +614,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, } seq_putc(m, ','); } + if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) { + seq_puts(m, "read_from_replica=balance,"); + } else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + seq_puts(m, "read_from_replica=localize,"); + } if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4ce6cdc744e4..22733e844be1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2425,11 +2425,14 @@ promote: static void account_request(struct ceph_osd_request *req) { + struct ceph_osd_client *osdc = req->r_osdc; + WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; - atomic_inc(&req->r_osdc->num_requests); + req->r_flags |= osdc->client->options->osd_req_flags; + atomic_inc(&osdc->num_requests); req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); -- cgit v1.2.3 From d43be2554b58621a21cb5f54b32db2263b3008b6 Mon Sep 17 00:00:00 2001 From: Bernard Zhao Date: Mon, 27 Apr 2020 01:05:23 -0700 Subject: drivers: video: hdmi: cleanup coding style in video a bit Eliminate the magic numbers, add vendor infoframe size macro like other hdmi modules. Signed-off-by: Bernard Zhao Cc: Uma Shankar Cc: Ville Syrjala Cc: Shashank Sharma Cc: Laurent Pinchart Cc: Daniel Vetter Cc: opensource.kernel@vivo.com [b.zolnierkie: add "hdmi" to the patch summary] [b.zolnierkie: fix "vender" -> vendor" typo in the patch description] Signed-off-by: Bartlomiej Zolnierkiewicz Link: https://patchwork.freedesktop.org/patch/msgid/20200427080530.3234-1-bernard@vivo.com --- drivers/video/hdmi.c | 2 +- include/linux/hdmi.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c index 856a8c4e84a2..f693076f2e5f 100644 --- a/drivers/video/hdmi.c +++ b/drivers/video/hdmi.c @@ -495,7 +495,7 @@ int hdmi_vendor_infoframe_init(struct hdmi_vendor_infoframe *frame) * value */ frame->s3d_struct = HDMI_3D_STRUCTURE_INVALID; - frame->length = 4; + frame->length = HDMI_VENDOR_INFOFRAME_SIZE; return 0; } diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 9613d796cfb1..ff25aeb95ee4 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -57,6 +57,7 @@ enum hdmi_infoframe_type { #define HDMI_SPD_INFOFRAME_SIZE 25 #define HDMI_AUDIO_INFOFRAME_SIZE 10 #define HDMI_DRM_INFOFRAME_SIZE 26 +#define HDMI_VENDOR_INFOFRAME_SIZE 4 #define HDMI_INFOFRAME_SIZE(type) \ (HDMI_INFOFRAME_HEADER_SIZE + HDMI_ ## type ## _INFOFRAME_SIZE) -- cgit v1.2.3 From c39ba6b3a8d47be07c180f857564a25a0356d336 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 1 Jun 2020 08:44:39 +0000 Subject: workqueue: fix a piece of comment about reserved bits for work flags 8a2e8e5dec7e("workqueue: fix cwq->nr_active underflow") allocated one more bit from the work flags, and it updated partial of the comments (128 bytes -> 256 bytes), but it failed to update the info about the number of reserved bits. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8b505d22fc0e..26de0cae2a0a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -62,7 +62,7 @@ enum { WORK_CPU_UNBOUND = NR_CPUS, /* - * Reserve 7 bits off of pwq pointer w/ debugobjects turned off. + * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 15 workqueue * flush colors. */ -- cgit v1.2.3 From abb30460bda232f304f642510adc8c6576ea51ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 10:02:01 -0600 Subject: block: mark bio_wouldblock_error() bio with BIO_QUIET We really don't care about triggering buffer errors for this condition. This avoids a spew of: Buffer I/O error on dev sdc, logical block 785929, async page read Buffer I/O error on dev sdc, logical block 759095, async page read Buffer I/O error on dev sdc, logical block 766922, async page read Buffer I/O error on dev sdc, logical block 17659, async page read Buffer I/O error on dev sdc, logical block 637571, async page read Buffer I/O error on dev sdc, logical block 39241, async page read Buffer I/O error on dev sdc, logical block 397241, async page read Buffer I/O error on dev sdc, logical block 763992, async page read from -EAGAIN conditions on request allocation for async reads. Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 941378ec5b39..683ff5fd8871 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,6 +417,7 @@ static inline void bio_io_error(struct bio *bio) static inline void bio_wouldblock_error(struct bio *bio) { + bio_set_flag(bio, BIO_QUIET); bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } -- cgit v1.2.3 From 708b2000362476c9c7a3571c0cc774dffb91836a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 26 May 2020 16:18:29 -0700 Subject: PCI/AER: Remove HEST/FIRMWARE_FIRST parsing for AER ownership Commit c100beb9ccfb ("PCI/AER: Use only _OSC to determine AER ownership") removed the use of HEST in determining AER ownership, but the AER driver still used HEST to verify AER ownership in some of its APIs. Per the ACPI spec v6.3, sec 18.3.2.4, some HEST table entries contain a FIRMWARE_FIRST bit, but that bit does not tell us anything about ownership of the AER capability. Remove parsing of HEST to look for FIRMWARE_FIRST. Add pcie_aer_is_native() for the places that need to know whether the OS owns the AER capability. [bhelgaas: commit log, reorder patch, remove unused __aer_firmware_first] Link: https://lore.kernel.org/r/9a37f53a4e6ff4942ff8e18dbb20b00e16c47341.1590534843.git.sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 118 +++++---------------------------------------- drivers/pci/pcie/dpc.c | 2 +- drivers/pci/pcie/portdrv.h | 13 +---- include/linux/pci.h | 2 - 4 files changed, 14 insertions(+), 121 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index efc26773cc6d..803273ba30db 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -217,118 +217,22 @@ void pcie_ecrc_get_policy(char *str) } #endif /* CONFIG_PCIE_ECRC */ -#ifdef CONFIG_ACPI_APEI -static inline int hest_match_pci(struct acpi_hest_aer_common *p, - struct pci_dev *pci) -{ - return ACPI_HEST_SEGMENT(p->bus) == pci_domain_nr(pci->bus) && - ACPI_HEST_BUS(p->bus) == pci->bus->number && - p->device == PCI_SLOT(pci->devfn) && - p->function == PCI_FUNC(pci->devfn); -} - -static inline bool hest_match_type(struct acpi_hest_header *hest_hdr, - struct pci_dev *dev) -{ - u16 hest_type = hest_hdr->type; - u8 pcie_type = pci_pcie_type(dev); - - if ((hest_type == ACPI_HEST_TYPE_AER_ROOT_PORT && - pcie_type == PCI_EXP_TYPE_ROOT_PORT) || - (hest_type == ACPI_HEST_TYPE_AER_ENDPOINT && - pcie_type == PCI_EXP_TYPE_ENDPOINT) || - (hest_type == ACPI_HEST_TYPE_AER_BRIDGE && - (dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)) - return true; - return false; -} - -struct aer_hest_parse_info { - struct pci_dev *pci_dev; - int firmware_first; -}; - -static int hest_source_is_pcie_aer(struct acpi_hest_header *hest_hdr) -{ - if (hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT || - hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT || - hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE) - return 1; - return 0; -} - -static int aer_hest_parse(struct acpi_hest_header *hest_hdr, void *data) -{ - struct aer_hest_parse_info *info = data; - struct acpi_hest_aer_common *p; - int ff; - - if (!hest_source_is_pcie_aer(hest_hdr)) - return 0; - - p = (struct acpi_hest_aer_common *)(hest_hdr + 1); - ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST); - - /* - * If no specific device is supplied, determine whether - * FIRMWARE_FIRST is set for *any* PCIe device. - */ - if (!info->pci_dev) { - info->firmware_first |= ff; - return 0; - } - - /* Otherwise, check the specific device */ - if (p->flags & ACPI_HEST_GLOBAL) { - if (hest_match_type(hest_hdr, info->pci_dev)) - info->firmware_first = ff; - } else - if (hest_match_pci(p, info->pci_dev)) - info->firmware_first = ff; - - return 0; -} - -static void aer_set_firmware_first(struct pci_dev *pci_dev) -{ - int rc; - struct aer_hest_parse_info info = { - .pci_dev = pci_dev, - .firmware_first = 0, - }; - - rc = apei_hest_parse(aer_hest_parse, &info); - - if (rc) - pci_dev->__aer_firmware_first = 0; - else - pci_dev->__aer_firmware_first = info.firmware_first; - pci_dev->__aer_firmware_first_valid = 1; -} +#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \ + PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE) -int pcie_aer_get_firmware_first(struct pci_dev *dev) +int pcie_aer_is_native(struct pci_dev *dev) { - if (!pci_is_pcie(dev)) - return 0; + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); - if (pcie_ports_native) + if (!dev->aer_cap) return 0; - if (!dev->__aer_firmware_first_valid) - aer_set_firmware_first(dev); - return dev->__aer_firmware_first; + return pcie_ports_native || host->native_aer; } -#endif - -#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \ - PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE) int pci_enable_pcie_error_reporting(struct pci_dev *dev) { - if (pcie_aer_get_firmware_first(dev)) - return -EIO; - - if (!dev->aer_cap) + if (!pcie_aer_is_native(dev)) return -EIO; return pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS); @@ -337,7 +241,7 @@ EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting); int pci_disable_pcie_error_reporting(struct pci_dev *dev) { - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return -EIO; return pcie_capability_clear_word(dev, PCI_EXP_DEVCTL, @@ -362,7 +266,7 @@ int pci_aer_clear_nonfatal_status(struct pci_dev *dev) if (!pos) return -EIO; - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return -EIO; /* Clear status bits for ERR_NONFATAL errors only */ @@ -385,7 +289,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev) if (!pos) return; - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return; /* Clear status bits for ERR_FATAL errors only */ @@ -435,7 +339,7 @@ int pci_aer_raw_clear_status(struct pci_dev *dev) int pci_aer_clear_status(struct pci_dev *dev) { - if (pcie_aer_get_firmware_first(dev)) + if (!pcie_aer_is_native(dev)) return -EIO; return pci_aer_raw_clear_status(dev); diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 762170423fdd..0993d51abf03 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -284,7 +284,7 @@ static int dpc_probe(struct pcie_device *dev) int status; u16 ctl, cap; - if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native) + if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native) return -ENOTSUPP; status = devm_request_threaded_irq(device, dev->irq, dpc_irq, diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 64b5e081cdb2..af7cf237432a 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -29,8 +29,10 @@ extern bool pcie_ports_dpc_native; #ifdef CONFIG_PCIEAER int pcie_aer_init(void); +int pcie_aer_is_native(struct pci_dev *dev); #else static inline int pcie_aer_init(void) { return 0; } +static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } #endif #ifdef CONFIG_HOTPLUG_PCI_PCIE @@ -147,16 +149,5 @@ static inline bool pcie_pme_no_msi(void) { return false; } static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {} #endif /* !CONFIG_PCIE_PME */ -#ifdef CONFIG_ACPI_APEI -int pcie_aer_get_firmware_first(struct pci_dev *pci_dev); -#else -static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev) -{ - if (pci_dev->__aer_firmware_first_valid) - return pci_dev->__aer_firmware_first; - return 0; -} -#endif - struct device *pcie_port_find_device(struct pci_dev *dev, u32 service); #endif /* _PORTDRV_H_ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..43f265830eca 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -420,8 +420,6 @@ struct pci_dev { * mappings to make sure they cannot access arbitrary memory. */ unsigned int untrusted:1; - unsigned int __aer_firmware_first_valid:1; - unsigned int __aer_firmware_first:1; unsigned int broken_intx_masking:1; /* INTx masking can't be used */ unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ unsigned int irq_managed:1; -- cgit v1.2.3 From bfad978116c2aa3b693701059923de4561196f9b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 28 May 2020 17:45:02 +0200 Subject: regmap: provide helpers for simple bit operations In many instances regmap_update_bits() is used for simple bit setting and clearing. In these cases the last argument is redundant and we can hide it with a static inline function. This adds three new helpers for simple bit operations: set_bits, clear_bits and test_bits (the last one defined as a regular function). Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- drivers/base/regmap/regmap.c | 22 ++++++++++++++++++++++ include/linux/regmap.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 59f911e57719..4ad5c5adc0a3 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2936,6 +2936,28 @@ int regmap_update_bits_base(struct regmap *map, unsigned int reg, } EXPORT_SYMBOL_GPL(regmap_update_bits_base); +/** + * regmap_test_bits() - Check if all specified bits are set in a register. + * + * @map: Register map to operate on + * @reg: Register to read from + * @bits: Bits to test + * + * Returns -1 if the underlying regmap_read() fails, 0 if at least one of the + * tested bits is not set and 1 if all tested bits are set. + */ +int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits) +{ + unsigned int val, ret; + + ret = regmap_read(map, reg, &val); + if (ret) + return ret; + + return (val & bits) == bits; +} +EXPORT_SYMBOL_GPL(regmap_test_bits); + void regmap_async_complete_cb(struct regmap_async *async, int ret) { struct regmap *map = async->map; diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 40b07168fd8e..ddf0baff195d 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1111,6 +1111,21 @@ bool regmap_reg_in_ranges(unsigned int reg, const struct regmap_range *ranges, unsigned int nranges); +static inline int regmap_set_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + return regmap_update_bits_base(map, reg, bits, bits, + NULL, false, false); +} + +static inline int regmap_clear_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + return regmap_update_bits_base(map, reg, bits, 0, NULL, false, false); +} + +int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits); + /** * struct reg_field - Description of an register field * @@ -1410,6 +1425,27 @@ static inline int regmap_update_bits_base(struct regmap *map, unsigned int reg, return -EINVAL; } +static inline int regmap_set_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + +static inline int regmap_clear_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + +static inline int regmap_test_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regmap_field_update_bits_base(struct regmap_field *field, unsigned int mask, unsigned int val, bool *change, bool async, bool force) -- cgit v1.2.3 From d3798acc094c8ff2406e9acc7a9b2c09da994616 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 May 2020 20:31:37 +0200 Subject: libceph: support for alloc hint flags Allow indicating future I/O pattern via flags. This is supported since Kraken (and bluestore persists flags together with expected_object_size and expected_write_size). Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 3 ++- include/linux/ceph/osd_client.h | 4 +++- include/linux/ceph/rados.h | 14 ++++++++++++++ net/ceph/osd_client.c | 8 +++++++- 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 67d65ac785e9..97e102ea03e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2253,7 +2253,8 @@ static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { osd_req_op_alloc_hint_init(osd_req, which++, rbd_dev->layout.object_size, - rbd_dev->layout.object_size); + rbd_dev->layout.object_size, + 0); } if (rbd_obj_is_entire(obj_req)) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 671fb93e8c60..c60b59e9291b 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -136,6 +136,7 @@ struct ceph_osd_req_op { struct { u64 expected_object_size; u64 expected_write_size; + u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ } alloc_hint; struct { u64 snapid; @@ -472,7 +473,8 @@ extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, - u64 expected_write_size); + u64 expected_write_size, + u32 flags); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 88ed3c5c04c5..3a518fd0eaad 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -464,6 +464,19 @@ enum { const char *ceph_osd_watch_op_name(int o); +enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + enum { CEPH_OSD_BACKOFF_OP_BLOCK = 1, CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, @@ -517,6 +530,7 @@ struct ceph_osd_op { struct { __le64 expected_object_size; __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ } __attribute__ ((packed)) alloc_hint; struct { __le64 snapid; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 22733e844be1..4fea3c33af2a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -932,10 +932,14 @@ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, op->watch.gen = 0; } +/* + * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_* + */ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, - u64 expected_write_size) + u64 expected_write_size, + u32 flags) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, @@ -943,6 +947,7 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; + op->alloc_hint.flags = flags; /* * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed @@ -1018,6 +1023,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, cpu_to_le64(src->alloc_hint.expected_object_size); dst->alloc_hint.expected_write_size = cpu_to_le64(src->alloc_hint.expected_write_size); + dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: -- cgit v1.2.3 From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 + include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 4 + include/uapi/linux/bpf.h | 84 +++- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++ kernel/bpf/syscall.c | 12 + kernel/bpf/verifier.c | 195 ++++++-- kernel/trace/bpf_trace.c | 10 + tools/include/uapi/linux/bpf.h | 84 +++- tools/testing/selftests/bpf/verifier/and.c | 4 +- .../testing/selftests/bpf/verifier/array_access.c | 4 +- tools/testing/selftests/bpf/verifier/bounds.c | 6 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- .../selftests/bpf/verifier/direct_value_access.c | 4 +- .../selftests/bpf/verifier/helper_access_var_len.c | 2 +- .../selftests/bpf/verifier/helper_value_access.c | 6 +- .../selftests/bpf/verifier/value_ptr_arith.c | 8 +- 19 files changed, 882 insertions(+), 70 deletions(-) create mode 100644 kernel/bpf/ringbuf.c (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efe8836b5c48..e5884f7f801c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -90,6 +90,8 @@ struct bpf_map_ops { int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); + __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts); }; struct bpf_map_memory { @@ -244,6 +246,9 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ }; /* type of values returned from helper functions */ @@ -255,6 +260,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ + RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -322,6 +328,8 @@ enum bpf_reg_type { PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; +extern const struct bpf_func_proto bpf_ringbuf_output_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_proto; +extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 29d22752fc87..fa8e1b552acd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ea833087e853..ca08db4ffb5f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -54,6 +54,8 @@ struct bpf_reg_state { u32 btf_id; /* for PTR_TO_BTF_ID */ + u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -63,6 +65,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. + * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation + * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 58f4aa593b1b..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -354,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 7629a0cebb9b..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..5a605ae131a9 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -318,7 +318,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..961f28139b96 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 18 +++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 109 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5884f7f801c..e042311f991f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +bool dev_map_can_have_prog(struct bpf_map *map); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); @@ -1363,6 +1364,10 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map { return NULL; } +static inline bool dev_map_can_have_prog(struct bpf_map *map) +{ + return false; +} static inline void __dev_flush(void) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 630432c5c292..f1e364d69007 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; diff --git a/net/core/dev.c b/net/core/dev.c index ae37586f6ee8..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974ca6e948e3..65d7717bce2f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From e91de6afa81c10e9f855c5695eb9a53168d96b73 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:59 -0700 Subject: bpf: Fix running sk_skb program types with ktls KTLS uses a stream parser to collect TLS messages and send them to the upper layer tls receive handler. This ensures the tls receiver has a full TLS header to parse when it is run. However, when a socket has BPF_SK_SKB_STREAM_VERDICT program attached before KTLS is enabled we end up with two stream parsers running on the same socket. The result is both try to run on the same socket. First the KTLS stream parser runs and calls read_sock() which will tcp_read_sock which in turn calls tcp_rcv_skb(). This dequeues the skb from the sk_receive_queue. When this is done KTLS code then data_ready() callback which because we stacked KTLS on top of the bpf stream verdict program has been replaced with sk_psock_start_strp(). This will in turn kick the stream parser again and eventually do the same thing KTLS did above calling into tcp_rcv_skb() and dequeuing a skb from the sk_receive_queue. At this point the data stream is broke. Part of the stream was handled by the KTLS side some other bytes may have been handled by the BPF side. Generally this results in either missing data or more likely a "Bad Message" complaint from the kTLS receive handler as the BPF program steals some bytes meant to be in a TLS header and/or the TLS header length is no longer correct. We've already broke the idealized model where we can stack ULPs in any order with generic callbacks on the TX side to handle this. So in this patch we do the same thing but for RX side. We add a sk_psock_strp_enabled() helper so TLS can learn a BPF verdict program is running and add a tls_sw_has_ctx_rx() helper so BPF side can learn there is a TLS ULP on the socket. Then on BPF side we omit calling our stream parser to avoid breaking the data stream for the KTLS receiver. Then on the KTLS side we call BPF_SK_SKB_STREAM_VERDICT once the KTLS receiver is done with the packet but before it posts the msg to userspace. This gives us symmetry between the TX and RX halfs and IMO makes it usable again. On the TX side we process packets in this order BPF -> TLS -> TCP and on the receive side in the reverse order TCP -> TLS -> BPF. Discovered while testing OpenSSL 3.0 Alpha2.0 release. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079361946.5745.605854335665044485.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 8 ++++++++ include/net/tls.h | 9 +++++++++ net/core/skmsg.c | 43 ++++++++++++++++++++++++++++++++++++++++--- net/tls/tls_sw.c | 20 ++++++++++++++++++-- 4 files changed, 75 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index ad31c9fb7158..08674cd14d5a 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -437,4 +437,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->skb_verdict, NULL); } +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); + +static inline bool sk_psock_strp_enabled(struct sk_psock *psock) +{ + if (!psock) + return false; + return psock->parser.enabled; +} #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 3e7b44cae0d9..3212d3c214a9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -571,6 +571,15 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) return !!tls_sw_ctx_tx(ctx); } +static inline bool tls_sw_has_ctx_rx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_rx(ctx); +} + void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9d72f71e9b47..351afbf6bfba 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -7,6 +7,7 @@ #include #include +#include static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -714,6 +715,38 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) } } +static void sk_psock_tls_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(psock, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_tls_verdict_apply(psock, skb, ret); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { @@ -792,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); + if (tls_sw_has_ctx_rx(sk)) { + psock->parser.saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } } rcu_read_unlock(); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8c2763eb6aae..24f64bc0de18 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk, long timeo; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool bpf_strp_enabled; int num_async = 0; int pending; @@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk, psock = sk_psock_get(sk); lock_sock(sk); + bpf_strp_enabled = sk_psock_strp_enabled(psock); /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false, @@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_decrypt <= len && !is_kvec && !is_peek && ctx->control == TLS_RECORD_TYPE_DATA && - prot->version != TLS_1_3_VERSION) + prot->version != TLS_1_3_VERSION && + !bpf_strp_enabled) zc = true; /* Do not use async mode if record is non-data */ - if (ctx->control == TLS_RECORD_TYPE_DATA) + if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) async_capable = ctx->async_capable; else async_capable = false; @@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk, goto pick_next_record; if (!zc) { + if (bpf_strp_enabled) { + err = sk_psock_tls_strp_read(psock, skb); + if (err != __SK_PASS) { + rxm->offset = rxm->offset + rxm->full_len; + rxm->full_len = 0; + if (err == __SK_DROP) + consume_skb(skb); + ctx->recv_pkt = NULL; + __strp_unpause(&ctx->strp); + continue; + } + } + if (rxm->full_len > len) { retain_skb = true; chunk = len; -- cgit v1.2.3 From 958a3f2d2aff896ae2a622878e456114f4a4cd15 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 31 May 2020 17:42:55 +0200 Subject: bpf: Use tracing helpers for lsm programs Currenty lsm uses bpf_tracing_func_proto helpers which do not include stack trace or perf event output. It's useful to have those for bpftrace lsm support [1]. Using tracing_prog_func_proto helpers for lsm programs. [1] https://github.com/iovisor/bpftrace/pull/1347 Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200531154255.896551-1-jolsa@kernel.org --- include/linux/bpf.h | 3 +++ kernel/bpf/bpf_lsm.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e042311f991f..07052d44bca1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1633,6 +1633,9 @@ extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto *tracing_prog_func_proto( + enum bpf_func_id func_id, const struct bpf_prog *prog); + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b6c24be5ff53..c41186417d93 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1467,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { -- cgit v1.2.3 From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:36 +0200 Subject: net: Introduce netns_bpf for BPF programs attached to netns In order to: (1) attach more than one BPF program type to netns, or (2) support attaching BPF programs to netns with bpf_link, or (3) support multi-prog attach points for netns we will need to keep more state per netns than a single pointer like we have now for BPF flow dissector program. Prepare for the above by extracting netns_bpf that is part of struct net, for storing all state related to BPF programs attached to netns. Turn flow dissector callbacks for querying/attaching/detaching a program into generic ones that operate on netns_bpf. Next patch will move the generic callbacks into their own module. This is similar to how it is organized for cgroup with cgroup_bpf. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 56 +++++++++++++++++++++++ include/linux/skbuff.h | 26 ----------- include/net/net_namespace.h | 4 +- include/net/netns/bpf.h | 17 +++++++ kernel/bpf/syscall.c | 7 +-- net/core/flow_dissector.c | 105 +++++++++++++++++++++++++++++--------------- 6 files changed, 149 insertions(+), 66 deletions(-) create mode 100644 include/linux/bpf-netns.h create mode 100644 include/net/netns/bpf.h (limited to 'include/linux') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h new file mode 100644 index 000000000000..f3aec3d79824 --- /dev/null +++ b/include/linux/bpf-netns.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_NETNS_H +#define _BPF_NETNS_H + +#include +#include + +enum netns_bpf_attach_type { + NETNS_BPF_INVALID = -1, + NETNS_BPF_FLOW_DISSECTOR = 0, + MAX_NETNS_BPF_ATTACH_TYPE +}; + +static inline enum netns_bpf_attach_type +to_netns_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + case BPF_FLOW_DISSECTOR: + return NETNS_BPF_FLOW_DISSECTOR; + default: + return NETNS_BPF_INVALID; + } +} + +/* Protects updates to netns_bpf */ +extern struct mutex netns_bpf_mutex; + +union bpf_attr; +struct bpf_prog; + +#ifdef CONFIG_NET +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); +int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); +int netns_bpf_prog_detach(const union bpf_attr *attr); +#else +static inline int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +#endif /* _BPF_NETNS_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 531843952809..a0d5c2760103 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1283,32 +1283,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -#ifdef CONFIG_NET -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog); - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); -#else -static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - return -EOPNOTSUPP; -} -#endif - struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8e001e049497..2ee5901bec7a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,8 @@ struct net { #endif struct net_generic __rcu *gen; - struct bpf_prog __rcu *flow_dissector_prog; + /* Used to store attached BPF programs */ + struct netns_bpf bpf; /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h new file mode 100644 index 000000000000..a858d1c5b166 --- /dev/null +++ b/include/net/netns/bpf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF programs attached to network namespace + */ + +#ifndef __NETNS_BPF_H__ +#define __NETNS_BPF_H__ + +#include + +struct bpf_prog; + +struct netns_bpf { + struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; +}; + +#endif /* __NETNS_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e83b0818b529..c77ab9c76f7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b64a44a083fd..6c1b8e43d611 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -31,8 +31,10 @@ #include #include #endif +#include -static DEFINE_MUTEX(flow_dissector_mutex); +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -70,23 +72,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; if (attr->query.query_flags) return -EINVAL; + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { prog_cnt = 1; prog_id = attached->aux->id; @@ -112,6 +119,7 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, static int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; if (net == &init_net) { @@ -125,74 +133,97 @@ static int flow_dissector_bpf_prog_attach(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) + if (rcu_access_pointer(ns->bpf.progs[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) + if (rcu_access_pointer(init_net.bpf.progs[type])) return -EEXIST; } - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); if (attached == prog) /* The same program cannot be attached twice */ return -EINVAL; - rcu_assign_pointer(net->flow_dissector_prog, prog); + rcu_assign_pointer(net->bpf.progs[type], prog); if (attached) bpf_prog_put(attached); return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + enum netns_bpf_attach_type type; + struct net *net; int ret; - mutex_lock(&flow_dissector_mutex); - ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); - mutex_unlock(&flow_dissector_mutex); + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); return ret; } -static int flow_dissector_bpf_prog_detach(struct net *net) +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) { struct bpf_prog *attached; - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) return -ENOENT; - } - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + RCU_INIT_POINTER(net->bpf.progs[type], NULL); bpf_prog_put(attached); - mutex_unlock(&flow_dissector_mutex); return 0; } -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr) { - return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; } -static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { - /* We're not racing with attach/detach because there are no - * references to netns left when pre_exit gets called. - */ - if (rcu_access_pointer(net->flow_dissector_prog)) - flow_dissector_bpf_prog_detach(net); + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); } -static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { - .pre_exit = flow_dissector_pernet_pre_exit, +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, }; /** @@ -1044,11 +1075,13 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + rcu_read_lock(); - attached = rcu_dereference(init_net.flow_dissector_prog); + attached = rcu_dereference(init_net.bpf.progs[type]); if (!attached) - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { struct bpf_flow_keys flow_keys; @@ -1870,6 +1903,6 @@ static int __init init_default_flow_dissectors(void) flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - return register_pernet_subsys(&flow_dissector_pernet_ops); + return register_pernet_subsys(&netns_bpf_pernet_ops); } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3 From 7f045a49fee04b5662cbdeaf0838f9322ae8c63a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:38 +0200 Subject: bpf: Add link-based BPF program attachment to network namespace Extend bpf() syscall subcommands that operate on bpf_link, that is LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to network namespaces (only flow dissector at the moment). Link-based and prog-based attachment can be used interchangeably, but only one can exist at a time. Attempts to attach a link when a prog is already attached directly, and the other way around, will be met with -EEXIST. Attempts to detach a program when link exists result in -EINVAL. Attachment of multiple links of same attach type to one netns is not supported with the intention to lift the restriction when a use-case presents itself. Because of that link create returns -E2BIG when trying to create another netns link, when one already exists. Link-based attachments to netns don't keep a netns alive by holding a ref to it. Instead links get auto-detached from netns when the latter is being destroyed, using a pernet pre_exit callback. When auto-detached, link lives in defunct state as long there are open FDs for it. -ENOLINK is returned if a user tries to update a defunct link. Because bpf_link to netns doesn't hold a ref to struct net, special care is taken when releasing, updating, or filling link info. The netns might be getting torn down when any of these link operations are in progress. That is why auto-detach and update/release/fill_info are synchronized by the same mutex. Also, link ops have to always check if auto-detach has not happened yet and if netns is still alive (refcnt > 0). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com --- include/linux/bpf-netns.h | 8 ++ include/linux/bpf_types.h | 3 + include/net/netns/bpf.h | 1 + include/uapi/linux/bpf.h | 5 + kernel/bpf/net_namespace.c | 244 ++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 3 + tools/include/uapi/linux/bpf.h | 5 + 7 files changed, 267 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index f3aec3d79824..4052d649f36d 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -34,6 +34,8 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog); #else static inline int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -51,6 +53,12 @@ static inline int netns_bpf_prog_detach(const union bpf_attr *attr) { return -EOPNOTSUPP; } + +static inline int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif #endif /* _BPF_NETNS_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa8e1b552acd..a18ae82a298a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,3 +126,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) +#ifdef CONFIG_NET +BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) +#endif diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a858d1c5b166..a8dce2a380c8 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -12,6 +12,7 @@ struct bpf_prog; struct netns_bpf { struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b37d81450c3a..78cf061f8179 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -8,9 +8,140 @@ * Functions to manage BPF programs attached to netns */ +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -67,6 +198,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach(net, prog); @@ -75,6 +213,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ret = -EINVAL; break; } +out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; @@ -86,6 +225,10 @@ static int __netns_bpf_prog_detach(struct net *net, { struct bpf_prog *attached; + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + attached = rcu_dereference_protected(net->bpf.progs[type], lockdep_is_held(&netns_bpf_mutex)); if (!attached) @@ -111,13 +254,110 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) return ret; } +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; + struct bpf_link *link; mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } mutex_unlock(&netns_bpf_mutex); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab9c76f7b..e14a842d7e0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3887,6 +3887,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: ret = tracing_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From a865e420b9561235851c3f5d483c82ef389d29bd Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 6 Apr 2020 08:42:55 -0400 Subject: virtio: force spec specified alignment on types The ring element addresses are passed between components with different alignments assumptions. Thus, if guest/userspace selects a pointer and host then gets and dereferences it, we might need to decrease the compiler-selected alignment to prevent compiler on the host from assuming pointer is aligned. This actually triggers on ARM with -mabi=apcs-gnu - which is a deprecated configuration, but it seems safer to handle this generally. Note that userspace that allocates the memory is actually OK and does not need to be fixed, but userspace that gets it from guest or another process does need to be fixed. The later doesn't generally talk to the kernel so while it might be buggy it's not talking to the kernel in the buggy way - it's just using the header in the buggy way - so fixing header and asking userspace to recompile is the best we can do. I verified that the produced kernel binary on x86 is exactly identical before and after the change. Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vhost.c | 8 +++---- drivers/vhost/vhost.h | 6 +++--- drivers/vhost/vringh.c | 6 +++--- include/linux/vringh.h | 6 +++--- include/uapi/linux/virtio_ring.h | 46 ++++++++++++++++++++++++++++++---------- 5 files changed, 48 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 21a59b598ed8..96d9871fa0cb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1244,9 +1244,9 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) } static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used) { return access_ok(desc, vhost_get_desc_size(vq, num)) && @@ -2301,7 +2301,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) { - struct vring_used_elem __user *used; + vring_used_elem_t __user *used; u16 old, new; int start; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f8403bd46b85..60cab4c78229 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -67,9 +67,9 @@ struct vhost_virtqueue { /* The actual ring of buffers. */ struct mutex mutex; unsigned int num; - struct vring_desc __user *desc; - struct vring_avail __user *avail; - struct vring_used __user *used; + vring_desc_t __user *desc; + vring_avail_t __user *avail; + vring_used_t __user *used; const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS]; struct file *kick; struct eventfd_ctx *call_ctx; diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index ba8e0d6cfd97..e059a9a47cdf 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -620,9 +620,9 @@ static inline int xfer_to_user(const struct vringh *vrh, */ int vringh_init_user(struct vringh *vrh, u64 features, unsigned int num, bool weak_barriers, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used) { /* Sane power of 2 please! */ if (!num || num > 0xffff || (num & (num - 1))) { diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 9e2763d7c159..59bd50f99291 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -105,9 +105,9 @@ struct vringh_kiov { /* Helpers for userspace vrings. */ int vringh_init_user(struct vringh *vrh, u64 features, unsigned int num, bool weak_barriers, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used); + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used); static inline void vringh_iov_init(struct vringh_iov *iov, struct iovec *iovec, unsigned num) diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 9223c3a5c46a..476d3e5c0fe7 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -86,6 +86,13 @@ * at the end of the used ring. Guest should ignore the used->flags field. */ #define VIRTIO_RING_F_EVENT_IDX 29 +/* Alignment requirements for vring elements. + * When using pre-virtio 1.0 layout, these fall out naturally. + */ +#define VRING_AVAIL_ALIGN_SIZE 2 +#define VRING_USED_ALIGN_SIZE 4 +#define VRING_DESC_ALIGN_SIZE 16 + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ @@ -112,29 +119,46 @@ struct vring_used_elem { __virtio32 len; }; +typedef struct vring_used_elem __attribute__((aligned(VRING_USED_ALIGN_SIZE))) + vring_used_elem_t; + struct vring_used { __virtio16 flags; __virtio16 idx; - struct vring_used_elem ring[]; + vring_used_elem_t ring[]; }; +/* + * The ring element addresses are passed between components with different + * alignments assumptions. Thus, we might need to decrease the compiler-selected + * alignment, and so must use a typedef to make sure the aligned attribute + * actually takes hold: + * + * https://gcc.gnu.org/onlinedocs//gcc/Common-Type-Attributes.html#Common-Type-Attributes + * + * When used on a struct, or struct member, the aligned attribute can only + * increase the alignment; in order to decrease it, the packed attribute must + * be specified as well. When used as part of a typedef, the aligned attribute + * can both increase and decrease alignment, and specifying the packed + * attribute generates a warning. + */ +typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE))) + vring_desc_t; +typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE))) + vring_avail_t; +typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE))) + vring_used_t; + struct vring { unsigned int num; - struct vring_desc *desc; + vring_desc_t *desc; - struct vring_avail *avail; + vring_avail_t *avail; - struct vring_used *used; + vring_used_t *used; }; -/* Alignment requirements for vring elements. - * When using pre-virtio 1.0 layout, these fall out naturally. - */ -#define VRING_AVAIL_ALIGN_SIZE 2 -#define VRING_USED_ALIGN_SIZE 4 -#define VRING_DESC_ALIGN_SIZE 16 - #ifndef VIRTIO_RING_NO_LEGACY /* The standard layout for the ring is a continuous chunk of memory which looks -- cgit v1.2.3 From 25de110d148666752dc0e0da7a0b69de31cd7098 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Jun 2020 12:08:39 +0200 Subject: irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too Some SMP platforms don't have CONFIG_IRQ_WORK defined, resulting in a link error at build time. Define a stub and clean up the prototype definitions. Reported-by: kbuild test robot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/irq_work.h | 2 ++ kernel/smp.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index f23a359c8f46..2735da5f839e 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -58,9 +58,11 @@ void irq_work_sync(struct irq_work *work); void irq_work_run(void); bool irq_work_needs_cpu(void); +void irq_work_single(void *arg); #else static inline bool irq_work_needs_cpu(void) { return false; } static inline void irq_work_run(void) { } +static inline void irq_work_single(void *arg) { } #endif #endif /* _LINUX_IRQ_WORK_H */ diff --git a/kernel/smp.c b/kernel/smp.c index 4dec04f7fdc5..c80486a7e3b8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,8 +194,6 @@ void generic_smp_call_function_single_interrupt(void) flush_smp_call_function_queue(true); } -extern void irq_work_single(void *); - /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * -- cgit v1.2.3 From ef3534a94fdbdeab4c89d18d0164be2ad5d6dbb7 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 2 Jun 2020 09:42:08 +0530 Subject: hw-breakpoints: Fix build warnings with clang kbuild test robot reported some build warnings in the hw_breakpoint code when compiled with clang[1]. Some of them were introduced by the recent powerpc change to add arch_reserve_bp_slot() and arch_release_bp_slot(). Fix them all. kernel/events/hw_breakpoint.c:71:12: warning: no previous prototype for function 'hw_breakpoint_weight' kernel/events/hw_breakpoint.c:216:12: warning: no previous prototype for function 'arch_reserve_bp_slot' kernel/events/hw_breakpoint.c:221:13: warning: no previous prototype for function 'arch_release_bp_slot' kernel/events/hw_breakpoint.c:228:13: warning: no previous prototype for function 'arch_unregister_hw_breakpoint' [1]: https://lore.kernel.org/linuxppc-dev/202005192233.oi9CjRtA%25lkp@intel.com/ Fixes: 29da4f91c0c1 ("powerpc/watchpoint: Don't allow concurrent perf and ptrace events") Reported-by: kbuild test robot Signed-off-by: Ravi Bangoria [mpe: Drop extern, flesh out change log, add Fixes tag] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200602041208.128913-1-ravi.bangoria@linux.ibm.com --- arch/powerpc/include/asm/hw_breakpoint.h | 3 --- include/linux/hw_breakpoint.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index f42a55eb77d2..cb424799da0d 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -70,9 +70,6 @@ extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); int arch_install_hw_breakpoint(struct perf_event *bp); void arch_uninstall_hw_breakpoint(struct perf_event *bp); -int arch_reserve_bp_slot(struct perf_event *bp); -void arch_release_bp_slot(struct perf_event *bp); -void arch_unregister_hw_breakpoint(struct perf_event *bp); void hw_breakpoint_pmu_read(struct perf_event *bp); extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 6058c3844a76..d7d4250cd1e4 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -80,6 +80,10 @@ extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); +int hw_breakpoint_weight(struct perf_event *bp); +int arch_reserve_bp_slot(struct perf_event *bp); +void arch_release_bp_slot(struct perf_event *bp); +void arch_unregister_hw_breakpoint(struct perf_event *bp); extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); -- cgit v1.2.3 From c893de12e1ef17b581eb2cf8fc9018ec0cbd07df Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 21 May 2020 15:21:25 +0800 Subject: kdb: Remove the misfeature 'KDBFLAGS' Currently, 'KDBFLAGS' is an internal variable of kdb, it is combined by 'KDBDEBUG' and state flags. It will be shown only when 'KDBDEBUG' is set, and the user can define an environment variable named 'KDBFLAGS' too. These are puzzling indeed. After communication with Daniel, it seems that 'KDBFLAGS' is a misfeature. So let's replace 'KDBFLAGS' with 'KDBDEBUG' to just show the value we wrote into. After this modification, we can use `md4c1 kdb_flags` instead, to observe the state flags. Suggested-by: Daniel Thompson Signed-off-by: Wei Li Link: https://lore.kernel.org/r/20200521072125.21103-1-liwei391@huawei.com [daniel.thompson@linaro.org: Make kdb_flags unsigned to avoid arithmetic right shift] Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 2 +- kernel/debug/kdb/kdb_main.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 24cd447659e0..0125a677b67f 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -125,7 +125,7 @@ extern const char *kdb_diemsg; #define KDB_FLAG_NO_I8042 (1 << 7) /* No i8042 chip is available, do * not use keyboard */ -extern int kdb_flags; /* Global flags, see kdb_state for per cpu state */ +extern unsigned int kdb_flags; /* Global flags, see kdb_state for per cpu state */ extern void kdb_save_flags(void); extern void kdb_restore_flags(void); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6865a0f58d38..ec190569f690 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -62,7 +62,7 @@ int kdb_grep_trailing; /* * Kernel debugger state flags */ -int kdb_flags; +unsigned int kdb_flags; /* * kdb_lock protects updates to kdb_initial_cpu. Used to @@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv) argv[2]); return 0; } - kdb_flags = (kdb_flags & - ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK)) | (debugflags << KDB_DEBUG_FLAG_SHIFT); return 0; @@ -2082,7 +2081,8 @@ static int kdb_env(int argc, const char **argv) } if (KDB_DEBUG(MASK)) - kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + kdb_printf("KDBDEBUG=0x%x\n", + (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT); return 0; } -- cgit v1.2.3 From 735e4ae5ba28c886d249ad04d3c8cc097dad6336 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 21:45:36 -0700 Subject: vfs: track per-sb writeback errors and report them to syncfs Patch series "vfs: have syncfs() return error when there are writeback errors", v6. Currently, syncfs does not return errors when one of the inodes fails to be written back. It will return errors based on the legacy AS_EIO and AS_ENOSPC flags when syncing out the block device fails, but that's not particularly helpful for filesystems that aren't backed by a blockdev. It's also possible for a stray sync to lose those errors. The basic idea in this set is to track writeback errors at the superblock level, so that we can quickly and easily check whether something bad happened without having to fsync each file individually. syncfs is then changed to reliably report writeback errors after they occur, much in the same fashion as fsync does now. This patch (of 2): Usually we suggest that applications call fsync when they want to ensure that all data written to the file has made it to the backing store, but that can be inefficient when there are a lot of open files. Calling syncfs on the filesystem can be more efficient in some situations, but the error reporting doesn't currently work the way most people expect. If a single inode on a filesystem reports a writeback error, syncfs won't necessarily return an error. syncfs only returns an error if __sync_blockdev fails, and on some filesystems that's a no-op. It would be better if syncfs reported an error if there were any writeback failures. Then applications could call syncfs to see if there are any errors on any open files, and could then call fsync on all of the other descriptors to figure out which one failed. This patch adds a new errseq_t to struct super_block, and has mapping_set_error also record writeback errors there. To report those errors, we also need to keep an errseq_t in struct file to act as a cursor. This patch adds a dedicated field for that purpose, which slots nicely into 4 bytes of padding at the end of struct file on x86_64. An earlier version of this patch used an O_PATH file descriptor to cue the kernel that the open file should track the superblock error and not the inode's writeback error. I think that API is just too weird though. This is simpler and should make syncfs error reporting "just work" even if someone is multiplexing fsync and syncfs on the same fds. Signed-off-by: Jeff Layton Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Andres Freund Cc: Matthew Wilcox Cc: Al Viro Cc: Christoph Hellwig Cc: Dave Chinner Cc: David Howells Link: http://lkml.kernel.org/r/20200428135155.19223-1-jlayton@kernel.org Link: http://lkml.kernel.org/r/20200428135155.19223-2-jlayton@kernel.org Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 1 + fs/file_table.c | 1 + fs/open.c | 3 +-- fs/sync.c | 6 ++++-- include/linux/fs.h | 16 ++++++++++++++++ include/linux/pagemap.h | 5 ++++- 6 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1af823b2fe6b..4c0af2eb7e19 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -377,6 +377,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..676e620948d2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/open.c b/fs/open.c index 719b320ede52..d9467a8a7f6a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -743,9 +743,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5a..c6f6f5be5682 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 45cc10cdf6dd..f2fb5b7406b9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -976,6 +976,7 @@ struct file { #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1520,6 +1521,9 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@ -2827,6 +2831,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a8f7bd8ea1c6..d4409b13747e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error) return; /* Record in wb_err for checkers using errseq_t based tracking */ - filemap_set_wb_err(mapping, error); + __filemap_set_wb_err(mapping, error); + + /* Record it in superblock */ + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) -- cgit v1.2.3 From cee9a0c4e84db024d692d6b5c18f65465eb06905 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:07 -0700 Subject: mm: move readahead prototypes from mm.h Patch series "Change readahead API", v11. This series adds a readahead address_space operation to replace the readpages operation. The key difference is that pages are added to the page cache as they are allocated (and then looked up by the filesystem) instead of passing them on a list to the readpages operation and having the filesystem add them to the page cache. It's a net reduction in code for each implementation, more efficient than walking a list, and solves the direct-write vs buffered-read problem reported by yu kuai at http://lkml.kernel.org/r/20200116063601.39201-1-yukuai3@huawei.com The only unconverted filesystems are those which use fscache. Their conversion is pending Dave Howells' rewrite which will make the conversion substantially easier. This should be completed by the end of the year. I want to thank the reviewers/testers; Dave Chinner, John Hubbard, Eric Biggers, Johannes Thumshirn, Dave Sterba, Zi Yan, Christoph Hellwig and Miklos Szeredi have done a marvellous job of providing constructive criticism. These patches pass an xfstests run on ext4, xfs & btrfs with no regressions that I can tell (some of the tests seem a little flaky before and remain flaky afterwards). This patch (of 25): The readahead code is part of the page cache so should be found in the pagemap.h file. force_page_cache_readahead is only used within mm, so move it to mm/internal.h instead. Remove the parameter names where they add no value, and rename the ones which were actively misleading. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Reviewed-by: Johannes Thumshirn Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-1-willy@infradead.org Link: http://lkml.kernel.org/r/20200414150233.24495-2-willy@infradead.org Signed-off-by: Linus Torvalds --- block/blk-core.c | 1 + include/linux/mm.h | 19 ------------------- include/linux/pagemap.h | 8 ++++++++ mm/fadvise.c | 2 ++ mm/internal.h | 2 ++ 5 files changed, 13 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 9bfaee050c82..38d7b1f16067 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index f3fe7371855c..92704fde6475 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2612,25 +2612,6 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); int __must_check write_one_page(struct page *page); void task_dirty_inc(struct task_struct *tsk); -/* readahead.c */ -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d4409b13747e..8c081cda4b35 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -618,6 +618,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, + struct file *, pgoff_t index, unsigned long req_count); +void page_cache_async_readahead(struct address_space *, struct file_ra_state *, + struct file *, struct page *, pgoff_t index, + unsigned long req_count); + /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. diff --git a/mm/fadvise.c b/mm/fadvise.c index 4f17c83db575..3efebfb9952c 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -22,6 +22,8 @@ #include +#include "internal.h" + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..25fee17c7334 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,6 +49,8 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); +int force_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read); extern unsigned int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); -- cgit v1.2.3 From 042124cc64c33555deba0b11c6e0c612ae7a8653 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:21 -0700 Subject: mm: add new readahead_control API Filesystems which implement the upcoming ->readahead method will get their pages by calling readahead_page() or readahead_page_batch(). These functions support large pages, even though none of the filesystems to be converted do yet. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-6-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 140 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8c081cda4b35..c3bf73263ec9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -642,6 +642,146 @@ static inline int add_to_page_cache(struct page *page, return error; } +/** + * struct readahead_control - Describes a readahead request. + * + * A readahead request is for consecutive pages. Filesystems which + * implement the ->readahead method should call readahead_page() or + * readahead_page_batch() in a loop and attempt to start I/O against + * each page in the request. + * + * Most of the fields in this struct are private and should be accessed + * by the functions below. + * + * @file: The file, used primarily by network filesystems for authentication. + * May be NULL if invoked internally by the filesystem. + * @mapping: Readahead this filesystem object. + */ +struct readahead_control { + struct file *file; + struct address_space *mapping; +/* private: use the readahead_* accessors instead */ + pgoff_t _index; + unsigned int _nr_pages; + unsigned int _batch_count; +}; + +/** + * readahead_page - Get the next page to read. + * @rac: The current readahead request. + * + * Context: The page is locked and has an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: A pointer to the next page, or %NULL if we are done. + */ +static inline struct page *readahead_page(struct readahead_control *rac) +{ + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + + if (!rac->_nr_pages) { + rac->_batch_count = 0; + return NULL; + } + + page = xa_load(&rac->mapping->i_pages, rac->_index); + VM_BUG_ON_PAGE(!PageLocked(page), page); + rac->_batch_count = hpage_nr_pages(page); + + return page; +} + +static inline unsigned int __readahead_batch(struct readahead_control *rac, + struct page **array, unsigned int array_sz) +{ + unsigned int i = 0; + XA_STATE(xas, &rac->mapping->i_pages, 0); + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + rac->_batch_count = 0; + + xas_set(&xas, rac->_index); + rcu_read_lock(); + xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); + array[i++] = page; + rac->_batch_count += hpage_nr_pages(page); + + /* + * The page cache isn't using multi-index entries yet, + * so the xas cursor needs to be manually moved to the + * next index. This can be removed once the page cache + * is converted. + */ + if (PageHead(page)) + xas_set(&xas, rac->_index + rac->_batch_count); + + if (i == array_sz) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * readahead_page_batch - Get a batch of pages to read. + * @rac: The current readahead request. + * @array: An array of pointers to struct page. + * + * Context: The pages are locked and have an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: The number of pages placed in the array. 0 indicates the request + * is complete. + */ +#define readahead_page_batch(rac, array) \ + __readahead_batch(rac, array, ARRAY_SIZE(array)) + +/** + * readahead_pos - The byte offset into the file of this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_pos(struct readahead_control *rac) +{ + return (loff_t)rac->_index * PAGE_SIZE; +} + +/** + * readahead_length - The number of bytes in this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_length(struct readahead_control *rac) +{ + return (loff_t)rac->_nr_pages * PAGE_SIZE; +} + +/** + * readahead_index - The index of the first page in this readahead request. + * @rac: The readahead request. + */ +static inline pgoff_t readahead_index(struct readahead_control *rac) +{ + return rac->_index; +} + +/** + * readahead_count - The number of pages in this readahead request. + * @rac: The readahead request. + */ +static inline unsigned int readahead_count(struct readahead_control *rac) +{ + return rac->_nr_pages; +} + static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> -- cgit v1.2.3 From 8151b4c8bee43cea7a28cb0300123df90880e60c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:44 -0700 Subject: mm: add readahead address space operation This replaces ->readpages with a saner interface: - Return void instead of an ignored error code. - Page cache is already populated with locked pages when ->readahead is called. - New arguments can be passed to the implementation without changing all the filesystems that use a common helper function like mpage_readahead(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-12-willy@infradead.org Signed-off-by: Linus Torvalds --- Documentation/filesystems/locking.rst | 6 +++++- Documentation/filesystems/vfs.rst | 15 +++++++++++++++ include/linux/fs.h | 2 ++ mm/readahead.c | 12 ++++++++++-- 4 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 5057e4d9dcd1..0af2e0e11461 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,6 +239,7 @@ prototypes:: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -271,7 +272,8 @@ writepage: yes, unlocks (see below) readpage: yes, unlocks writepages: set_page_dirty no -readpages: +readahead: yes, unlocks +readpages: no write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -295,6 +297,8 @@ the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. +->readahead() unlocks the pages that I/O is attempted on like ->readpage(). + ->readpages() populates the pagecache with the passed pages and starts I/O against them. They come unlocked upon I/O completion. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7d4d09dd5e6d..ed17771c212b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -781,12 +782,26 @@ cache in your filesystem. The following members are defined: If defined, it should set the PageDirty flag, and the PAGECACHE_TAG_DIRTY tag in the radix tree. +``readahead`` + Called by the VM to read pages associated with the address_space + object. The pages are consecutive in the page cache and are + locked. The implementation should decrement the page refcount + after starting I/O on each page. Usually the page will be + unlocked by the I/O completion handler. If the filesystem decides + to stop attempting I/O before reaching the end of the readahead + window, it can simply return. The caller will decrement the page + refcount and unlock the remaining pages for you. Set PageUptodate + if the I/O completes successfully. Setting PageError on any page + will be ignored; simply unlock the page if an I/O error occurs. + ``readpages`` called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. + This interface is deprecated and will be removed by the end of + 2020; implement readahead instead. ``write_begin`` Called by the generic buffered write code to ask the filesystem diff --git a/include/linux/fs.h b/include/linux/fs.h index f2fb5b7406b9..1434ed801b80 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -292,6 +292,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +struct readahead_control; /* * Write life time hint values. @@ -375,6 +376,7 @@ struct address_space_operations { */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, diff --git a/mm/readahead.c b/mm/readahead.c index e52b3a7b9da5..d01531ef9f3c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -125,7 +125,14 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, blk_start_plug(&plug); - if (aops->readpages) { + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { aops->readpages(rac->file, rac->mapping, pages, readahead_count(rac)); /* Clean up the remaining pages */ @@ -233,7 +240,8 @@ void force_page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) return; /* -- cgit v1.2.3 From 2c684234d36f7e8c80414e4a772911d407e821fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:51 -0700 Subject: mm: add page_cache_readahead_unbounded ext4 and f2fs have duplicated the guts of the readahead code so they can read past i_size. Instead, separate out the guts of the readahead code so they can call it directly. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Tested-by: Eric Biggers Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Reviewed-by: Eric Biggers Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-14-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/ext4/verity.c | 35 ++----------------------- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 --- fs/f2fs/verity.c | 35 ++----------------------- include/linux/pagemap.h | 3 +++ mm/readahead.c | 68 ++++++++++++++++++++++++++++++++++--------------- 6 files changed, 55 insertions(+), 91 deletions(-) (limited to 'include/linux') diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..dec1244dd062 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void ext4_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - ext4_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..ae14e952df4f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2177,7 +2177,7 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, +static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 157eec348970..b5a5da74c013 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3373,9 +3373,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..865c9fb774fb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3bf73263ec9..c6348c50136f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -625,6 +625,9 @@ void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, void page_cache_async_readahead(struct address_space *, struct file_ra_state *, struct file *, struct page *, pgoff_t index, unsigned long req_count); +void page_cache_readahead_unbounded(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_count); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/mm/readahead.c b/mm/readahead.c index 998fdd23c0b1..ae231a5312cb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -156,37 +156,34 @@ out: rac->_index++; } -/* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates - * the pages first, then submits them for I/O. This avoids the very bad - * behaviour which would occur if page allocations are causing VM writeback. - * We really don't want to intermingle reads and writes like that. +/** + * page_cache_readahead_unbounded - Start unchecked readahead. + * @mapping: File address space. + * @file: This instance of the open file; used for authentication. + * @index: First page index to read. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. + * + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. */ -void __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t index, unsigned long nr_to_read, +void page_cache_readahead_unbounded(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; LIST_HEAD(page_pool); - loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); struct readahead_control rac = { .mapping = mapping, - .file = filp, + .file = file, ._index = index, }; unsigned long i; - pgoff_t end_index; /* The last page we want to read */ - - if (isize == 0) - return; - - end_index = (isize - 1) >> PAGE_SHIFT; - if (index > end_index) - return; - /* Don't read past the page containing the last byte of the file */ - if (nr_to_read > end_index - index) - nr_to_read = end_index - index + 1; /* * Preallocate as many pages as we will need. @@ -230,6 +227,35 @@ void __do_page_cache_readahead(struct address_space *mapping, */ read_pages(&rac, &page_pool, false); } +EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +void __do_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_readahead_unbounded(mapping, file, index, nr_to_read, + lookahead_size); +} /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much -- cgit v1.2.3 From d4388340ae0bc8397ef5b24342279f7739982918 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:02 -0700 Subject: fs: convert mpage_readpages to mpage_readahead Implement the new readahead aop and convert all callers (block_dev, exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6, reiserfs & udf). The callers are all trivial except for GFS2 & OCFS2. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Junxiao Bi # ocfs2 Reviewed-by: Joseph Qi # ocfs2 Reviewed-by: Dave Chinner Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/block_dev.c | 7 +++---- fs/exfat/inode.c | 7 +++---- fs/ext2/inode.c | 10 ++++------ fs/fat/inode.c | 7 +++---- fs/gfs2/aops.c | 23 ++++++++--------------- fs/hpfs/file.c | 7 +++---- fs/iomap/buffered-io.c | 2 +- fs/isofs/inode.c | 7 +++---- fs/jfs/inode.c | 7 +++---- fs/mpage.c | 38 +++++++++++--------------------------- fs/nilfs2/inode.c | 15 +++------------ fs/ocfs2/aops.c | 34 +++++++++++++--------------------- fs/omfs/file.c | 7 +++---- fs/qnx6/inode.c | 7 +++---- fs/reiserfs/inode.c | 8 +++----- fs/udf/inode.c | 7 +++---- include/linux/mpage.h | 4 ++-- mm/migrate.c | 2 +- 18 files changed, 73 insertions(+), 126 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index 93672c3f1c78..f05e2f2c898d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -2085,7 +2084,7 @@ static int blkdev_writepages(struct address_space *mapping, static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 06887492f54b..785ead346543 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page) return mpage_readpage(page, exfat_get_block); } -static int exfat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void exfat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); + mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) @@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations exfat_aops = { .readpage = exfat_readpage, - .readpages = exfat_readpages, + .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..2875c0a705b5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -877,11 +877,9 @@ static int ext2_readpage(struct file *file, struct page *page) return mpage_readpage(page, ext2_get_block); } -static int -ext2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext2_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); + mpage_readahead(rac, ext2_get_block); } static int @@ -967,7 +965,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, @@ -981,7 +979,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, .write_begin = ext2_nobh_write_begin, .write_end = nobh_write_end, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 71946da84388..e6e68b2274a5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page) return mpage_readpage(page, fat_get_block); } -static int fat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, fat_get_block); + mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) @@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations fat_aops = { .readpage = fat_readpage, - .readpages = fat_readpages, + .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 786c1ce8f030..72c9560f4467 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, } /** - * gfs2_readpages - Read a bunch of pages at once + * gfs2_readahead - Read a bunch of pages at once * @file: The file to read from * @mapping: Address space info * @pages: List of pages to read @@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void gfs2_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) + if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + mpage_readahead(rac, gfs2_block_map); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(gfs2_withdrawn(sdp))) - ret = -EIO; - return ret; } /** @@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = { .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepage = gfs2_jdata_writepage, .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..2de0d3492d15 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -125,10 +125,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void hpfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); + mpage_readahead(rac, hpfs_get_block); } static int hpfs_writepages(struct address_space *mapping, @@ -198,7 +197,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, - .readpages = hpfs_readpages, + .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 89e21961d1ad..075db1e71b14 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -367,7 +367,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readpages and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page we always * return 0 and just mark the page as PageError on errors. This * should be cleaned up all through the stack eventually. */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 62c0462dc89f..95b1f377ad09 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1185,10 +1185,9 @@ static int isofs_readpage(struct file *file, struct page *page) return mpage_readpage(page, isofs_get_block); } -static int isofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void isofs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); + mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1198,7 +1197,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, - .readpages = isofs_readpages, + .readahead = isofs_readahead, .bmap = _isofs_bmap }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9486afcdac76..6f65bfa9f18d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, jfs_get_block); } -static int jfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void jfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); + mpage_readahead(rac, jfs_get_block); } static void jfs_write_failed(struct address_space *mapping, loff_t to) @@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations jfs_aops = { .readpage = jfs_readpage, - .readpages = jfs_readpages, + .readahead = jfs_readahead, .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, diff --git a/fs/mpage.c b/fs/mpage.c index ccba3c4c4479..830e6cc2a9e7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev, } /* - * support function for mpage_readpages. The fs supplied get_block might + * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows readpage to avoid triggering a duplicate call * to get_block. @@ -338,13 +338,8 @@ confused: } /** - * mpage_readpages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * The page at @pages->prev has the lowest file offset, and reads should be - * issued in @pages->prev to @pages->next order. - * @nr_pages: The number of pages at *@pages + * mpage_readahead - start reads against pages + * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and @@ -381,36 +376,25 @@ confused: * * This all causes the disk requests to be issued in the correct order. */ -int -mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block) +void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { + struct page *page; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - unsigned page_idx; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - args.page = page; - args.nr_pages = nr_pages - page_idx; - args.bio = do_mpage_readpage(&args); - } + args.page = page; + args.nr_pages = readahead_count(rac); + args.bio = do_mpage_readpage(&args); put_page(page); } - BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); - return 0; } -EXPORT_SYMBOL(mpage_readpages); +EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all @@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also - * using mpage_readpages then this can rarely happen. + * using mpage_readahead then this can rarely happen. */ goto confused; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..ceeb3b441844 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -145,18 +145,9 @@ static int nilfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, nilfs_get_block); } -/** - * nilfs_readpages() - implement readpages() method of nilfs_aops {} - * address_space_operations. - * @file - file struct of the file to be read - * @mapping - address_space struct used for reading multiple pages - * @pages - the pages to be read - * @nr_pages - number of pages to be read - */ -static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void nilfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); + mpage_readahead(rac, nilfs_get_block); } static int nilfs_writepages(struct address_space *mapping, @@ -308,7 +299,7 @@ const struct address_space_operations nilfs_aops = { .readpage = nilfs_readpage, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, - .readpages = nilfs_readpages, + .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3a67a6518ddf..3bfb4147895a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -350,14 +350,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = lru_to_page(pages); - start = (loff_t)last->index << PAGE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has @@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, + .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d640b9388238..d7b5f09d298c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, omfs_get_block); } -static int omfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void omfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); + mpage_readahead(rac, omfs_get_block); } static int omfs_writepage(struct page *page, struct writeback_control *wbc) @@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = { const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, - .readpages = omfs_readpages, + .readahead = omfs_readahead, .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 345db56c98fd..755293c8c71a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page) return mpage_readpage(page, qnx6_get_block); } -static int qnx6_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void qnx6_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); + mpage_readahead(rac, qnx6_get_block); } /* @@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations qnx6_aops = { .readpage = qnx6_readpage, - .readpages = qnx6_readpages, + .readahead = qnx6_readahead, .bmap = qnx6_bmap }; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6419e6dacc39..0031070b3692 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1160,11 +1160,9 @@ failure: return retval; } -static int -reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void reiserfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + mpage_readahead(rac, reiserfs_get_block); } /* @@ -3434,7 +3432,7 @@ out: const struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, + .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, .invalidatepage = reiserfs_invalidatepage, .write_begin = reiserfs_write_begin, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e875bc5668ee..adaba8e8b326 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page) return mpage_readpage(page, udf_get_block); } -static int udf_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void udf_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, udf_get_block); + mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, - .readpages = udf_readpages, + .readahead = udf_readahead, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 001f1fcf9836..f4f5e90a6844 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -13,9 +13,9 @@ #ifdef CONFIG_BLOCK struct writeback_control; +struct readahead_control; -int mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block); +void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..f66f93f9a5e2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1032,7 +1032,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, -- cgit v1.2.3 From 9d24a13a93d995e4c980fdaa389aa3e2f1ea0b12 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:34 -0700 Subject: iomap: convert from readpages to readahead Use the new readahead operation in iomap. Convert XFS and ZoneFS to use it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-26-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/iomap/buffered-io.c | 90 +++++++++++++++++--------------------------------- fs/iomap/trace.h | 2 +- fs/xfs/xfs_aops.c | 13 +++----- fs/zonefs/super.c | 7 ++-- include/linux/iomap.h | 3 +- 5 files changed, 41 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 075db1e71b14..890c8fcda4f3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -214,9 +214,8 @@ iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct page *cur_page; bool cur_page_in_bio; - bool is_readahead; struct bio *bio; - struct list_head *pages; + struct readahead_control *rac; }; static void @@ -308,7 +307,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->bio) submit_bio(ctx->bio); - if (ctx->is_readahead) /* same as readahead_gfp_mask */ + if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); /* @@ -319,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio) ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) + if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); @@ -375,36 +374,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, +iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; @@ -418,10 +389,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = NULL; } if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; + ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, @@ -431,32 +399,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, return done; } -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) +/** + * iomap_readahead - Attempt to read pages from a file. + * @rac: Describes the pages to be read. + * @ops: The operations vector for the filesystem. + * + * This function is for filesystems to call to implement their readahead + * address_space operation. + * + * Context: The @ops callbacks may submit I/O (eg to read the addresses of + * blocks from disc), and may wait for it. The caller may be trying to + * access a different page, and so sleeping excessively should be avoided. + * It may allocate memory, but should avoid costly allocations. This + * function is called with memalloc_nofs set, so allocations will not cause + * the filesystem to be reentered. + */ +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { + struct inode *inode = rac->mapping->host; + loff_t pos = readahead_pos(rac); + loff_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, + .rac = rac, }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - trace_iomap_readpages(mapping->host, nr_pages); + trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); + loff_t ret = iomap_apply(inode, pos, length, 0, ops, + &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); - goto done; + break; } pos += ret; length -= ret; } - ret = 0; -done: + if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_page) { @@ -464,15 +443,8 @@ done: unlock_page(ctx.cur_page); put_page(ctx.cur_page); } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; } -EXPORT_SYMBOL_GPL(iomap_readpages); +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a page are diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4df19c66f597..5693a39d52fb 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); -DEFINE_READPAGE_EVENT(iomap_readpages); +DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d9cebf18726..1fd4fb7a607c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3ce9829a6936..dba874a61fc5 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -78,10 +78,9 @@ static int zonefs_readpage(struct file *unused, struct page *page) return iomap_readpage(page, &zonefs_iomap_ops); } -static int zonefs_readpages(struct file *unused, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void zonefs_readahead(struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_iomap_ops); } /* @@ -128,7 +127,7 @@ static int zonefs_writepages(struct address_space *mapping, static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, - .readpages = zonefs_readpages, + .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..bc20bd04c2a2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -155,8 +155,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); -int iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops); +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); int iomap_set_page_dirty(struct page *page); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); -- cgit v1.2.3 From b03143accd9274d9e024da42ed5857a71a6b6a27 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:38 -0700 Subject: include/linux/pagemap.h: introduce attach/detach_page_private MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Introduce attach/detach_page_private to cleanup code". This patch (of 10): The logic in attach_page_buffers and __clear_page_buffers are quite paired, but 1. they are located in different files. 2. attach_page_buffers is implemented in buffer_head.h, so it could be used by other files. But __clear_page_buffers is static function in buffer.c and other potential users can't call the function, md-bitmap even copied the function. So, introduce the new attach/detach_page_private to replace them. With the new pair of function, we will remove the usage of attach_page_buffers and __clear_page_buffers in next patches. Thanks for suggestions about the function name from Alexander Viro, Andreas Grünbacher, Christoph Hellwig and Matthew Wilcox. Suggested-by: Matthew Wilcox Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: "Darrick J. Wong" Cc: William Kucharski Cc: "Kirill A. Shutemov" Cc: Andreas Gruenbacher Cc: Yang Shi Cc: Yafang Shao Cc: Song Liu Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Alexander Viro Cc: Jaegeuk Kim Cc: Chao Yu Cc: Christoph Hellwig Cc: Anton Altaparmakov Cc: Mike Marshall Cc: Martin Brandenburg Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Roman Gushchin Cc: Andreas Dilger Cc: Chao Yu Cc: Dave Chinner Link: http://lkml.kernel.org/r/20200517214718.468-1-guoqing.jiang@cloud.ionos.com Link: http://lkml.kernel.org/r/20200517214718.468-2-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c6348c50136f..8e085713150c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -208,6 +208,43 @@ static inline int page_cache_add_speculative(struct page *page, int count) return __page_cache_add_speculative(page, count); } +/** + * attach_page_private - Attach private data to a page. + * @page: Page to attach data to. + * @data: Data to attach to page. + * + * Attaching private data to a page increments the page's reference count. + * The data must be detached before the page will be freed. + */ +static inline void attach_page_private(struct page *page, void *data) +{ + get_page(page); + set_page_private(page, (unsigned long)data); + SetPagePrivate(page); +} + +/** + * detach_page_private - Detach private data from a page. + * @page: Page to detach data from. + * + * Removes the data that was previously attached to the page and decrements + * the refcount on the page. + * + * Return: Data that was attached to the page. + */ +static inline void *detach_page_private(struct page *page) +{ + void *data = (void *)page_private(page); + + if (!PagePrivate(page)) + return NULL; + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + + return data; +} + #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else -- cgit v1.2.3 From 7b59435a2afed12dc9b2ec1b930efa2e94f1c397 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:48:03 -0700 Subject: buffer_head.h: remove attach_page_buffers All the callers have replaced attach_page_buffers with the new function attach_page_private, so remove it. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Roman Gushchin Cc: Andreas Dilger Link: http://lkml.kernel.org/r/20200517214718.468-10-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 15b765a181b8..22fb11e2d2e0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -272,14 +272,6 @@ void buffer_init(void); * inline definitions */ -static inline void attach_page_buffers(struct page *page, - struct buffer_head *head) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, (unsigned long)head); -} - static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); -- cgit v1.2.3 From 60e65a6f42d0af33fa7361bd8723adc70539121b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:48:09 -0700 Subject: mm_types.h: change set_page_private to inline function Change it to inline function to make callers use the proper argument. And no need for it to be macro per Andrew's comment [1]. [1] https://lore.kernel.org/lkml/20200518221235.1fa32c38e5766113f78e3f0d@linux-foundation.org/ Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200525203149.18802-1-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4aba6c0c2ba8..ef6d3aface8a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,7 +240,11 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) #define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) + +static inline void set_page_private(struct page *page, unsigned long private) +{ + page->private = private; +} struct page_frag_cache { void * va; -- cgit v1.2.3 From a37b0715ddf3007734c4e2424c14bc7efcdd1190 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:18 -0700 Subject: mm/writeback: replace PF_LESS_THROTTLE with PF_LOCAL_THROTTLE PF_LESS_THROTTLE exists for loop-back nfsd (and a similar need in the loop block driver and callers of prctl(PR_SET_IO_FLUSHER)), where a daemon needs to write to one bdi (the final bdi) in order to free up writes queued to another bdi (the client bdi). The daemon sets PF_LESS_THROTTLE and gets a larger allowance of dirty pages, so that it can still dirty pages after other processses have been throttled. The purpose of this is to avoid deadlock that happen when the PF_LESS_THROTTLE process must write for any dirty pages to be freed, but it is being thottled and cannot write. This approach was designed when all threads were blocked equally, independently on which device they were writing to, or how fast it was. Since that time the writeback algorithm has changed substantially with different threads getting different allowances based on non-trivial heuristics. This means the simple "add 25%" heuristic is no longer reliable. The important issue is not that the daemon needs a *larger* dirty page allowance, but that it needs a *private* dirty page allowance, so that dirty pages for the "client" bdi that it is helping to clear (the bdi for an NFS filesystem or loop block device etc) do not affect the throttling of the daemon writing to the "final" bdi. This patch changes the heuristic so that the task is not throttled when the bdi it is writing to has a dirty page count below below (or equal to) the free-run threshold for that bdi. This ensures it will always be able to have some pages in flight, and so will not deadlock. In a steady-state, it is expected that PF_LOCAL_THROTTLE tasks might still be throttled by global threshold, but that is acceptable as it is only the deadlock state that is interesting for this flag. This approach of "only throttle when target bdi is busy" is consistent with the other use of PF_LESS_THROTTLE in current_may_throttle(), were it causes attention to be focussed only on the target bdi. So this patch - renames PF_LESS_THROTTLE to PF_LOCAL_THROTTLE, - removes the 25% bonus that that flag gives, and - If PF_LOCAL_THROTTLE is set, don't delay at all unless the global and the local free-run thresholds are exceeded. Note that previously realtime threads were treated the same as PF_LESS_THROTTLE threads. This patch does *not* change the behvaiour for real-time threads, so it is now different from the behaviour of nfsd and loop tasks. I don't know what is wanted for realtime. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Acked-by: Chuck Lever [nfsd] Cc: Christoph Hellwig Cc: Michal Hocko Cc: Trond Myklebust Link: http://lkml.kernel.org/r/87ftbf7gs3.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 2 +- fs/nfsd/vfs.c | 9 +++++---- include/linux/sched.h | 3 ++- kernel/sys.c | 2 +- mm/page-writeback.c | 41 +++++++++++++++++++++++++++++++++-------- mm/vmscan.c | 4 ++-- 6 files changed, 44 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..d89c25ba3b89 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -919,7 +919,7 @@ static void loop_unprepare_queue(struct loop_device *lo) static int loop_kthread_worker_fn(void *worker_ptr) { - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; return kthread_worker_fn(worker_ptr); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0aa02eb18bd3..c3fbab1753ec 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -979,12 +979,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* - * We want less throttling in balance_dirty_pages() - * and shrink_inactive_list() so that nfs to + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to * localhost doesn't cause nfsd to lock up due to all * the client's dirty pages or its congested queue. */ - current->flags |= PF_LESS_THROTTLE; + current->flags |= PF_LOCAL_THROTTLE; exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1037,7 +1038,7 @@ out_nfserr: nfserr = nfserrno(host_err); } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - current_restore_flags(pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..12ef0c753284 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1481,7 +1481,8 @@ extern struct pid *cad_pid; #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, + * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..180a2fa33f7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2df6fb174983..7ff2290cf43d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void) * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { @@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } @@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_task(tsk)) dirty += dirty / 4; return dirty; @@ -1653,8 +1652,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv = dirty_poll_interval(dirty, thresh); - unsigned long m_intv = ULONG_MAX; + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; @@ -1673,9 +1676,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(gdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); @@ -1689,9 +1703,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(mdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. + */ + goto free_running; + } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); diff --git a/mm/vmscan.c b/mm/vmscan.c index a37c87b5aee2..b2f5deb3603c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1878,13 +1878,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, /* * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ static int current_may_throttle(void) { - return !(current->flags & PF_LESS_THROTTLE) || + return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || bdi_write_congested(current->backing_dev_info); } -- cgit v1.2.3 From 8d92890bd6b8502d6aee4b37430ae6444ade7a8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:21 -0700 Subject: mm/writeback: discard NR_UNSTABLE_NFS, use NR_WRITEBACK instead After an NFS page has been written it is considered "unstable" until a COMMIT request succeeds. If the COMMIT fails, the page will be re-written. These "unstable" pages are currently accounted as "reclaimable", either in WB_RECLAIMABLE, or in NR_UNSTABLE_NFS which is included in a 'reclaimable' count. This might have made sense when sending the COMMIT required a separate action by the VFS/MM (e.g. releasepage() used to send a COMMIT). However now that all writes generated by ->writepages() will automatically be followed by a COMMIT (since commit 919e3bd9a875 ("NFS: Ensure we commit after writeback is complete")) it makes more sense to treat them as writeback pages. So this patch removes NR_UNSTABLE_NFS and accounts unstable pages in NR_WRITEBACK and WB_WRITEBACK. A particular effect of this change is that when wb_check_background_flush() calls wb_over_bg_threshold(), the latter will report 'true' a lot less often as the 'unstable' pages are no longer considered 'dirty' (as there is nothing that writeback can do about them anyway). Currently wb_check_background_flush() will trigger writeback to NFS even when there are relatively few dirty pages (if there are lots of unstable pages), this can result in small writes going to the server (10s of Kilobytes rather than a Megabyte) which hurts throughput. With this patch, there are fewer writes which are each larger on average. Where the NR_UNSTABLE_NFS count was included in statistics virtual-files, the entry is retained, but the value is hard-coded as zero. static trace points and warning printks which mentioned this counter no longer report it. [akpm@linux-foundation.org: re-layout comment] [akpm@linux-foundation.org: fix printk warning] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Trond Myklebust Acked-by: Michal Hocko [mm] Cc: Christoph Hellwig Cc: Chuck Lever Link: http://lkml.kernel.org/r/87d06j7gqa.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 4 ++-- drivers/base/node.c | 2 +- fs/fs-writeback.c | 1 - fs/nfs/internal.h | 10 +++++++--- fs/nfs/write.c | 4 ++-- fs/proc/meminfo.c | 3 +-- include/linux/mmzone.h | 1 - include/trace/events/writeback.h | 5 +---- mm/memcontrol.c | 1 - mm/page-writeback.c | 17 ++++------------- mm/page_alloc.c | 5 +---- mm/vmstat.c | 11 +++++++++-- 12 files changed, 28 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 38b606991065..092b7b44d158 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1042,8 +1042,8 @@ PageTables amount of memory dedicated to the lowest level of page tables. NFS_Unstable - NFS pages sent to the server, but not yet committed to stable - storage + Always zero. Previous counted pages which had been written to + the server, but has not been committed to stable storage. Bounce Memory used for block device "bounce buffers" WritebackTmp diff --git a/drivers/base/node.c b/drivers/base/node.c index 10d7e818e118..6012574913f7 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -439,7 +439,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.sharedram), nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), - nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), nid, K(sreclaimable + diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..c5bdf46e3b4b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1f32a9fbfdaf..6673a77884d9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -668,7 +668,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) @@ -676,8 +677,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + inc_node_page_state(page, NR_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1e767f779c49..639c34fec04a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,9 +946,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_node_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_WRITEBACK); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + WB_WRITEBACK); } /* Called holding the request lock on @req */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..9bd94b5a9658 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,8 +106,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1b9de7d220fb..a89f47515eb1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -193,7 +193,6 @@ enum node_stat_item { NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, - NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 85a33bea76f1..10f5d1fa7347 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -541,7 +541,6 @@ TRACE_EVENT(global_dirty_state, TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) - __field(unsigned long, nr_unstable) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) @@ -552,7 +551,6 @@ TRACE_EVENT(global_dirty_state, TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); - __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; @@ -560,12 +558,11 @@ TRACE_EVENT(global_dirty_state, __entry->dirty_limit = global_wb_domain.dirty_limit; ), - TP_printk("dirty=%lu writeback=%lu unstable=%lu " + TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, - __entry->nr_unstable, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3b97f103966..1db4b285c407 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4330,7 +4330,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7ff2290cf43d..718565266257 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -504,7 +504,6 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; @@ -758,7 +757,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1566,7 +1565,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; @@ -1586,14 +1585,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); @@ -1963,8 +1955,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 13cc653122b7..cc406ee17ad9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5319,7 +5319,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5332,7 +5332,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_SLAB_RECLAIMABLE), global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), @@ -5365,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5387,7 +5385,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..b1582fdf757c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1162,7 +1162,6 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1723,6 +1722,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } -- cgit v1.2.3 From 91429023342789a89f4b6ae95b47a7df71ab6d95 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 1 Jun 2020 21:48:27 -0700 Subject: mm/gup: introduce pin_user_pages_unlocked Introduce pin_user_pages_unlocked(), which is nearly identical to the get_user_pages_unlocked() that it wraps, except that it sets FOLL_PIN and rejects FOLL_GET. Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Andy Walls Cc: Mauro Carvalho Chehab Link: http://lkml.kernel.org/r/20200518012157.1178336-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 92704fde6475..ebbb0acbeee2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1713,6 +1713,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index ca723a33268e..c3014669988b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2960,3 +2960,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); -- cgit v1.2.3 From 4b4bb6bb451c8b0c1cbc38fa20a89a3988aa4e0b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:49:13 -0700 Subject: mm/swapfile.c: classify SWAP_MAP_XXX to make it more readable swap_info_struct->swap_map[] encodes some flag and count. And to do some condition check, it also introduces some special values. Currently those macros are defined with some magic order, which makes audience hard to understand the exact meaning. This patch split those macros into three categories: flag special value for first swap_map special value for continued swap_map May this help audiences a little. [akpm@linux-foundation.org: tweak capitalization in comments] Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200501015259.32237-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e1bbf7a16b27..873bf5206afb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -183,12 +183,17 @@ enum { #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ -#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ +/* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ -#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ -#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ + +/* Special value in first swap_map */ +#define SWAP_MAP_MAX 0x3e /* Max count */ +#define SWAP_MAP_BAD 0x3f /* Note page is bad */ +#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ + +/* Special value in each swap_map continuation */ +#define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk -- cgit v1.2.3 From 490705888107c3edf8c264ec930909107f76a984 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 1 Jun 2020 21:49:22 -0700 Subject: swap: reduce lock contention on swap cache from swap slots allocation In some swap scalability test, it is found that there are heavy lock contention on swap cache even if we have split one swap cache radix tree per swap device to one swap cache radix tree every 64 MB trunk in commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"). The reason is as follow. After the swap device becomes fragmented so that there's no free swap cluster, the swap device will be scanned linearly to find the free swap slots. swap_info_struct->cluster_next is the next scanning base that is shared by all CPUs. So nearby free swap slots will be allocated for different CPUs. The probability for multiple CPUs to operate on the same 64 MB trunk is high. This causes the lock contention on the swap cache. To solve the issue, in this patch, for SSD swap device, a percpu version next scanning base (cluster_next_cpu) is added. Every CPU will use its own per-cpu next scanning base. And after finishing scanning a 64MB trunk, the per-cpu scanning base will be changed to the beginning of another randomly selected 64MB trunk. In this way, the probability for multiple CPUs to operate on the same 64 MB trunk is reduced greatly. Thus the lock contention is reduced too. For HDD, because sequential access is more important for IO performance, the original shared next scanning base is used. To test the patch, we have run 16-process pmbench memory benchmark on a 2-socket server machine with 48 cores. One ram disk is configured as the swap device per socket. The pmbench working-set size is much larger than the available memory so that swapping is triggered. The memory read/write ratio is 80/20 and the accessing pattern is random. In the original implementation, the lock contention on the swap cache is heavy. The perf profiling data of the lock contention code path is as following, _raw_spin_lock_irq.add_to_swap_cache.add_to_swap.shrink_page_list: 7.91 _raw_spin_lock_irqsave.__remove_mapping.shrink_page_list: 7.11 _raw_spin_lock.swapcache_free_entries.free_swap_slot.__swap_entry_free: 2.51 _raw_spin_lock_irqsave.swap_cgroup_record.mem_cgroup_uncharge_swap: 1.66 _raw_spin_lock_irq.shrink_inactive_list.shrink_lruvec.shrink_node: 1.29 _raw_spin_lock.free_pcppages_bulk.drain_pages_zone.drain_pages: 1.03 _raw_spin_lock_irq.shrink_active_list.shrink_lruvec.shrink_node: 0.93 After applying this patch, it becomes, _raw_spin_lock.swapcache_free_entries.free_swap_slot.__swap_entry_free: 3.58 _raw_spin_lock_irq.shrink_inactive_list.shrink_lruvec.shrink_node: 2.3 _raw_spin_lock_irqsave.swap_cgroup_record.mem_cgroup_uncharge_swap: 2.26 _raw_spin_lock_irq.shrink_active_list.shrink_lruvec.shrink_node: 1.8 _raw_spin_lock.free_pcppages_bulk.drain_pages_zone.drain_pages: 1.19 The lock contention on the swap cache is almost eliminated. And the pmbench score increases 18.5%. The swapin throughput increases 18.7% from 2.96 GB/s to 3.51 GB/s. While the swapout throughput increases 18.5% from 2.99 GB/s to 3.54 GB/s. We need really fast disk to show the benefit. I have tried this on 2 Intel P3600 NVMe disks. The performance improvement is only about 1%. The improvement should be better on the faster disks, such as Intel Optane disk. [ying.huang@intel.com: fix cluster_next_cpu allocation and freeing, per Daniel] Link: http://lkml.kernel.org/r/20200525002648.336325-1-ying.huang@intel.com [ying.huang@intel.com: v4] Link: http://lkml.kernel.org/r/20200529010840.928819-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200520031502.175659-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 873bf5206afb..6c23a6a14012 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -252,6 +252,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 18dfccb91123..c531d2687dd0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -613,10 +613,12 @@ new_cluster: } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; @@ -722,6 +724,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -746,7 +776,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { @@ -835,7 +874,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -884,6 +922,7 @@ checks: } done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -2653,6 +2692,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -3205,11 +3246,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + prandom_u32_max(p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3325,6 +3374,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); -- cgit v1.2.3 From 251af0cda614e5ad6bfda059bd120aed7432891d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 1 Jun 2020 21:49:29 -0700 Subject: include/linux/swap.h: delete meaningless __add_to_swap_cache() declaration Since commit 8d93b41c09d1 ("mm: Convert add_to_swap_cache to XArray"), __add_to_swap_cache and add_to_swap_cache are combined into one function. There is no __add_to_swap_cache() anymore. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: "Huang, Ying" Link: http://lkml.kernel.org/r/1590810326-2493-1-git-send-email-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6c23a6a14012..68ef7638311f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -414,7 +414,6 @@ extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); -extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); -- cgit v1.2.3 From d1663a907bd348f912b7f7088e83ca1b6fd3309f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:49 -0700 Subject: mm/memcg: move cgroup high memory limit setting into struct page_counter High memory limit is currently recorded directly in struct mem_cgroup. We are about to add a high limit for swap, move the field to struct page_counter and add some helpers. Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Chris Down Cc: Hugh Dickins Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200527195846.102707-4-kuba@kernel.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 --- include/linux/page_counter.h | 8 ++++++++ mm/memcontrol.c | 19 +++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 977edd3b7bd8..95a09a7ec412 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -215,9 +215,6 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Upper bound of normal memory consumption range */ - unsigned long high; - /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index bab7e57f659b..85bd413e784e 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -10,6 +10,7 @@ struct page_counter { atomic_long_t usage; unsigned long min; unsigned long low; + unsigned long high; unsigned long max; struct page_counter *parent; @@ -55,6 +56,13 @@ bool page_counter_try_charge(struct page_counter *counter, void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); + +static inline void page_counter_set_high(struct page_counter *counter, + unsigned long nr_pages) +{ + WRITE_ONCE(counter->high, nr_pages); +} + int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a857b87428d..08cf17b186fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2252,7 +2252,8 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2345,7 +2346,7 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg) do { overage = calculate_overage(page_counter_read(&memcg->memory), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); max_overage = max(overage, max_overage); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2604,7 +2605,8 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { + if (page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high)) { /* Don't bother a random interrupted task */ if (in_interrupt()) { schedule_work(&memcg->high_work); @@ -4347,7 +4349,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -5072,7 +5074,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); @@ -5225,7 +5227,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); } @@ -6024,7 +6026,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -6041,7 +6044,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - WRITE_ONCE(memcg->high, high); + page_counter_set_high(&memcg->memory, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); -- cgit v1.2.3 From 4b82ab4f28836646eca12cb37f408568d3cdc5c3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:52 -0700 Subject: mm/memcg: automatically penalize tasks with high swap use Add a memory.swap.high knob, which can be used to protect the system from SWAP exhaustion. The mechanism used for penalizing is similar to memory.high penalty (sleep on return to user space). That is not to say that the knob itself is equivalent to memory.high. The objective is more to protect the system from potentially buggy tasks consuming a lot of swap and impacting other tasks, or even bringing the whole system to stand still with complete SWAP exhaustion. Hopefully without the need to find per-task hard limits. Slowing misbehaving tasks down gradually allows user space oom killers or other protection mechanisms to react. oomd and earlyoom already do killing based on swap exhaustion, and memory.swap.high protection will help implement such userspace oom policies more reliably. We can use one counter for number of pages allocated under pressure to save struct task space and avoid two separate hierarchy walks on the hot path. The exact overage is calculated on return to user space, anyway. Take the new high limit into account when determining if swap is "full". Borrowing the explanation from Johannes: The idea behind "swap full" is that as long as the workload has plenty of swap space available and it's not changing its memory contents, it makes sense to generously hold on to copies of data in the swap device, even after the swapin. A later reclaim cycle can drop the page without any IO. Trading disk space for IO. But the only two ways to reclaim a swap slot is when they're faulted in and the references go away, or by scanning the virtual address space like swapoff does - which is very expensive (one could argue it's too expensive even for swapoff, it's often more practical to just reboot). So at some point in the fill level, we have to start freeing up swap slots on fault/swapin. Otherwise we could eventually run out of swap slots while they're filled with copies of data that is also in RAM. We don't want to OOM a workload because its available swap space is filled with redundant cache. Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Chris Down Cc: Shakeel Butt Cc: Michal Hocko Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200527195846.102707-5-kuba@kernel.org Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 20 ++++++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 88 ++++++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5f12f203822e..b8c0460730f3 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1374,6 +1374,22 @@ PAGE_SIZE multiple when read back. The total amount of swap currently being used by the cgroup and its descendants. + memory.swap.high + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Swap usage throttle limit. If a cgroup's swap usage exceeds + this limit, all its further allocations will be throttled to + allow userspace to implement custom out-of-memory procedures. + + This limit marks a point of no return for the cgroup. It is NOT + designed to manage the amount of swapping a workload does + during regular operation. Compare to memory.swap.max, which + prohibits swapping past a set amount, but lets the cgroup + continue unimpeded as long as other memory can be reclaimed. + + Healthy workloads are not expected to reach this limit. + memory.swap.max A read-write single value file which exists on non-root cgroups. The default is "max". @@ -1387,6 +1403,10 @@ PAGE_SIZE multiple when read back. otherwise, a value change in this file generates a file modified event. + high + The number of times the cgroup's swap usage was over + the high threshold. + max The number of times the cgroup's swap usage was about to go over the max boundary and swap allocation diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 95a09a7ec412..bfe9533bb67e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -45,6 +45,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08cf17b186fb..f3087e22dfa9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2354,6 +2354,22 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg) return max_overage; } +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + /* * Get the number of jiffies that we should penalise a mischievous cgroup which * is exceeding its memory.high by checking both it and its ancestors. @@ -2415,6 +2431,9 @@ void mem_cgroup_handle_over_high(void) penalty_jiffies = calculate_high_delay(memcg, nr_pages, mem_find_max_overage(memcg)); + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + /* * Clamp the max delay per usermode return so as to still keep the * application moving forwards and also permit diagnostics, albeit @@ -2605,13 +2624,32 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > - READ_ONCE(memcg->memory.high)) { - /* Don't bother a random interrupted task */ - if (in_interrupt()) { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + if (mem_high) { schedule_work(&memcg->high_work); break; } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -5076,6 +5114,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; @@ -5229,6 +5268,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); memcg_wb_domain_size_changed(memcg); } @@ -7142,10 +7182,13 @@ bool mem_cgroup_swap_full(struct page *page) if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= - READ_ONCE(memcg->swap.max)) + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) return true; + } return false; } @@ -7175,6 +7218,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + static int swap_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7202,6 +7268,8 @@ static int swap_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); seq_printf(m, "fail %lu\n", @@ -7216,6 +7284,12 @@ static struct cftype swap_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = swap_current_read, }, + { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, { .name = "swap.max", .flags = CFTYPE_NOT_ON_ROOT, -- cgit v1.2.3 From 1494e0c38ee903e83aefb58caf54a9217273d49a Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 1 Jun 2020 21:49:58 -0700 Subject: x86: mm: ptdump: calculate effective permissions correctly Patch series "Fix W+X debug feature on x86" Jan alerted me[1] that the W+X detection debug feature was broken in x86 by my change[2] to switch x86 to use the generic ptdump infrastructure. Fundamentally the approach of trying to move the calculation of effective permissions into note_page() was broken because note_page() is only called for 'leaf' entries and the effective permissions are passed down via the internal nodes of the page tree. The solution I've taken here is to create a new (optional) callback which is called for all nodes of the page tree and therefore can calculate the effective permissions. Secondly on some configurations (32 bit with PAE) "unsigned long" is not large enough to store the table entries. The fix here is simple - let's just use a u64. [1] https://lore.kernel.org/lkml/d573dc7e-e742-84de-473d-f971142fa319@suse.com/ [2] 2ae27137b2db ("x86: mm: convert dump_pagetables to use walk_page_range") This patch (of 2): By switching the x86 page table dump code to use the generic code the effective permissions are no longer calculated correctly because the note_page() function is only called for *leaf* entries. To calculate the actual effective permissions it is necessary to observe the full hierarchy of the page tree. Introduce a new callback for ptdump which is called for every entry and can therefore update the prot_levels array correctly. note_page() can then simply access the appropriate element in the array. [steven.price@arm.com: make the assignment conditional on val != 0] Link: http://lkml.kernel.org/r/430c8ab4-e7cd-6933-dde6-087fac6db872@arm.com Fixes: 2ae27137b2db ("x86: mm: convert dump_pagetables to use walk_page_range") Reported-by: Jan Beulich Signed-off-by: Steven Price Signed-off-by: Andrew Morton Cc: Qian Cai Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/20200521152308.33096-1-steven.price@arm.com Link: http://lkml.kernel.org/r/20200521152308.33096-2-steven.price@arm.com Signed-off-by: Linus Torvalds --- arch/x86/mm/dump_pagetables.c | 33 ++++++++++++++++++++------------- include/linux/ptdump.h | 1 + mm/ptdump.c | 17 ++++++++++++++++- 3 files changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 69309cd56fdf..33093fdedb02 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr) (void *)st->start_address); } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) { - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; } /* @@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, struct seq_file *m = st->seq; new_prot = val & PTE_FLAGS_MASK; - - if (level > 0) { - new_eff = effective_prot(st->prot_levels[level - 1], - new_prot); - } else { - new_eff = new_prot; - } - - if (level >= 0) - st->prot_levels[level] = new_eff; + if (!val) + new_eff = 0; + else + new_eff = st->prot_levels[level]; /* * If we have a "break" in the series, we need to flush the state that @@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { .note_page = note_page, + .effective_prot = effective_prot, .range = ptdump_ranges }, .level = -1, diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index a67065c403c3..ac01502763bf 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -14,6 +14,7 @@ struct ptdump_state { /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, int level, unsigned long val); + void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7..f4ce916f5602 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + if (pgd_leaf(val)) st->note_page(st, addr, 0, pgd_val(val)); @@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + if (p4d_leaf(val)) st->note_page(st, addr, 1, p4d_val(val)); @@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + if (pud_leaf(val)) st->note_page(st, addr, 2, pud_val(val)); @@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); if (pmd_leaf(val)) st->note_page(st, addr, 3, pmd_val(val)); @@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; + pte_t val = READ_ONCE(*pte); + + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); - st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); + st->note_page(st, addr, 4, pte_val(val)); return 0; } -- cgit v1.2.3 From 99395ee3f7b4accc3a16a6aa4c2abb3774fc33ca Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 1 Jun 2020 21:50:01 -0700 Subject: mm: ptdump: expand type of 'val' in note_page() The page table entry is passed in the 'val' argument to note_page(), however this was previously an "unsigned long" which is fine on 64-bit platforms. But for 32 bit x86 it is not always big enough to contain a page table entry which may be 64 bits. Change the type to u64 to ensure that it is always big enough. [akpm@linux-foundation.org: fix riscv] Reported-by: Jan Beulich Signed-off-by: Steven Price Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/20200521152308.33096-3-steven.price@arm.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/dump.c | 2 +- arch/riscv/mm/ptdump.c | 2 +- arch/x86/mm/dump_pagetables.c | 2 +- include/linux/ptdump.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 860c00ec8bd3..d4313bc0c4c1 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -247,7 +247,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); static const char units[] = "KMGTPE"; diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 7eab76a93106..070505d79b06 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -204,7 +204,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, - int level, unsigned long val) + int level, u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); u64 pa = PFN_PHYS(pte_pfn(__pte(val))); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 33093fdedb02..ea9010113f69 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -273,7 +273,7 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) * print what we collected so far. */ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); pgprotval_t new_prot, new_eff; diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index ac01502763bf..2a3a95586425 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -13,7 +13,7 @@ struct ptdump_range { struct ptdump_state { /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, unsigned long val); + int level, u64 val); void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; -- cgit v1.2.3 From 4926627793c0a7e7db2bc674e1d06777e86d8dab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:45 -0700 Subject: mm: remove __get_vm_area Switch the two remaining callers to use __get_vm_area_caller instead. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-9-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/pci_64.c | 3 ++- arch/sh/kernel/cpu/sh4/sq.c | 3 ++- include/linux/vmalloc.h | 2 -- mm/vmalloc.c | 8 -------- 4 files changed, 4 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 2a976314f169..d9ac980c398c 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -132,7 +132,8 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) * address decoding but I'd rather not deal with those outside of the * reserved 64K legacy region. */ - area = __get_vm_area(size, 0, PHB_IO_BASE, PHB_IO_END); + area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + __builtin_return_address(0)); if (!area) return NULL; diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index 934ff84844fa..d432164b23b7 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX); + vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a95d3cc74d79..ff69d1e037ca 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -161,8 +161,6 @@ static inline size_t get_vm_area_size(const struct vm_struct *area) extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); -extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a8227afa073..6e9527f131d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2128,14 +2128,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) -- cgit v1.2.3 From 8b136018da7bf49b988a24064fc45c290baffd93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:53 -0700 Subject: mm: rename CONFIG_PGTABLE_MAPPING to CONFIG_ZSMALLOC_PGTABLE_MAPPING Rename the Kconfig variable to clarify the scope. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Minchan Kim Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-11-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/configs/omap2plus_defconfig | 2 +- include/linux/zsmalloc.h | 2 +- mm/Kconfig | 2 +- mm/zsmalloc.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 8b83d4a5d309..fe383f5a92fb 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,7 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_PGTABLE_MAPPING=y +CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2219cce81ca4..0fdbf653b173 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,7 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when PGTABLE_MAPPING is selected. + * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..09a9edfb8461 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -705,7 +705,7 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config PGTABLE_MAPPING +config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" depends on ZSMALLOC help diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2f836a2b993f..ac0524330b9b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,7 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING struct vm_struct *vm; /* vm area for mapping object that span pages */ #else char *vm_buf; /* copy buffer for objects that span pages */ @@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1151,7 +1151,7 @@ static inline void __zs_unmap_object(struct mapping_area *area, unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING */ +#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -1233,7 +1233,7 @@ out: pagefault_enable(); } -#endif /* CONFIG_PGTABLE_MAPPING */ +#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static int zs_cpu_prepare(unsigned int cpu) { -- cgit v1.2.3 From ed1f324c5fed06c91f30a36aedb66f34244ab86e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:19 -0700 Subject: mm: remove map_vm_range Switch all callers to map_kernel_range, which symmetric to the unmap side (as well as the _noflush versions). Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-17-hch@lst.de Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 2 +- include/linux/vmalloc.h | 10 ++++------ mm/vmalloc.c | 21 +++++++-------------- mm/zsmalloc.c | 4 +++- net/ceph/ceph_common.c | 3 +-- 5 files changed, 16 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 93cb65d52720..a1582cc79f0f 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -213,7 +213,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'. - The first of these two routines is invoked after map_vm_area() + The first of these two routines is invoked after map_kernel_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries. diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ff69d1e037ca..a1e9bdc3ad9e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -168,11 +168,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); -extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -189,14 +189,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size, { return size >> PAGE_SHIFT; } +#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { } -static inline void -unmap_kernel_range(unsigned long addr, unsigned long size) -{ -} +#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index aab00ddee686..49ca687d8853 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -273,8 +273,8 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } -static int map_kernel_range(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; @@ -2028,16 +2028,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) flush_tlb_kernel_range(addr, end); } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - int err; - - err = map_kernel_range(addr, get_vm_area_size(area), prot, pages); - - return err > 0 ? 0 : err; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2409,7 +2399,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, prot, + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2472,8 +2463,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ac0524330b9b..f6dc0673e62c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..66f22e8aa529 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,8 +190,7 @@ EXPORT_SYMBOL(ceph_compare_options); * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are * compatible with (a superset of) GFP_KERNEL. This is because while the * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take - * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * are always allocated with GFP_KERNEL. * * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. */ -- cgit v1.2.3 From d4efd79a81abc7096a418ee3103f261cfb6ab634 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:27 -0700 Subject: mm: remove the prot argument from vm_map_ram This is always PAGE_KERNEL - for long term mappings with other properties vmap should be used. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-19-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c | 2 +- drivers/media/common/videobuf2/videobuf2-dma-sg.c | 3 +-- drivers/media/common/videobuf2/videobuf2-vmalloc.c | 3 +-- fs/erofs/decompressor.c | 2 +- fs/xfs/xfs_buf.c | 2 +- include/linux/vmalloc.h | 3 +-- mm/nommu.c | 2 +- mm/vmalloc.c | 4 ++-- 8 files changed, 9 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c index 9272bef57092..debaf7b18ab5 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c @@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf) { struct mock_dmabuf *mock = to_mock(dma_buf); - return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL); + return vm_map_ram(mock->pages, mock->npages, 0); } static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr) diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 6db60e9d5183..92072a08af25 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv) if (buf->db_attach) buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf); else - buf->vaddr = vm_map_ram(buf->pages, - buf->num_pages, -1, PAGE_KERNEL); + buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1); } /* add offset in case userptr is not page-aligned */ diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 1a4f0ca87c7c..c66fda4a65e4 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, unsigned long vaddr, buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { - buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1, - PAGE_KERNEL); + buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5d2d81940679..7628816f2453 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, i = 0; while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + dst = vm_map_ram(rq->out, nrpages_out, -1); /* retry two more times (totally 3 times) */ if (dst || ++i >= 3) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9ec3eaf1c618..65538d18e64f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a1e9bdc3ad9e..5488cea5ef11 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -88,8 +88,7 @@ struct vmap_area { * Highlevel APIs for driver use */ extern void vm_unmap_ram(const void *mem, unsigned int count); -extern void *vm_map_ram(struct page **pages, unsigned int count, - int node, pgprot_t prot); +extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU diff --git a/mm/nommu.c b/mm/nommu.c index 318df4e236c9..4f07b7ef0297 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -351,7 +351,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3b9b1366baa3..9848156a1c6a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1835,7 +1835,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1859,7 +1859,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (map_kernel_range(addr, size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } -- cgit v1.2.3 From 88dca4ca5a93d2c09e5bbc6a62fbfc3af83c4fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:40 -0700 Subject: mm: remove the pgprot argument to __vmalloc The pgprot argument to __vmalloc is always PAGE_KERNEL now, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley [hyperv] Acked-by: Gao Xiang [erofs] Acked-by: Peter Zijlstra (Intel) Acked-by: Wei Liu Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-22-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/hyperv/hv_init.c | 3 +-- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/sev.c | 3 +-- drivers/block/drbd/drbd_bitmap.c | 4 +--- drivers/gpu/drm/etnaviv/etnaviv_dump.c | 4 ++-- drivers/lightnvm/pblk-init.c | 5 ++--- drivers/md/dm-bufio.c | 4 ++-- drivers/mtd/ubi/io.c | 4 ++-- drivers/scsi/sd_zbc.c | 3 +-- fs/gfs2/dir.c | 9 ++++----- fs/gfs2/quota.c | 2 +- fs/nfs/blocklayout/extent_tree.c | 2 +- fs/ntfs/malloc.h | 2 +- fs/ubifs/debug.c | 2 +- fs/ubifs/lprops.c | 2 +- fs/ubifs/lpt_commit.c | 4 ++-- fs/ubifs/orphan.c | 2 +- fs/xfs/kmem.c | 2 +- include/linux/vmalloc.h | 2 +- kernel/bpf/core.c | 6 +++--- kernel/groups.c | 2 +- kernel/module.c | 3 +-- mm/nommu.c | 15 +++++++-------- mm/page_alloc.c | 2 +- mm/percpu.c | 2 +- mm/vmalloc.c | 4 ++-- net/bridge/netfilter/ebtables.c | 6 ++---- sound/core/memalloc.c | 2 +- sound/core/pcm_memory.c | 2 +- 29 files changed, 47 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 697ddd2afef9..e2137070386a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -97,8 +97,7 @@ static int hv_cpu_init(unsigned int cpu) * not be stopped in the case of CPU offlining and the VM will hang. */ if (!*hvp) { - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); } if (*hvp) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0a6b35353fc7..e94b3de564d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1279,8 +1279,7 @@ extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, - GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89f7f3aebd31..5573a97f1520 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -336,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 15e99697234a..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) bytes = sizeof(struct page *)*want; new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_ZERO, - PAGE_KERNEL); + new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO); if (!new_pages) return NULL; } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c index 648cf0207309..706af0304ca4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c @@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit) file_size += sizeof(*iter.hdr) * n_obj; /* Allocate the file in vmalloc memory, it's likely to be big */ - iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); + iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_NORETRY); if (!iter.start) { mutex_unlock(&gpu->mmu_context->lock); dev_warn(gpu->dev, "failed to allocate devcoredump file\n"); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 9a967a2e83dd..6e677ff62cc9 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -145,9 +145,8 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) int ret = 0; map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN - | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM, - PAGE_KERNEL); + pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); if (!pblk->trans_map) { pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", map_size); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..d1786cfd7f22 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -400,13 +400,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask); } /* diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index b57b84fb97d0..14d890b00d2c 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1297,7 +1297,7 @@ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, if (!ubi_dbg_chk_io(ubi)) return 0; - buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf1 = __vmalloc(len, GFP_NOFS); if (!buf1) { ubi_err(ubi, "cannot allocate memory to check writes"); return 0; @@ -1361,7 +1361,7 @@ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) if (!ubi_dbg_chk_io(ubi)) return 0; - buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(len, GFP_NOFS); if (!buf) { ubi_err(ubi, "cannot allocate memory to check for 0xFFs"); return 0; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f45c22b09726..8be27426aa66 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -136,8 +136,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, while (bufsize >= SECTOR_SIZE) { buf = __vmalloc(bufsize, - GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY, - PAGE_KERNEL); + GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY); if (buf) { *buflen = bufsize; return buf; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8259fef3f986..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 5488cea5ef11..1c278e030599 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -110,7 +110,7 @@ extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14aa1f74dd10..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..086618a0058f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2946,8 +2946,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; diff --git a/mm/nommu.c b/mm/nommu.c index 4f07b7ef0297..2df549adb22b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -152,14 +152,14 @@ EXPORT_SYMBOL(__vmalloc); void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, flags); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; @@ -230,7 +230,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +248,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +301,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +313,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cc406ee17ad9..45ad73122e82 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8244,7 +8244,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* diff --git a/mm/percpu.c b/mm/percpu.c index 7da7d7737dab..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -482,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d7c7108181a..11194ae18f23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2564,9 +2564,9 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_mask, prot, 0, node, caller); } -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 78db58c7aec2..7e869284e052 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1095,16 +1095,14 @@ static int do_replace(struct net *net, const void __user *user, tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; - newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index a83553fbedf0..bea46ed157a6 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -143,7 +143,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, break; case SNDRV_DMA_TYPE_VMALLOC: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM); - dmab->area = __vmalloc(size, gfp, PAGE_KERNEL); + dmab->area = __vmalloc(size, gfp); dmab->addr = 0; break; #ifdef CONFIG_HAS_DMA diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c index fcab37ea6641..860935e3aea4 100644 --- a/sound/core/pcm_memory.c +++ b/sound/core/pcm_memory.c @@ -460,7 +460,7 @@ int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream, return 0; /* already large enough */ vfree(runtime->dma_area); } - runtime->dma_area = __vmalloc(size, gfp_flags, PAGE_KERNEL); + runtime->dma_area = __vmalloc(size, gfp_flags); if (!runtime->dma_area) return -ENOMEM; runtime->dma_bytes = size; -- cgit v1.2.3 From 4d39d7285f45cc6c72b850f040d3addd626658e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:49 -0700 Subject: mm: remove both instances of __vmalloc_node_flags The real version just had a few callers that can open code it and remove one layer of indirection. The nommu stub was public but only had a single caller, so remove it and avoid a CONFIG_MMU ifdef in vmalloc.h. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-24-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 9 --------- mm/nommu.c | 3 ++- mm/vmalloc.c | 20 ++++++-------------- 3 files changed, 8 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1c278e030599..91d143783458 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,17 +115,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -#ifndef CONFIG_MMU -extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); -static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, - gfp_t flags, void *caller) -{ - return __vmalloc_node_flags(size, node, flags); -} -#else extern void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, void *caller); -#endif extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/mm/nommu.c b/mm/nommu.c index 2df549adb22b..9553efa59787 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -150,7 +150,8 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, + void *caller) { return __vmalloc(size, flags); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c9343f1a7268..e9f730092c81 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2567,14 +2567,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, node, - __builtin_return_address(0)); -} - - void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, void *caller) { @@ -2595,8 +2587,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2615,8 +2607,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2671,8 +2663,8 @@ EXPORT_SYMBOL(vmalloc_node); */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); -- cgit v1.2.3 From 2b9059489c839e67ca9254913325e18cea11a980 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:53 -0700 Subject: mm: remove __vmalloc_node_flags_caller Just use __vmalloc_node instead which gets and extra argument. To be able to to use __vmalloc_node in all caller make it available outside of vmalloc and implement it in nommu.c. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-25-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 4 ++-- kernel/bpf/syscall.c | 5 ++--- mm/nommu.c | 6 +++--- mm/util.c | 2 +- mm/vmalloc.c | 10 +--------- 5 files changed, 9 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 91d143783458..964d58a060ca 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,8 +115,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -extern void *__vmalloc_node_flags_caller(unsigned long size, - int node, gfp_t flags, void *caller); +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e6dee19a668..9c1cf7a87fb3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -299,9 +299,8 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags); } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, + numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/mm/nommu.c b/mm/nommu.c index 9553efa59787..08848fc18672 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -150,10 +150,10 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) { - return __vmalloc(size, flags); + return __vmalloc(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..6d5868adbe18 100644 --- a/mm/util.c +++ b/mm/util.c @@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9f730092c81..88f9971a7d6e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2401,8 +2401,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2553,7 +2551,7 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, @@ -2567,12 +2565,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size -- cgit v1.2.3 From 041de93ff86fc500aa73e5360039c95f4d31b95f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:02 -0700 Subject: mm: remove vmalloc_user_node_flags Open code it in __bpf_map_area_alloc, which is the only caller. Also clean up __bpf_map_area_alloc to have a single vmalloc call with slightly different flags instead of the current two different calls. For this to compile for the nommu case add a __vmalloc_node_range stub to nommu.c. [akpm@linux-foundation.org: fix nommu.c build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-27-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - kernel/bpf/syscall.c | 24 ++++++++++++++---------- mm/nommu.c | 14 ++++++++------ mm/vmalloc.c | 20 -------------------- 4 files changed, 22 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 964d58a060ca..3332926295d4 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9c1cf7a87fb3..42c7a42fc9c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -281,26 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, - numa_node, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/mm/nommu.c b/mm/nommu.c index 08848fc18672..371697bf372d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -150,6 +150,14 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +{ + return __vmalloc(size, gfp_mask); +} + void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { @@ -180,12 +188,6 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_user_flags(size, flags | __GFP_ZERO); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 931106654d1f..410bfe26ee73 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2659,26 +2659,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size -- cgit v1.2.3 From d8626138009ba58ae2c22356966c2edaa1f1c3b5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:18 -0700 Subject: mm: add functions to track page directory modifications Patch series "mm: Get rid of vmalloc_sync_(un)mappings()", v3. After the recent issue with vmalloc and tracing code[1] on x86 and a long history of previous issues related to the vmalloc_sync_mappings() interface, I thought the time has come to remove it. Please see [2], [3], and [4] for some other issues in the past. The patches add tracking of page-table directory changes to the vmalloc and ioremap code. Depending on which page-table levels changes have been made, a new per-arch function is called: arch_sync_kernel_mappings(). On x86-64 with 4-level paging, this function will not be called more than 64 times in a systems runtime (because vmalloc-space takes 64 PGD entries which are only populated, but never cleared). As a side effect this also allows to get rid of vmalloc faults on x86, making it safe to touch vmalloc'ed memory in the page-fault handler. Note that this potentially includes per-cpu memory. This patch (of 7): Add page-table allocation functions which will keep track of changed directory entries. They are needed for new PGD, P4D, PUD, and PMD entries and will be used in vmalloc and ioremap code to decide whether any changes in the kernel mappings need to be synchronized between page-tables in the system. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: "H . Peter Anvin" Cc: Dave Hansen Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Steven Rostedt (VMware) Cc: Vlastimil Babka Cc: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200515140023.25469-1-joro@8bytes.org Link: http://lkml.kernel.org/r/20200515140023.25469-2-joro@8bytes.org Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 5 +++-- include/asm-generic/pgtable.h | 23 +++++++++++++++++++ include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index 4c74b1c1d13b..58046ddc08d0 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -17,8 +17,9 @@ ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ NULL : pud_offset(p4d, address)) -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_offset(pgd, start) (pgd) +#define p4d_alloc(mm, pgd, address) (pgd) +#define p4d_alloc_track(mm, pgd, address, mask) (pgd) +#define p4d_offset(pgd, start) (pgd) #ifndef __ASSEMBLY__ static inline int p4d_none(p4d_t p4d) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8c5f9c29698b..db7df7daa0d8 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1213,6 +1213,29 @@ static inline bool arch_has_pfn_modify_check(void) # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif +/* + * Page Table Modification bits for pgtbl_mod_mask. + * + * These are used by the p?d_alloc_track*() set of functions an in the generic + * vmalloc/ioremap code to track at which page-table levels entries have been + * modified. Based on that the code can better decide when vmalloc and ioremap + * mapping changes need to be synchronized to other page-tables in the system. + */ +#define __PGTBL_PGD_MODIFIED 0 +#define __PGTBL_P4D_MODIFIED 1 +#define __PGTBL_PUD_MODIFIED 2 +#define __PGTBL_PMD_MODIFIED 3 +#define __PGTBL_PTE_MODIFIED 4 + +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) + +/* Page-Table Modification Mask */ +typedef unsigned int pgtbl_mod_mask; + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/include/linux/mm.h b/include/linux/mm.h index ebbb0acbeee2..fda41eb7f1c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2091,13 +2091,54 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } + +static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, + unsigned long address, + pgtbl_mod_mask *mod_mask) + +{ + if (unlikely(pgd_none(*pgd))) { + if (__p4d_alloc(mm, pgd, address)) + return NULL; + *mod_mask |= PGTBL_PGD_MODIFIED; + } + + return p4d_offset(pgd, address); +} + #endif /* !__ARCH_HAS_5LEVEL_HACK */ +static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(p4d_none(*p4d))) { + if (__pud_alloc(mm, p4d, address)) + return NULL; + *mod_mask |= PGTBL_P4D_MODIFIED; + } + + return pud_offset(p4d, address); +} + static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(pud_none(*pud))) { + if (__pmd_alloc(mm, pud, address)) + return NULL; + *mod_mask |= PGTBL_PUD_MODIFIED; + } + + return pmd_offset(pud, address); +} #endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS @@ -2213,6 +2254,11 @@ static inline void pgtable_pte_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) +#define pte_alloc_kernel_track(pmd, address, mask) \ + ((unlikely(pmd_none(*(pmd))) && \ + (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ + NULL: pte_offset_kernel(pmd, address)) + #if USE_SPLIT_PMD_PTLOCKS static struct page *pmd_to_page(pmd_t *pmd) -- cgit v1.2.3 From 2ba3e6947aed9bb9575eb1603c0ac6e39185d32a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:22 -0700 Subject: mm/vmalloc: track which page-table levels were modified Track at which levels in the page-table entries were modified by vmap/vunmap. After the page-table has been modified, use that information do decide whether the new arch_sync_kernel_mappings() needs to be called. [akpm@linux-foundation.org: map_kernel_range_noflush() needs the arch_sync_kernel_mappings() call] Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-3-joro@8bytes.org Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 16 +++++++++ mm/vmalloc.c | 95 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 85 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3332926295d4..0efc35dc5b25 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -133,6 +133,22 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, void vmalloc_sync_mappings(void); void vmalloc_sync_unmappings(void); +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() + * needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); + /* * Lowlevel-APIs (not for driver use!) */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 410bfe26ee73..154e3396154c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -69,7 +69,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -78,59 +79,81 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /** * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap + * @start: start of the VM area to unmap * @size: size of the VM area to unmap * * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify @@ -141,24 +164,33 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) * for calling flush_cache_vunmap() on to-be-mapped areas before calling this * function and flush_tlb_kernel_range() after. */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - unsigned long end = addr + size; + unsigned long end = start + size; unsigned long next; pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); + start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -167,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -180,55 +212,59 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; @@ -255,21 +291,28 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgprot_t prot, struct page **pages) { + unsigned long start = addr; unsigned long end = addr + size; unsigned long next; pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + return 0; } -- cgit v1.2.3 From 73f693c3a705756032c2863bfb37570276902d7d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:36 -0700 Subject: mm: remove vmalloc_sync_(un)mappings() These functions are not needed anymore because the vmalloc and ioremap mappings are now synchronized when they are created or torn down. Remove all callers and function definitions. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-7-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 37 ------------------------------------- drivers/acpi/apei/ghes.c | 6 ------ include/linux/vmalloc.h | 2 -- kernel/notifier.c | 1 - kernel/trace/trace.c | 12 ------------ mm/nommu.c | 12 ------------ mm/vmalloc.c | 21 --------------------- 7 files changed, 91 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index edeb2adaf31f..255fc631b042 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -214,26 +214,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -static void vmalloc_sync(void) -{ - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - arch_sync_kernel_mappings(VMALLOC_START, VMALLOC_END); -} - -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - /* * 32-bit: * @@ -336,23 +316,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - /* * 64-bit: * diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 24c9642e8fc7..aabe9c5ee515 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -167,12 +167,6 @@ int ghes_estatus_pool_init(int num_ghes) if (!addr) goto err_pool_alloc; - /* - * New allocation must be visible in all pgd before it can be found by - * an NMI allocating from the pool. - */ - vmalloc_sync_mappings(); - rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1); if (rc) goto err_pool_add; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0efc35dc5b25..48bb681e6c2a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -130,8 +130,6 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); -void vmalloc_sync_mappings(void); -void vmalloc_sync_unmappings(void); /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..f12e99b387b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8527,18 +8527,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } diff --git a/mm/nommu.c b/mm/nommu.c index 371697bf372d..dfae55f41901 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -371,18 +371,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_[un]mapping() if the architecture - * chose not to have one. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 154e3396154c..1e94497b7388 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1353,12 +1353,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) if (unlikely(valist == NULL)) return false; - /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_unmappings(); - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. @@ -3089,21 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose - * not to have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; -- cgit v1.2.3 From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:32 +0200 Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting Lorenz recently reported: In our TC classifier cls_redirect [0], we use the following sequence of helper calls to decapsulate a GUE (basically IP + UDP + custom header) encapsulated packet: bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO) bpf_redirect(skb->ifindex, BPF_F_INGRESS) It seems like some checksums of the inner headers are not validated in this case. For example, a TCP SYN packet with invalid TCP checksum is still accepted by the network stack and elicits a SYN ACK. [...] That is, we receive the following packet from the driver: | ETH | IP | UDP | GUE | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading. On this packet we run skb_adjust_room_mac(-encap_len), and get the following: | ETH | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is turned into a no-op due to CHECKSUM_UNNECESSARY. The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally, it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now there is no way to adjust the skb->csum_level. NICs that have checksum offload disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected. Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users from opting out. Opting out is useful for the case where we don't remove/add full protocol headers, or for the case where a user wants to adjust the csum level manually e.g. through bpf_csum_level() helper that is added in subsequent patch. The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally as well but doesn't change layers, only transitions between v4 to v6 and vice versa, therefore no adoption is required there. [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/ Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Reported-by: Lorenz Bauer Reported-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net --- include/linux/skbuff.h | 8 ++++++++ include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 8 ++++++-- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a0d5c2760103..0c0377fc00c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + skb->ip_summed = CHECKSUM_NONE; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ae82bcb03124..278dcc0af961 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { -- cgit v1.2.3 From 1f55b7ab907d373581e9abf3fc4b24ed19cf831f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:50 -0300 Subject: RDMA/mlx4: Remove FMR support for memory registration HCA's that are driven by mlx4 driver support FRWR method to register memory. Remove the ancient and unsafe FMR method. Link: https://lore.kernel.org/r/8-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 11 -- drivers/infiniband/hw/mlx4/mlx4_ib.h | 16 --- drivers/infiniband/hw/mlx4/mr.c | 93 --------------- drivers/net/ethernet/mellanox/mlx4/main.c | 2 - drivers/net/ethernet/mellanox/mlx4/mr.c | 183 ------------------------------ include/linux/mlx4/device.h | 22 +--- 6 files changed, 2 insertions(+), 325 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 275722cec8c6..816d28854a8e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -558,7 +558,6 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; - props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL; props->timestamp_mask = 0xFFFFFFFFFFFFULL; props->max_ah = INT_MAX; @@ -2600,13 +2599,6 @@ static const struct ib_device_ops mlx4_ib_dev_wq_ops = { .modify_wq = mlx4_ib_modify_wq, }; -static const struct ib_device_ops mlx4_ib_dev_fmr_ops = { - .alloc_fmr = mlx4_ib_fmr_alloc, - .dealloc_fmr = mlx4_ib_fmr_dealloc, - .map_phys_fmr = mlx4_ib_map_phys_fmr, - .unmap_fmr = mlx4_ib_unmap_fmr, -}; - static const struct ib_device_ops mlx4_ib_dev_mw_ops = { .alloc_mw = mlx4_ib_alloc_mw, .dealloc_mw = mlx4_ib_dealloc_mw, @@ -2724,9 +2716,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_wq_ops); } - if (!mlx4_is_slave(ibdev->dev)) - ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fmr_ops); - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { ibdev->ib_dev.uverbs_cmd_mask |= diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 182a237b87f7..6f4ea1067095 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -146,11 +146,6 @@ struct mlx4_ib_mw { struct mlx4_mw mmw; }; -struct mlx4_ib_fmr { - struct ib_fmr ibfmr; - struct mlx4_fmr mfmr; -}; - #define MAX_REGS_PER_FLOW 2 struct mlx4_flow_reg_id { @@ -679,11 +674,6 @@ static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) return container_of(ibmw, struct mlx4_ib_mw, ibmw); } -static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); -} - static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) { return container_of(ibflow, struct mlx4_ib_flow, ibflow); @@ -794,12 +784,6 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); -struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr); -int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, - u64 iova); -int mlx4_ib_unmap_fmr(struct list_head *fmr_list); -int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view); int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index b0121c90c561..e2fb71b23c80 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -698,99 +698,6 @@ err_free: return ERR_PTR(err); } -struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, - struct ib_fmr_attr *fmr_attr) -{ - struct mlx4_ib_dev *dev = to_mdev(pd->device); - struct mlx4_ib_fmr *fmr; - int err = -ENOMEM; - - fmr = kmalloc(sizeof *fmr, GFP_KERNEL); - if (!fmr) - return ERR_PTR(-ENOMEM); - - err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), - fmr_attr->max_pages, fmr_attr->max_maps, - fmr_attr->page_shift, &fmr->mfmr); - if (err) - goto err_free; - - err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr); - if (err) - goto err_mr; - - fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; - - return &fmr->ibfmr; - -err_mr: - (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); - -err_free: - kfree(fmr); - - return ERR_PTR(err); -} - -int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int npages, u64 iova) -{ - struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); - struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); - - return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, - &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); -} - -int mlx4_ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *ibfmr; - int err; - struct mlx4_dev *mdev = NULL; - - list_for_each_entry(ibfmr, fmr_list, list) { - if (mdev && to_mdev(ibfmr->device)->dev != mdev) - return -EINVAL; - mdev = to_mdev(ibfmr->device)->dev; - } - - if (!mdev) - return 0; - - list_for_each_entry(ibfmr, fmr_list, list) { - struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); - - mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); - } - - /* - * Make sure all MPT status updates are visible before issuing - * SYNC_TPT firmware command. - */ - wmb(); - - err = mlx4_SYNC_TPT(mdev); - if (err) - pr_warn("SYNC_TPT error %d when " - "unmapping FMRs\n", err); - - return 0; -} - -int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) -{ - struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); - struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); - int err; - - err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); - - if (!err) - kfree(ifmr); - - return err; -} - static int mlx4_set_page(struct ib_mr *ibmr, u64 addr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index c72c4e1ea383..3d9aa7da95e9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2345,8 +2345,6 @@ static int mlx4_init_hca(struct mlx4_dev *dev) goto out_free; } - dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; - if (enable_4k_uar || !dev->persist->num_vfs) { init_hca->log_uar_sz = ilog2(dev->caps.num_uars) + PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT; diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 1a11bc0e1612..d2986f1f2db0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -966,189 +966,6 @@ void mlx4_cleanup_mr_table(struct mlx4_dev *dev) mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); } -static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova) -{ - int i, page_mask; - - if (npages > fmr->max_pages) - return -EINVAL; - - page_mask = (1 << fmr->page_shift) - 1; - - /* We are getting page lists, so va must be page aligned. */ - if (iova & page_mask) - return -EINVAL; - - /* Trust the user not to pass misaligned data in page_list */ - if (0) - for (i = 0; i < npages; ++i) { - if (page_list[i] & ~page_mask) - return -EINVAL; - } - - if (fmr->maps >= fmr->max_maps) - return -EINVAL; - - return 0; -} - -int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova, u32 *lkey, u32 *rkey) -{ - u32 key; - int i, err; - - err = mlx4_check_fmr(fmr, page_list, npages, iova); - if (err) - return err; - - ++fmr->maps; - - key = key_to_hw_index(fmr->mr.key); - key += dev->caps.num_mpts; - *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); - - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; - - /* Make sure MPT status is visible before writing MTT entries */ - wmb(); - - dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle, - npages * sizeof(u64), DMA_TO_DEVICE); - - for (i = 0; i < npages; ++i) - fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); - - dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle, - npages * sizeof(u64), DMA_TO_DEVICE); - - fmr->mpt->key = cpu_to_be32(key); - fmr->mpt->lkey = cpu_to_be32(key); - fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift)); - fmr->mpt->start = cpu_to_be64(iova); - - /* Make MTT entries are visible before setting MPT status */ - wmb(); - - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW; - - /* Make sure MPT status is visible before consumer can use FMR */ - wmb(); - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr); - -int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, - int max_maps, u8 page_shift, struct mlx4_fmr *fmr) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - int err = -ENOMEM; - - if (max_maps > dev->caps.max_fmr_maps) - return -EINVAL; - - if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32) - return -EINVAL; - - /* All MTTs must fit in the same page */ - if (max_pages * sizeof(*fmr->mtts) > PAGE_SIZE) - return -EINVAL; - - fmr->page_shift = page_shift; - fmr->max_pages = max_pages; - fmr->max_maps = max_maps; - fmr->maps = 0; - - err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages, - page_shift, &fmr->mr); - if (err) - return err; - - fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, - fmr->mr.mtt.offset, - &fmr->dma_handle); - - if (!fmr->mtts) { - err = -ENOMEM; - goto err_free; - } - - return 0; - -err_free: - (void) mlx4_mr_free(dev, &fmr->mr); - return err; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_alloc); - -int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - int err; - - err = mlx4_mr_enable(dev, &fmr->mr); - if (err) - return err; - - fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table, - key_to_hw_index(fmr->mr.key), NULL); - if (!fmr->mpt) - return -ENOMEM; - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_enable); - -void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, - u32 *lkey, u32 *rkey) -{ - if (!fmr->maps) - return; - - /* To unmap: it is sufficient to take back ownership from HW */ - *(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW; - - /* Make sure MPT status is visible */ - wmb(); - - fmr->maps = 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_unmap); - -int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr) -{ - int ret; - - if (fmr->maps) - return -EBUSY; - if (fmr->mr.enabled == MLX4_MPT_EN_HW) { - /* In case of FMR was enabled and unmapped - * make sure to give ownership of MPT back to HW - * so HW2SW_MPT command will success. - */ - *(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW; - /* Make sure MPT status is visible before changing MPT fields */ - wmb(); - fmr->mpt->length = 0; - fmr->mpt->start = 0; - /* Make sure MPT data is visible after changing MPT status */ - wmb(); - *(u8 *)fmr->mpt = MLX4_MPT_STATUS_HW; - /* make sure MPT status is visible */ - wmb(); - } - - ret = mlx4_mr_free(dev, &fmr->mr); - if (ret) - return ret; - fmr->mr.enabled = MLX4_MPT_DISABLED; - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_free); - int mlx4_SYNC_TPT(struct mlx4_dev *dev) { return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 20372de0b587..06e066e04a4b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -573,7 +573,6 @@ struct mlx4_caps { int reserved_eqs; int num_comp_vectors; int num_mpts; - int max_fmr_maps; int num_mtts; int fmr_reserved_mtts; int reserved_mtts; @@ -707,17 +706,6 @@ struct mlx4_mw { int enabled; }; -struct mlx4_fmr { - struct mlx4_mr mr; - struct mlx4_mpt_entry *mpt; - __be64 *mtts; - dma_addr_t dma_handle; - int max_pages; - int max_maps; - int maps; - u8 page_shift; -}; - struct mlx4_uar { unsigned long pfn; int index; @@ -1412,14 +1400,6 @@ int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); -int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova, u32 *lkey, u32 *rkey); -int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, - int max_maps, u8 page_shift, struct mlx4_fmr *fmr); -int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr); -void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, - u32 *lkey, u32 *rkey); -int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); int mlx4_test_interrupt(struct mlx4_dev *dev, int vector); int mlx4_test_async(struct mlx4_dev *dev); @@ -1522,6 +1502,8 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, int enable); + +struct mlx4_mpt_entry; int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, struct mlx4_mpt_entry ***mpt_entry); int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, -- cgit v1.2.3 From 649392bf75a423287a9c4936b341677f12e8cf0b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 28 May 2020 16:45:54 -0300 Subject: RDMA: Remove 'max_fmr' Now that FMR support is gone, this attribute can be deleted from all places. Link: https://lore.kernel.org/r/12-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma.h | 1 - drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 1 - drivers/infiniband/hw/qedr/main.c | 1 - drivers/infiniband/hw/qedr/qedr.h | 1 - drivers/infiniband/hw/qedr/verbs.c | 1 - drivers/infiniband/sw/rdmavt/mr.c | 1 - drivers/infiniband/sw/siw/siw.h | 2 -- drivers/infiniband/sw/siw/siw_main.c | 1 - drivers/infiniband/sw/siw/siw_verbs.c | 1 - drivers/net/ethernet/qlogic/qed/qed_rdma.c | 1 - drivers/net/ethernet/qlogic/qed/qed_rdma.h | 1 - include/linux/qed/qed_rdma_if.h | 1 - include/rdma/ib_verbs.h | 1 - net/rds/ib.c | 2 +- 16 files changed, 1 insertion(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2067a939788b..56d207405dbd 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -356,7 +356,6 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 7baedc74e39d..fcfe0e82197a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -98,7 +98,6 @@ struct ocrdma_dev_attr { u64 max_mr_size; u32 max_num_mr_pbl; int max_mw; - int max_fmr; int max_map_per_fmr; int max_pages_per_frmr; u16 max_ord_per_qp; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index d82d3ec3649e..e07bf0b2209a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1190,7 +1190,6 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_mr = rsp->max_mr; attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) | rsp->max_mr_size_lo; - attr->max_fmr = 0; attr->max_pages_per_frmr = rsp->max_pages_per_frmr; attr->max_num_mr_pbl = rsp->max_num_mr_pbl; attr->max_cqe = rsp->max_cq_cqes_per_cq & diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 10e343894595..890e3fd41d21 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -99,7 +99,6 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_mw = dev->attr.max_mw; attr->max_pd = dev->attr.max_pd; attr->atomic_cap = 0; - attr->max_fmr = 0; attr->max_map_per_fmr = 0; attr->max_qp_rd_atom = min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index dcdc85a1ab25..ccaedfd53e49 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -632,7 +632,6 @@ static int qedr_set_device_attr(struct qedr_dev *dev) attr->max_mr_size = qed_attr->max_mr_size; attr->max_cqe = min_t(u64, qed_attr->max_cqe, QEDR_MAX_CQES); attr->max_mw = qed_attr->max_mw; - attr->max_fmr = qed_attr->max_fmr; attr->max_mr_mw_fmr_pbl = qed_attr->max_mr_mw_fmr_pbl; attr->max_mr_mw_fmr_size = qed_attr->max_mr_mw_fmr_size; attr->max_pd = qed_attr->max_pd; diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 5488dbd59d3c..fdf90ecb2699 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -103,7 +103,6 @@ struct qedr_device_attr { u64 max_mr_size; u32 max_cqe; u32 max_mw; - u32 max_fmr; u32 max_mr_mw_fmr_pbl; u64 max_mr_mw_fmr_size; u32 max_pd; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index d6b94a713573..ca88006eaa66 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -145,7 +145,6 @@ int qedr_query_device(struct ib_device *ibdev, attr->max_mw = qattr->max_mw; attr->max_pd = qattr->max_pd; attr->atomic_cap = dev->atomic_cap; - attr->max_fmr = qattr->max_fmr; attr->max_map_per_fmr = 16; attr->max_qp_init_rd_atom = 1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index ddb0c0d771c2..60864e5ca7cb 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -97,7 +97,6 @@ int rvt_driver_mr_init(struct rvt_dev_info *rdi) RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL); rdi->dparms.props.max_mr = rdi->lkey_table.max; - rdi->dparms.props.max_fmr = rdi->lkey_table.max; return 0; } diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 5a58a1cc7a7e..e9753831ac3f 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -30,7 +30,6 @@ #define SIW_MAX_MR (SIW_MAX_QP * 10) #define SIW_MAX_PD SIW_MAX_QP #define SIW_MAX_MW 0 /* to be set if MW's are supported */ -#define SIW_MAX_FMR SIW_MAX_MR #define SIW_MAX_SRQ SIW_MAX_QP #define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10) #define SIW_MAX_CONTEXT SIW_MAX_PD @@ -59,7 +58,6 @@ struct siw_dev_cap { int max_mr; int max_pd; int max_mw; - int max_fmr; int max_srq; int max_srq_wr; int max_srq_sge; diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 5cd40fb9e20c..a0b8cc643c5c 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -413,7 +413,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; - sdev->attrs.max_fmr = SIW_MAX_FMR; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index aeb842bc7a1e..987e2ba05dbc 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -136,7 +136,6 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, attr->max_cq = sdev->attrs.max_cq; attr->max_cqe = sdev->attrs.max_cqe; attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; - attr->max_fmr = sdev->attrs.max_fmr; attr->max_mr = sdev->attrs.max_mr; attr->max_mw = sdev->attrs.max_mw; attr->max_mr_size = ~0ull; diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 38b1f402f7ed..5dc18a4bdda4 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -499,7 +499,6 @@ static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn, dev->max_cqe = QED_RDMA_MAX_CQE_16_BIT; dev->max_mw = 0; - dev->max_fmr = QED_RDMA_MAX_FMR; dev->max_mr_mw_fmr_pbl = (PAGE_SIZE / 8) * (PAGE_SIZE / 8); dev->max_mr_mw_fmr_size = dev->max_mr_mw_fmr_pbl * PAGE_SIZE; dev->max_pkey = QED_RDMA_MAX_P_KEY; diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h index 3689fe3e5935..dfaa2f552627 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.h +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h @@ -45,7 +45,6 @@ #include "qed_iwarp.h" #include "qed_roce.h" -#define QED_RDMA_MAX_FMR (RDMA_MAX_TIDS) #define QED_RDMA_MAX_P_KEY (1) #define QED_RDMA_MAX_WQE (0x7FFF) #define QED_RDMA_MAX_SRQ_WQE_ELEM (0x7FFF) diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index 74efca15fde7..c90276cda5c1 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -91,7 +91,6 @@ struct qed_rdma_device { u64 max_mr_size; u32 max_cqe; u32 max_mw; - u32 max_fmr; u32 max_mr_mw_fmr_pbl; u64 max_mr_mw_fmr_size; u32 max_pd; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ff6a8053ec52..c4708b3243f9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -430,7 +430,6 @@ struct ib_device_attr { int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; - int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; diff --git a/net/rds/ib.c b/net/rds/ib.c index 6c43b3e4c736..deecbdcdae84 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -217,7 +217,7 @@ static int rds_ib_add_one(struct ib_device *device) } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", - device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); pr_info("RDS/IB: %s: added\n", device->name); -- cgit v1.2.3 From 6a45b0e2589fafd7fc501a5fbd0ab56689a08124 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 28 May 2020 16:58:43 +0200 Subject: gpiolib: Introduce gpiochip_irqchip_add_domain() The function connects an IRQ domain to a gpiochip and reuses gpiochip_to_irq() which is provided by gpiolib. gpiochip_irqchip_* and regmap_irq partially provide the same functionality. This function will help to connect just the minimal functionality of the gpiochip_irqchip which is needed to work together with regmap-irq. Signed-off-by: Michael Walle Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200528145845.31436-2-michael@walle.cc Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 20 ++++++++++++++++++++ include/linux/gpio/driver.h | 3 +++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 167860684be6..0a54a759dd40 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2779,6 +2779,26 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gc, } EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key); +/** + * gpiochip_irqchip_add_domain() - adds an irqdomain to a gpiochip + * @gc: the gpiochip to add the irqchip to + * @domain: the irqdomain to add to the gpiochip + * + * This function adds an IRQ domain to the gpiochip. + */ +int gpiochip_irqchip_add_domain(struct gpio_chip *gc, + struct irq_domain *domain) +{ + if (!domain) + return -EINVAL; + + gc->to_irq = gpiochip_to_irq; + gc->irq.domain = domain; + + return 0; +} +EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_domain); + #else /* CONFIG_GPIOLIB_IRQCHIP */ static inline int gpiochip_add_irqchip(struct gpio_chip *gc, diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index c8bcaf315d03..c4f272af7af5 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -612,6 +612,9 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gc, bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc, unsigned int offset); +int gpiochip_irqchip_add_domain(struct gpio_chip *gc, + struct irq_domain *domain); + #ifdef CONFIG_LOCKDEP /* -- cgit v1.2.3 From ebe363197e525ffbd279c729421f6f6c24d8d681 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 28 May 2020 16:58:44 +0200 Subject: gpio: add a reusable generic gpio_chip using regmap There are quite a lot simple GPIO controller which are using regmap to access the hardware. This driver tries to be a base to unify existing code into one place. This won't cover everything but it should be a good starting point. It does not implement its own irq_chip because there is already a generic one for regmap based devices. Instead, the irq_chip will be instantiated in the parent driver and its irq domain will be associate to this driver. For now it consists of the usual registers, like set (and an optional clear) data register, an input register and direction registers. Out-of-the-box, it supports consecutive register mappings and mappings where the registers have gaps between them with a linear mapping between GPIO offset and bit position. For weirder mappings the user can register its own .xlate(). Signed-off-by: Michael Walle Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200528145845.31436-3-michael@walle.cc Signed-off-by: Linus Walleij --- drivers/gpio/Kconfig | 4 + drivers/gpio/Makefile | 1 + drivers/gpio/gpio-regmap.c | 349 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/gpio/regmap.h | 86 +++++++++++ 4 files changed, 440 insertions(+) create mode 100644 drivers/gpio/gpio-regmap.c create mode 100644 include/linux/gpio/regmap.h (limited to 'include/linux') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 7d077be10a0f..bcacd9c74aa8 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -73,6 +73,10 @@ config GPIO_GENERIC depends on HAS_IOMEM # Only for IOMEM drivers tristate +config GPIO_REGMAP + depends on REGMAP + tristate + # put drivers in the right section, in alphabetical order # This symbol is selected by both I2C and SPI expanders diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 65bf3940e33c..1e4894e0bf0f 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_GPIO_SYSFS) += gpiolib-sysfs.o obj-$(CONFIG_GPIO_ACPI) += gpiolib-acpi.o # Device drivers. Generally keep list sorted alphabetically +obj-$(CONFIG_GPIO_REGMAP) += gpio-regmap.o obj-$(CONFIG_GPIO_GENERIC) += gpio-generic.o # directly supported by gpio-generic diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c new file mode 100644 index 000000000000..5412cb3b0b2a --- /dev/null +++ b/drivers/gpio/gpio-regmap.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * regmap based generic GPIO driver + * + * Copyright 2020 Michael Walle + */ + +#include +#include +#include +#include +#include + +struct gpio_regmap { + struct device *parent; + struct regmap *regmap; + struct gpio_chip gpio_chip; + + int reg_stride; + int ngpio_per_reg; + unsigned int reg_dat_base; + unsigned int reg_set_base; + unsigned int reg_clr_base; + unsigned int reg_dir_in_base; + unsigned int reg_dir_out_base; + + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, + unsigned int offset, unsigned int *reg, + unsigned int *mask); + + void *driver_data; +}; + +static unsigned int gpio_regmap_addr(unsigned int addr) +{ + if (addr == GPIO_REGMAP_ADDR_ZERO) + return 0; + + return addr; +} + +static int gpio_regmap_simple_xlate(struct gpio_regmap *gpio, + unsigned int base, unsigned int offset, + unsigned int *reg, unsigned int *mask) +{ + unsigned int line = offset % gpio->ngpio_per_reg; + unsigned int stride = offset / gpio->ngpio_per_reg; + + *reg = base + stride * gpio->reg_stride; + *mask = BIT(line); + + return 0; +} + +static int gpio_regmap_get(struct gpio_chip *chip, unsigned int offset) +{ + struct gpio_regmap *gpio = gpiochip_get_data(chip); + unsigned int base, val, reg, mask; + int ret; + + /* we might not have an output register if we are input only */ + if (gpio->reg_dat_base) + base = gpio_regmap_addr(gpio->reg_dat_base); + else + base = gpio_regmap_addr(gpio->reg_set_base); + + ret = gpio->reg_mask_xlate(gpio, base, offset, ®, &mask); + if (ret) + return ret; + + ret = regmap_read(gpio->regmap, reg, &val); + if (ret) + return ret; + + return !!(val & mask); +} + +static void gpio_regmap_set(struct gpio_chip *chip, unsigned int offset, + int val) +{ + struct gpio_regmap *gpio = gpiochip_get_data(chip); + unsigned int base = gpio_regmap_addr(gpio->reg_set_base); + unsigned int reg, mask; + + gpio->reg_mask_xlate(gpio, base, offset, ®, &mask); + if (val) + regmap_update_bits(gpio->regmap, reg, mask, mask); + else + regmap_update_bits(gpio->regmap, reg, mask, 0); +} + +static void gpio_regmap_set_with_clear(struct gpio_chip *chip, + unsigned int offset, int val) +{ + struct gpio_regmap *gpio = gpiochip_get_data(chip); + unsigned int base, reg, mask; + + if (val) + base = gpio_regmap_addr(gpio->reg_set_base); + else + base = gpio_regmap_addr(gpio->reg_clr_base); + + gpio->reg_mask_xlate(gpio, base, offset, ®, &mask); + regmap_write(gpio->regmap, reg, mask); +} + +static int gpio_regmap_get_direction(struct gpio_chip *chip, + unsigned int offset) +{ + struct gpio_regmap *gpio = gpiochip_get_data(chip); + unsigned int base, val, reg, mask; + int invert, ret; + + if (gpio->reg_dir_out_base) { + base = gpio_regmap_addr(gpio->reg_dir_out_base); + invert = 0; + } else if (gpio->reg_dir_in_base) { + base = gpio_regmap_addr(gpio->reg_dir_in_base); + invert = 1; + } else { + return -EOPNOTSUPP; + } + + ret = gpio->reg_mask_xlate(gpio, base, offset, ®, &mask); + if (ret) + return ret; + + ret = regmap_read(gpio->regmap, reg, &val); + if (ret) + return ret; + + if (!!(val & mask) ^ invert) + return GPIO_LINE_DIRECTION_OUT; + else + return GPIO_LINE_DIRECTION_IN; +} + +static int gpio_regmap_set_direction(struct gpio_chip *chip, + unsigned int offset, bool output) +{ + struct gpio_regmap *gpio = gpiochip_get_data(chip); + unsigned int base, val, reg, mask; + int invert, ret; + + if (gpio->reg_dir_out_base) { + base = gpio_regmap_addr(gpio->reg_dir_out_base); + invert = 0; + } else if (gpio->reg_dir_in_base) { + base = gpio_regmap_addr(gpio->reg_dir_in_base); + invert = 1; + } else { + return -EOPNOTSUPP; + } + + ret = gpio->reg_mask_xlate(gpio, base, offset, ®, &mask); + if (ret) + return ret; + + if (invert) + val = output ? 0 : mask; + else + val = output ? mask : 0; + + return regmap_update_bits(gpio->regmap, reg, mask, val); +} + +static int gpio_regmap_direction_input(struct gpio_chip *chip, + unsigned int offset) +{ + return gpio_regmap_set_direction(chip, offset, false); +} + +static int gpio_regmap_direction_output(struct gpio_chip *chip, + unsigned int offset, int value) +{ + gpio_regmap_set(chip, offset, value); + + return gpio_regmap_set_direction(chip, offset, true); +} + +void gpio_regmap_set_drvdata(struct gpio_regmap *gpio, void *data) +{ + gpio->driver_data = data; +} +EXPORT_SYMBOL_GPL(gpio_regmap_set_drvdata); + +void *gpio_regmap_get_drvdata(struct gpio_regmap *gpio) +{ + return gpio->driver_data; +} +EXPORT_SYMBOL_GPL(gpio_regmap_get_drvdata); + +/** + * gpio_regmap_register() - Register a generic regmap GPIO controller + * @config: configuration for gpio_regmap + * + * Return: A pointer to the registered gpio_regmap or ERR_PTR error value. + */ +struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config) +{ + struct gpio_regmap *gpio; + struct gpio_chip *chip; + int ret; + + if (!config->parent) + return ERR_PTR(-EINVAL); + + if (!config->ngpio) + return ERR_PTR(-EINVAL); + + /* we need at least one */ + if (!config->reg_dat_base && !config->reg_set_base) + return ERR_PTR(-EINVAL); + + /* if we have a direction register we need both input and output */ + if ((config->reg_dir_out_base || config->reg_dir_in_base) && + (!config->reg_dat_base || !config->reg_set_base)) + return ERR_PTR(-EINVAL); + + /* we don't support having both registers simultaneously for now */ + if (config->reg_dir_out_base && config->reg_dir_in_base) + return ERR_PTR(-EINVAL); + + gpio = kzalloc(sizeof(*gpio), GFP_KERNEL); + if (!gpio) + return ERR_PTR(-ENOMEM); + + gpio->parent = config->parent; + gpio->regmap = config->regmap; + gpio->ngpio_per_reg = config->ngpio_per_reg; + gpio->reg_stride = config->reg_stride; + gpio->reg_mask_xlate = config->reg_mask_xlate; + gpio->reg_dat_base = config->reg_dat_base; + gpio->reg_set_base = config->reg_set_base; + gpio->reg_clr_base = config->reg_clr_base; + gpio->reg_dir_in_base = config->reg_dir_in_base; + gpio->reg_dir_out_base = config->reg_dir_out_base; + + /* if not set, assume there is only one register */ + if (!gpio->ngpio_per_reg) + gpio->ngpio_per_reg = config->ngpio; + + /* if not set, assume they are consecutive */ + if (!gpio->reg_stride) + gpio->reg_stride = 1; + + if (!gpio->reg_mask_xlate) + gpio->reg_mask_xlate = gpio_regmap_simple_xlate; + + chip = &gpio->gpio_chip; + chip->parent = config->parent; + chip->base = -1; + chip->ngpio = config->ngpio; + chip->names = config->names; + chip->label = config->label ?: dev_name(config->parent); + + /* + * If our regmap is fast_io we should probably set can_sleep to false. + * Right now, the regmap doesn't save this property, nor is there any + * access function for it. + * The only regmap type which uses fast_io is regmap-mmio. For now, + * assume a safe default of true here. + */ + chip->can_sleep = true; + + chip->get = gpio_regmap_get; + if (gpio->reg_set_base && gpio->reg_clr_base) + chip->set = gpio_regmap_set_with_clear; + else if (gpio->reg_set_base) + chip->set = gpio_regmap_set; + + if (gpio->reg_dir_in_base || gpio->reg_dir_out_base) { + chip->get_direction = gpio_regmap_get_direction; + chip->direction_input = gpio_regmap_direction_input; + chip->direction_output = gpio_regmap_direction_output; + } + + ret = gpiochip_add_data(chip, gpio); + if (ret < 0) + goto err_free_gpio; + + if (config->irq_domain) { + ret = gpiochip_irqchip_add_domain(chip, config->irq_domain); + if (ret) + goto err_remove_gpiochip; + } + + return gpio; + +err_remove_gpiochip: + gpiochip_remove(chip); +err_free_gpio: + kfree(gpio); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(gpio_regmap_register); + +/** + * gpio_regmap_unregister() - Unregister a generic regmap GPIO controller + * @gpio: gpio_regmap device to unregister + */ +void gpio_regmap_unregister(struct gpio_regmap *gpio) +{ + gpiochip_remove(&gpio->gpio_chip); + kfree(gpio); +} +EXPORT_SYMBOL_GPL(gpio_regmap_unregister); + +static void devm_gpio_regmap_unregister(struct device *dev, void *res) +{ + gpio_regmap_unregister(*(struct gpio_regmap **)res); +} + +/** + * devm_gpio_regmap_register() - resource managed gpio_regmap_register() + * @dev: device that is registering this GPIO device + * @config: configuration for gpio_regmap + * + * Managed gpio_regmap_register(). For generic regmap GPIO device registered by + * this function, gpio_regmap_unregister() is automatically called on driver + * detach. See gpio_regmap_register() for more information. + * + * Return: A pointer to the registered gpio_regmap or ERR_PTR error value. + */ +struct gpio_regmap *devm_gpio_regmap_register(struct device *dev, + const struct gpio_regmap_config *config) +{ + struct gpio_regmap **ptr, *gpio; + + ptr = devres_alloc(devm_gpio_regmap_unregister, sizeof(*ptr), + GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + gpio = gpio_regmap_register(config); + if (!IS_ERR(gpio)) { + *ptr = gpio; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return gpio; +} +EXPORT_SYMBOL_GPL(devm_gpio_regmap_register); + +MODULE_AUTHOR("Michael Walle "); +MODULE_DESCRIPTION("GPIO generic regmap driver core"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h new file mode 100644 index 000000000000..4c1e6b34e824 --- /dev/null +++ b/include/linux/gpio/regmap.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _LINUX_GPIO_REGMAP_H +#define _LINUX_GPIO_REGMAP_H + +struct device; +struct gpio_regmap; +struct irq_domain; +struct regmap; + +#define GPIO_REGMAP_ADDR_ZERO ((unsigned long)(-1)) +#define GPIO_REGMAP_ADDR(addr) ((addr) ? : GPIO_REGMAP_ADDR_ZERO) + +/** + * struct gpio_regmap_config - Description of a generic regmap gpio_chip. + * @parent: The parent device + * @regmap: The regmap used to access the registers + * given, the name of the device is used + * @label: (Optional) Descriptive name for GPIO controller. + * If not given, the name of the device is used. + * @ngpio: Number of GPIOs + * @names: (Optional) Array of names for gpios + * @reg_dat_base: (Optional) (in) register base address + * @reg_set_base: (Optional) set register base address + * @reg_clr_base: (Optional) clear register base address + * @reg_dir_in_base: (Optional) in setting register base address + * @reg_dir_out_base: (Optional) out setting register base address + * @reg_stride: (Optional) May be set if the registers (of the + * same type, dat, set, etc) are not consecutive. + * @ngpio_per_reg: Number of GPIOs per register + * @irq_domain: (Optional) IRQ domain if the controller is + * interrupt-capable + * @reg_mask_xlate: (Optional) Translates base address and GPIO + * offset to a register/bitmask pair. If not + * given the default gpio_regmap_simple_xlate() + * is used. + * + * The ->reg_mask_xlate translates a given base address and GPIO offset to + * register and mask pair. The base address is one of the given register + * base addresses in this structure. + * + * Although all register base addresses are marked as optional, there are + * several rules: + * 1. if you only have @reg_dat_base set, then it is input-only + * 2. if you only have @reg_set_base set, then it is output-only + * 3. if you have either @reg_dir_in_base or @reg_dir_out_base set, then + * you have to set both @reg_dat_base and @reg_set_base + * 4. if you have @reg_set_base set, you may also set @reg_clr_base to have + * two different registers for setting and clearing the output. This is + * also valid for the output-only case. + * 5. @reg_dir_in_base and @reg_dir_out_base are exclusive; is there really + * hardware which has redundant registers? + * + * Note: All base addresses may have the special value %GPIO_REGMAP_ADDR_ZERO + * which forces the address to the value 0. + */ +struct gpio_regmap_config { + struct device *parent; + struct regmap *regmap; + + const char *label; + int ngpio; + const char *const *names; + + unsigned int reg_dat_base; + unsigned int reg_set_base; + unsigned int reg_clr_base; + unsigned int reg_dir_in_base; + unsigned int reg_dir_out_base; + int reg_stride; + int ngpio_per_reg; + struct irq_domain *irq_domain; + + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, + unsigned int offset, unsigned int *reg, + unsigned int *mask); +}; + +struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config); +void gpio_regmap_unregister(struct gpio_regmap *gpio); +struct gpio_regmap *devm_gpio_regmap_register(struct device *dev, + const struct gpio_regmap_config *config); +void gpio_regmap_set_drvdata(struct gpio_regmap *gpio, void *data); +void *gpio_regmap_get_drvdata(struct gpio_regmap *gpio); + +#endif /* _LINUX_GPIO_REGMAP_H */ -- cgit v1.2.3 From a645a89d9a780a8fbb6e283f84fc91ad538c2edc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Jun 2020 15:55:48 +0300 Subject: RDMA/mlx5: Return ECE DC support The DC QPs are many-to-one QP types that means that first connection will establish ECE options that coming connections should follow. Due to this property, the ECE code was removed between first [1] and second [2] ECE submissions. This patch returns the dropped code, because ECE is a property of a connection and like any other connection users are needed to manage this data. Allow them to set ECE parameter for DC too and avoid need of having compatibility flag for the DC ECE. [1] https://lore.kernel.org/linux-rdma/20200523132243.817936-1-leon@kernel.org/ [2] https://lore.kernel.org/linux-rdma/20200525174401.71152-1-leon@kernel.org/ Link: https://lore.kernel.org/r/20200602125548.172654-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 45 ++++++++++++++++++++++++++++------------- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 2 files changed, 34 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 18135f908971..81bf6b975e0e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2404,7 +2404,8 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, destroy_qp(dev, qp, base, udata); } -static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, +static int create_dct(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct mlx5_create_qp_params *params) { struct ib_qp_init_attr *attr = params->attr; @@ -2423,6 +2424,8 @@ static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); MLX5_SET(dctc, dctc, user_index, uidx); + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + MLX5_SET(dctc, dctc, ece, ucmd->ece_options); if (qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) { int rcqe_sz = mlx5_ib_get_cqe_size(attr->recv_cq); @@ -2768,7 +2771,7 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } if (qp->type == MLX5_IB_QPT_DCT) { - err = create_dct(pd, qp, params); + err = create_dct(dev, pd, qp, params); goto out; } @@ -2882,9 +2885,8 @@ static int check_ucmd_data(struct mlx5_ib_dev *dev, */ last = sizeof(struct mlx5_ib_create_qp_rss); else - /* IB_QPT_RAW_PACKET and IB_QPT_DRIVER don't have ECE data */ + /* IB_QPT_RAW_PACKET doesn't have ECE data */ switch (attr->qp_type) { - case IB_QPT_DRIVER: case IB_QPT_RAW_PACKET: last = offsetof(struct mlx5_ib_create_qp, ece_options); break; @@ -4095,7 +4097,8 @@ static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new * Other transitions and attributes are illegal */ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) + int attr_mask, struct mlx5_ib_modify_qp *ucmd, + struct ib_udata *udata) { struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -4111,6 +4114,15 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr->qp_state; dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + if (MLX5_CAP_GEN(dev->mdev, ece_support) && ucmd->ece_options) + /* + * DCT doesn't initialize QP till modify command is executed, + * so we need to overwrite previously set ECE field if user + * provided any value except zero, which means not set/not + * valid. + */ + MLX5_SET(dctc, dctc, ece, ucmd->ece_options); + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u16 set_id; @@ -4145,14 +4157,21 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, counter_set_id, set_id); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; - u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; - u32 min_resp_len = offsetof(typeof(resp), dctn) + - sizeof(resp.dctn); + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; + u32 min_resp_len = offsetofend(typeof(resp), dctn); if (udata->outlen < min_resp_len) return -EINVAL; resp.response_length = min_resp_len; + /* + * If we don't have enough space for the ECE options, + * simply indicate it with resp.response_length. + */ + resp.response_length = (udata->outlen < sizeof(resp)) ? + min_resp_len : + sizeof(resp); + required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU; if (!is_valid_mask(attr_mask, required, 0)) return -EINVAL; @@ -4169,6 +4188,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (err) return err; resp.dctn = qp->dct.mdct.mqp.qpn; + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp.ece_options = MLX5_GET(create_dct_out, out, ece); err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) { mlx5_core_destroy_dct(dev, &qp->dct.mdct); @@ -4226,12 +4247,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : qp->type; - if (qp_type == MLX5_IB_QPT_DCT) { - if (memchr_inv(&ucmd.ece_options, 0, sizeof(ucmd.ece_options))) - return -EOPNOTSUPP; - - return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); - } + if (qp_type == MLX5_IB_QPT_DCT) + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, &ucmd, udata); mutex_lock(&qp->mutex); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4d2e36dd2c6b..116bd9bb347f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3690,7 +3690,8 @@ struct mlx5_ifc_dctc_bits { u8 ecn[0x2]; u8 dscp[0x6]; - u8 reserved_at_1c0[0x40]; + u8 reserved_at_1c0[0x20]; + u8 ece[0x20]; }; enum { @@ -7940,7 +7941,7 @@ struct mlx5_ifc_create_dct_out_bits { u8 reserved_at_40[0x8]; u8 dctn[0x18]; - u8 reserved_at_60[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_create_dct_in_bits { -- cgit v1.2.3 From 5ea75ae6ae60d13dfa35fd5d2e2a81824cba6662 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 18 Feb 2020 17:30:05 -0500 Subject: user_regset_copyout_zero(): use clear_user() that's the only caller of __clear_user() in generic code, and it's not hot enough to bother with skipping access_ok(). Signed-off-by: Al Viro --- include/linux/regset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regset.h b/include/linux/regset.h index bf0243779738..46d6ae68c455 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -320,7 +320,7 @@ static inline int user_regset_copyout_zero(unsigned int *pos, if (*kbuf) { memset(*kbuf, 0, copy); *kbuf += copy; - } else if (__clear_user(*ubuf, copy)) + } else if (clear_user(*ubuf, copy)) return -EFAULT; else *ubuf += copy; -- cgit v1.2.3 From 376a34efa4eeb699d285c1a741b186d44b44c429 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 3 Jun 2020 15:56:30 -0700 Subject: mm/gup: refactor and de-duplicate gup_fast() code There were two nearly identical sets of code for gup_fast() style of walking the page tables with interrupts disabled. This has lead to the usual maintenance problems that arise from having duplicated code. There is already a core internal routine in gup.c for gup_fast(), so just enhance it very slightly: allow skipping the fall-back to "slow" (regular) get_user_pages(), via the new FOLL_FAST_ONLY flag. Then, just call internal_get_user_pages_fast() from __get_user_pages_fast(), and adjust the API to match pre-existing API behavior. There is a change in behavior from this refactoring: the nested form of interrupt disabling is used in all gup_fast() variants now. That's because there is only one place that interrupt disabling for page walking is done, and so the safer form is required. This should, if anything, eliminate possible (rare) bugs, because the non-nested form of enabling interrupts was fragile at best. [jhubbard@nvidia.com: fixup] Link: http://lkml.kernel.org/r/20200521233841.1279742-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: Chris Wilson Cc: Daniel Vetter Cc: David Airlie Cc: Jani Nikula Cc: "Joonas Lahtinen" Cc: Matthew Auld Cc: Matthew Wilcox Cc: Rodrigo Vivi Cc: Souptick Joarder Cc: Tvrtko Ursulin Link: http://lkml.kernel.org/r/20200519002124.2025955-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/gup.c | 83 +++++++++++++++++++++++++++--------------------------- 2 files changed, 43 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7e07f4f490cb..e6b884715da1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2816,6 +2816,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/mm/gup.c b/mm/gup.c index 57b5f3e22076..67b5e96cd2c7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2731,10 +2731,12 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, struct page **pages) { unsigned long addr, len, end; + unsigned long flags; int nr_pinned = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE | FOLL_PIN | FOLL_GET))) + FOLL_FORCE | FOLL_PIN | FOLL_GET | + FOLL_FAST_ONLY))) return -EINVAL; start = untagged_addr(start) & PAGE_MASK; @@ -2753,16 +2755,36 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * order to avoid confusing the normal COW routines. So only * targets that are already writable are safe to do by just * looking at the page tables. + * + * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here, + * because there is no slow path to fall back on. But you'd + * better be careful about possible COW pages - you'll get _a_ + * COW page, but not necessarily the one you intended to get + * depending on what COW event happens after this. COW may break + * the page copy in a random direction. + * + * Disable interrupts. The nested form is used, in order to allow + * full, general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_disable(); - gup_pgd_range(addr, end, gup_flags | FOLL_WRITE, pages, &nr_pinned); - local_irq_enable(); + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { + unsigned long fast_flags = gup_flags; + if (!(gup_flags & FOLL_FAST_ONLY)) + fast_flags |= FOLL_WRITE; + + local_irq_save(flags); + gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); + local_irq_restore(flags); ret = nr_pinned; } - if (nr_pinned < nr_pages) { + if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { /* Try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; @@ -2798,51 +2820,30 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - unsigned long len, end; - unsigned long flags; - int nr_pinned = 0; + int nr_pinned; /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. + * + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. */ - unsigned int gup_flags = FOLL_GET; + unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; if (write) gup_flags |= FOLL_WRITE; - start = untagged_addr(start) & PAGE_MASK; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) - return 0; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); /* - * Disable interrupts. We use the nested form as we can already have - * interrupts disabled by get_futex_key. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - * - * NOTE! We allow read-only gup_fast() here, but you'd better be - * careful about possible COW pages. You'll get _a_ COW page, but - * not necessarily the one you intended to get depending on what - * COW event happens after this. COW may break the page copy in a - * random direction. + * As specified in the API description above, this routine is not + * allowed to return negative values. However, the common core + * routine internal_get_user_pages_fast() *can* return -errno. + * Therefore, correct for that here: */ - - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); - local_irq_restore(flags); - } + if (nr_pinned < 0) + nr_pinned = 0; return nr_pinned; } -- cgit v1.2.3 From 104acc327648b347d1716374586803e82fa1dc95 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 3 Jun 2020 15:56:34 -0700 Subject: mm/gup: introduce pin_user_pages_fast_only() This is the FOLL_PIN equivalent of __get_user_pages_fast(), except with a more descriptive name, and gup_flags instead of a boolean "write" in the argument list. Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: Chris Wilson Cc: Daniel Vetter Cc: David Airlie Cc: Jani Nikula Cc: "Joonas Lahtinen" Cc: Matthew Auld Cc: Matthew Wilcox Cc: Rodrigo Vivi Cc: Souptick Joarder Cc: Tvrtko Ursulin Link: http://lkml.kernel.org/r/20200519002124.2025955-4-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e6b884715da1..6c236c5b0015 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1827,6 +1827,8 @@ extern int mprotect_fixup(struct vm_area_struct *vma, */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* * per-process(per-mm_struct) statistics. */ diff --git a/mm/gup.c b/mm/gup.c index 67b5e96cd2c7..0702c1a7e041 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2917,6 +2917,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/* + * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the + * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * + * The API rules are the same, too: no negative values may be returned. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + + /* + * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API + * rules require returning 0, rather than -errno: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return 0; + /* + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * This routine is not allowed to return negative values. However, + * internal_get_user_pages_fast() *can* return -errno. Therefore, + * correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process (task != current) * -- cgit v1.2.3 From 47227d27e2fcb01a9e8f5958d8997cf47a820afc Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 3 Jun 2020 15:56:46 -0700 Subject: string.h: fix incompatibility between FORTIFY_SOURCE and KASAN The memcmp KASAN self-test fails on a kernel with both KASAN and FORTIFY_SOURCE. When FORTIFY_SOURCE is on, a number of functions are replaced with fortified versions, which attempt to check the sizes of the operands. However, these functions often directly invoke __builtin_foo() once they have performed the fortify check. Using __builtins may bypass KASAN checks if the compiler decides to inline it's own implementation as sequence of instructions, rather than emit a function call that goes out to a KASAN-instrumented implementation. Why is only memcmp affected? ============================ Of the string and string-like functions that kasan_test tests, only memcmp is replaced by an inline sequence of instructions in my testing on x86 with gcc version 9.2.1 20191008 (Ubuntu 9.2.1-9ubuntu2). I believe this is due to compiler heuristics. For example, if I annotate kmalloc calls with the alloc_size annotation (and disable some fortify compile-time checking!), the compiler will replace every memset except the one in kmalloc_uaf_memset with inline instructions. (I have some WIP patches to add this annotation.) Does this affect other functions in string.h? ============================================= Yes. Anything that uses __builtin_* rather than __real_* could be affected. This looks like: - strncpy - strcat - strlen - strlcpy maybe, under some circumstances? - strncat under some circumstances - memset - memcpy - memmove - memcmp (as noted) - memchr - strcpy Whether a function call is emitted always depends on the compiler. Most bugs should get caught by FORTIFY_SOURCE, but the missed memcmp test shows that this is not always the case. Isn't FORTIFY_SOURCE disabled with KASAN? ========================================- The string headers on all arches supporting KASAN disable fortify with kasan, but only when address sanitisation is _also_ disabled. For example from x86: #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) /* * For files that are not instrumented (e.g. mm/slub.c) we * should use not instrumented version of mem* functions. */ #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ #endif #endif This comes from commit 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions"), and doesn't work when KASAN is enabled and the file is supposed to be sanitised - as with test_kasan.c I'm pretty sure this is not wrong, but not as expansive it should be: * we shouldn't use __builtin_memcpy etc in files where we don't have instrumentation - it could devolve into a function call to memcpy, which will be instrumented. Rather, we should use __memcpy which by convention is not instrumented. * we also shouldn't be using __builtin_memcpy when we have a KASAN instrumented file, because it could be replaced with inline asm that will not be instrumented. What is correct behaviour? ========================== Firstly, there is some overlap between fortification and KASAN: both provide some level of _runtime_ checking. Only fortify provides compile-time checking. KASAN and fortify can pick up different things at runtime: - Some fortify functions, notably the string functions, could easily be modified to consider sub-object sizes (e.g. members within a struct), and I have some WIP patches to do this. KASAN cannot detect these because it cannot insert poision between members of a struct. - KASAN can detect many over-reads/over-writes when the sizes of both operands are unknown, which fortify cannot. So there are a couple of options: 1) Flip the test: disable fortify in santised files and enable it in unsanitised files. This at least stops us missing KASAN checking, but we lose the fortify checking. 2) Make the fortify code always call out to real versions. Do this only for KASAN, for fear of losing the inlining opportunities we get from __builtin_*. (We can't use kasan_check_{read,write}: because the fortify functions are _extern inline_, you can't include _static_ inline functions without a compiler warning. kasan_check_{read,write} are static inline so we can't use them even when they would otherwise be suitable.) Take approach 2 and call out to real versions when KASAN is enabled. Use __underlying_foo to distinguish from __real_foo: __real_foo always refers to the kernel's implementation of foo, __underlying_foo could be either the kernel implementation or the __builtin_foo implementation. This is sometimes enough to make the memcmp test succeed with FORTIFY_SOURCE enabled. It is at least enough to get the function call into the module. One more fix is needed to make it reliable: see the next patch. Fixes: 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions") Signed-off-by: Daniel Axtens Signed-off-by: Andrew Morton Tested-by: David Gow Reviewed-by: Dmitry Vyukov Cc: Daniel Micay Cc: Andrey Ryabinin Cc: Alexander Potapenko Link: http://lkml.kernel.org/r/20200423154503.5103-3-dja@axtens.net Signed-off-by: Linus Torvalds --- include/linux/string.h | 60 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 6dfbb2efa815..9b7a0632e87a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -272,6 +272,31 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + +#ifdef CONFIG_KASAN +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); @@ -279,14 +304,14 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_strncpy(p, q, size); + return __underlying_strncpy(p, q, size); } __FORTIFY_INLINE char *strcat(char *p, const char *q) { size_t p_size = __builtin_object_size(p, 0); if (p_size == (size_t)-1) - return __builtin_strcat(p, q); + return __underlying_strcat(p, q); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; @@ -300,7 +325,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) /* Work around gcc excess stack consumption issue */ if (p_size == (size_t)-1 || (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) - return __builtin_strlen(p); + return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); @@ -333,7 +358,7 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) __write_overflow(); if (len >= p_size) fortify_panic(__func__); - __builtin_memcpy(p, q, len); + __underlying_memcpy(p, q, len); p[len] = '\0'; } return ret; @@ -346,12 +371,12 @@ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strncat(p, q, count); + return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); - __builtin_memcpy(p + p_len, q, copy_len); + __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } @@ -363,7 +388,7 @@ __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memset(p, c, size); + return __underlying_memset(p, c, size); } __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) @@ -378,7 +403,7 @@ __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcpy(p, q, size); + return __underlying_memcpy(p, q, size); } __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) @@ -393,7 +418,7 @@ __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memmove(p, q, size); + return __underlying_memmove(p, q, size); } extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); @@ -419,7 +444,7 @@ __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcmp(p, q, size); + return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) @@ -429,7 +454,7 @@ __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) __read_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memchr(p, c, size); + return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); @@ -460,11 +485,22 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strcpy(p, q); + return __underlying_strcpy(p, q); memcpy(p, q, strlen(q) + 1); return p; } +/* Don't use these outside the FORITFY_SOURCE implementation */ +#undef __underlying_memchr +#undef __underlying_memcmp +#undef __underlying_memcpy +#undef __underlying_memmove +#undef __underlying_memset +#undef __underlying_strcat +#undef __underlying_strcpy +#undef __underlying_strlen +#undef __underlying_strncat +#undef __underlying_strncpy #endif /** -- cgit v1.2.3 From 574c1ae66c12410a08aeef8474936baa50e0371d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 3 Jun 2020 15:56:49 -0700 Subject: mm: clarify __GFP_MEMALLOC usage It seems that the existing documentation is not explicit about the expected usage and potential risks enough. While it is calls out that users have to free memory when using this flag it is not really apparent that users have to careful to not deplete memory reserves and that they should implement some sort of throttling wrt. freeing process. This is partly based on Neil's explanation [1]. Let's also call out that a pre allocated pool allocator should be considered. [1] http://lkml.kernel.org/r/877dz0yxoa.fsf@notabene.neil.brown.name [akpm@linux-foundation.org: coding style fixes] [mhocko@kernel.org: update] Link: http://lkml.kernel.org/r/20200406070137.GC19426@dhcp22.suse.cz Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Cc: David Rientjes Cc: Joel Fernandes Cc: Neil Brown Cc: Paul E. McKenney Cc: John Hubbard Link: http://lkml.kernel.org/r/20200403083543.11552-2-mhocko@kernel.org Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4aba4c86c626..fab6d486cbb7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -110,6 +110,11 @@ struct vm_area_struct; * the caller guarantees the allocation will allow more memory to be freed * very shortly e.g. process exiting or swapping. Users either should * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * Users of this flag have to be extremely careful to not deplete the reserve + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. + * Usage of a pre-allocated pool (e.g. mempool) should be always considered + * before using this flag. * * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. -- cgit v1.2.3 From 6f24fbd38c4e05f7905814791806c01dc6c4b9de Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:56:57 -0700 Subject: mm: make early_pfn_to_nid() and related defintions close to each other early_pfn_to_nid() and its helper __early_pfn_to_nid() are spread around include/linux/mm.h, include/linux/mmzone.h and mm/page_alloc.c. Drop unused stub for __early_pfn_to_nid() and move its actual generic implementation close to its users. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- include/linux/mmzone.h | 9 --------- mm/page_alloc.c | 51 +++++++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c236c5b0015..4288e6993dc8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2445,9 +2445,9 @@ extern void sparse_memory_present_with_active_regions(int nid); #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) +static inline int early_pfn_to_nid(unsigned long pfn) { + BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); return 0; } #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fdd9beb5efed..c3a77eb85b42 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1080,15 +1080,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #include #endif -#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -static inline unsigned long early_pfn_to_nid(unsigned long pfn) -{ - BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); - return 0; -} -#endif - #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 511602288e2d..8741ae0828e1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1504,6 +1504,31 @@ void __free_pages_core(struct page *page, unsigned int order) static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) +{ + unsigned long start_pfn, end_pfn; + int nid; + + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } + + return nid; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + int __meminit early_pfn_to_nid(unsigned long pfn) { static DEFINE_SPINLOCK(early_pfn_lock); @@ -6310,32 +6335,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != NUMA_NO_NODE) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. -- cgit v1.2.3 From 3f08a302f533f74ad2e909e7a61274aa7eebc0ab Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:02 -0700 Subject: mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option CONFIG_HAVE_MEMBLOCK_NODE_MAP is used to differentiate initialization of nodes and zones structures between the systems that have region to node mapping in memblock and those that don't. Currently all the NUMA architectures enable this option and for the non-NUMA systems we can presume that all the memory belongs to node 0 and therefore the compile time configuration option is not required. The remaining few architectures that use DISCONTIGMEM without NUMA are easily updated to use memblock_add_node() instead of memblock_add() and thus have proper correspondence of memblock regions to NUMA nodes. Still, free_area_init_node() must have a backward compatible version because its semantics with and without CONFIG_HAVE_MEMBLOCK_NODE_MAP is different. Once all the architectures will use the new semantics, the entire compatibility layer can be dropped. To avoid addition of extra run time memory to store node id for architectures that keep memblock but have only a single node, the node id field of the memblock_region is guarded by CONFIG_NEED_MULTIPLE_NODES and the corresponding accessors presume that in those cases it is always 0. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Acked-by: Catalin Marinas [arm64] Cc: Baoquan He Cc: Brian Cain Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-4-rppt@kernel.org Signed-off-by: Linus Torvalds --- .../features/vm/numa-memblock/arch-support.txt | 34 ------- arch/alpha/mm/numa.c | 4 +- arch/arm64/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/mm/motorola.c | 4 +- arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/x86/Kconfig | 1 - include/linux/memblock.h | 8 +- include/linux/mm.h | 12 +-- include/linux/mmzone.h | 2 +- mm/Kconfig | 3 - mm/memblock.c | 11 +-- mm/memory_hotplug.c | 4 - mm/page_alloc.c | 101 ++++++++++++--------- 20 files changed, 74 insertions(+), 119 deletions(-) delete mode 100644 Documentation/features/vm/numa-memblock/arch-support.txt (limited to 'include/linux') diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt deleted file mode 100644 index 3004beb0fd71..000000000000 --- a/Documentation/features/vm/numa-memblock/arch-support.txt +++ /dev/null @@ -1,34 +0,0 @@ -# -# Feature name: numa-memblock -# Kconfig: HAVE_MEMBLOCK_NODE_MAP -# description: arch supports NUMA aware memblocks -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | TODO | - | arc: | .. | - | arm: | .. | - | arm64: | ok | - | c6x: | .. | - | csky: | .. | - | h8300: | .. | - | hexagon: | .. | - | ia64: | ok | - | m68k: | .. | - | microblaze: | ok | - | mips: | ok | - | nds32: | TODO | - | nios2: | .. | - | openrisc: | .. | - | parisc: | .. | - | powerpc: | ok | - | riscv: | ok | - | s390: | ok | - | sh: | ok | - | sparc: | ok | - | um: | .. | - | unicore32: | .. | - | x86: | ok | - | xtensa: | .. | - ----------------------- diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index d0b73371e985..a24cd13e71cb 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -144,8 +144,8 @@ setup_memory_node(int nid, void *kernel_end) if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) panic("kernel loaded out of ram"); - memblock_add(PFN_PHYS(node_min_pfn), - (node_max_pfn - node_min_pfn) << PAGE_SHIFT); + memblock_add_node(PFN_PHYS(node_min_pfn), + (node_max_pfn - node_min_pfn) << PAGE_SHIFT, nid); /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. Note that we round this down, not up - node memory diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 552d36cacc05..1a9b480c6f1d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,7 +162,6 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_MEMBLOCK_NODE_MAP if NUMA select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index bab7cd878464..88b05b5256a9 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -31,7 +31,6 @@ config IA64 select HAVE_FUNCTION_TRACER select TTY select HAVE_ARCH_TRACEHOOK - select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING select DMA_NONCOHERENT_MMAP select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index fc16190ec2d6..84ab5963cabb 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -386,7 +386,7 @@ void __init paging_init(void) min_addr = m68k_memory[0].addr; max_addr = min_addr + m68k_memory[0].size; - memblock_add(m68k_memory[0].addr, m68k_memory[0].size); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); for (i = 1; i < m68k_num_memory;) { if (m68k_memory[i].addr < min_addr) { printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n", @@ -397,7 +397,7 @@ void __init paging_init(void) (m68k_num_memory - i) * sizeof(struct m68k_mem_info)); continue; } - memblock_add(m68k_memory[i].addr, m68k_memory[i].size); + memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i); addr = m68k_memory[i].addr + m68k_memory[i].size; if (addr > max_addr) max_addr = addr; diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 9606c244b5b8..d262ac0c8714 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -32,7 +32,6 @@ config MICROBLAZE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_PCI select IRQ_DOMAIN diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 690718b3701a..94a91b5b7759 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -72,7 +72,6 @@ config MIPS select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b29d7cb38368..41ba42b107c0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -211,7 +211,6 @@ config PPC select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a31e1a41913a..5c07ca4d5cd6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -32,7 +32,6 @@ config RISCV select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS - select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_CONTIGUOUS if MMU select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_PERF_EVENTS diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2167bce993ff..d6dc6933adc2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -162,7 +162,6 @@ config S390 select HAVE_LIVEPATCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP select MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 97656d20b9ea..0424b8f2f8d3 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -9,7 +9,6 @@ config SUPERH select CLKDEV_LOOKUP select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index da515fdad83d..795206b7b552 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -65,7 +65,6 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e5d38cd11df0..c669328abf58 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -192,7 +192,6 @@ config X86 select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6bc37a731d27..45abfc54da37 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES int nid; #endif }; @@ -215,7 +215,6 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -234,7 +233,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, @@ -310,10 +308,10 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); +#ifdef CONFIG_NEED_MULTIPLE_NODES static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -332,7 +330,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4288e6993dc8..5f15d8723167 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2401,9 +2401,8 @@ static inline unsigned long get_num_physpages(void) return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its + * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in a more * architecture independent manner. This is a substitute for creating the * zone_sizes[] and zholes_size[] arrays and passing them to @@ -2424,9 +2423,6 @@ static inline unsigned long get_num_physpages(void) * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. */ extern void free_area_init_nodes(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); @@ -2441,13 +2437,9 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) +#ifndef CONFIG_NEED_MULTIPLE_NODES static inline int early_pfn_to_nid(unsigned long pfn) { - BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); return 0; } #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c3a77eb85b42..0c575c3d7feb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -876,7 +876,7 @@ extern int movable_zone; #ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES return movable_zone == ZONE_HIGHMEM; #else return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; diff --git a/mm/Kconfig b/mm/Kconfig index 5c0362bd8d56..3af64646f343 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK_NODE_MAP - bool - config HAVE_MEMBLOCK_PHYS_MAP bool diff --git a/mm/memblock.c b/mm/memblock.c index 43e2fd3006c1..743659d88fc4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -620,7 +620,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, *idx = ULLONG_MAX; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * Common iterator interface used to define for_each_mem_pfn_range(). */ @@ -1247,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { +#ifdef CONFIG_NEED_MULTIPLE_NODES int start_rgn, end_rgn; int i, ret; @@ -1258,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); +#endif return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() @@ -1799,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr) return !memblock_is_nomap(&memblock.memory.regions[i]); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { @@ -1814,7 +1814,6 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, return memblock_get_region_node(&type->regions[mid]); } -#endif /** * memblock_is_region_memory - check if a region is a subset of memory @@ -1905,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..e67dc501576a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1372,11 +1372,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8741ae0828e1..430e35384b78 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -335,7 +335,6 @@ static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; @@ -348,7 +347,6 @@ static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -1499,8 +1497,7 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages(page, order); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; @@ -1542,7 +1539,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif +#endif /* CONFIG_NEED_MULTIPLE_NODES */ #ifdef CONFIG_NODES_SPAN_OTHER_NODES /* Only safe to use early in boot when initialisation is single-threaded */ @@ -5936,7 +5933,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat) static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static struct memblock_region *r; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { @@ -5952,7 +5948,6 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return true; } } -#endif return false; } @@ -6585,8 +6580,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __init zone_spanned_pages_in_node(int nid, +static inline unsigned long __init compat_zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6605,7 +6599,7 @@ static inline unsigned long __init zone_spanned_pages_in_node(int nid, return zones_size[zone_type]; } -static inline unsigned long __init zone_absent_pages_in_node(int nid, +static inline unsigned long __init compat_zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6617,13 +6611,12 @@ static inline unsigned long __init zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long *zholes_size, + bool compat) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6631,17 +6624,38 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + if (compat) { + spanned = compat_zone_spanned_pages_in_node( + pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, + zones_size); + absent = compat_zone_absent_pages_in_node( + pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zholes_size); + } else { + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, + zones_size); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zholes_size); + } + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6941,10 +6955,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6961,9 +6973,10 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init __free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size, + bool compat) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6975,16 +6988,16 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif + if (!compat) { + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + start_pfn = node_start_pfn; + } calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + zones_size, zholes_size, compat); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6992,6 +7005,14 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +void __init free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size) +{ + __free_area_init_node(nid, zones_size, node_start_pfn, zholes_size, + true); +} + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Initialize all valid struct pages in the range [spfn, epfn) and mark them @@ -7075,8 +7096,6 @@ static inline void __init init_unavailable_mem(void) } #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7505,8 +7524,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + __free_area_init_node(nid, NULL, + find_min_pfn_for_node(nid), NULL, false); /* Any memory on that node */ if (pgdat->node_present_pages) @@ -7571,8 +7590,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); -- cgit v1.2.3 From fa3354e4ea39e97af906c05551a36396541d70b4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:06 -0700 Subject: mm: free_area_init: use maximal zone PFNs rather than zone sizes Currently, architectures that use free_area_init() to initialize memory map and node and zone structures need to calculate zone and hole sizes. We can use free_area_init_nodes() instead and let it detect the zone boundaries while the architectures will only have to supply the possible limits for the zones. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-5-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 16 ++++++---------- arch/c6x/mm/init.c | 8 +++----- arch/h8300/mm/init.c | 6 +++--- arch/hexagon/mm/init.c | 6 +++--- arch/m68k/mm/init.c | 6 +++--- arch/m68k/mm/mcfmmu.c | 9 +++------ arch/nds32/mm/init.c | 11 ++++------- arch/nios2/mm/init.c | 8 +++----- arch/openrisc/mm/init.c | 9 +++------ arch/um/kernel/mem.c | 12 ++++-------- include/linux/mm.h | 2 +- mm/page_alloc.c | 5 ++--- 12 files changed, 38 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 12e218d3792a..667cd21393b5 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -243,21 +243,17 @@ callback_init(void * kernel_end) */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - unsigned long dma_pfn, high_pfn; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; + unsigned long dma_pfn; dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - high_pfn = max_pfn = max_low_pfn; + max_pfn = max_low_pfn; - if (dma_pfn >= high_pfn) - zones_size[ZONE_DMA] = high_pfn; - else { - zones_size[ZONE_DMA] = dma_pfn; - zones_size[ZONE_NORMAL] = high_pfn - dma_pfn; - } + max_zone_pfn[ZONE_DMA] = dma_pfn; + max_zone_pfn[ZONE_NORMAL] = max_pfn; /* Initialize mem_map[]. */ - free_area_init(zones_size); + free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ memset((void *)ZERO_PGE, 0, PAGE_SIZE); diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 9b374393a8f4..a97e51a3e26d 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(empty_zero_page); void __init paging_init(void) { struct pglist_data *pgdat = NODE_DATA(0); - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; empty_zero_page = (unsigned long) memblock_alloc(PAGE_SIZE, PAGE_SIZE); @@ -49,11 +49,9 @@ void __init paging_init(void) /* * Define zones */ - zones_size[ZONE_NORMAL] = (memory_end - PAGE_OFFSET) >> PAGE_SHIFT; - pgdat->node_zones[ZONE_NORMAL].zone_start_pfn = - __pa(PAGE_OFFSET) >> PAGE_SHIFT; + max_zone_pfn[ZONE_NORMAL] = memory_end >> PAGE_SHIFT; - free_area_init(zones_size); + free_area_init(max_zone_pfn); } void __init mem_init(void) diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1eab16b1a0bc..27a0020e3771 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -83,10 +83,10 @@ void __init paging_init(void) start_mem, end_mem); { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; - free_area_init(zones_size); + max_zone_pfn[ZONE_NORMAL] = end_mem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } } diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index c961773a6fff..f2e6c868e477 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -91,7 +91,7 @@ void sync_icache_dcache(pte_t pte) */ void __init paging_init(void) { - unsigned long zones_sizes[MAX_NR_ZONES] = {0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; /* * This is not particularly well documented anywhere, but @@ -101,9 +101,9 @@ void __init paging_init(void) * adjust accordingly. */ - zones_sizes[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_sizes); /* sets up the zonelists and mem_map */ + free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ /* * Start of high memory area. Will probably need something more diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index b88d510d4fe3..6d3147662ff2 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -84,7 +84,7 @@ void __init paging_init(void) * page_alloc get different views of the world. */ unsigned long end_mem = memory_end & PAGE_MASK; - unsigned long zones_size[MAX_NR_ZONES] = { 0, }; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; high_memory = (void *) end_mem; @@ -98,8 +98,8 @@ void __init paging_init(void) */ set_fs (USER_DS); - zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; - free_area_init(zones_size); + max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 0ea375607767..80064e6d064f 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -39,7 +39,7 @@ void __init paging_init(void) pte_t *pg_table; unsigned long address, size; unsigned long next_pgtable, bootmem_end; - unsigned long zones_size[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; enum zone_type zone; int i; @@ -80,11 +80,8 @@ void __init paging_init(void) } current->mm = NULL; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - zones_size[zone] = 0x0; - zones_size[ZONE_DMA] = num_pages; - free_area_init(zones_size); + max_zone_pfn[ZONE_DMA] = PFN_DOWN(_ramend); + free_area_init(max_zone_pfn); } int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word) diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 0be3833f6814..91147cca4b64 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -31,16 +31,13 @@ EXPORT_SYMBOL(empty_zero_page); static void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - /* Clear the zone sizes */ - memset(zones_size, 0, sizeof(zones_size)); - - zones_size[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = max_pfn; + max_zone_pfn[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init(zones_size); + free_area_init(max_zone_pfn); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 2c609c2516b2..9afca77d10b1 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -46,17 +46,15 @@ pgd_t *pgd_current; */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - - memset(zones_size, 0, sizeof(zones_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; pagetable_init(); pgd_current = swapper_pg_dir; - zones_size[ZONE_NORMAL] = max_mapnr; + max_zone_pfn[ZONE_NORMAL] = max_mapnr; /* pass the memory from the bootmem allocator to the main allocator */ - free_area_init(zones_size); + free_area_init(max_zone_pfn); flush_dcache_range((unsigned long)empty_zero_page, (unsigned long)empty_zero_page + PAGE_SIZE); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 1f87b524db78..f94fe6d3f499 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -45,17 +45,14 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); static void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - - /* Clear the zone sizes */ - memset(zones_size, 0, sizeof(zones_size)); + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; /* * We use only ZONE_NORMAL */ - zones_size[ZONE_NORMAL] = max_low_pfn; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_size); + free_area_init(max_zone_pfn); } extern const char _s_kernel_ro[], _e_kernel_ro[]; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 30885d0b94ac..401b22f14743 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -158,8 +158,8 @@ static void __init fixaddr_user_init( void) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES], vaddr; - int i; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + unsigned long vaddr; empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -167,12 +167,8 @@ void __init paging_init(void) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(zones_size); i++) - zones_size[i] = 0; - - zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) - - (uml_physmem >> PAGE_SHIFT); - free_area_init(zones_size); + max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); /* * Fixed mappings, only the page table structure has to be diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f15d8723167..788704977de0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2329,7 +2329,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * zones_size); +extern void free_area_init(unsigned long * max_zone_pfn); extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 430e35384b78..cf420e947d9c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7712,11 +7712,10 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *zones_size) +void __init free_area_init(unsigned long *max_zone_pfn) { init_unavailable_mem(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + free_area_init_nodes(max_zone_pfn); } static int page_alloc_cpu_dead(unsigned int cpu) -- cgit v1.2.3 From 9691a071aa26a21fc8dac804a2b98d3c24f76f9a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:57:10 -0700 Subject: mm: use free_area_init() instead of free_area_init_nodes() free_area_init() has effectively became a wrapper for free_area_init_nodes() and there is no point of keeping it. Still free_area_init() name is shorter and more general as it does not imply necessity to initialize multiple nodes. Rename free_area_init_nodes() to free_area_init(), update the callers and drop old version of free_area_init(). Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Acked-by: Catalin Marinas Cc: Brian Cain Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-6-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/arm64/mm/init.c | 2 +- arch/ia64/mm/contig.c | 2 +- arch/ia64/mm/discontig.c | 2 +- arch/microblaze/mm/init.c | 2 +- arch/mips/loongson64/numa.c | 2 +- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/riscv/mm/init.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/sparc/mm/init_64.c | 2 +- arch/x86/mm/init.c | 2 +- include/linux/mm.h | 7 +++---- mm/page_alloc.c | 10 ++-------- 15 files changed, 18 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d2df416b840e..5dae97f2f628 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -206,7 +206,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) #endif max_zone_pfns[ZONE_NORMAL] = max; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } #else diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 5b00dc3898e1..8786fa5c7612 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -210,6 +210,6 @@ paging_init (void) printk("Virtual mem_map starts at 0x%p\n", mem_map); } #endif /* !CONFIG_VIRTUAL_MEM_MAP */ - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 4f33f6e7e206..dd8284bcbf16 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -627,7 +627,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = max_dma; #endif max_zone_pfns[ZONE_NORMAL] = max_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 1ffbfa96b9b8..dcaa53d11339 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -112,7 +112,7 @@ static void __init paging_init(void) #endif /* We don't have holes in memory map */ - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init setup_memory(void) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 1ae072df4831..901f5be5ee76 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -247,7 +247,7 @@ void __init paging_init(void) zones_size[ZONE_DMA32] = MAX_DMA32_PFN; #endif zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init mem_init(void) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 79684000de0e..19719e8b41a5 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -418,7 +418,7 @@ void __init paging_init(void) } #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } #ifdef CONFIG_64BIT diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index a45691e6ab90..1213215ea965 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -419,7 +419,7 @@ void __init paging_init(void) pagetable_init(); zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(zones_size); + free_area_init(zones_size); } void __init mem_init(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 041ed7cfd341..0fcea21f26b4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -271,7 +271,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); mark_nonram_nosave(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 736de6c8739f..6168b1985b77 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -39,7 +39,7 @@ static void __init zone_sizes_init(void) #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } static void setup_zero_page(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 87b2d024e75a..b11bcf4da531 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -122,7 +122,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } void mark_rodata_ro(void) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 8d2a68aea1fc..628f461b8993 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -334,7 +334,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } unsigned int mem_init_done = 0; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 1cf0d666dea3..79d3c5e0802e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2488,7 +2488,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_NORMAL] = end_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } printk("Booting Linux...\n"); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a573a3e63f02..1decb645dac0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -947,7 +947,7 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { diff --git a/include/linux/mm.h b/include/linux/mm.h index 788704977de0..ff2c19e14c1e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2329,7 +2329,6 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * max_zone_pfn); extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); @@ -2410,21 +2409,21 @@ static inline unsigned long get_num_physpages(void) * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); + * free_area_init(max_zone_pfns); * * free_bootmem_with_active_regions() calls free_bootmem_node() for each * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf420e947d9c..644a59d17318 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7440,7 +7440,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7452,7 +7452,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; int i, nid; @@ -7712,12 +7712,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *max_zone_pfn) -{ - init_unavailable_mem(); - free_area_init_nodes(max_zone_pfn); -} - static int page_alloc_cpu_dead(unsigned int cpu) { -- cgit v1.2.3 From 51930df5801e4da60e962ea52b811634d257a148 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:58:03 -0700 Subject: mm: free_area_init: allow defining max_zone_pfn in descending order Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is sorted in descending order allows using free_area_init() on such architectures. Add top -> down traversal of max_zone_pfn array in free_area_init() and use the latter in ARC node/zone initialization. [rppt@kernel.org: ARC fix] Link: http://lkml.kernel.org/r/20200504153901.GM14260@kernel.org [rppt@linux.ibm.com: arc: free_area_init(): take into account PAE40 mode] Link: http://lkml.kernel.org/r/20200507205900.GH683243@linux.ibm.com [akpm@linux-foundation.org: declare arch_has_descending_max_zone_pfns()] Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Reviewed-by: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20200412194859.12663-18-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/arc/mm/init.c | 41 ++++++++++++----------------------------- include/linux/mm.h | 1 + mm/page_alloc.c | 26 +++++++++++++++++++++----- 3 files changed, 34 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 0920c969c466..e7bdc2ac1c87 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -63,11 +63,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) low_mem_sz = size; in_use = 1; + memblock_add_node(base, size, 0); } else { #ifdef CONFIG_HIGHMEM high_mem_start = base; high_mem_sz = size; in_use = 1; + memblock_add_node(base, size, 1); #endif } @@ -75,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) base, TO_MB(size), !in_use ? "Not used":""); } +bool arch_has_descending_max_zone_pfns(void) +{ + return !IS_ENABLED(CONFIG_ARC_HAS_PAE40); +} + /* * First memory setup routine called from setup_arch() * 1. setup swapper's mm @init_mm @@ -83,8 +90,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) */ void __init setup_arch_memory(void) { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zones_holes[MAX_NR_ZONES]; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; init_mm.start_code = (unsigned long)_text; init_mm.end_code = (unsigned long)_etext; @@ -115,7 +121,6 @@ void __init setup_arch_memory(void) * the crash */ - memblock_add_node(low_mem_start, low_mem_sz, 0); memblock_reserve(CONFIG_LINUX_LINK_BASE, __pa(_end) - CONFIG_LINUX_LINK_BASE); @@ -133,22 +138,7 @@ void __init setup_arch_memory(void) memblock_dump_all(); /*----------------- node/zones setup --------------------------*/ - memset(zones_size, 0, sizeof(zones_size)); - memset(zones_holes, 0, sizeof(zones_holes)); - - zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn; - zones_holes[ZONE_NORMAL] = 0; - - /* - * We can't use the helper free_area_init(zones[]) because it uses - * PAGE_OFFSET to compute the @min_low_pfn which would be wrong - * when our kernel doesn't start at PAGE_OFFSET, i.e. - * PAGE_OFFSET != CONFIG_LINUX_RAM_BASE - */ - free_area_init_node(0, /* node-id */ - zones_size, /* num pages per zone */ - min_low_pfn, /* first pfn of node */ - zones_holes); /* holes */ + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM /* @@ -168,20 +158,13 @@ void __init setup_arch_memory(void) min_high_pfn = PFN_DOWN(high_mem_start); max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); - zones_size[ZONE_NORMAL] = 0; - zones_holes[ZONE_NORMAL] = 0; - - zones_size[ZONE_HIGHMEM] = max_high_pfn - min_high_pfn; - zones_holes[ZONE_HIGHMEM] = 0; - - free_area_init_node(1, /* node-id */ - zones_size, /* num pages per zone */ - min_high_pfn, /* first pfn of node */ - zones_holes); /* holes */ + max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; high_memory = (void *)(min_high_pfn << PAGE_SHIFT); kmap_init(); #endif + + free_area_init(max_zone_pfn); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index ff2c19e14c1e..21cf171ae9de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2473,6 +2473,7 @@ extern void setup_per_cpu_pageset(void); extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; +extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f7eff7120d7..36d93c73f2bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7408,6 +7408,15 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -7424,7 +7433,8 @@ static void check_for_memory(pg_data_t *pgdat, int nid) void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7433,14 +7443,20 @@ void __init free_area_init(unsigned long *max_zone_pfn) sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } -- cgit v1.2.3 From bc9331a19d758706493cbebba67ca70382edddac Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 3 Jun 2020 15:58:09 -0700 Subject: mm: rename free_area_init_node() to free_area_init_memoryless_node() free_area_init_node() is only used by x86 to initialize a memory-less nodes. Make its name reflect this and drop all the function parameters except node ID as they are anyway zero. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Tested-by: Hoan Tran [arm64] Cc: Baoquan He Cc: Brian Cain Cc: Catalin Marinas Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200412194859.12663-19-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 5 +---- include/linux/mm.h | 9 +++------ mm/page_alloc.c | 7 ++----- 3 files changed, 6 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fe024b2ac796..8ee952038c80 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -737,12 +737,9 @@ void __init x86_numa_init(void) static void __init init_memory_less_node(int nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - /* Allocate and initialize node data. Memory-less node is now online.*/ alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); + free_area_init_memoryless_node(nid); /* * All zonelists will be built later in start_kernel() after per cpu diff --git a/include/linux/mm.h b/include/linux/mm.h index 21cf171ae9de..0d998c84231c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2329,8 +2329,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void __init free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); /* @@ -2402,10 +2401,8 @@ static inline unsigned long get_num_physpages(void) /* * Using memblock node mappings, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36d93c73f2bb..cc96ecbe52f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6974,12 +6974,9 @@ static void __init __free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +void __init free_area_init_memoryless_node(int nid) { - __free_area_init_node(nid, zones_size, node_start_pfn, zholes_size, - true); + __free_area_init_node(nid, NULL, 0, NULL, false); } #if !defined(CONFIG_FLAT_NODE_MEM_MAP) -- cgit v1.2.3 From 4ca7be24eeb3198dffdae9472d7464c8b8cadadb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 3 Jun 2020 15:58:45 -0700 Subject: mm/page_alloc.c: remove unused free_bootmem_with_active_regions Since commit 397dc00e249ec64e10 ("mips: sgi-ip27: switch from DISCONTIGMEM to SPARSEMEM"), the last caller of free_bootmem_with_active_regions() was gone. Now no user calls it any more. Let's remove it. Signed-off-by: Baoquan He Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Link: http://lkml.kernel.org/r/20200402143455.5145-1-bhe@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- mm/page_alloc.c | 25 ------------------------- 2 files changed, 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d998c84231c..4141ebcb3a65 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2415,8 +2415,6 @@ static inline unsigned long get_num_physpages(void) * memblock_add_node(base, size, nid) * free_area_init(max_zone_pfns); * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. */ @@ -2429,8 +2427,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_NEED_MULTIPLE_NODES diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5207a9e86388..a1a4f883b7f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6295,31 +6295,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3 From 97a225e69a1f880886f33d2e65a7ace13f152caa Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 3 Jun 2020 15:59:01 -0700 Subject: mm/page_alloc: integrate classzone_idx and high_zoneidx classzone_idx is just different name for high_zoneidx now. So, integrate them and add some comment to struct alloc_context in order to reduce future confusion about the meaning of this variable. The accessor, ac_classzone_idx() is also removed since it isn't needed after integration. In addition to integration, this patch also renames high_zoneidx to highest_zoneidx since it represents more precise meaning. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Ye Xiaolong Link: http://lkml.kernel.org/r/1587095923-7515-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 9 ++-- include/linux/mmzone.h | 12 ++--- include/trace/events/compaction.h | 22 ++++---- include/trace/events/vmscan.h | 14 +++-- mm/compaction.c | 64 +++++++++++------------ mm/internal.h | 21 +++++--- mm/memory_hotplug.c | 6 +-- mm/oom_kill.c | 4 +- mm/page_alloc.c | 60 +++++++++++----------- mm/slab.c | 4 +- mm/slub.c | 4 +- mm/vmscan.c | 105 ++++++++++++++++++++------------------ 12 files changed, 175 insertions(+), 150 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..3ed2f22b588a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int highest_zoneidx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); @@ -182,7 +182,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); -extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else static inline void reset_isolation_suitable(pg_data_t *pgdat) @@ -190,7 +190,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + int alloc_flags, int highest_zoneidx) { return COMPACT_SKIPPED; } @@ -232,7 +232,8 @@ static inline void kcompactd_stop(int nid) { } -static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +static inline void wakeup_kcompactd(pg_data_t *pgdat, + int order, int highest_zoneidx) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c575c3d7feb..cd8bd5f90552 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -699,13 +699,13 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; - enum zone_type kswapd_classzone_idx; + enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; - enum zone_type kcompactd_classzone_idx; + enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif @@ -783,15 +783,15 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type classzone_idx); + enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, + unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx); + unsigned long mark, int highest_zoneidx); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e5bf6ee4e814..54e5bf081171 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep, DECLARE_EVENT_CLASS(kcompactd_wake_template, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx), + TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) - __field(enum zone_type, classzone_idx) + __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; ), + /* + * classzone_idx is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, - __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 74bb594ccb25..2070df64958e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end, ); TRACE_EVENT(mm_vmscan_lru_isolate, - TP_PROTO(int classzone_idx, + TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, @@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate, isolate_mode_t isolate_mode, int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), + TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( - __field(int, classzone_idx) + __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) @@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, ), TP_fast_assign( - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; @@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->lru = lru; ), + /* + * classzone is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, - __entry->classzone_idx, + __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, diff --git a/mm/compaction.c b/mm/compaction.c index 8c2961100840..883355de4ace 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1968,7 +1968,7 @@ static enum compact_result compact_finished(struct compact_control *cc) */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx, + int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; @@ -1981,7 +1981,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ - if (zone_watermark_ok(zone, order, watermark, classzone_idx, + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, alloc_flags)) return COMPACT_SUCCESS; @@ -1991,9 +1991,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * watermark and alloc_flags have to match, or be more pessimistic than * the check in __isolate_free_page(). We don't use the direct * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's classzone_idx to - * skip over zones where lowmem reserves would prevent allocation even - * if compaction succeeds. + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered @@ -2002,7 +2002,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; @@ -2011,12 +2011,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2057,8 +2057,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, * Make sure at least one zone would pass __compaction_suitable if we continue * retrying the reclaim. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; enum compact_result compact_result; @@ -2071,7 +2071,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, - ac_classzone_idx(ac), available); + ac->highest_zoneidx, available); if (compact_result != COMPACT_SKIPPED) return true; } @@ -2102,7 +2102,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->classzone_idx); + cc->highest_zoneidx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; @@ -2293,7 +2293,7 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx, + unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { enum compact_result ret; @@ -2305,7 +2305,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, - .classzone_idx = classzone_idx, + .highest_zoneidx = highest_zoneidx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), @@ -2361,8 +2361,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { enum compact_result status; if (prio > MIN_COMPACT_PRIORITY @@ -2372,7 +2372,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac), capture); + alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2507,16 +2507,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; - enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; - for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - classzone_idx) == COMPACT_CONTINUE) + highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2534,16 +2534,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .classzone_idx = pgdat->kcompactd_classzone_idx, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, - cc.classzone_idx); + cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -2592,16 +2592,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* * Regardless of success, we are done until woken up next. But remember - * the requested order/classzone_idx in case it was higher/tighter than - * our current ones + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones */ if (pgdat->kcompactd_max_order <= cc.order) pgdat->kcompactd_max_order = 0; - if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; } -void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { if (!order) return; @@ -2609,8 +2609,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - if (pgdat->kcompactd_classzone_idx > classzone_idx) - pgdat->kcompactd_classzone_idx = classzone_idx; + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; /* * Pairs with implicit barrier in wait_event_freezable() @@ -2623,7 +2623,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) return; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, - classzone_idx); + highest_zoneidx); wake_up_interruptible(&pgdat->kcompactd_wait); } @@ -2644,7 +2644,7 @@ static int kcompactd(void *p) set_freezable(); pgdat->kcompactd_max_order = 0; - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { unsigned long pflags; diff --git a/mm/internal.h b/mm/internal.h index 6220a5e6b3c7..b1f0afcbe016 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -127,10 +127,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * between functions involved in allocations, including the alloc_pages* * family of functions. * - * nodemask, migratetype and high_zoneidx are initialized only once in + * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages_nodemask() and then never change. * - * zonelist, preferred_zone and classzone_idx are set first in + * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole strucure * by a const pointer. @@ -140,12 +140,21 @@ struct alloc_context { nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; - enum zone_type high_zoneidx; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; bool spread_dirty_pages; }; -#define ac_classzone_idx(ac) (ac->high_zoneidx) - /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -224,7 +233,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e67dc501576a..926ec704e835 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -879,13 +879,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } else { int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfc357614e56..4daedf7b91f6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; @@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, - high_zoneidx, oc->nodemask) + highest_zoneidx, oc->nodemask) if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b8d0966d429..5ef1eff330a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2593,7 +2593,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, int order; bool ret; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -3462,7 +3462,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3507,7 +3507,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3540,14 +3540,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); long cma_pages = 0; @@ -3565,22 +3566,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > + mark + z->lowmem_reserve[highest_zoneidx]) return true; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } @@ -3657,8 +3659,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -3713,7 +3715,7 @@ retry: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3746,7 +3748,7 @@ retry: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3905,7 +3907,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -4108,10 +4110,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -4235,12 +4237,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4375,8 +4377,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4390,7 +4392,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4509,7 +4511,7 @@ retry_cpuset: * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; @@ -4596,7 +4598,7 @@ retry: if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4730,7 +4732,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -4769,7 +4771,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* @@ -6867,7 +6869,7 @@ static void __init free_area_init_node(int nid) unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..9350062ffc1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; struct page *page; int nid; @@ -3124,7 +3124,7 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed(zone, flags) && diff --git a/mm/slub.c b/mm/slub.c index 527209d63278..d52487919278 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1938,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1967,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); diff --git a/mm/vmscan.c b/mm/vmscan.c index b2f5deb3603c..18bfbee9a581 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3131,8 +3131,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) - WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3385,7 +3385,7 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; @@ -3397,7 +3397,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) * start prematurely when there is no boosting and a lower * zone is balanced. */ - for (i = classzone_idx; i >= 0; i--) { + for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3411,9 +3411,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) /* * Returns true if there is an eligible zone balanced for the request order - * and classzone_idx + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; @@ -3423,19 +3423,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* - * If a node has no populated zone within classzone_idx, it does not + * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -3461,7 +3461,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3483,7 +3484,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - if (pgdat_balanced(pgdat, order, classzone_idx)) { + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } @@ -3547,7 +3548,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -3575,7 +3576,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3593,7 +3594,7 @@ restart: bool balanced; bool ret; - sc.reclaim_idx = classzone_idx; + sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed @@ -3623,7 +3624,7 @@ restart: * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ - balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; @@ -3723,7 +3724,7 @@ out: if (boosted) { unsigned long flags; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; @@ -3738,7 +3739,7 @@ out: * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ - wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); @@ -3756,22 +3757,22 @@ out: } /* - * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be - * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not - * a valid index then either kswapd runs for first time or kswapd couldn't sleep - * after previous reclaim attempt (node is still unbalanced). In that case - * return the zone index of the previous kswapd reclaim cycle. + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. */ -static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type prev_classzone_idx) +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) { - enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, - unsigned int classzone_idx) + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3788,7 +3789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3801,18 +3802,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* - * If woken prematurely then reset kswapd_classzone_idx and + * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { - WRITE_ONCE(pgdat->kswapd_classzone_idx, - kswapd_classzone_idx(pgdat, classzone_idx)); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); @@ -3827,7 +3829,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * go fully to sleep until explicitly woken up. */ if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3869,7 +3871,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; - unsigned int classzone_idx = MAX_NR_ZONES - 1; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -3893,22 +3895,24 @@ static int kswapd(void *p) set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - classzone_idx); + highest_zoneidx); - /* Read the new order and classzone_idx */ + /* Read the new order and highest_zoneidx */ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3929,9 +3933,10 @@ kswapd_try_sleep: * but kcompactd is woken to compact for the original * request (alloc_order). */ - trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3949,7 +3954,7 @@ kswapd_try_sleep: * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, - enum zone_type classzone_idx) + enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; @@ -3961,10 +3966,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; pgdat = zone->zone_pgdat; - curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) - WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); @@ -3974,8 +3979,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, classzone_idx) && - !pgdat_watermark_boosted(pgdat, classzone_idx))) { + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -3984,11 +3989,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } -- cgit v1.2.3 From 01c0bfe061f309b848d51619f20495ee2acd7727 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 3 Jun 2020 15:59:08 -0700 Subject: mm: rename gfpflags_to_migratetype to gfp_migratetype for same convention Pageblock migrate type is encoded in GFP flags, just as zone_type and zonelist. Currently we use gfp_zone() and gfp_zonelist() to extract related information, it would be proper to use the same naming convention for migrate type. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Link: http://lkml.kernel.org/r/20200329080823.7735-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- mm/compaction.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_owner.c | 7 +++---- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fab6d486cbb7..67a0774e080b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -312,7 +312,7 @@ struct vm_area_struct; #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 -static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) +static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); diff --git a/mm/compaction.c b/mm/compaction.c index 883355de4ace..5e3e3a972cd2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2100,7 +2100,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + cc->migratetype = gfp_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->highest_zoneidx); /* Compaction is likely to fail */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 818c2644a200..2bd8d6893b3f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4285,7 +4285,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4735,7 +4735,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; diff --git a/mm/page_owner.c b/mm/page_owner.c index 18ecde9f45b2..360461509423 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_owner = get_page_owner(page_ext); - page_mt = gfpflags_to_migratetype( - page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page) page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; - mt = gfpflags_to_migratetype(gfp_mask); + mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); -- cgit v1.2.3 From ae70eddd5633fc71dccf210f237c5aefc96f4332 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 15:59:17 -0700 Subject: mm/page_alloc: restrict and formalize compound_page_dtors[] Restrict elements in compound_page_dtors[] array per NR_COMPOUND_DTORS and explicitly position them according to enum compound_dtor_id. This improves protection against possible misalignment between compound_page_dtors[] and enum compound_dtor_id later on. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Link: http://lkml.kernel.org/r/1589795958-19317-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4141ebcb3a65..32f3c17715ac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -867,7 +867,7 @@ enum compound_dtor_id { #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[]; +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d95442f7e478..045c4aeeec9a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -302,14 +302,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; -- cgit v1.2.3 From 3d060856adfc59afb9d029c233141334cfaba418 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 3 Jun 2020 15:59:24 -0700 Subject: mm: initialize deferred pages with interrupts enabled Initializing struct pages is a long task and keeping interrupts disabled for the duration of this operation introduces a number of problems. 1. jiffies are not updated for long period of time, and thus incorrect time is reported. See proposed solution and discussion here: lkml/20200311123848.118638-1-shile.zhang@linux.alibaba.com 2. It prevents farther improving deferred page initialization by allowing intra-node multi-threading. We are keeping interrupts disabled to solve a rather theoretical problem that was never observed in real world (See 3a2d7fa8a3d5). Let's keep interrupts enabled. In case we ever encounter a scenario where an interrupt thread wants to allocate large amount of memory this early in boot we can deal with that by growing zone (see deferred_grow_zone()) by the needed amount before starting deferred_init_memmap() threads. Before: [ 1.232459] node 0 initialised, 12058412 pages in 1ms After: [ 1.632580] node 0 initialised, 12051227 pages in 436ms Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages") Reported-by: Shile Zhang Signed-off-by: Pavel Tatashin Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dan Williams Cc: James Morris Cc: Kirill Tkhai Cc: Sasha Levin Cc: Yiqian Wei Cc: [4.17+] Link: http://lkml.kernel.org/r/20200403140952.17177-3-pasha.tatashin@soleen.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ mm/page_alloc.c | 20 +++++++------------- 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd8bd5f90552..2f79ff4477ba 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -680,6 +680,8 @@ typedef struct pglist_data { /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. + * Also synchronizes pgdat->first_deferred_pfn during deferred page + * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 148cf9a73f0b..c75561a3f144 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1844,6 +1844,13 @@ static int __init deferred_init_memmap(void *data) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); pgdat->first_deferred_pfn = ULONG_MAX; + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + /* Only the highest zone is deferred so find it */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { zone = pgdat->node_zones + zid; @@ -1866,8 +1873,6 @@ static int __init deferred_init_memmap(void *data) touch_nmi_watchdog(); } zone_empty: - pgdat_resize_unlock(pgdat, &flags); - /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1909,17 +1914,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order) pgdat_resize_lock(pgdat, &flags); - /* - * If deferred pages have been initialized while we were waiting for - * the lock, return true, as the zone was grown. The caller will retry - * this zone. We won't return to this function since the caller also - * has this static branch. - */ - if (!static_branch_unlikely(&deferred_pages)) { - pgdat_resize_unlock(pgdat, &flags); - return true; - } - /* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already. -- cgit v1.2.3 From f1b192b117cd418bacf42a9583d7a01855a18fe5 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:35 -0700 Subject: padata: initialize earlier padata will soon initialize the system's struct pages in parallel, so it needs to be ready by page_alloc_init_late(). The error return from padata_driver_init() triggers an initcall warning, so add a warning to padata_init() to avoid silent failure. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-3-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- include/linux/padata.h | 6 ++++++ init/main.c | 2 ++ kernel/padata.c | 17 ++++++++--------- 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index 693cae9bfe66..d50bd1a7756f 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -166,6 +166,12 @@ struct padata_instance { #define PADATA_INVALID 4 }; +#ifdef CONFIG_PADATA +extern void __init padata_init(void); +#else +static inline void __init padata_init(void) {} +#endif + extern struct padata_instance *padata_alloc_possible(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); diff --git a/init/main.c b/init/main.c index 03371976d387..df32f67214d2 100644 --- a/init/main.c +++ b/init/main.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -1482,6 +1483,7 @@ static noinline void __init kernel_init_freeable(void) smp_init(); sched_init_smp(); + padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ page_ext_init(); diff --git a/kernel/padata.c b/kernel/padata.c index b4c7e3e38a05..0c5f4f78f2c2 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -31,7 +31,6 @@ #include #include #include -#include #define MAX_OBJ_NUM 1000 @@ -1052,26 +1051,26 @@ void padata_free_shell(struct padata_shell *ps) } EXPORT_SYMBOL(padata_free_shell); -#ifdef CONFIG_HOTPLUG_CPU - -static __init int padata_driver_init(void) +void __init padata_init(void) { +#ifdef CONFIG_HOTPLUG_CPU int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", padata_cpu_online, NULL); if (ret < 0) - return ret; + goto err; hp_online = ret; ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); if (ret < 0) { cpuhp_remove_multi_state(hp_online); - return ret; + goto err; } - return 0; -} -module_init(padata_driver_init); + return; +err: + pr_warn("padata: initialization failed\n"); #endif +} -- cgit v1.2.3 From 4611ce22468895acd61fee9ac1da810d60617d9a Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:39 -0700 Subject: padata: allocate work structures for parallel jobs from a pool padata allocates per-CPU, per-instance work structs for parallel jobs. A do_parallel call assigns a job to a sequence number and hashes the number to a CPU, where the job will eventually run using the corresponding work. This approach fit with how padata used to bind a job to each CPU round-robin, makes less sense after commit bfde23ce200e6 ("padata: unbind parallel jobs from specific CPUs") because a work isn't bound to a particular CPU anymore, and isn't needed at all for multithreaded jobs because they don't have sequence numbers. Replace the per-CPU works with a preallocated pool, which allows sharing them between existing padata users and the upcoming multithreaded user. The pool will also facilitate setting NUMA-aware concurrency limits with later users. The pool is sized according to the number of possible CPUs. With this limit, MAX_OBJ_NUM no longer makes sense, so remove it. If the global pool is exhausted, a parallel job is run in the current task instead to throttle a system trying to do too much in parallel. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-4-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- include/linux/padata.h | 8 +--- kernel/padata.c | 118 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 78 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index d50bd1a7756f..b4da88f8588c 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -24,7 +24,6 @@ * @list: List entry, to attach to the padata lists. * @pd: Pointer to the internal control structure. * @cb_cpu: Callback cpu for serializatioon. - * @cpu: Cpu for parallelization. * @seq_nr: Sequence number of the parallelized data object. * @info: Used to pass information from the parallel to the serial function. * @parallel: Parallel execution function. @@ -34,7 +33,6 @@ struct padata_priv { struct list_head list; struct parallel_data *pd; int cb_cpu; - int cpu; unsigned int seq_nr; int info; void (*parallel)(struct padata_priv *padata); @@ -68,15 +66,11 @@ struct padata_serial_queue { /** * struct padata_parallel_queue - The percpu padata parallel queue * - * @parallel: List to wait for parallelization. * @reorder: List to wait for reordering after parallel processing. - * @work: work struct for parallelization. * @num_obj: Number of objects that are processed by this cpu. */ struct padata_parallel_queue { - struct padata_list parallel; struct padata_list reorder; - struct work_struct work; atomic_t num_obj; }; @@ -111,7 +105,7 @@ struct parallel_data { struct padata_parallel_queue __percpu *pqueue; struct padata_serial_queue __percpu *squeue; atomic_t refcnt; - atomic_t seq_nr; + unsigned int seq_nr; unsigned int processed; int cpu; struct padata_cpumask cpumask; diff --git a/kernel/padata.c b/kernel/padata.c index 0c5f4f78f2c2..d779a73d4d92 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -32,7 +32,15 @@ #include #include -#define MAX_OBJ_NUM 1000 +struct padata_work { + struct work_struct pw_work; + struct list_head pw_list; /* padata_free_works linkage */ + void *pw_data; +}; + +static DEFINE_SPINLOCK(padata_works_lock); +static struct padata_work *padata_works; +static LIST_HEAD(padata_free_works); static void padata_free_pd(struct parallel_data *pd); @@ -58,30 +66,44 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *parallel_work) +static struct padata_work *padata_work_alloc(void) { - struct padata_parallel_queue *pqueue; - LIST_HEAD(local_list); + struct padata_work *pw; - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); + lockdep_assert_held(&padata_works_lock); - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); + if (list_empty(&padata_free_works)) + return NULL; /* No more work items allowed to be queued. */ - while (!list_empty(&local_list)) { - struct padata_priv *padata; + pw = list_first_entry(&padata_free_works, struct padata_work, pw_list); + list_del(&pw->pw_list); + return pw; +} - padata = list_entry(local_list.next, - struct padata_priv, list); +static void padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data) +{ + INIT_WORK(&pw->pw_work, work_fn); + pw->pw_data = data; +} - list_del_init(&padata->list); +static void padata_work_free(struct padata_work *pw) +{ + lockdep_assert_held(&padata_works_lock); + list_add(&pw->pw_list, &padata_free_works); +} - padata->parallel(padata); - } +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_work *pw = container_of(parallel_work, struct padata_work, + pw_work); + struct padata_priv *padata = pw->pw_data; + local_bh_disable(); + padata->parallel(padata); + spin_lock(&padata_works_lock); + padata_work_free(pw); + spin_unlock(&padata_works_lock); local_bh_enable(); } @@ -105,9 +127,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, target_cpu, err; - struct padata_parallel_queue *queue; + int i, cpu, cpu_index, err; struct parallel_data *pd; + struct padata_work *pw; rcu_read_lock_bh(); @@ -135,25 +157,25 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - target_cpu = padata_cpu_hash(pd, padata->seq_nr); - padata->cpu = target_cpu; - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); + rcu_read_unlock_bh(); - queue_work(pinst->parallel_wq, &queue->work); + spin_lock(&padata_works_lock); + padata->seq_nr = ++pd->seq_nr; + pw = padata_work_alloc(); + spin_unlock(&padata_works_lock); + if (pw) { + padata_work_init(pw, padata_parallel_worker, padata); + queue_work(pinst->parallel_wq, &pw->pw_work); + } else { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + return 0; out: rcu_read_unlock_bh(); @@ -324,8 +346,9 @@ static void padata_serial_worker(struct work_struct *serial_work) void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; + int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - padata->cpu); + hashed_cpu); struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); @@ -416,8 +439,6 @@ static void padata_init_pqueues(struct parallel_data *pd) pqueue = per_cpu_ptr(pd->pqueue, cpu); __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } } @@ -451,7 +472,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); @@ -1053,6 +1074,7 @@ EXPORT_SYMBOL(padata_free_shell); void __init padata_init(void) { + unsigned int i, possible_cpus; #ifdef CONFIG_HOTPLUG_CPU int ret; @@ -1064,13 +1086,27 @@ void __init padata_init(void) ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); - if (ret < 0) { - cpuhp_remove_multi_state(hp_online); - goto err; - } + if (ret < 0) + goto remove_online_state; +#endif + + possible_cpus = num_possible_cpus(); + padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work), + GFP_KERNEL); + if (!padata_works) + goto remove_dead_state; + + for (i = 0; i < possible_cpus; ++i) + list_add(&padata_works[i].pw_list, &padata_free_works); return; + +remove_dead_state: +#ifdef CONFIG_HOTPLUG_CPU + cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); +remove_online_state: + cpuhp_remove_multi_state(hp_online); err: - pr_warn("padata: initialization failed\n"); #endif + pr_warn("padata: initialization failed\n"); } -- cgit v1.2.3 From 004ed42638f4428e70ead59d170f3d17ff761a0f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:43 -0700 Subject: padata: add basic support for multithreaded jobs Sometimes the kernel doesn't take full advantage of system memory bandwidth, leading to a single CPU spending excessive time in initialization paths where the data scales with memory size. Multithreading naturally addresses this problem. Extend padata, a framework that handles many parallel yet singlethreaded jobs, to also handle multithreaded jobs by adding support for splitting up the work evenly, specifying a minimum amount of work that's appropriate for one helper thread to do, load balancing between helpers, and coordinating them. This is inspired by work from Pavel Tatashin and Steve Sistare. Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-5-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- include/linux/padata.h | 29 ++++++++++ kernel/padata.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 178 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index b4da88f8588c..7302efff5e65 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -4,6 +4,9 @@ * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert + * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan */ #ifndef PADATA_H @@ -130,6 +133,31 @@ struct padata_shell { struct list_head list; }; +/** + * struct padata_mt_job - represents one multithreaded job + * + * @thread_fn: Called for each chunk of work that a padata thread does. + * @fn_arg: The thread function argument. + * @start: The start of the job (units are job-specific). + * @size: size of this node's work (units are job-specific). + * @align: Ranges passed to the thread function fall on this boundary, with the + * possible exceptions of the beginning and end of the job. + * @min_chunk: The minimum chunk size in job-specific units. This allows + * the client to communicate the minimum amount of work that's + * appropriate for one worker thread to do at once. + * @max_threads: Max threads to use for the job, actual number may be less + * depending on task size and minimum chunk size. + */ +struct padata_mt_job { + void (*thread_fn)(unsigned long start, unsigned long end, void *arg); + void *fn_arg; + unsigned long start; + unsigned long size; + unsigned long align; + unsigned long min_chunk; + int max_threads; +}; + /** * struct padata_instance - The overall control structure. * @@ -173,6 +201,7 @@ extern void padata_free_shell(struct padata_shell *ps); extern int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu); extern void padata_do_serial(struct padata_priv *padata); +extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); extern int padata_start(struct padata_instance *pinst); diff --git a/kernel/padata.c b/kernel/padata.c index d779a73d4d92..29fc5d87a4cd 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -7,6 +7,9 @@ * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan + * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. @@ -21,6 +24,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include #include #include #include @@ -32,6 +36,8 @@ #include #include +#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */ + struct padata_work { struct work_struct pw_work; struct list_head pw_list; /* padata_free_works linkage */ @@ -42,7 +48,17 @@ static DEFINE_SPINLOCK(padata_works_lock); static struct padata_work *padata_works; static LIST_HEAD(padata_free_works); +struct padata_mt_job_state { + spinlock_t lock; + struct completion completion; + struct padata_mt_job *job; + int nworks; + int nworks_fini; + unsigned long chunk_size; +}; + static void padata_free_pd(struct parallel_data *pd); +static void __init padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -81,18 +97,56 @@ static struct padata_work *padata_work_alloc(void) } static void padata_work_init(struct padata_work *pw, work_func_t work_fn, - void *data) + void *data, int flags) { - INIT_WORK(&pw->pw_work, work_fn); + if (flags & PADATA_WORK_ONSTACK) + INIT_WORK_ONSTACK(&pw->pw_work, work_fn); + else + INIT_WORK(&pw->pw_work, work_fn); pw->pw_data = data; } +static int __init padata_work_alloc_mt(int nworks, void *data, + struct list_head *head) +{ + int i; + + spin_lock(&padata_works_lock); + /* Start at 1 because the current task participates in the job. */ + for (i = 1; i < nworks; ++i) { + struct padata_work *pw = padata_work_alloc(); + + if (!pw) + break; + padata_work_init(pw, padata_mt_helper, data, 0); + list_add(&pw->pw_list, head); + } + spin_unlock(&padata_works_lock); + + return i; +} + static void padata_work_free(struct padata_work *pw) { lockdep_assert_held(&padata_works_lock); list_add(&pw->pw_list, &padata_free_works); } +static void __init padata_works_free(struct list_head *works) +{ + struct padata_work *cur, *next; + + if (list_empty(works)) + return; + + spin_lock(&padata_works_lock); + list_for_each_entry_safe(cur, next, works, pw_list) { + list_del(&cur->pw_list); + padata_work_free(cur); + } + spin_unlock(&padata_works_lock); +} + static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_work *pw = container_of(parallel_work, struct padata_work, @@ -168,7 +222,7 @@ int padata_do_parallel(struct padata_shell *ps, pw = padata_work_alloc(); spin_unlock(&padata_works_lock); if (pw) { - padata_work_init(pw, padata_parallel_worker, padata); + padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); } else { /* Maximum works limit exceeded, run in the current task. */ @@ -409,6 +463,98 @@ out: return err; } +static void __init padata_mt_helper(struct work_struct *w) +{ + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; + struct padata_mt_job *job = ps->job; + bool done; + + spin_lock(&ps->lock); + + while (job->size > 0) { + unsigned long start, size, end; + + start = job->start; + /* So end is chunk size aligned if enough work remains. */ + size = roundup(start + 1, ps->chunk_size) - start; + size = min(size, job->size); + end = start + size; + + job->start = end; + job->size -= size; + + spin_unlock(&ps->lock); + job->thread_fn(start, end, job->fn_arg); + spin_lock(&ps->lock); + } + + ++ps->nworks_fini; + done = (ps->nworks_fini == ps->nworks); + spin_unlock(&ps->lock); + + if (done) + complete(&ps->completion); +} + +/** + * padata_do_multithreaded - run a multithreaded job + * @job: Description of the job. + * + * See the definition of struct padata_mt_job for more details. + */ +void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + /* In case threads finish at different times. */ + static const unsigned long load_balance_factor = 4; + struct padata_work my_work, *pw; + struct padata_mt_job_state ps; + LIST_HEAD(works); + int nworks; + + if (job->size == 0) + return; + + /* Ensure at least one thread when size < min_chunk. */ + nworks = max(job->size / job->min_chunk, 1ul); + nworks = min(nworks, job->max_threads); + + if (nworks == 1) { + /* Single thread, no coordination needed, cut to the chase. */ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); + return; + } + + spin_lock_init(&ps.lock); + init_completion(&ps.completion); + ps.job = job; + ps.nworks = padata_work_alloc_mt(nworks, &ps, &works); + ps.nworks_fini = 0; + + /* + * Chunk size is the amount of work a helper does per call to the + * thread function. Load balance large jobs between threads by + * increasing the number of chunks, guarantee at least the minimum + * chunk size from the caller, and honor the caller's alignment. + */ + ps.chunk_size = job->size / (ps.nworks * load_balance_factor); + ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = roundup(ps.chunk_size, job->align); + + list_for_each_entry(pw, &works, pw_list) + queue_work(system_unbound_wq, &pw->pw_work); + + /* Use the current thread, which saves starting a workqueue worker. */ + padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); + padata_mt_helper(&my_work.pw_work); + + /* Wait for all the helpers to finish. */ + wait_for_completion(&ps.completion); + + destroy_work_on_stack(&my_work.pw_work); + padata_works_free(&works); +} + static void __padata_list_init(struct padata_list *pd_list) { INIT_LIST_HEAD(&pd_list->list); -- cgit v1.2.3 From ecd096506922332fdb36ff1211e03601befe6e18 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 3 Jun 2020 15:59:55 -0700 Subject: mm: make deferred init's max threads arch-specific Using padata during deferred init has only been tested on x86, so for now limit it to this architecture. If another arch wants this, it can find the max thread limit that's best for it and override deferred_page_init_max_threads(). Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Tested-by: Josh Triplett Cc: Alexander Duyck Cc: Alex Williamson Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Herbert Xu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Michal Hocko Cc: Pavel Machek Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Robert Elliott Cc: Shile Zhang Cc: Steffen Klassert Cc: Steven Sistare Cc: Tejun Heo Cc: Zi Yan Link: http://lkml.kernel.org/r/20200527173608.2885243-8-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 12 ++++++++++++ include/linux/memblock.h | 3 +++ mm/page_alloc.c | 13 ++++++++----- 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96274a90c5ff..e08f1007f776 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1265,6 +1265,18 @@ void __init mem_init(void) mem_init_print_info(NULL); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + /* + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle + * now. + */ + return max_t(int, cpumask_weight(node_cpumask), 1); +} +#endif + int kernel_set_to_readonly; void mark_rodata_ro(void) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 45abfc54da37..807ab9daf0cd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -273,6 +273,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27ec5dc4db33..fb9dec1c1976 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1836,6 +1836,13 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, } } +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { @@ -1884,11 +1891,7 @@ static int __init deferred_init_memmap(void *data) first_init_pfn)) goto zone_empty; - /* - * More CPUs always led to greater speedups on tested systems, up to - * all the nodes' CPUs. Use all since the system is otherwise idle now. - */ - max_threads = max(cpumask_weight(cpumask), 1u); + max_threads = deferred_page_init_max_threads(cpumask); while (spfn < epfn) { unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); -- cgit v1.2.3 From ae94da898133947c2d1f005da10838478e4548db Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 3 Jun 2020 16:00:34 -0700 Subject: hugetlbfs: add arch_hugetlb_valid_size Patch series "Clean up hugetlb boot command line processing", v4. Longpeng(Mike) reported a weird message from hugetlb command line processing and proposed a solution [1]. While the proposed patch does address the specific issue, there are other related issues in command line processing. As hugetlbfs evolved, updates to command line processing have been made to meet immediate needs and not necessarily in a coordinated manner. The result is that some processing is done in arch specific code, some is done in arch independent code and coordination is problematic. Semantics can vary between architectures. The patch series does the following: - Define arch specific arch_hugetlb_valid_size routine used to validate passed huge page sizes. - Move hugepagesz= command line parsing out of arch specific code and into an arch independent routine. - Clean up command line processing to follow desired semantics and document those semantics. [1] https://lore.kernel.org/linux-mm/20200305033014.1152-1-longpeng2@huawei.com This patch (of 3): The architecture independent routine hugetlb_default_setup sets up the default huge pages size. It has no way to verify if the passed value is valid, so it accepts it and attempts to validate at a later time. This requires undocumented cooperation between the arch specific and arch independent code. For architectures that support more than one huge page size, provide a routine arch_hugetlb_valid_size to validate a huge page size. hugetlb_default_setup can use this to validate passed values. arch_hugetlb_valid_size will also be used in a subsequent patch to move processing of the "hugepagesz=" in arch specific code to a common routine in arch independent code. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Acked-by: Gerald Schaefer [s390] Acked-by: Will Deacon Cc: Catalin Marinas Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Dave Hansen Cc: Jonathan Corbet Cc: Longpeng Cc: Christophe Leroy Cc: Randy Dunlap Cc: Mina Almasry Cc: Peter Xu Cc: Nitesh Narayan Lal Cc: Anders Roxell Cc: "Aneesh Kumar K.V" Cc: Qian Cai Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200428205614.246260-1-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20200428205614.246260-2-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20200417185049.275845-1-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20200417185049.275845-2-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 17 +++++++++++++---- arch/powerpc/mm/hugetlbpage.c | 20 +++++++++++++------- arch/riscv/mm/hugetlbpage.c | 26 +++++++++++++++++--------- arch/s390/mm/hugetlbpage.c | 16 ++++++++++++---- arch/sparc/mm/init_64.c | 24 ++++++++++++++++-------- arch/x86/mm/hugetlbpage.c | 17 +++++++++++++---- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 21 ++++++++++++++++++--- 8 files changed, 103 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0be3355e3499..2ac41cefe699 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -464,17 +464,26 @@ static int __init hugetlbpage_init(void) } arch_initcall(hugetlbpage_init); -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - - switch (ps) { + switch (size) { #ifdef CONFIG_ARM64_4K_PAGES case PUD_SIZE: #endif case CONT_PMD_SIZE: case PMD_SIZE: case CONT_PTE_SIZE: + return true; + } + + return false; +} + +static __init int setup_hugepagesz(char *opt) +{ + unsigned long ps = memparse(opt, &opt); + + if (arch_hugetlb_valid_size(ps)) { add_huge_page_size(ps); return 1; } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 33b3461d91e8..de54d2a37830 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -558,7 +558,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return vma_kernel_pagesize(vma); } -static int __init add_huge_page_size(unsigned long long size) +bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); int mmu_psize; @@ -566,20 +566,26 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ if (size <= PAGE_SIZE || !is_power_of_2(size)) - return -EINVAL; + return false; mmu_psize = check_and_get_huge_psize(shift); if (mmu_psize < 0) - return -EINVAL; + return false; BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); - /* Return if huge page size has already been setup */ - if (size_to_hstate(size)) - return 0; + return true; +} - hugetlb_add_hstate(shift - PAGE_SHIFT); +static int __init add_huge_page_size(unsigned long long size) +{ + int shift = __ffs(size); + + if (!arch_hugetlb_valid_size((unsigned long)size)) + return -EINVAL; + if (!size_to_hstate(size)) + hugetlb_add_hstate(shift - PAGE_SHIFT); return 0; } diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index a6189ed36c5f..da1f516bc451 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -12,21 +12,29 @@ int pmd_huge(pmd_t pmd) return pmd_leaf(pmd); } +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + if (size == HPAGE_SIZE) + return true; + else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE) + return true; + else + return false; +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); - if (ps == HPAGE_SIZE) { - hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT); - } else if (IS_ENABLED(CONFIG_64BIT) && ps == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); - return 0; + if (arch_hugetlb_valid_size(ps)) { + hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); + return 1; } - return 1; + hugetlb_bad_size(); + pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); + return 0; + } __setup("hugepagesz=", setup_hugepagesz); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 4632d4e26b66..2f2b6b5b3d29 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -254,16 +254,24 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + return true; + else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + return true; + else + return false; +} + static __init int setup_hugepagesz(char *opt) { unsigned long size; char *string = opt; size = memparse(opt, &opt); - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + if (arch_hugetlb_valid_size(size)) { + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); } else { hugetlb_bad_size(); pr_err("hugepagesz= specifies an unsupported page size %s\n", diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 79d3c5e0802e..24475b73657a 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -360,16 +360,11 @@ static void __init pud_huge_patch(void) __asm__ __volatile__("flush %0" : : "r" (addr)); } -static int __init setup_hugepagesz(char *string) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long long hugepage_size; - unsigned int hugepage_shift; + unsigned int hugepage_shift = ilog2(size); unsigned short hv_pgsz_idx; unsigned int hv_pgsz_mask; - int rc = 0; - - hugepage_size = memparse(string, &string); - hugepage_shift = ilog2(hugepage_size); switch (hugepage_shift) { case HPAGE_16GB_SHIFT: @@ -397,7 +392,20 @@ static int __init setup_hugepagesz(char *string) hv_pgsz_mask = 0; } - if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) { + if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) + return false; + + return true; +} + +static int __init setup_hugepagesz(char *string) +{ + unsigned long long hugepage_size; + int rc = 0; + + hugepage_size = memparse(string, &string); + + if (!arch_hugetlb_valid_size((unsigned long)hugepage_size)) { hugetlb_bad_size(); pr_err("hugepagesz=%llu not supported by MMU.\n", hugepage_size); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 5bfd5aef5378..1c4372bfe782 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -181,13 +181,22 @@ get_unmapped_area: #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + if (size == PMD_SIZE) + return true; + else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) + return true; + else + return false; +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + + if (arch_hugetlb_valid_size(ps)) { + hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); } else { hugetlb_bad_size(); printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 43a1cef8f0f1..2eb15f5ab01e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -521,6 +521,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h); void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); +bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bcabbe02192b..63ca4241ea87 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3256,6 +3256,12 @@ static int __init hugetlb_init(void) } subsys_initcall(hugetlb_init); +/* Overwritten by architectures with more huge page sizes */ +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) +{ + return size == HPAGE_SIZE; +} + /* Should be called on processing a hugepagesz=... option */ void __init hugetlb_bad_size(void) { @@ -3331,12 +3337,21 @@ static int __init hugetlb_nrpages_setup(char *s) } __setup("hugepages=", hugetlb_nrpages_setup); -static int __init hugetlb_default_setup(char *s) +static int __init default_hugepagesz_setup(char *s) { - default_hstate_size = memparse(s, &s); + unsigned long size; + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported default_hugepagesz %s\n", s); + return 0; + } + + default_hstate_size = size; return 1; } -__setup("default_hugepagesz=", hugetlb_default_setup); +__setup("default_hugepagesz=", default_hugepagesz_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { -- cgit v1.2.3 From 359f25443a8dada0fb709dd044a422017031790f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 3 Jun 2020 16:00:38 -0700 Subject: hugetlbfs: move hugepagesz= parsing to arch independent code Now that architectures provide arch_hugetlb_valid_size(), parsing of "hugepagesz=" can be done in architecture independent code. Create a single routine to handle hugepagesz= parsing and remove all arch specific routines. We can also remove the interface hugetlb_bad_size() as this is no longer used outside arch independent code. This also provides consistent behavior of hugetlbfs command line options. The hugepagesz= option should only be specified once for a specific size, but some architectures allow multiple instances. This appears to be more of an oversight when code was added by some architectures to set up ALL huge pages sizes. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Tested-by: Sandipan Das Reviewed-by: Peter Xu Acked-by: Mina Almasry Acked-by: Gerald Schaefer [s390] Acked-by: Will Deacon Cc: Albert Ou Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Longpeng Cc: Nitesh Narayan Lal Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Anders Roxell Cc: "Aneesh Kumar K.V" Cc: Qian Cai Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200417185049.275845-3-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20200428205614.246260-3-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 15 --------------- arch/powerpc/mm/hugetlbpage.c | 15 --------------- arch/riscv/mm/hugetlbpage.c | 16 ---------------- arch/s390/mm/hugetlbpage.c | 18 ------------------ arch/sparc/mm/init_64.c | 22 ---------------------- arch/x86/mm/hugetlbpage.c | 16 ---------------- include/linux/hugetlb.h | 1 - mm/hugetlb.c | 23 +++++++++++++++++------ 8 files changed, 17 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2ac41cefe699..d6cb9fe71b44 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -478,18 +478,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return false; } - -static __init int setup_hugepagesz(char *opt) -{ - unsigned long ps = memparse(opt, &opt); - - if (arch_hugetlb_valid_size(ps)) { - add_huge_page_size(ps); - return 1; - } - - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); - return 0; -} -__setup("hugepagesz=", setup_hugepagesz); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index de54d2a37830..2c3fa0a7787b 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -589,21 +589,6 @@ static int __init add_huge_page_size(unsigned long long size) return 0; } -static int __init hugepage_setup_sz(char *str) -{ - unsigned long long size; - - size = memparse(str, &str); - - if (add_huge_page_size(size) != 0) { - hugetlb_bad_size(); - pr_err("Invalid huge page size specified(%llu)\n", size); - } - - return 1; -} -__setup("hugepagesz=", hugepage_setup_sz); - static int __init hugetlbpage_init(void) { bool configured = false; diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index da1f516bc451..4e5d7e9f0eef 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -22,22 +22,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return false; } -static __init int setup_hugepagesz(char *opt) -{ - unsigned long ps = memparse(opt, &opt); - - if (arch_hugetlb_valid_size(ps)) { - hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); - return 1; - } - - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); - return 0; - -} -__setup("hugepagesz=", setup_hugepagesz); - #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 2f2b6b5b3d29..82df06d720e8 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -264,24 +264,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return false; } -static __init int setup_hugepagesz(char *opt) -{ - unsigned long size; - char *string = opt; - - size = memparse(opt, &opt); - if (arch_hugetlb_valid_size(size)) { - hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz= specifies an unsupported page size %s\n", - string); - return 0; - } - return 1; -} -__setup("hugepagesz=", setup_hugepagesz); - static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 24475b73657a..1b1f1ac1869e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -397,28 +397,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return true; } - -static int __init setup_hugepagesz(char *string) -{ - unsigned long long hugepage_size; - int rc = 0; - - hugepage_size = memparse(string, &string); - - if (!arch_hugetlb_valid_size((unsigned long)hugepage_size)) { - hugetlb_bad_size(); - pr_err("hugepagesz=%llu not supported by MMU.\n", - hugepage_size); - goto out; - } - - add_huge_page_size(hugepage_size); - rc = 1; - -out: - return rc; -} -__setup("hugepagesz=", setup_hugepagesz); #endif /* CONFIG_HUGETLB_PAGE */ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 1c4372bfe782..937d640a89e3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -191,22 +191,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return false; } -static __init int setup_hugepagesz(char *opt) -{ - unsigned long ps = memparse(opt, &opt); - - if (arch_hugetlb_valid_size(ps)) { - hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", - ps >> 20); - return 0; - } - return 1; -} -__setup("hugepagesz=", setup_hugepagesz); - #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2eb15f5ab01e..0c13706054ef 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -519,7 +519,6 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, int __init __alloc_bootmem_huge_page(struct hstate *h); int __init alloc_bootmem_huge_page(struct hstate *h); -void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 63ca4241ea87..6a8454bc2917 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3262,12 +3262,6 @@ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) return size == HPAGE_SIZE; } -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_bad_size(void) -{ - parsed_valid_hugepagesz = false; -} - void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; @@ -3337,6 +3331,23 @@ static int __init hugetlb_nrpages_setup(char *s) } __setup("hugepages=", hugetlb_nrpages_setup); +static int __init hugepagesz_setup(char *s) +{ + unsigned long size; + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + parsed_valid_hugepagesz = false; + pr_err("HugeTLB: unsupported hugepagesz %s\n", s); + return 0; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + return 1; +} +__setup("hugepagesz=", hugepagesz_setup); + static int __init default_hugepagesz_setup(char *s) { unsigned long size; -- cgit v1.2.3 From b0eae98c66fe4ccac3a5a79a1479c057f2c7cdd7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 16:01:01 -0700 Subject: mm/hugetlb: define a generic fallback for is_hugepage_only_range() There are multiple similar definitions for is_hugepage_only_range() on various platforms. Lets just add it's generic fallback definition for platforms that do not override. This help reduce code duplication. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Mike Kravetz Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1588907271-11920-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 6 ------ arch/arm64/include/asm/hugetlb.h | 6 ------ arch/ia64/include/asm/hugetlb.h | 1 + arch/mips/include/asm/hugetlb.h | 7 ------- arch/parisc/include/asm/hugetlb.h | 6 ------ arch/powerpc/include/asm/hugetlb.h | 1 + arch/riscv/include/asm/hugetlb.h | 6 ------ arch/s390/include/asm/hugetlb.h | 7 ------- arch/sh/include/asm/hugetlb.h | 6 ------ arch/sparc/include/asm/hugetlb.h | 6 ------ arch/x86/include/asm/hugetlb.h | 6 ------ include/linux/hugetlb.h | 9 +++++++++ 12 files changed, 11 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 318dcf5921ab..9ecd516d1ff7 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -14,12 +14,6 @@ #include #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index b88878ddc88b..8f58e052697a 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -17,12 +17,6 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); #endif -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 36cc0396b214..6ef50b9a4bdf 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,6 +20,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return (REGION_NUMBER(addr) == RGN_HPAGE || REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } +#define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 425bb6fc3bda..8b201e281f67 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -11,13 +11,6 @@ #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return 0; -} - #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 7cb595dcb7d7..411d9d867baa 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -12,12 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index bd6504c28c2f..b167c869d72d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -30,6 +30,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return slice_is_hugepage_only_range(mm, addr, len); return 0; } +#define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 728a5db66597..866f6ae6467c 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -5,12 +5,6 @@ #include #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index de8f0bf5f238..7d27ea96ec2f 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -21,13 +21,6 @@ pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline bool is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return false; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 6f025fe18146..536ad2cb8aa4 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -5,12 +5,6 @@ #include #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 3963f80d1cb3..a056fe1119f5 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,12 +20,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index f65cfb48cfdd..cc98f79074d0 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -7,12 +7,6 @@ #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c13706054ef..bf97b17ab206 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -591,6 +591,15 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) #include +#ifndef is_hugepage_only_range +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} +#define is_hugepage_only_range is_hugepage_only_range +#endif + #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) -- cgit v1.2.3 From 5be993432821750f382809df5e20bf4c129b24f7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 16:01:05 -0700 Subject: mm/hugetlb: define a generic fallback for arch_clear_hugepage_flags() There are multiple similar definitions for arch_clear_hugepage_flags() on various platforms. Lets just add it's generic fallback definition for platforms that do not override. This help reduce code duplication. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Mike Kravetz Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1588907271-11920-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 1 + arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 4 ---- arch/mips/include/asm/hugetlb.h | 4 ---- arch/parisc/include/asm/hugetlb.h | 4 ---- arch/powerpc/include/asm/hugetlb.h | 4 ---- arch/riscv/include/asm/hugetlb.h | 4 ---- arch/s390/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 1 + arch/sparc/include/asm/hugetlb.h | 4 ---- arch/x86/include/asm/hugetlb.h | 4 ---- include/linux/hugetlb.h | 5 +++++ 12 files changed, 9 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 9ecd516d1ff7..d02d6ca88e92 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -18,5 +18,6 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags #endif /* _ASM_ARM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 8f58e052697a..94ba0c5bced2 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -21,6 +21,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 6ef50b9a4bdf..7e46ebde8c0c 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -28,10 +28,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 8b201e281f67..10e3be870df7 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -75,10 +75,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #endif /* __ASM_HUGETLB_H */ diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 411d9d867baa..a69cf9efb0c1 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -42,10 +42,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #endif /* _ASM_PARISC64_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index b167c869d72d..e6dfa63da552 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -61,10 +61,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #else /* ! CONFIG_HUGETLB_PAGE */ diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 866f6ae6467c..a5c2ca1d1cd8 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -5,8 +5,4 @@ #include #include -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_RISCV_HUGETLB_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7d27ea96ec2f..9ddf4a43a590 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,6 +39,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_arch_1, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 536ad2cb8aa4..ae4de7b89210 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -30,6 +30,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags #include diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index a056fe1119f5..53838a173f62 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -47,10 +47,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index cc98f79074d0..1721b1aadeb1 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -7,8 +7,4 @@ #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bf97b17ab206..2e66b71c1236 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -600,6 +600,11 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, #define is_hugepage_only_range is_hugepage_only_range #endif +#ifndef arch_clear_hugepage_flags +static inline void arch_clear_hugepage_flags(struct page *page) { } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags +#endif + #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) -- cgit v1.2.3 From ff45fc3ca0f3c38e752d75f71b8d8efcf409e42d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Jun 2020 16:01:09 -0700 Subject: mm: simplify calling a compound page destructor None of the three callers of get_compound_page_dtor() want to know the value; they just want to call the function. Replace it with destroy_compound_page() which calls the dtor for them. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Link: http://lkml.kernel.org/r/20200517105051.9352-1-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/swap.c | 5 +---- mm/vmscan.c | 4 ++-- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32f3c17715ac..ff73187c3fd4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -876,10 +876,10 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +static inline void destroy_compound_page(struct page *page) { VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - return compound_page_dtors[page[1].compound_dtor]; + compound_page_dtors[page[1].compound_dtor](page); } static inline unsigned int compound_order(struct page *page) diff --git a/mm/swap.c b/mm/swap.c index 0ac463d44cff..6e454a5c5ab9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -102,8 +102,6 @@ static void __put_single_page(struct page *page) static void __put_compound_page(struct page *page) { - compound_page_dtor *dtor; - /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set @@ -112,8 +110,7 @@ static void __put_compound_page(struct page *page) */ if (!PageHuge(page)) __page_cache_release(page); - dtor = get_compound_page_dtor(page); - (*dtor)(page); + destroy_compound_page(page); } void __put_page(struct page *page) diff --git a/mm/vmscan.c b/mm/vmscan.c index 18bfbee9a581..1d1d25306ca2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1438,7 +1438,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); else list_add(&page->lru, &free_pages); continue; @@ -1859,7 +1859,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); -- cgit v1.2.3 From 1f318a9b0dc3990490e98eef48f21e6f15185781 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Wed, 3 Jun 2020 16:01:15 -0700 Subject: mm/vmscan: count layzfree pages and fix nr_isolated_* mismatch Fix an nr_isolate_* mismatch problem between cma and dirty lazyfree pages. If try_to_unmap_one is used for reclaim and it detects a dirty lazyfree page, then the lazyfree page is changed to a normal anon page having SwapBacked by commit 802a3a92ad7a ("mm: reclaim MADV_FREE pages"). Even with the change, reclaim context correctly counts isolated files because it uses is_file_lru to distinguish file. And the change to anon is not happened if try_to_unmap_one is used for migration. So migration context like compaction also correctly counts isolated files even though it uses page_is_file_lru insted of is_file_lru. Recently page_is_file_cache was renamed to page_is_file_lru by commit 9de4f22a60f7 ("mm: code cleanup for MADV_FREE"). But the nr_isolate_* mismatch problem happens on cma alloc. There is reclaim_clean_pages_from_list which is being used only by cma. It was introduced by commit 02c6de8d757c ("mm: cma: discard clean pages during contiguous allocation instead of migration") to reclaim clean file pages without migration. The cma alloc uses both reclaim_clean_pages_from_list and migrate_pages, and it uses page_is_file_lru to count isolated files. If there are dirty lazyfree pages allocated from cma memory region, the pages are counted as isolated file at the beginging but are counted as isolated anon after finished. Mem-Info: Node 0 active_anon:3045904kB inactive_anon:611448kB active_file:14892kB inactive_file:205636kB unevictable:10416kB isolated(anon):0kB isolated(file):37664kB mapped:630216kB dirty:384kB writeback:0kB shmem:42576kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no Like log above, there were too much isolated files, 37664kB, which triggers too_many_isolated in reclaim even when there is no actually isolated file in system wide. It could be reproducible by running two programs, writing on MADV_FREE page and doing cma alloc, respectively. Although isolated anon is 0, I found that the internal value of isolated anon was the negative value of isolated file. Fix this by compensating the isolated count for both LRU lists. Count non-discarded lazyfree pages in shrink_page_list, then compensate the counted number in reclaim_clean_pages_from_list. Reported-by: Yong-Taek Lee Suggested-by: Minchan Kim Signed-off-by: Jaewon Kim Signed-off-by: Andrew Morton Acked-by: Minchan Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200426011718.30246-1-jaewon31.kim@samsung.com Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 1 + mm/vmscan.c | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 292485f3d24d..10cc932e209a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -29,6 +29,7 @@ struct reclaim_stat { unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; + unsigned nr_lazyfree_fail; }; enum writeback_stat_item { diff --git a/mm/vmscan.c b/mm/vmscan.c index 197eba50e157..8be3d52548ca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1295,11 +1295,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && PageSwapBacked(page)) + stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } @@ -1491,8 +1495,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - struct reclaim_stat dummy_stat; - unsigned long ret; + struct reclaim_stat stat; + unsigned long nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1504,11 +1508,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &dummy_stat, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; } /* -- cgit v1.2.3 From 3fba69a56e16e8dcf182fe6ca77735dd65a898aa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:31 -0700 Subject: mm: memcontrol: drop @compound parameter from memcg charging API The memcg charging API carries a boolean @compound parameter that tells whether the page we're dealing with is a hugepage. mem_cgroup_commit_charge() has another boolean @lrucare that indicates whether the page needs LRU locking or not while charging. The majority of callsites know those parameters at compile time, which results in a lot of naked "false, false" argument lists. This makes for cryptic code and is a breeding ground for subtle mistakes. Thankfully, the huge page state can be inferred from the page itself and doesn't need to be passed along. This is safe because charging completes before the page is published and somebody may split it. Simplify the callsites by removing @compound, and let memcg infer the state by using hpage_nr_pages() unconditionally. That function does PageTransHuge() to identify huge pages, which also helpfully asserts that nobody passes in tail pages by accident. The following patches will introduce a new charging API, best not to carry over unnecessary weight. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Alex Shi Reviewed-by: Joonsoo Kim Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-4-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 22 ++++++++-------------- kernel/events/uprobes.c | 6 +++--- mm/filemap.c | 6 +++--- mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 20 ++++++++++---------- mm/memcontrol.c | 38 +++++++++++++++----------------------- mm/memory.c | 32 +++++++++++++++----------------- mm/migrate.c | 6 +++--- mm/shmem.c | 22 +++++++++------------- mm/swapfile.c | 9 ++++----- mm/userfaultfd.c | 6 +++--- 11 files changed, 77 insertions(+), 98 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bfe9533bb67e..fc400002c7be 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -359,15 +359,12 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound); + gfp_t gfp_mask, struct mem_cgroup **memcgp); int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound); + gfp_t gfp_mask, struct mem_cgroup **memcgp); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound); + bool lrucare); +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -849,8 +846,7 @@ static inline enum mem_cgroup_protection mem_cgroup_protected( static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcgp, - bool compound) + struct mem_cgroup **memcgp) { *memcgp = NULL; return 0; @@ -859,8 +855,7 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, static inline int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcgp, - bool compound) + struct mem_cgroup **memcgp) { *memcgp = NULL; return 0; @@ -868,13 +863,12 @@ static inline int mem_cgroup_try_charge_delay(struct page *page, static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound) + bool lrucare) { } static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg, - bool compound) + struct mem_cgroup *memcg) { } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..40e7488ce467 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -169,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false); + &memcg); if (err) return err; } @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { if (new_page) - mem_cgroup_cancel_charge(new_page, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); @@ -189,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); + mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/filemap.c b/mm/filemap.c index 9af4af660ef1..f268cfffbc6e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -842,7 +842,7 @@ static int __add_to_page_cache_locked(struct page *page, if (!huge) { error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg, false); + gfp_mask, &memcg); if (error) return error; } @@ -878,14 +878,14 @@ unlock: goto error; if (!huge) - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); put_page(page); return xas_error(&xas); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 693c0ff2256c..672e34932b53 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -594,7 +594,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); @@ -630,7 +630,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); + mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -641,7 +641,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false, true); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -658,7 +658,7 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg, true); + mem_cgroup_cancel_charge(page, memcg); put_page(page); return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e7897a7dfbec..0a29b51494fd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1060,7 +1060,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1068,7 +1068,7 @@ static void collapse_huge_page(struct mm_struct *mm, down_read(&mm->mmap_sem); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { - mem_cgroup_cancel_charge(new_page, memcg, true); + mem_cgroup_cancel_charge(new_page, memcg); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1076,7 +1076,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) { result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg, true); + mem_cgroup_cancel_charge(new_page, memcg); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1088,7 +1088,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg, true); + mem_cgroup_cancel_charge(new_page, memcg); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1176,7 +1176,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); + mem_cgroup_commit_charge(new_page, memcg, false); count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); @@ -1194,7 +1194,7 @@ out_nolock: trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg, true); + mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } @@ -1637,7 +1637,7 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1650,7 +1650,7 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - mem_cgroup_cancel_charge(new_page, memcg, true); + mem_cgroup_cancel_charge(new_page, memcg); result = SCAN_FAIL; goto out; } @@ -1887,7 +1887,7 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - mem_cgroup_commit_charge(new_page, memcg, false, true); + mem_cgroup_commit_charge(new_page, memcg, false); if (is_shmem) { set_page_dirty(new_page); @@ -1942,7 +1942,7 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - mem_cgroup_cancel_charge(new_page, memcg, true); + mem_cgroup_cancel_charge(new_page, memcg); new_page->mapping = NULL; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8f95896c5aa1..340c580f8363 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -834,7 +834,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool compound, int nr_pages) + int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is @@ -848,7 +848,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __mod_memcg_state(memcg, NR_SHMEM, nr_pages); } - if (compound) { + if (abs(nr_pages) > 1) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); } @@ -5501,9 +5501,9 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, compound, nr_pages); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, compound, -nr_pages); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -6494,7 +6494,6 @@ out: * @mm: mm context of the victim * @gfp_mask: reclaim mode * @memcgp: charged memcg return - * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. @@ -6507,11 +6506,10 @@ out: * with mem_cgroup_cancel_charge() in case page instantiation fails. */ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) + gfp_t gfp_mask, struct mem_cgroup **memcgp) { + unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) @@ -6553,13 +6551,12 @@ out: } int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) + gfp_t gfp_mask, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); + ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp); memcg = *memcgp; mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); return ret; @@ -6570,7 +6567,6 @@ int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, * @page: page to charge * @memcg: memcg to charge the page to * @lrucare: page might be on LRU already - * @compound: charge the page as compound or small page * * Finalize a charge transaction started by mem_cgroup_try_charge(), * after page->mapping has been set up. This must happen atomically @@ -6583,9 +6579,9 @@ int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, * Use mem_cgroup_cancel_charge() to cancel the transaction instead. */ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound) + bool lrucare) { - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; + unsigned int nr_pages = hpage_nr_pages(page); VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); @@ -6603,7 +6599,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, commit_charge(page, memcg, lrucare); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); + mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); @@ -6622,14 +6618,12 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * mem_cgroup_cancel_charge - cancel a page charge * @page: page to charge * @memcg: memcg to charge the page to - * @compound: charge the page as compound or small page * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound) +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) { - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; + unsigned int nr_pages = hpage_nr_pages(page); if (mem_cgroup_disabled()) return; @@ -6844,8 +6838,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, false); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage), - nr_pages); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); } @@ -7075,8 +7068,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), - -nr_entries); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) diff --git a/mm/memory.c b/mm/memory.c index 21438278afca..6b8c5900e9a4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2676,7 +2676,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; __SetPageUptodate(new_page); @@ -2711,7 +2711,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); + mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2750,7 +2750,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg); } if (new_page) @@ -3193,8 +3193,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3245,11 +3244,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); + mem_cgroup_commit_charge(page, memcg, true); activate_page(page); } @@ -3285,7 +3284,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); @@ -3359,8 +3358,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, - false)) + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg)) goto oom_free_page; /* @@ -3386,14 +3384,14 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3404,7 +3402,7 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); put_page(page); goto unlock; oom_free_page: @@ -3655,7 +3653,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); @@ -3864,8 +3862,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &vmf->memcg, false)) { + if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, + GFP_KERNEL, &vmf->memcg)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } @@ -3886,7 +3884,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg); put_page(vmf->cow_page); return ret; } diff --git a/mm/migrate.c b/mm/migrate.c index 846af96b84a5..93c732213d12 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2780,7 +2780,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) goto abort; /* @@ -2826,7 +2826,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); get_page(page); @@ -2848,7 +2848,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, unlock_abort: pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); abort: *src &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/shmem.c b/mm/shmem.c index bd8840082c94..d505b6cce4ab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1664,8 +1664,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, swp_to_radix_entry(swap), gfp); @@ -1680,14 +1679,14 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, * the rest. */ if (error) { - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); delete_from_swap_cache(page); } } if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true, false); + mem_cgroup_commit_charge(page, memcg, true); spin_lock_irq(&info->lock); info->swapped--; @@ -1859,8 +1858,7 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg); if (error) { if (PageTransHuge(page)) { count_vm_event(THP_FILE_FALLBACK); @@ -1871,12 +1869,10 @@ alloc_nohuge: error = shmem_add_to_page_cache(page, mapping, hindex, NULL, gfp & GFP_RECLAIM_MASK); if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); + mem_cgroup_cancel_charge(page, memcg); goto unacct; } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page); spin_lock_irq(&info->lock); @@ -2364,7 +2360,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg); if (ret) goto out_release; @@ -2373,7 +2369,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release_uncharge; - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2424,7 +2420,7 @@ out_release_uncharge_unlock: ClearPageDirty(page); delete_from_page_cache(page); out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); out_release: unlock_page(page); put_page(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 63ac67208453..40a80617cb03 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1902,15 +1902,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); ret = 0; goto out; } @@ -1922,10 +1921,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, true, false); + mem_cgroup_commit_charge(page, memcg, true); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 512576e171ce..bb57d0a3fca7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -97,7 +97,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); @@ -124,7 +124,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -138,7 +138,7 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg, false); + mem_cgroup_cancel_charge(page, memcg); out_release: put_page(page); goto out; -- cgit v1.2.3 From 6caa6a0703e03236f46461342e31ca53d0e3c091 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:38 -0700 Subject: mm: memcontrol: move out cgroup swaprate throttling The cgroup swaprate throttling is about matching new anon allocations to the rate of available IO when that is being throttled. It's the io controller hooking into the VM, rather than a memory controller thing. Rename mem_cgroup_throttle_swaprate() to cgroup_throttle_swaprate(), and drop the @memcg argument which is only used to check whether the preceding page charge has succeeded and the fault is proceeding. We could decouple the call from mem_cgroup_try_charge() here as well, but that would cause unnecessary churn: the following patches convert all callsites to a new charge API and we'll decouple as we go along. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Alex Shi Reviewed-by: Joonsoo Kim Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-5-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++---- mm/memcontrol.c | 5 ++--- mm/swapfile.c | 14 +++++++------- 3 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e92176fc8824..6cea1eb97d45 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -651,11 +651,9 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask); +extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); #else -static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, - int node, gfp_t gfp_mask) +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { } #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 340c580f8363..bc0f55d0cc08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6553,12 +6553,11 @@ out: int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp) { - struct mem_cgroup *memcg; int ret; ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp); - memcg = *memcgp; - mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); + if (*memcgp) + cgroup_throttle_swaprate(page, gfp_mask); return ret; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 40a80617cb03..1829fc4b3ca2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3798,11 +3798,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask) +void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; - if (!(gfp_mask & __GFP_IO) || !memcg) + int nid = page_to_nid(page); + + if (!(gfp_mask & __GFP_IO)) return; if (!blk_cgroup_congested()) @@ -3816,11 +3817,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], - avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], + avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), - true); + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); break; } } -- cgit v1.2.3 From 3fea5a499d57dec46043fcdb08e38eac1767bb0d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:41 -0700 Subject: mm: memcontrol: convert page cache to a new mem_cgroup_charge() API The try/commit/cancel protocol that memcg uses dates back to when pages used to be uncharged upon removal from the page cache, and thus couldn't be committed before the insertion had succeeded. Nowadays, pages are uncharged when they are physically freed; it doesn't matter whether the insertion was successful or not. For the page cache, the transaction dance has become unnecessary. Introduce a mem_cgroup_charge() function that simply charges a newly allocated page to a cgroup and sets up page->mem_cgroup in one single step. If the insertion fails, the caller doesn't have to do anything but free/put the page. Then switch the page cache over to this new API. Subsequent patches will also convert anon pages, but it needs a bit more prep work. Right now, memcg depends on page->mapping being already set up at the time of charging, so that it can maintain its own MEMCG_CACHE and MEMCG_RSS counters. For anon, page->mapping is set under the same pte lock under which the page is publishd, so a single charge point that can block doesn't work there just yet. The following prep patches will replace the private memcg counters with the generic vmstat counters, thus removing the page->mapping dependency, then complete the transition to the new single-point charge API and delete the old transactional scheme. v2: leave shmem swapcache when charging fails to avoid double IO (Joonsoo) v3: rebase on preceeding shmem simplification patch Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-6-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 +++++++ mm/filemap.c | 24 +++++++-------- mm/memcontrol.c | 29 ++++++++++++++++-- mm/shmem.c | 73 +++++++++++++++++++--------------------------- 4 files changed, 77 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fc400002c7be..898925bdd676 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -365,6 +365,10 @@ int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); + +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, + bool lrucare); + void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -872,6 +876,12 @@ static inline void mem_cgroup_cancel_charge(struct page *page, { } +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, bool lrucare) +{ + return 0; +} + static inline void mem_cgroup_uncharge(struct page *page) { } diff --git a/mm/filemap.c b/mm/filemap.c index f268cfffbc6e..38e6a37336a6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -832,7 +832,6 @@ static int __add_to_page_cache_locked(struct page *page, { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); - struct mem_cgroup *memcg; int error; void *old; @@ -840,17 +839,16 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(PageSwapBacked(page), page); mapping_set_update(&xas, mapping); - if (!huge) { - error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg); - if (error) - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; + if (!huge) { + error = mem_cgroup_charge(page, current->mm, gfp_mask, false); + if (error) + goto error; + } + do { xas_lock_irq(&xas); old = xas_load(&xas); @@ -874,20 +872,18 @@ unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); - if (xas_error(&xas)) + if (xas_error(&xas)) { + error = xas_error(&xas); goto error; + } - if (!huge) - mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - if (!huge) - mem_cgroup_cancel_charge(page, memcg); put_page(page); - return xas_error(&xas); + return error; } ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc0f55d0cc08..c414b7f85e48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6637,6 +6637,33 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) cancel_charge(memcg, nr_pages); } +/** + * mem_cgroup_charge - charge a newly allocated page to a cgroup + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @lrucare: page might be on the LRU already + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, + bool lrucare) +{ + struct mem_cgroup *memcg; + int ret; + + VM_BUG_ON_PAGE(!page->mapping, page); + + ret = mem_cgroup_try_charge(page, mm, gfp_mask, &memcg); + if (ret) + return ret; + mem_cgroup_commit_charge(page, memcg, lrucare); + return 0; +} + struct uncharge_gather { struct mem_cgroup *memcg; unsigned long pgpgout; @@ -6684,8 +6711,6 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && - !PageHWPoison(page) , page); if (!page->mem_cgroup) return; diff --git a/mm/shmem.c b/mm/shmem.c index 729bbb3513cd..0d9615723152 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -605,11 +605,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); unsigned long i = 0; unsigned long nr = compound_nr(page); + int error; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -621,6 +623,16 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; + error = mem_cgroup_charge(page, charge_mm, gfp, PageSwapCache(page)); + if (error) { + if (!PageSwapCache(page) && PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + cgroup_throttle_swaprate(page, gfp); + do { void *entry; xas_lock_irq(&xas); @@ -648,12 +660,15 @@ unlock: } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - page->mapping = NULL; - page_ref_sub(page, nr); - return xas_error(&xas); + error = xas_error(&xas); + goto error; } return 0; +error: + page->mapping = NULL; + page_ref_sub(page, nr); + return error; } /* @@ -1619,7 +1634,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1664,18 +1678,11 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg); - if (error) - goto failed; - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - if (error) { - mem_cgroup_cancel_charge(page, memcg); + swp_to_radix_entry(swap), gfp, + charge_mm); + if (error) goto failed; - } - - mem_cgroup_commit_charge(page, memcg, true); spin_lock_irq(&info->lock); info->swapped--; @@ -1722,7 +1729,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; @@ -1847,21 +1853,11 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg); - if (error) { - if (PageTransHuge(page)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg); + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page); spin_lock_irq(&info->lock); @@ -2299,7 +2295,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; spinlock_t *ptl; void *page_kaddr; struct page *page; @@ -2349,16 +2344,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK); + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false); + goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2379,11 +2368,11 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; + goto out_release_unlock; ret = -EEXIST; if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; + goto out_release_unlock; lru_cache_add_anon(page); @@ -2404,12 +2393,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = 0; out: return ret; -out_release_uncharge_unlock: +out_release_unlock: pte_unmap_unlock(dst_pte, ptl); ClearPageDirty(page); delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg); out_release: unlock_page(page); put_page(page); -- cgit v1.2.3 From 9da7b5216869f80e91f78403a57c72b42357758c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:51 -0700 Subject: mm: memcontrol: prepare cgroup vmstat infrastructure for native anon counters Anonymous compound pages can be mapped by ptes, which means that if we want to track NR_MAPPED_ANON, NR_ANON_THPS on a per-cgroup basis, we have to be prepared to see tail pages in our accounting functions. Make mod_lruvec_page_state() and lock_page_memcg() deal with tail pages correctly, namely by redirecting to the head page which has the page->mem_cgroup set up. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-9-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 898925bdd676..8f00dd755818 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -709,16 +709,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { + struct page *head = compound_head(page); /* rmap on tail pages */ pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!page->mem_cgroup) { + if (!head->mem_cgroup) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f82ae3723760..a875a97067a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1981,6 +1981,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) */ struct mem_cgroup *lock_page_memcg(struct page *page) { + struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -2000,7 +2001,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = page->mem_cgroup; + memcg = head->mem_cgroup; if (unlikely(!memcg)) return NULL; @@ -2008,7 +2009,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page->mem_cgroup) { + if (memcg != head->mem_cgroup) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2051,7 +2052,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) */ void unlock_page_memcg(struct page *page) { - __unlock_page_memcg(page->mem_cgroup); + struct page *head = compound_head(page); + + __unlock_page_memcg(head->mem_cgroup); } EXPORT_SYMBOL(unlock_page_memcg); -- cgit v1.2.3 From 0d1c20722ab333ac0ac03ae2188922c1021d3abc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:54 -0700 Subject: mm: memcontrol: switch to native NR_FILE_PAGES and NR_SHMEM counters Memcg maintains private MEMCG_CACHE and NR_SHMEM counters. This divergence from the generic VM accounting means unnecessary code overhead, and creates a dependency for memcg that page->mapping is set up at the time of charging, so that page types can be told apart. Convert the generic accounting sites to mod_lruvec_page_state and friends to maintain the per-cgroup vmstat counters of NR_FILE_PAGES and NR_SHMEM. The page is already locked in these places, so page->mem_cgroup is stable; we only need minimal tweaks of two mem_cgroup_migrate() calls to ensure it's set up in time. Then replace MEMCG_CACHE with NR_FILE_PAGES and delete the private NR_SHMEM accounting sites. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-10-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- mm/filemap.c | 17 +++++++++-------- mm/khugepaged.c | 16 +++++++++++----- mm/memcontrol.c | 28 +++++++++++----------------- mm/migrate.c | 15 +++++++++++---- mm/shmem.c | 14 +++++++------- 6 files changed, 50 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8f00dd755818..f6ea68ceed2c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,8 +29,7 @@ struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { - MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS, - MEMCG_RSS, + MEMCG_RSS = NR_VM_NODE_STAT_ITEMS, MEMCG_RSS_HUGE, MEMCG_SWAP, MEMCG_SOCK, diff --git a/mm/filemap.c b/mm/filemap.c index 38e6a37336a6..a6565890cdf0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, nr = hpage_nr_pages(page); - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; + mem_cgroup_migrate(old, new); + xas_lock_irqsave(&xas, flags); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(old)) - __dec_node_page_state(old, NR_FILE_PAGES); + __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); + __inc_lruvec_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(old)) - __dec_node_page_state(old, NR_SHMEM); + __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); + __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irqrestore(&xas, flags); - mem_cgroup_migrate(old, new); if (freepage) freepage(old); put_page(old); @@ -867,7 +868,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); + __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0a29b51494fd..ddbdc1e3a694 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1844,12 +1844,18 @@ out_unlock: } if (nr_none) { - struct zone *zone = page_zone(new_page); - - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + struct lruvec *lruvec; + /* + * XXX: We have started try_charge and pinned the + * memcg, but the page isn't committed yet so we + * cannot use mod_lruvec_page_state(). This hackery + * will be cleaned up when remove the page->mapping + * dependency from memcg and fully charge above. + */ + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(new_page)); + __mod_lruvec_state(lruvec, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_node_page_state(zone->zone_pgdat, - NR_SHMEM, nr_none); + __mod_lruvec_state(lruvec, NR_SHMEM, nr_none); } xa_locked: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a875a97067a3..ab3497ba0e35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -842,11 +842,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, */ if (PageAnon(page)) __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); - else { - __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); - if (PageSwapBacked(page)) - __mod_memcg_state(memcg, NR_SHMEM, nr_pages); - } if (abs(nr_pages) > 1) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -1392,7 +1387,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * + (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * @@ -3357,7 +3352,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - val = memcg_page_state(memcg, MEMCG_CACHE) + + val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, MEMCG_RSS); if (swap) val += memcg_page_state(memcg, MEMCG_SWAP); @@ -3828,7 +3823,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, + NR_FILE_PAGES, MEMCG_RSS, MEMCG_RSS_HUGE, NR_SHMEM, @@ -5461,6 +5456,14 @@ static int mem_cgroup_move_account(struct page *page, lock_page_memcg(page); if (!PageAnon(page)) { + __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); + + if (PageSwapBacked(page)) { + __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); + __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); + } + if (page_mapped(page)) { __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); @@ -6673,10 +6676,8 @@ struct uncharge_gather { unsigned long nr_pages; unsigned long pgpgout; unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_kmem; unsigned long nr_huge; - unsigned long nr_shmem; struct page *dummy_page; }; @@ -6700,9 +6701,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); - __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); - __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); @@ -6743,11 +6742,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->nr_huge += nr_pages; if (PageAnon(page)) ug->nr_anon += nr_pages; - else { - ug->nr_file += nr_pages; - if (PageSwapBacked(page)) - ug->nr_shmem += nr_pages; - } ug->pgpgout++; } else { ug->nr_kmem += nr_pages; diff --git a/mm/migrate.c b/mm/migrate.c index 93c732213d12..0d1f796df75f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping, * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); + __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __dec_lruvec_state(old_lruvec, NR_SHMEM); + __inc_lruvec_state(new_lruvec, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); diff --git a/mm/shmem.c b/mm/shmem.c index 0d9615723152..71842fd4a9d0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -653,8 +653,8 @@ next: __inc_node_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -685,8 +685,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + __dec_lruvec_page_state(page, NR_FILE_PAGES); + __dec_lruvec_page_state(page, NR_SHMEM); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -1593,8 +1593,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(oldpage, newpage); + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1606,7 +1607,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } -- cgit v1.2.3 From be5d0a74c62d8da43f9526a5b08cdd18e2bbc37a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:01:57 -0700 Subject: mm: memcontrol: switch to native NR_ANON_MAPPED counter Memcg maintains a private MEMCG_RSS counter. This divergence from the generic VM accounting means unnecessary code overhead, and creates a dependency for memcg that page->mapping is set up at the time of charging, so that page types can be told apart. Convert the generic accounting sites to mod_lruvec_page_state and friends to maintain the per-cgroup vmstat counter of NR_ANON_MAPPED. We use lock_page_memcg() to stabilize page->mem_cgroup during rmap changes, the same way we do for NR_FILE_MAPPED. With the previous patch removing MEMCG_CACHE and the private NR_SHMEM counter, this patch finally eliminates the need to have page->mapping set up at charge time. However, we need to have page->mem_cgroup set up by the time rmap runs and does the accounting, so switch the commit and the rmap callbacks around. v2: fix temporary accounting bug by switching rmap<->commit (Joonsoo) Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Alex Shi Cc: Hugh Dickins Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-11-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memcontrol.c | 27 +++++++++----------------- mm/memory.c | 10 +++++----- mm/migrate.c | 2 +- mm/rmap.c | 47 ++++++++++++++++++++++++++++------------------ mm/swapfile.c | 4 ++-- mm/userfaultfd.c | 2 +- 10 files changed, 51 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f6ea68ceed2c..acacc3018957 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,8 +29,7 @@ struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { - MEMCG_RSS = NR_VM_NODE_STAT_ITEMS, - MEMCG_RSS_HUGE, + MEMCG_RSS_HUGE = NR_VM_NODE_STAT_ITEMS, MEMCG_SWAP, MEMCG_SOCK, /* XXX: why are these zone and not node counters? */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 40e7488ce467..89ef81b65bcb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -188,8 +188,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 672e34932b53..2caf2494db66 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -640,8 +640,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, haddr, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ddbdc1e3a694..34eff4dfae80 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1175,8 +1175,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, true); count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab3497ba0e35..b80125303c02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -836,13 +836,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, int nr_pages) { - /* - * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is - * counted as CACHE even if it's on ANON LRU. - */ - if (PageAnon(page)) - __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); - if (abs(nr_pages) > 1) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); @@ -1384,7 +1377,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) */ seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * + (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * PAGE_SIZE); seq_buf_printf(&s, "file %llu\n", (u64)memcg_page_state(memcg, NR_FILE_PAGES) * @@ -3353,7 +3346,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) if (mem_cgroup_is_root(memcg)) { val = memcg_page_state(memcg, NR_FILE_PAGES) + - memcg_page_state(memcg, MEMCG_RSS); + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) val += memcg_page_state(memcg, MEMCG_SWAP); } else { @@ -3824,7 +3817,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) static const unsigned int memcg1_stats[] = { NR_FILE_PAGES, - MEMCG_RSS, + NR_ANON_MAPPED, MEMCG_RSS_HUGE, NR_SHMEM, NR_FILE_MAPPED, @@ -5455,7 +5448,12 @@ static int mem_cgroup_move_account(struct page *page, lock_page_memcg(page); - if (!PageAnon(page)) { + if (PageAnon(page)) { + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + } + } else { __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); @@ -6589,7 +6587,6 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, { unsigned int nr_pages = hpage_nr_pages(page); - VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); if (mem_cgroup_disabled()) @@ -6662,8 +6659,6 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup *memcg; int ret; - VM_BUG_ON_PAGE(!page->mapping, page); - ret = mem_cgroup_try_charge(page, mm, gfp_mask, &memcg); if (ret) return ret; @@ -6675,7 +6670,6 @@ struct uncharge_gather { struct mem_cgroup *memcg; unsigned long nr_pages; unsigned long pgpgout; - unsigned long nr_anon; unsigned long nr_kmem; unsigned long nr_huge; struct page *dummy_page; @@ -6700,7 +6694,6 @@ static void uncharge_batch(const struct uncharge_gather *ug) } local_irq_save(flags); - __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); @@ -6740,8 +6733,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) if (!PageKmemcg(page)) { if (PageTransHuge(page)) ug->nr_huge += nr_pages; - if (PageAnon(page)) - ug->nr_anon += nr_pages; ug->pgpgout++; } else { ug->nr_kmem += nr_pages; diff --git a/mm/memory.c b/mm/memory.c index 6b8c5900e9a4..543e41b1d57a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2710,8 +2710,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * thread doing COW. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -3243,12 +3243,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(page, vma); } else { - do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); activate_page(page); } @@ -3390,8 +3390,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3652,8 +3652,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); diff --git a/mm/migrate.c b/mm/migrate.c index 0d1f796df75f..e72ed681634f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2832,8 +2832,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); get_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index f79a206b271a..150513d31efa 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first; + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (compound) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1134,12 +1139,13 @@ void do_page_add_anon_rmap(struct page *page, */ if (compound) __inc_node_page_state(page, NR_ANON_THPS); - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) - return; - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) { + unlock_page_memcg(page); + return; + } /* address might be in next vma when migration races vma_adjust */ if (first) @@ -1181,7 +1187,7 @@ void page_add_new_anon_rmap(struct page *page, /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) int i, nr = 1; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); - goto out; + return; } /* page still mapped by someone else? */ @@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; if (PageSwapBacked(page)) __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; } /* @@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); -out: - unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } /** @@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) - return page_remove_file_rmap(page, compound); + lock_page_memcg(page); - if (compound) - return page_remove_anon_compound_rmap(page); + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } + + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_node_page_state(page, NR_ANON_MAPPED); + __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + unlock_page_memcg(page); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 1829fc4b3ca2..01f6538bad2d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1920,11 +1920,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true); + page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bb57d0a3fca7..3dea268d2850 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -123,8 +123,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); -- cgit v1.2.3 From 468c398233da208521a0f84c2068012a66a7489d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:01 -0700 Subject: mm: memcontrol: switch to native NR_ANON_THPS counter With rmap memcg locking already in place for NR_ANON_MAPPED, it's just a small step to remove the MEMCG_RSS_HUGE wart and switch memcg to the native NR_ANON_THPS accounting sites. [hannes@cmpxchg.org: fixes] Link: http://lkml.kernel.org/r/20200512121750.GA397968@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Tested-by: Naresh Kamboju Reviewed-by: Joonsoo Kim Acked-by: Randy Dunlap [build-tested] Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-12-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- mm/huge_memory.c | 4 +++- mm/memcontrol.c | 47 +++++++++++++++++++++++----------------------- mm/rmap.c | 6 +++--- 4 files changed, 31 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index acacc3018957..63a31a6c3c69 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,8 +29,7 @@ struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { - MEMCG_RSS_HUGE = NR_VM_NODE_STAT_ITEMS, - MEMCG_SWAP, + MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2caf2494db66..1fe980dafe03 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2159,15 +2159,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, atomic_inc(&page[i]._mapcount); } + lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) atomic_dec(&page[i]._mapcount); } } + unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b80125303c02..17587ea2745a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -836,11 +836,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, int nr_pages) { - if (abs(nr_pages) > 1) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); - } - /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); @@ -1406,15 +1401,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_ANON_THPS) * + HPAGE_PMD_SIZE); +#endif for (i = 0; i < NR_LRU_LISTS; i++) seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), @@ -3061,8 +3052,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - - __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3818,7 +3807,9 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) static const unsigned int memcg1_stats[] = { NR_FILE_PAGES, NR_ANON_MAPPED, - MEMCG_RSS_HUGE, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + NR_ANON_THPS, +#endif NR_SHMEM, NR_FILE_MAPPED, NR_FILE_DIRTY, @@ -3829,7 +3820,9 @@ static const unsigned int memcg1_stats[] = { static const char *const memcg1_stat_names[] = { "cache", "rss", +#ifdef CONFIG_TRANSPARENT_HUGEPAGE "rss_huge", +#endif "shmem", "mapped_file", "dirty", @@ -3855,11 +3848,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state_local(memcg, memcg1_stats[i]) * - PAGE_SIZE); + nr = memcg_page_state_local(memcg, memcg1_stats[i]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memcg1_stats[i] == NR_ANON_THPS) + nr *= HPAGE_PMD_NR; +#endif + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -5452,6 +5450,13 @@ static int mem_cgroup_move_account(struct page *page, if (page_mapped(page)) { __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + if (PageTransHuge(page)) { + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); + } + } } else { __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); @@ -6671,7 +6676,6 @@ struct uncharge_gather { unsigned long nr_pages; unsigned long pgpgout; unsigned long nr_kmem; - unsigned long nr_huge; struct page *dummy_page; }; @@ -6694,7 +6698,6 @@ static void uncharge_batch(const struct uncharge_gather *ug) } local_irq_save(flags); - __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); @@ -6731,8 +6734,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->nr_pages += nr_pages; if (!PageKmemcg(page)) { - if (PageTransHuge(page)) - ug->nr_huge += nr_pages; ug->pgpgout++; } else { ug->nr_kmem += nr_pages; diff --git a/mm/rmap.c b/mm/rmap.c index 150513d31efa..ad4a0fdcc94c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1138,7 +1138,7 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_node_page_state(page, NR_ANON_THPS); + __inc_lruvec_page_state(page, NR_ANON_THPS); __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1180,7 +1180,7 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_node_page_state(page, NR_ANON_THPS); + __inc_lruvec_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1286,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* -- cgit v1.2.3 From 9d82c69438d0dff8809061edbcce43a5a4bcf09f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:04 -0700 Subject: mm: memcontrol: convert anon and file-thp to new mem_cgroup_charge() API With the page->mapping requirement gone from memcg, we can charge anon and file-thp pages in one single step, right after they're allocated. This removes two out of three API calls - especially the tricky commit step that needed to happen at just the right time between when the page is "set up" and when it's "published" - somewhat vague and fluid concepts that varied by page type. All we need is a freshly allocated page and a memcg context to charge. v2: prevent double charges on pre-allocated hugepages in khugepaged [hannes@cmpxchg.org: Fix crash - *hpage could be ERR_PTR instead of NULL] Link: http://lkml.kernel.org/r/20200512215813.GA487759@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Cc: Qian Cai Link: http://lkml.kernel.org/r/20200508183105.225460-13-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +--- kernel/events/uprobes.c | 11 +++-------- mm/filemap.c | 2 +- mm/huge_memory.c | 9 +++------ mm/khugepaged.c | 35 ++++++++++------------------------- mm/memory.c | 36 ++++++++++-------------------------- mm/migrate.c | 5 +---- mm/swapfile.c | 6 +----- mm/userfaultfd.c | 5 +---- 9 files changed, 31 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ff73187c3fd4..4ef044fa09fa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -501,7 +501,6 @@ struct vm_fault { pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ - struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by @@ -946,8 +945,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page); +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 89ef81b65bcb..4253c153e985 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -162,14 +162,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, }; int err; struct mmu_notifier_range range; - struct mem_cgroup *memcg; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL, + false); if (err) return err; } @@ -179,16 +178,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) { - if (new_page) - mem_cgroup_cancel_charge(new_page, memcg); + if (!page_vma_mapped_walk(&pvmw)) goto unlock; - } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { get_page(new_page); - mem_cgroup_commit_charge(new_page, memcg, false); page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_active_or_unevictable(new_page, vma); } else diff --git a/mm/filemap.c b/mm/filemap.c index a6565890cdf0..f08b0ca34e31 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2633,7 +2633,7 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, NULL, page)) + if (alloc_set_pte(vmf, page)) goto unlock; unlock_page(page); goto next; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1fe980dafe03..e9201a88157e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp, false)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } + cgroup_throttle_swaprate(page, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { @@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -640,7 +639,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, vma, haddr, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); @@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -658,7 +656,6 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg); put_page(page); return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 34eff4dfae80..32c85b81837a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1037,7 +1037,6 @@ static void collapse_huge_page(struct mm_struct *mm, struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; - struct mem_cgroup *memcg; struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; @@ -1060,15 +1059,15 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp, false))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); down_read(&mm->mmap_sem); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { - mem_cgroup_cancel_charge(new_page, memcg); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1076,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) { result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1088,7 +1086,6 @@ static void collapse_huge_page(struct mm_struct *mm, */ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1175,9 +1172,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - mem_cgroup_commit_charge(new_page, memcg, false); page_add_new_anon_rmap(new_page, vma, address, true); - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1191,10 +1186,11 @@ static void collapse_huge_page(struct mm_struct *mm, out_up_write: up_write(&mm->mmap_sem); out_nolock: + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } @@ -1618,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm, struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; - struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1637,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp, false))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* This will be less messy when we use multi-index entries */ do { @@ -1650,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - mem_cgroup_cancel_charge(new_page, memcg); result = SCAN_FAIL; goto out; } @@ -1844,18 +1839,9 @@ out_unlock: } if (nr_none) { - struct lruvec *lruvec; - /* - * XXX: We have started try_charge and pinned the - * memcg, but the page isn't committed yet so we - * cannot use mod_lruvec_page_state(). This hackery - * will be cleaned up when remove the page->mapping - * dependency from memcg and fully charge above. - */ - lruvec = mem_cgroup_lruvec(memcg, page_pgdat(new_page)); - __mod_lruvec_state(lruvec, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_lruvec_state(lruvec, NR_SHMEM, nr_none); + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } xa_locked: @@ -1893,7 +1879,6 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - mem_cgroup_commit_charge(new_page, memcg, false); if (is_shmem) { set_page_dirty(new_page); @@ -1901,7 +1886,6 @@ xa_unlocked: } else { lru_cache_add_file(new_page); } - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1948,13 +1932,14 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - mem_cgroup_cancel_charge(new_page, memcg); new_page->mapping = NULL; } unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } diff --git a/mm/memory.c b/mm/memory.c index 543e41b1d57a..27e225bef5d0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2645,7 +2645,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - struct mem_cgroup *memcg; struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) @@ -2676,8 +2675,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL, false)) goto oom_free_new; + cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); @@ -2710,7 +2710,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * thread doing COW. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - mem_cgroup_commit_charge(new_page, memcg, false); page_add_new_anon_rmap(new_page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2749,8 +2748,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) /* Free the old page.. */ new_page = old_page; page_copied = 1; - } else { - mem_cgroup_cancel_charge(new_page, memcg); } if (new_page) @@ -3088,7 +3085,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; - struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; @@ -3193,10 +3189,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL, true)) { ret = VM_FAULT_OOM; goto out_page; } + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. @@ -3243,11 +3240,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(page, vma); } else { - mem_cgroup_commit_charge(page, memcg, true); do_page_add_anon_rmap(page, vma, vmf->address, exclusive); activate_page(page); } @@ -3284,7 +3279,6 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); @@ -3305,7 +3299,6 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; struct page *page; vm_fault_t ret = 0; pte_t entry; @@ -3358,8 +3351,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL, false)) goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * The memory barrier inside __SetPageUptodate makes sure that @@ -3384,13 +3378,11 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - mem_cgroup_cancel_charge(page, memcg); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(page, vma); setpte: @@ -3402,7 +3394,6 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - mem_cgroup_cancel_charge(page, memcg); put_page(page); goto unlock; oom_free_page: @@ -3607,7 +3598,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * mapping. If needed, the fucntion allocates page table or use pre-allocated. * * @vmf: fault environment - * @memcg: memcg to charge page (only for private mappings) * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on @@ -3618,8 +3608,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Return: %0 on success, %VM_FAULT_ code in case of error. */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page) +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3627,9 +3616,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - /* THP on COW? */ - VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -3652,7 +3638,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_active_or_unevictable(page, vma); } else { @@ -3702,7 +3687,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) - ret = alloc_set_pte(vmf, vmf->memcg, page); + ret = alloc_set_pte(vmf, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3862,11 +3847,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, - GFP_KERNEL, &vmf->memcg)) { + if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } + cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3884,7 +3869,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg); put_page(vmf->cow_page); return ret; } diff --git a/mm/migrate.c b/mm/migrate.c index e72ed681634f..44cee40221ec 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2740,7 +2740,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; - struct mem_cgroup *memcg; bool flush = false; spinlock_t *ptl; pte_t entry; @@ -2787,7 +2786,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL, false)) goto abort; /* @@ -2832,7 +2831,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); @@ -2855,7 +2853,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, unlock_abort: pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg); abort: *src &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 01f6538bad2d..720e9a924c01 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1892,7 +1892,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { struct page *swapcache; - struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1902,14 +1901,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL, true)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg); ret = 0; goto out; } @@ -1920,10 +1918,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - mem_cgroup_commit_charge(page, memcg, true); page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_active_or_unevictable(page, vma); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3dea268d2850..2745489415cc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep, bool wp_copy) { - struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; void *page_kaddr; @@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL, false)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); @@ -123,7 +122,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - mem_cgroup_commit_charge(page, memcg, false); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); lru_cache_add_active_or_unevictable(page, dst_vma); @@ -138,7 +136,6 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg); out_release: put_page(page); goto out; -- cgit v1.2.3 From f0e45fb4da29746a116e810eb91423ccfa4830fc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:07 -0700 Subject: mm: memcontrol: drop unused try/commit/cancel charge API There are no more users. RIP in peace. [arnd@arndb.de: fix an unused-function warning] Link: http://lkml.kernel.org/r/20200528095640.151454-1-arnd@arndb.de Signed-off-by: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-14-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 36 ------------- mm/memcontrol.c | 128 ++++++--------------------------------------- 2 files changed, 17 insertions(+), 147 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 63a31a6c3c69..46620c6343ef 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -355,14 +355,6 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp); -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp); -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); - int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, bool lrucare); @@ -846,34 +838,6 @@ static inline enum mem_cgroup_protection mem_cgroup_protected( return MEMCG_PROT_NONE; } -static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, - struct mem_cgroup **memcgp) -{ - *memcgp = NULL; - return 0; -} - -static inline int mem_cgroup_try_charge_delay(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask, - struct mem_cgroup **memcgp) -{ - *memcgp = NULL; - return 0; -} - -static inline void mem_cgroup_commit_charge(struct page *page, - struct mem_cgroup *memcg, - bool lrucare) -{ -} - -static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg) -{ -} - static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, bool lrucare) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17587ea2745a..1e7a10b450b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2641,6 +2641,7 @@ done_restock: return 0; } +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) @@ -2652,6 +2653,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } +#endif static void lock_page_lru(struct page *page, int *isolated) { @@ -6499,29 +6501,26 @@ out: } /** - * mem_cgroup_try_charge - try charging a page + * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode - * @memcgp: charged memcg return + * @lrucare: page might be on the LRU already * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * - * Returns 0 on success, with *@memcgp pointing to the charged memcg. - * Otherwise, an error code is returned. - * - * After page->mapping has been set up, the caller must finalize the - * charge with mem_cgroup_commit_charge(). Or abort the transaction - * with mem_cgroup_cancel_charge() in case page instantiation fails. + * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, + bool lrucare) { unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; int ret = 0; + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + if (mem_cgroup_disabled()) goto out; @@ -6553,56 +6552,8 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); ret = try_charge(memcg, gfp_mask, nr_pages); - - css_put(&memcg->css); -out: - *memcgp = memcg; - return ret; -} - -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) -{ - int ret; - - ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp); - if (*memcgp) - cgroup_throttle_swaprate(page, gfp_mask); - return ret; -} - -/** - * mem_cgroup_commit_charge - commit a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @lrucare: page might be on LRU already - * - * Finalize a charge transaction started by mem_cgroup_try_charge(), - * after page->mapping has been set up. This must happen atomically - * as part of the page instantiation, i.e. under the page table lock - * for anonymous pages, under the page lock for page and swap cache. - * - * In addition, the page must not be on the LRU during the commit, to - * prevent racing with task migration. If it might be, use @lrucare. - * - * Use mem_cgroup_cancel_charge() to cancel the transaction instead. - */ -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) -{ - unsigned int nr_pages = hpage_nr_pages(page); - - VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; + if (ret) + goto out_put; commit_charge(page, memcg, lrucare); @@ -6620,55 +6571,11 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, */ mem_cgroup_uncharge_swap(entry, nr_pages); } -} -/** - * mem_cgroup_cancel_charge - cancel a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * - * Cancel a charge transaction started by mem_cgroup_try_charge(). - */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) -{ - unsigned int nr_pages = hpage_nr_pages(page); - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - cancel_charge(memcg, nr_pages); -} - -/** - * mem_cgroup_charge - charge a newly allocated page to a cgroup - * @page: page to charge - * @mm: mm context of the victim - * @gfp_mask: reclaim mode - * @lrucare: page might be on the LRU already - * - * Try to charge @page to the memcg that @mm belongs to, reclaiming - * pages according to @gfp_mask if necessary. - * - * Returns 0 on success. Otherwise, an error code is returned. - */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - bool lrucare) -{ - struct mem_cgroup *memcg; - int ret; - - ret = mem_cgroup_try_charge(page, mm, gfp_mask, &memcg); - if (ret) - return ret; - mem_cgroup_commit_charge(page, memcg, lrucare); - return 0; +out_put: + css_put(&memcg->css); +out: + return ret; } struct uncharge_gather { @@ -6773,8 +6680,7 @@ static void uncharge_list(struct list_head *page_list) * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_try_charge() and - * mem_cgroup_commit_charge(). + * Uncharge a page previously charged with mem_cgroup_charge(). */ void mem_cgroup_uncharge(struct page *page) { @@ -6797,7 +6703,7 @@ void mem_cgroup_uncharge(struct page *page) * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + * mem_cgroup_charge(). */ void mem_cgroup_uncharge_list(struct list_head *page_list) { -- cgit v1.2.3 From eccb52e7880973f221ab2606e4d22ce04d96a1a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:11 -0700 Subject: mm: memcontrol: prepare swap controller setup for integration A few cleanups to streamline the swap controller setup: - Replace the do_swap_account flag with cgroup_memory_noswap. This brings it in line with other functionality that is usually available unless explicitly opted out of - nosocket, nokmem. - Remove the really_do_swap_account flag that stores the boot option and is later used to switch the do_swap_account. It's not clear why this indirection is/was necessary. Use do_swap_account directly. - Minor coding style polishing Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Balbir Singh Link: http://lkml.kernel.org/r/20200508183105.225460-15-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 59 ++++++++++++++++++++++------------------------ mm/swap_cgroup.c | 4 ++-- 3 files changed, 31 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 46620c6343ef..96257f995caa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -558,7 +558,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP -extern int do_swap_account; +extern bool cgroup_memory_noswap; #endif struct mem_cgroup *lock_page_memcg(struct page *page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1e7a10b450b2..2d035e68a127 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,10 +83,14 @@ static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -int do_swap_account __read_mostly; +#ifdef CONFIG_MEMCG_SWAP_ENABLED +bool cgroup_memory_noswap __read_mostly; #else -#define do_swap_account 0 -#endif +bool cgroup_memory_noswap __read_mostly = 1; +#endif /* CONFIG_MEMCG_SWAP_ENABLED */ +#else +#define cgroup_memory_noswap 1 +#endif /* CONFIG_MEMCG_SWAP */ #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); @@ -95,7 +99,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } #define THRESHOLDS_EVENTS_TARGET 128 @@ -6528,18 +6532,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which + * already charged pages, too. page->mem_cgroup is protected + * by the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound_head(page)->mem_cgroup) goto out; - if (do_swap_account) { + if (!cgroup_memory_noswap) { swp_entry_t ent = { .val = page_private(page), }; - unsigned short id = lookup_swap_cgroup_id(ent); + unsigned short id; + id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg && !css_tryget_online(&memcg->css)) @@ -7012,7 +7017,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || cgroup_memory_noswap) return 0; memcg = page->mem_cgroup; @@ -7056,7 +7061,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) + if (cgroup_memory_noswap) return; id = swap_cgroup_record(entry, 0, nr_pages); @@ -7079,7 +7084,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7096,7 +7101,7 @@ bool mem_cgroup_swap_full(struct page *page) if (vm_swap_full()) return true; - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = page->mem_cgroup; @@ -7114,22 +7119,15 @@ bool mem_cgroup_swap_full(struct page *page) return false; } -/* for remember boot option*/ -#ifdef CONFIG_MEMCG_SWAP_ENABLED -static int really_do_swap_account __initdata = 1; -#else -static int really_do_swap_account __initdata; -#endif - -static int __init enable_swap_account(char *s) +static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - really_do_swap_account = 1; + cgroup_memory_noswap = 0; else if (!strcmp(s, "0")) - really_do_swap_account = 0; + cgroup_memory_noswap = 1; return 1; } -__setup("swapaccount=", enable_swap_account); +__setup("swapaccount=", setup_swap_account); static u64 swap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -7226,7 +7224,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -static struct cftype memsw_cgroup_files[] = { +static struct cftype memsw_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), @@ -7255,13 +7253,12 @@ static struct cftype memsw_cgroup_files[] = { static int __init mem_cgroup_swap_init(void) { - if (!mem_cgroup_disabled() && really_do_swap_account) { - do_swap_account = 1; - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, - swap_files)); - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, - memsw_cgroup_files)); - } + if (mem_cgroup_disabled() || cgroup_memory_noswap) + return 0; + + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); + return 0; } subsys_initcall(mem_cgroup_swap_init); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 45affaef3bc6..7aa764f09079 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -171,7 +171,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) + if (cgroup_memory_noswap) return 0; length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); @@ -209,7 +209,7 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) + if (cgroup_memory_noswap) return; mutex_lock(&swap_cgroup_mutex); -- cgit v1.2.3 From d9eb1ea2bf8734afd8ec7d995270437a7242f82b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:24 -0700 Subject: mm: memcontrol: delete unused lrucare handling Swapin faults were the last event to charge pages after they had already been put on the LRU list. Now that we charge directly on swapin, the lrucare portion of the charge code is unused. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Joonsoo Kim Cc: Alex Shi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Roman Gushchin Cc: Balbir Singh Cc: Shakeel Butt Link: http://lkml.kernel.org/r/20200508183105.225460-19-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++-- kernel/events/uprobes.c | 3 +-- mm/filemap.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 4 ++-- mm/memcontrol.c | 57 ++++------------------------------------------ mm/memory.c | 8 +++---- mm/migrate.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 2 +- mm/userfaultfd.c | 2 +- 11 files changed, 19 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96257f995caa..d5bf3b5bfe6d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -355,8 +355,7 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - bool lrucare); +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -839,7 +838,7 @@ static inline enum mem_cgroup_protection mem_cgroup_protected( } static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, bool lrucare) + gfp_t gfp_mask) { return 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4253c153e985..eddc8db96027 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,8 +167,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL, - false); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); if (err) return err; } diff --git a/mm/filemap.c b/mm/filemap.c index f08b0ca34e31..455990621989 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -845,7 +845,7 @@ static int __add_to_page_cache_locked(struct page *page, page->index = offset; if (!huge) { - error = mem_cgroup_charge(page, current->mm, gfp_mask, false); + error = mem_cgroup_charge(page, current->mm, gfp_mask); if (error) goto error; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e9201a88157e..6df182a18d2c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -593,7 +593,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_charge(page, vma->vm_mm, gfp, false)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 32c85b81837a..f29038c485e0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1059,7 +1059,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_charge(new_page, mm, gfp, false))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1632,7 +1632,7 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_charge(new_page, mm, gfp, false))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 425a265dc2a0..316a84025090 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2655,51 +2655,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) } #endif -static void lock_page_lru(struct page *page, int *isolated) +static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - pg_data_t *pgdat = page_pgdat(page); - - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - *isolated = 1; - } else - *isolated = 0; -} - -static void unlock_page_lru(struct page *page, int isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - if (isolated) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&pgdat->lru_lock); -} - -static void commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) -{ - int isolated; - VM_BUG_ON_PAGE(page->mem_cgroup, page); - - /* - * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page - * may already be on some other mem_cgroup's LRU. Take care of it. - */ - if (lrucare) - lock_page_lru(page, &isolated); - /* * Nobody should be changing or seriously looking at * page->mem_cgroup at this point: @@ -2715,9 +2673,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * have the page locked */ page->mem_cgroup = memcg; - - if (lrucare) - unlock_page_lru(page, isolated); } #ifdef CONFIG_MEMCG_KMEM @@ -6503,22 +6458,18 @@ out: * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode - * @lrucare: page might be on the LRU already * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - bool lrucare) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; int ret = 0; - VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); - if (mem_cgroup_disabled()) goto out; @@ -6552,7 +6503,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, if (ret) goto out_put; - commit_charge(page, memcg, lrucare); + commit_charge(page, memcg); local_irq_disable(); mem_cgroup_charge_statistics(memcg, page, nr_pages); @@ -6753,7 +6704,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, false); + commit_charge(newpage, memcg); local_irq_save(flags); mem_cgroup_charge_statistics(memcg, newpage, nr_pages); diff --git a/mm/memory.c b/mm/memory.c index 9c886e4207a2..d50d8b498af5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2675,7 +2675,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_charge(new_page, mm, GFP_KERNEL, false)) + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; cgroup_throttle_swaprate(new_page, GFP_KERNEL); @@ -3134,7 +3134,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* Tell memcg to use swap ownership records */ SetPageSwapCache(page); err = mem_cgroup_charge(page, vma->vm_mm, - GFP_KERNEL, false); + GFP_KERNEL); ClearPageSwapCache(page); if (err) goto out_page; @@ -3358,7 +3358,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3854,7 +3854,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, false)) { + if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } diff --git a/mm/migrate.c b/mm/migrate.c index 44cee40221ec..7bfd0962149e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2786,7 +2786,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto abort; /* diff --git a/mm/shmem.c b/mm/shmem.c index b79116185046..e83de27ce8f4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -624,7 +624,7 @@ static int shmem_add_to_page_cache(struct page *page, page->index = index; if (!PageSwapCache(page)) { - error = mem_cgroup_charge(page, charge_mm, gfp, false); + error = mem_cgroup_charge(page, charge_mm, gfp); if (error) { if (PageTransHuge(page)) { count_vm_event(THP_FILE_FALLBACK); diff --git a/mm/swap_state.c b/mm/swap_state.c index f841257a3014..ab0462819a5b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -435,7 +435,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; } - if (mem_cgroup_charge(page, NULL, gfp_mask, false)) { + if (mem_cgroup_charge(page, NULL, gfp_mask)) { delete_from_swap_cache(page); goto fail_unlock; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2745489415cc..7f5194046b01 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -96,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL, false)) + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); -- cgit v1.2.3 From 497a6c1b09902b22ceccc0f25ba4dd623e1ddb7d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:34 -0700 Subject: mm: keep separate anon and file statistics on page reclaim activity Having statistics on pages scanned and pages reclaimed for both anon and file pages makes it easier to evaluate changes to LRU balancing. While at it, clean up the stat-keeping mess for isolation, putback, reclaim stats etc. a bit: first the physical LRU operation (isolation and putback), followed by vmstats, reclaim_stats, and then vm events. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-3-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ mm/vmscan.c | 17 +++++++++-------- mm/vmstat.c | 4 ++++ 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ffef0f279747..24fc7c3ae7d6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -35,6 +35,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, + PGSCAN_ANON, + PGSCAN_FILE, + PGSTEAL_ANON, + PGSTEAL_FILE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 779edd05b75e..1fa2417f7c2a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1913,7 +1913,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; @@ -1941,11 +1941,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1956,16 +1957,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); + move_pages_to_lru(lruvec, &page_list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; + reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; - reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; - - move_pages_to_lru(lruvec, &page_list); - - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&pgdat->lru_lock); diff --git a/mm/vmstat.c b/mm/vmstat.c index 068706a0a1a7..e55eda3b1a71 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1203,6 +1203,10 @@ const char * const vmstat_text[] = { "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", #ifdef CONFIG_NUMA "zone_reclaim_failed", -- cgit v1.2.3 From 6058eaec816f29fbe33c9d35694614c9a4ed75ba Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:40 -0700 Subject: mm: fold and remove lru_cache_add_anon() and lru_cache_add_file() They're the same function, and for the purpose of all callers they are equivalent to lru_cache_add(). [akpm@linux-foundation.org: fix it for local_lock changes] Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200520232525.798933-5-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- fs/cifs/file.c | 10 +++++----- fs/fuse/dev.c | 2 +- include/linux/swap.h | 2 -- mm/khugepaged.c | 8 ++------ mm/memory.c | 2 +- mm/shmem.c | 6 +++--- mm/swap.c | 42 ++++++++++-------------------------------- mm/swap_state.c | 2 +- 8 files changed, 23 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 75ddce8ef456..17a4f49c34f5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4162,7 +4162,7 @@ cifs_readv_complete(struct work_struct *work) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; - lru_cache_add_file(page); + lru_cache_add(page); if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) { @@ -4232,7 +4232,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, * fill them until the writes are flushed. */ zero_user(page, 0, PAGE_SIZE); - lru_cache_add_file(page); + lru_cache_add(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); @@ -4242,7 +4242,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, continue; } else { /* no need to hold page hostage */ - lru_cache_add_file(page); + lru_cache_add(page); unlock_page(page); put_page(page); rdata->pages[i] = NULL; @@ -4437,7 +4437,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, /* best to give up if we're out of mem */ list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); - lru_cache_add_file(page); + lru_cache_add(page); unlock_page(page); put_page(page); } @@ -4475,7 +4475,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; - lru_cache_add_file(page); + lru_cache_add(page); unlock_page(page); put_page(page); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c7a65cf2bcca..a01540b22122 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -840,7 +840,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) get_page(newpage); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add_file(newpage); + lru_cache_add(newpage); err = 0; spin_lock(&cs->req->waitq.lock); diff --git a/include/linux/swap.h b/include/linux/swap.h index 6cea1eb97d45..217bc8e13feb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,8 +335,6 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void lru_cache_add(struct page *); -extern void lru_cache_add_anon(struct page *page); -extern void lru_cache_add_file(struct page *page); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f29038c485e0..3f032487825b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1879,13 +1879,9 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - - if (is_shmem) { + if (is_shmem) set_page_dirty(new_page); - lru_cache_add_anon(new_page); - } else { - lru_cache_add_file(new_page); - } + lru_cache_add(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. diff --git a/mm/memory.c b/mm/memory.c index d50d8b498af5..3431e76d0e75 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3139,7 +3139,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (err) goto out_page; - lru_cache_add_anon(page); + lru_cache_add(page); swap_readpage(page, true); } } else { diff --git a/mm/shmem.c b/mm/shmem.c index e83de27ce8f4..ea95a3e46fbb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1609,7 +1609,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - lru_cache_add_anon(newpage); + lru_cache_add(newpage); *pagep = newpage; } @@ -1860,7 +1860,7 @@ alloc_nohuge: charge_mm); if (error) goto unacct; - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced += compound_nr(page); @@ -2376,7 +2376,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock; - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced++; diff --git a/mm/swap.c b/mm/swap.c index f7026f72aca9..6196d792c952 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -424,37 +424,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -static void __lru_cache_add(struct page *page) -{ - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); - get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) - __pagevec_lru_add(pvec); - local_unlock(&lru_pvecs.lock); -} - -/** - * lru_cache_add_anon - add a page to the page lists - * @page: the page to add - */ -void lru_cache_add_anon(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} - -void lru_cache_add_file(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} -EXPORT_SYMBOL(lru_cache_add_file); - /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. @@ -466,10 +435,19 @@ EXPORT_SYMBOL(lru_cache_add_file); */ void lru_cache_add(struct page *page) { + struct pagevec *pvec; + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); - __lru_cache_add(page); + + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + if (!pagevec_add(pvec, page) || PageCompound(page)) + __pagevec_lru_add(pvec); + local_unlock(&lru_pvecs.lock); } +EXPORT_SYMBOL(lru_cache_add); /** * lru_cache_add_active_or_unevictable diff --git a/mm/swap_state.c b/mm/swap_state.c index ab0462819a5b..fa089002125f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -442,7 +442,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* Caller will initiate read into locked page */ SetPageWorkingset(page); - lru_cache_add_anon(page); + lru_cache_add(page); *new_page_allocated = true; return page; -- cgit v1.2.3 From 1431d4d11abb265e79cd44bed2f5ea93f1bcc57b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:02:53 -0700 Subject: mm: base LRU balancing on an explicit cost model Currently, scan pressure between the anon and file LRU lists is balanced based on a mixture of reclaim efficiency and a somewhat vague notion of "value" of having certain pages in memory over others. That concept of value is problematic, because it has caused us to count any event that remotely makes one LRU list more or less preferrable for reclaim, even when these events are not directly comparable and impose very different costs on the system. One example is referenced file pages that we still deactivate and referenced anonymous pages that we actually rotate back to the head of the list. There is also conceptual overlap with the LRU algorithm itself. By rotating recently used pages instead of reclaiming them, the algorithm already biases the applied scan pressure based on page value. Thus, when rebalancing scan pressure due to rotations, we should think of reclaim cost, and leave assessing the page value to the LRU algorithm. Lastly, considering both value-increasing as well as value-decreasing events can sometimes cause the same type of event to be counted twice, i.e. how rotating a page increases the LRU value, while reclaiming it succesfully decreases the value. In itself this will balance out fine, but it quietly skews the impact of events that are only recorded once. The abstract metric of "value", the murky relationship with the LRU algorithm, and accounting both negative and positive events make the current pressure balancing model hard to reason about and modify. This patch switches to a balancing model of accounting the concrete, actually observed cost of reclaiming one LRU over another. For now, that cost includes pages that are scanned but rotated back to the list head. Subsequent patches will add consideration for IO caused by refaulting of recently evicted pages. Replace struct zone_reclaim_stat with two cost counters in the lruvec, and make everything that affects cost go through a new lru_note_cost() function. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-9-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 21 +++++++-------------- include/linux/swap.h | 2 ++ mm/memcontrol.c | 18 ++++++------------ mm/swap.c | 19 ++++++++----------- mm/vmscan.c | 44 ++++++++++++++++++++++---------------------- 5 files changed, 45 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2f79ff4477ba..e57248ccb63d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -242,19 +242,6 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -struct zone_reclaim_stat { - /* - * The pageout code in vmscan.c keeps track of how many of the - * mem/swap backed and file backed pages are referenced. - * The higher the rotated/scanned ratio, the more valuable - * that cache is. - * - * The anon LRU stats live in [0], file LRU stats in [1] - */ - unsigned long recent_rotated[2]; - unsigned long recent_scanned[2]; -}; - enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -263,7 +250,13 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; - struct zone_reclaim_stat reclaim_stat; + /* + * These track the cost of reclaiming one LRU - file or anon - + * over the other. As the observed cost of reclaiming one LRU + * increases, the reclaim scan balance tips toward the other. + */ + unsigned long anon_cost; + unsigned long file_cost; /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 217bc8e13feb..ce3c55747006 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -334,6 +334,8 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ +extern void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_pages); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ee6fe8bb0bff..5381afb23d58 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3853,23 +3853,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) { pg_data_t *pgdat; struct mem_cgroup_per_node *mz; - struct zone_reclaim_stat *rstat; - unsigned long recent_rotated[2] = {0, 0}; - unsigned long recent_scanned[2] = {0, 0}; + unsigned long anon_cost = 0; + unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; + anon_cost += mz->lruvec.anon_cost; + file_cost += mz->lruvec.file_cost; } - seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); - seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); - seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); - seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); + seq_printf(m, "anon_cost %lu\n", anon_cost); + seq_printf(m, "file_cost %lu\n", file_cost); } #endif diff --git a/mm/swap.c b/mm/swap.c index 116b609c25c1..fedeb925dbfe 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -278,15 +278,12 @@ void rotate_reclaimable_page(struct page *page) } } -static void update_page_reclaim_stat(struct lruvec *lruvec, - int file, int rotated, - unsigned int nr_pages) +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - - reclaim_stat->recent_scanned[file] += nr_pages; - if (rotated) - reclaim_stat->recent_rotated[file] += nr_pages; + if (file) + lruvec->file_cost += nr_pages; + else + lruvec->anon_cost += nr_pages; } static void __activate_page(struct page *page, struct lruvec *lruvec, @@ -541,7 +538,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, if (active) __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0, hpage_nr_pages(page)); + lru_note_cost(lruvec, !file, hpage_nr_pages(page)); } static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, @@ -557,7 +554,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); - update_page_reclaim_stat(lruvec, file, 0, hpage_nr_pages(page)); + lru_note_cost(lruvec, !file, hpage_nr_pages(page)); } } @@ -582,7 +579,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); count_memcg_page_event(page, PGLAZYFREE); - update_page_reclaim_stat(lruvec, 1, 0, hpage_nr_pages(page)); + lru_note_cost(lruvec, 0, hpage_nr_pages(page)); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index a5a7a8d0764c..c5b2a68f4ef6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1916,7 +1916,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { @@ -1940,7 +1939,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); @@ -1960,8 +1958,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; - reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; + /* + * Rotating pages costs CPU without actually + * progressing toward the reclaim goal. + */ + lru_note_cost(lruvec, 0, stat.nr_activate[0]); + lru_note_cost(lruvec, 1, stat.nr_activate[1]); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); @@ -2013,7 +2015,6 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); @@ -2027,7 +2028,6 @@ static void shrink_active_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); @@ -2085,7 +2085,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * helps balance scan pressure between file and anonymous pages in * get_scan_count. */ - reclaim_stat->recent_rotated[file] += nr_rotated; + lru_note_cost(lruvec, file, nr_rotated); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2242,13 +2242,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, { struct mem_cgroup *memcg = lruvec_memcg(lruvec); int swappiness = mem_cgroup_swappiness(memcg); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; + unsigned long totalcost; unsigned long ap, fp; enum lru_list lru; @@ -2324,26 +2324,26 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); spin_lock_irq(&pgdat->lru_lock); - if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - reclaim_stat->recent_scanned[0] /= 2; - reclaim_stat->recent_rotated[0] /= 2; - } - - if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - reclaim_stat->recent_scanned[1] /= 2; - reclaim_stat->recent_rotated[1] /= 2; + totalcost = lruvec->anon_cost + lruvec->file_cost; + if (unlikely(totalcost > (anon + file) / 4)) { + lruvec->anon_cost /= 2; + lruvec->file_cost /= 2; + totalcost /= 2; } /* * The amount of pressure on anon vs file pages is inversely - * proportional to the fraction of recently scanned pages on - * each list that were recently referenced and in active use. + * proportional to the assumed cost of reclaiming each list, + * as determined by the share of pages that are likely going + * to refault or rotate on each list (recently referenced), + * times the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); - ap /= reclaim_stat->recent_rotated[0] + 1; + ap = anon_prio * (totalcost + 1); + ap /= lruvec->anon_cost + 1; - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); - fp /= reclaim_stat->recent_rotated[1] + 1; + fp = file_prio * (totalcost + 1); + fp /= lruvec->file_cost + 1; spin_unlock_irq(&pgdat->lru_lock); fraction[0] = ap; -- cgit v1.2.3 From 314b57fb0460001a090b35ff8be987f2c868ad3c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:03:03 -0700 Subject: mm: balance LRU lists based on relative thrashing Since the LRUs were split into anon and file lists, the VM has been balancing between page cache and anonymous pages based on per-list ratios of scanned vs. rotated pages. In most cases that tips page reclaim towards the list that is easier to reclaim and has the fewest actively used pages, but there are a few problems with it: 1. Refaults and LRU rotations are weighted the same way, even though one costs IO and the other costs a bit of CPU. 2. The less we scan an LRU list based on already observed rotations, the more we increase the sampling interval for new references, and rotations become even more likely on that list. This can enter a death spiral in which we stop looking at one list completely until the other one is all but annihilated by page reclaim. Since commit a528910e12ec ("mm: thrash detection-based file cache sizing") we have refault detection for the page cache. Along with swapin events, they are good indicators of when the file or anon list, respectively, is too small for its workingset and needs to grow. For example, if the page cache is thrashing, the cache pages need more time in memory, while there may be colder pages on the anonymous list. Likewise, if swapped pages are faulting back in, it indicates that we reclaim anonymous pages too aggressively and should back off. Replace LRU rotations with refaults and swapins as the basis for relative reclaim cost of the two LRUs. This will have the VM target list balances that incur the least amount of IO on aggregate. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-12-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 +-- mm/swap.c | 11 +++++++---- mm/swap_state.c | 5 +++++ mm/vmscan.c | 39 ++++++++++----------------------------- mm/workingset.c | 4 ++++ 5 files changed, 27 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index ce3c55747006..0b71bf75fb67 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -334,8 +334,7 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ -extern void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_pages); +extern void lru_note_cost(struct page *); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); diff --git a/mm/swap.c b/mm/swap.c index 7d552af25797..2dc7d392642f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -278,12 +278,15 @@ void rotate_reclaimable_page(struct page *page) } } -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) +void lru_note_cost(struct page *page) { - if (file) - lruvec->file_cost += nr_pages; + struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + + /* Record new data point */ + if (page_is_file_lru(page)) + lruvec->file_cost++; else - lruvec->anon_cost += nr_pages; + lruvec->anon_cost++; } static void __activate_page(struct page *page, struct lruvec *lruvec, diff --git a/mm/swap_state.c b/mm/swap_state.c index fa089002125f..1cd0b345ff7e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -440,6 +440,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; } + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + /* Caller will initiate read into locked page */ SetPageWorkingset(page); lru_cache_add(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3c89eac629f3..76e823db21a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1958,12 +1958,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - /* - * Rotating pages costs CPU without actually - * progressing toward the reclaim goal. - */ - lru_note_cost(lruvec, 0, stat.nr_activate[0]); - lru_note_cost(lruvec, 1, stat.nr_activate[1]); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); @@ -2079,11 +2073,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * Move pages back to the lru list. */ spin_lock_irq(&pgdat->lru_lock); - /* - * Rotating pages costs CPU without actually - * progressing toward the reclaim goal. - */ - lru_note_cost(lruvec, file, nr_rotated); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2298,22 +2287,23 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, scan_balance = SCAN_FRACT; /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * With swappiness at 100, anon and file have equal IO cost. */ anon_prio = swappiness; file_prio = 200 - anon_prio; /* - * OK, so we have swap space and a fair amount of page cache - * pages. We use the recently rotated / recently scanned - * ratios to determine how valuable each cache is. - * * Because workloads change over time (and to avoid overflow) * we keep these statistics as a floating average, which ends - * up weighing recent references more than old ones. - * - * anon in [0], file in [1] + * up weighing recent refaults more than old ones. */ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + @@ -2328,15 +2318,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, lruvec->file_cost /= 2; totalcost /= 2; } - - /* - * The amount of pressure on anon vs file pages is inversely - * proportional to the assumed cost of reclaiming each list, - * as determined by the share of pages that are likely going - * to refault or rotate on each list (recently referenced), - * times the relative IO cost of bringing back a swapped out - * anonymous page vs reloading a filesystem page (swappiness). - */ ap = anon_prio * (totalcost + 1); ap /= lruvec->anon_cost + 1; diff --git a/mm/workingset.c b/mm/workingset.c index e69865739539..a6a2a740ed0b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -365,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow) /* Page was active prior to eviction */ if (workingset) { SetPageWorkingset(page); + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } out: -- cgit v1.2.3 From 7cf111bc39f6792abedcdfbc4e6291a5603b0ef0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:03:06 -0700 Subject: mm: vmscan: determine anon/file pressure balance at the reclaim root We split the LRU lists into anon and file, and we rebalance the scan pressure between them when one of them begins thrashing: if the file cache experiences workingset refaults, we increase the pressure on anonymous pages; if the workload is stalled on swapins, we increase the pressure on the file cache instead. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, LRU pressure balancing is done on an individual cgroup LRU level. As a result, when one cgroup is thrashing on the filesystem cache while a sibling may have cold anonymous pages, pressure doesn't get equalized between them. This patch moves LRU balancing decision to the root of reclaim - the same level where the LRU order is established. It does this by tracking LRU cost recursively, so that every level of the cgroup tree knows the aggregate LRU cost of all memory within its domain. When the page scanner calculates the scan balance for any given individual cgroup's LRU list, it uses the values from the ancestor cgroup that initiated the reclaim cycle. If one sibling is then thrashing on the cache, it will tip the pressure balance inside its ancestors, and the next hierarchical reclaim iteration will go more after the anon pages in the tree. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-13-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 +++++++++++++ mm/swap.c | 32 +++++++++++++++++++++++++++----- mm/vmscan.c | 41 +++++++++++++++++------------------------ 3 files changed, 57 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d5bf3b5bfe6d..e77197a62809 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1242,6 +1242,19 @@ static inline void dec_lruvec_page_state(struct page *page, mod_lruvec_page_state(page, idx, -1); } +static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) +{ + struct mem_cgroup *memcg; + + memcg = lruvec_memcg(lruvec); + if (!memcg) + return NULL; + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/mm/swap.c b/mm/swap.c index 2dc7d392642f..4dff2123f695 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -282,11 +282,33 @@ void lru_note_cost(struct page *page) { struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); - /* Record new data point */ - if (page_is_file_lru(page)) - lruvec->file_cost++; - else - lruvec->anon_cost++; + do { + unsigned long lrusize; + + /* Record cost event */ + if (page_is_file_lru(page)) + lruvec->file_cost++; + else + lruvec->anon_cost++; + + /* + * Decay previous events + * + * Because workloads change over time (and to avoid + * overflow) we keep these statistics as a floating + * average, which ends up weighing recent refaults + * more than old ones. + */ + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + + lruvec_page_state(lruvec, NR_ACTIVE_FILE); + + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { + lruvec->file_cost /= 2; + lruvec->anon_cost /= 2; + } + } while ((lruvec = parent_lruvec(lruvec))); } static void __activate_page(struct page *page, struct lruvec *lruvec, diff --git a/mm/vmscan.c b/mm/vmscan.c index 76e823db21a7..d08640f0235c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,12 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + /* Can active pages be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -2231,10 +2237,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, int swappiness = mem_cgroup_swappiness(memcg); u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file; unsigned long totalcost; unsigned long ap, fp; enum lru_list lru; @@ -2285,7 +2289,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - /* * Calculate the pressure balance between anon and file pages. * @@ -2300,30 +2303,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, anon_prio = swappiness; file_prio = 200 - anon_prio; - /* - * Because workloads change over time (and to avoid overflow) - * we keep these statistics as a floating average, which ends - * up weighing recent refaults more than old ones. - */ - - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); - - spin_lock_irq(&pgdat->lru_lock); - totalcost = lruvec->anon_cost + lruvec->file_cost; - if (unlikely(totalcost > (anon + file) / 4)) { - lruvec->anon_cost /= 2; - lruvec->file_cost /= 2; - totalcost /= 2; - } + totalcost = sc->anon_cost + sc->file_cost; ap = anon_prio * (totalcost + 1); - ap /= lruvec->anon_cost + 1; + ap /= sc->anon_cost + 1; fp = file_prio * (totalcost + 1); - fp /= lruvec->file_cost + 1; - spin_unlock_irq(&pgdat->lru_lock); + fp /= sc->file_cost + 1; fraction[0] = ap; fraction[1] = fp; @@ -2687,6 +2672,14 @@ again: nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. -- cgit v1.2.3 From 96f8bf4fb1dd2656ae3e92326be9ebf003bbfd45 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jun 2020 16:03:09 -0700 Subject: mm: vmscan: reclaim writepage is IO cost The VM tries to balance reclaim pressure between anon and file so as to reduce the amount of IO incurred due to the memory shortage. It already counts refaults and swapins, but in addition it should also count writepage calls during reclaim. For swap, this is obvious: it's IO that wouldn't have occurred if the anonymous memory hadn't been under memory pressure. From a relative balancing point of view this makes sense as well: even if anon is cold and reclaimable, a cache that isn't thrashing may have equally cold pages that don't require IO to reclaim. For file writeback, it's trickier: some of the reclaim writepage IO would have likely occurred anyway due to dirty expiration. But not all of it - premature writeback reduces batching and generates additional writes. Since the flushers are already woken up by the time the VM starts writing cache pages one by one, let's assume that we'e likely causing writes that wouldn't have happened without memory pressure. In addition, the per-page cost of IO would have probably been much cheaper if written in larger batches from the flusher thread rather than the single-page-writes from kswapd. For our purposes - getting the trend right to accelerate convergence on a stable state that doesn't require paging at all - this is sufficiently accurate. If we later wanted to optimize for sustained thrashing, we can still refine the measurements. Count all writepage calls from kswapd as IO cost toward the LRU that the page belongs to. Why do this dynamically? Don't we know in advance that anon pages require IO to reclaim, and so could build in a static bias? First, scanning is not the same as reclaiming. If all the anon pages are referenced, we may not swap for a while just because we're scanning the anon list. During this time, however, it's important that we age anonymous memory and the page cache at the same rate so that their hot-cold gradients are comparable. Everything else being equal, we still want to reclaim the coldest memory overall. Second, we keep copies in swap unless the page changes. If there is swap-backed data that's mostly read (tmpfs file) and has been swapped out before, we can reclaim it without incurring additional IO. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200520232525.798933-14-hannes@cmpxchg.org Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 +++- include/linux/vmstat.h | 1 + mm/swap.c | 16 ++++++++++------ mm/swap_state.c | 2 +- mm/vmscan.c | 3 +++ mm/workingset.c | 2 +- 6 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0b71bf75fb67..4c5974bb9ba9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -334,7 +334,9 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ -extern void lru_note_cost(struct page *); +extern void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_pages); +extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 10cc932e209a..3d12c34cd42a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -26,6 +26,7 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; + unsigned nr_pageout; unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; diff --git a/mm/swap.c b/mm/swap.c index 4dff2123f695..343675d629ae 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -278,18 +278,16 @@ void rotate_reclaimable_page(struct page *page) } } -void lru_note_cost(struct page *page) +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { - struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); - do { unsigned long lrusize; /* Record cost event */ - if (page_is_file_lru(page)) - lruvec->file_cost++; + if (file) + lruvec->file_cost += nr_pages; else - lruvec->anon_cost++; + lruvec->anon_cost += nr_pages; /* * Decay previous events @@ -311,6 +309,12 @@ void lru_note_cost(struct page *page) } while ((lruvec = parent_lruvec(lruvec))); } +void lru_note_cost_page(struct page *page) +{ + lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), + page_is_file_lru(page), hpage_nr_pages(page)); +} + static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 1cd0b345ff7e..9d20b00627af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -442,7 +442,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* XXX: Move to lru_cache_add() when it supports new vs putback */ spin_lock_irq(&page_pgdat(page)->lru_lock); - lru_note_cost(page); + lru_note_cost_page(page); spin_unlock_irq(&page_pgdat(page)->lru_lock); /* Caller will initiate read into locked page */ diff --git a/mm/vmscan.c b/mm/vmscan.c index d08640f0235c..14ffe9ccf7ef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1359,6 +1359,8 @@ static unsigned int shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: + stat->nr_pageout += hpage_nr_pages(page); + if (PageWriteback(page)) goto keep; if (PageDirty(page)) @@ -1964,6 +1966,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + lru_note_cost(lruvec, file, stat.nr_pageout); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); diff --git a/mm/workingset.c b/mm/workingset.c index a6a2a740ed0b..d481ea452eeb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -367,7 +367,7 @@ void workingset_refault(struct page *page, void *shadow) SetPageWorkingset(page); /* XXX: Move to lru_cache_add() when it supports new vs putback */ spin_lock_irq(&page_pgdat(page)->lru_lock); - lru_note_cost(page); + lru_note_cost_page(page); spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } -- cgit v1.2.3 From 8cbd54f52997f43be3d09acd9fa9a10d202c5374 Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Wed, 3 Jun 2020 16:03:28 -0700 Subject: include/linux/memblock.h: fix minor typo and unclear comment Fix a minor typo "usabe->usable" for the current discription of member variable "memory" in struct memblock. BTW, I think it's unclear the member variable "base" in struct memblock_type is currently described as the physical address of memory region, change it to base address of the region is clearer since the variable is decorated as phys_addr_t. Signed-off-by: chenqiwu Signed-off-by: Andrew Morton Reviewed-by: Mike Rapoport Link: http://lkml.kernel.org/r/1588846952-32166-1-git-send-email-qiwuchen55@gmail.com Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 807ab9daf0cd..017fae833d4a 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -41,7 +41,7 @@ enum memblock_flags { /** * struct memblock_region - represents a memory region - * @base: physical address of the region + * @base: base address of the region * @size: size of the region * @flags: memory region attributes * @nid: NUMA node id @@ -75,7 +75,7 @@ struct memblock_type { * struct memblock - memblock allocator metadata * @bottom_up: is bottom up direction? * @current_limit: physical address of the current allocation limit - * @memory: usabe memory regions + * @memory: usable memory regions * @reserved: reserved memory regions * @physmem: all physical memory */ -- cgit v1.2.3 From 4301efa4c7cca11556dd89899eee066d49b47bf7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Apr 2020 10:54:44 +0200 Subject: writeback: Export inode_io_list_del() Ext4 needs to remove inode from writeback lists after it is out of visibility of its journalling machinery (which can still dirty the inode). Export inode_io_list_del() for it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200421085445.5731-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/fs-writeback.c | 1 + fs/internal.h | 2 -- include/linux/writeback.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..e58bd5f758d0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1126,6 +1126,7 @@ void inode_io_list_del(struct inode *inode) inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } +EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..8819d0d58b03 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -143,8 +143,6 @@ extern int dentry_needs_remove_privs(struct dentry *dentry); /* * fs-writeback.c */ -extern void inode_io_list_del(struct inode *inode); - extern long get_nr_dirty_inodes(void); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..902aa317621b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -197,6 +197,7 @@ void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); +void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) -- cgit v1.2.3 From 44ebcd06bbb3ab3ee446b933800aca32fc4ca9b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:10 +0200 Subject: fs: mark __generic_block_fiemap static There is no caller left outside of ioctl.c. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-4-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ioctl.c | 4 +--- include/linux/fs.h | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/ioctl.c b/fs/ioctl.c index 5e80b40bc1b5..8fe5131b1dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -307,8 +307,7 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) * If you use this function directly, you need to do your own locking. Use * generic_block_fiemap if you want the locking done for you. */ - -int __generic_block_fiemap(struct inode *inode, +static int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { @@ -453,7 +452,6 @@ int __generic_block_fiemap(struct inode *inode, return ret; } -EXPORT_SYMBOL(__generic_block_fiemap); /** * generic_block_fiemap - FIEMAP for block based inodes diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..3104c6f7527b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3299,10 +3299,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, - get_block_t *get_block); extern int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block); -- cgit v1.2.3 From 10c5db286452b8c60e8f58e9a4c1cbc5a91e4e5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:11 +0200 Subject: fs: move the fiemap definitions out of fs.h No need to pull the fiemap definitions into almost every file in the kernel build. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-5-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/bad_inode.c | 1 + fs/btrfs/extent_io.h | 1 + fs/cifs/inode.c | 1 + fs/cifs/smb2ops.c | 1 + fs/ext2/inode.c | 1 + fs/ext4/ext4.h | 1 + fs/f2fs/data.c | 1 + fs/f2fs/inline.c | 1 + fs/gfs2/inode.c | 1 + fs/hpfs/file.c | 1 + fs/ioctl.c | 1 + fs/iomap/fiemap.c | 1 + fs/nilfs2/inode.c | 1 + fs/overlayfs/inode.c | 1 + fs/xfs/xfs_iops.c | 1 + include/linux/fiemap.h | 24 ++++++++++++++++++++++++ include/linux/fs.h | 19 +------------------ include/uapi/linux/fiemap.h | 6 +++--- 18 files changed, 43 insertions(+), 21 deletions(-) create mode 100644 include/linux/fiemap.h (limited to 'include/linux') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 8035d2a44561..54f0ce444272 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,6 +15,7 @@ #include #include #include +#include static int bad_file_open(struct inode *inode, struct file *filp) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..817698bc0669 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -5,6 +5,7 @@ #include #include +#include #include "ulist.h" /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 390d2b15ef6e..3f276eb8ca68 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "cifsfs.h" diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f829f4165d38..09047f1ddfb6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..0f12a0e8a8d9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1eb07ca91fca..9e5c332a2b94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..25abbbb65ba0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4167e5408151..9686ffea177e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -8,6 +8,7 @@ #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 70b2d3a1e866..4842f313a808 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "gfs2.h" diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..62959a8e43ad 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include +#include #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/ioctl.c b/fs/ioctl.c index 8fe5131b1dee..3f300cc07dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index d55e8f491a5e..0a807bbb2b4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -6,6 +6,7 @@ #include #include #include +#include struct fiemap_ctx { struct fiemap_extent_info *fi; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..6e1aca38931f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b0d42ece4d7c..b5fec3410556 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "overlayfs.h" diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f7a99b3bbcf7..44c353998ac5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * Directories have different lock order w.r.t. mmap_sem compared to regular diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h new file mode 100644 index 000000000000..240d4f7d9116 --- /dev/null +++ b/include/linux/fiemap.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H 1 + +#include +#include + +struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ + struct fiemap_extent __user *fi_extents_start; /* Start of + fiemap_extent array */ +}; + +int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags); +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, u64 len, + get_block_t *get_block); + +#endif /* _LINUX_FIEMAP_H 1 */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3104c6f7527b..09bcd329c062 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; +struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; @@ -1745,19 +1745,6 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -/* - * VFS FS_IOC_FIEMAP helper definitions. - */ -struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ -}; -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); /* * This is the "filldir" function type, used by readdir() to let @@ -3299,10 +3286,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); - extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 7a900b2377b6..24ca0c00cae3 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -9,8 +9,8 @@ * Andreas Dilger */ -#ifndef _LINUX_FIEMAP_H -#define _LINUX_FIEMAP_H +#ifndef _UAPI_LINUX_FIEMAP_H +#define _UAPI_LINUX_FIEMAP_H #include @@ -67,4 +67,4 @@ struct fiemap { #define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other * files. */ -#endif /* _LINUX_FIEMAP_H */ +#endif /* _UAPI_LINUX_FIEMAP_H */ -- cgit v1.2.3 From 2732881894714f545ffac42dad7ba7730069874d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:12 +0200 Subject: iomap: fix the iomap_fiemap prototype iomap_fiemap should take u64 start and len arguments, just like the ->fiemap prototype. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-6-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/iomap/fiemap.c | 2 +- include/linux/iomap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 0a807bbb2b4a..449705575acf 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -66,7 +66,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, const struct iomap_ops *ops) + u64 start, u64 len, const struct iomap_ops *ops) { struct fiemap_ctx ctx; loff_t ret; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..63db02528b70 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,7 +178,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, const struct iomap_ops *ops); + u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, -- cgit v1.2.3 From cddf8a2c4a8286ae60fc866eab59c8bc524e93a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:13 +0200 Subject: fs: move fiemap range validation into the file systems instances Replace fiemap_check_flags with a fiemap_prep helper that also takes the inode and mapped range, and performs the sanity check and truncation previously done in fiemap_check_range. This way the validation is inside the file system itself and thus properly works for the stacked overlayfs case as well. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-7-hch@lst.de Signed-off-by: Theodore Ts'o --- Documentation/filesystems/fiemap.txt | 12 ++++--- fs/btrfs/inode.c | 2 +- fs/cifs/smb2ops.c | 6 ++-- fs/ext4/extents.c | 5 +-- fs/f2fs/data.c | 3 +- fs/ioctl.c | 63 ++++++++++++++---------------------- fs/iomap/fiemap.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/ocfs2/extent_map.c | 3 +- include/linux/fiemap.h | 3 +- 10 files changed, 47 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt index ac87e6fda842..35c8571eccb6 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.txt @@ -203,16 +203,18 @@ EINTR once fatal signal received. Flag checking should be done at the beginning of the ->fiemap callback via the -fiemap_check_flags() helper: +fiemap_prep() helper: -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); The struct fieinfo should be passed in as received from ioctl_fiemap(). The set of fiemap flags which the fs understands should be passed via fs_flags. If -fiemap_check_flags finds invalid user flags, it will place the bad values in +fiemap_prep finds invalid user flags, it will place the bad values in fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from -fiemap_check_flags(), it should immediately exit, returning that error back to -ioctl_fiemap(). +fiemap_prep(), it should immediately exit, returning that error back to +ioctl_fiemap(). Additionally the range is validate against the supported +maximum file size. For each extent in the request range, the file system should call diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 320d1062068d..1f1ec361089b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8250,7 +8250,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int ret; - ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, BTRFS_FIEMAP_FLAGS); if (ret) return ret; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 09047f1ddfb6..828e53e795c6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3408,8 +3408,10 @@ static int smb3_fiemap(struct cifs_tcon *tcon, int i, num, rc, flags, last_blob; u64 next; - if (fiemap_check_flags(fei, FIEMAP_FLAG_SYNC)) - return -EBADR; + rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, + FIEMAP_FLAG_SYNC); + if (rc) + return rc; xid = get_xid(); again: diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b365c8b407c4..cea083efb650 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4938,8 +4938,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)) - return -EBADR; + error = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + if (error) + return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 25abbbb65ba0..03faafc591b1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1825,7 +1825,8 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, + FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 3f300cc07dee..56bbf02209ae 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -149,61 +149,50 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, EXPORT_SYMBOL(fiemap_fill_next_extent); /** - * fiemap_check_flags - check validity of requested flags for fiemap + * fiemap_prep - check validity of requested flags for fiemap + * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap - * @fs_flags: Set of fiemap flags that the file system understands + * @start: Start of the mapped range + * @len: Length of the mapped range, can be truncated by this function. + * @supported_flags: Set of fiemap flags that the file system understands * - * Called from file system ->fiemap callback. This will compute the - * intersection of valid fiemap flags and those that the fs supports. That - * value is then compared against the user supplied flags. In case of bad user - * flags, the invalid values will be written into the fieinfo structure, and - * -EBADR is returned, which tells ioctl_fiemap() to return those values to - * userspace. For this reason, a return code of -EBADR should be preserved. + * This function must be called from each ->fiemap instance to validate the + * fiemap request against the file system parameters. * - * Returns 0 on success, -EBADR on bad flags. + * Returns 0 on success, or a negative error on failure. */ -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags) { + u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; - incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); - if (incompat_flags) { - fieinfo->fi_flags = incompat_flags; - return -EBADR; - } - return 0; -} -EXPORT_SYMBOL(fiemap_check_flags); - -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) + if (*len == 0) return -EINVAL; - if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + supported_flags &= FIEMAP_FLAGS_COMPAT; + incompat_flags = fieinfo->fi_flags & ~supported_flags; + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } return 0; } +EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (!inode->i_op->fiemap) @@ -215,11 +204,6 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -232,7 +216,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) @@ -320,7 +305,7 @@ static int __generic_block_fiemap(struct inode *inode, bool past_eof = false, whole_file = false; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 449705575acf..89dca4a97e4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -75,7 +75,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, ctx.fi = fi; ctx.prev.type = IOMAP_HOLE; - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fi, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6e1aca38931f..052c2da11e4d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1006,7 +1006,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, unsigned int blkbits = inode->i_blkbits; int ret, n; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e3e2d1b2af51..3744179b73fa 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -746,7 +746,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, + OCFS2_FIEMAP_FLAGS); if (ret) return ret; diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index 240d4f7d9116..4e624c466583 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -13,9 +13,10 @@ struct fiemap_extent_info { fiemap_extent array */ }; +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, -- cgit v1.2.3 From df820f8de4e481222b17f9bcee7b909ae8167529 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 4 Jun 2020 10:48:19 +0200 Subject: ovl: make private mounts longterm Overlayfs is using clone_private_mount() to create internal mounts for underlying layers. These are used for operations requiring a path, such as dentry_open(). Since these private mounts are not in any namespace they are treated as short term, "detached" mounts and mntput() involves taking the global mount_lock, which can result in serious cacheline pingpong. Make these private mounts longterm instead, which trade the penalty on mntput() for a slightly longer shutdown time due to an added RCU grace period when putting these mounts. Introduce a new helper kern_unmount_many() that can take care of multiple longterm mounts with a single RCU grace period. Cc: Al Viro Signed-off-by: Miklos Szeredi --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namespace.c | 16 ++++++++++++++++ fs/overlayfs/super.c | 7 ++++++- include/linux/mount.h | 2 ++ 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 26c093969573..867036aa90b8 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -858,3 +858,10 @@ be misspelled d_alloc_anon(). [should've been added in 2016] stale comment in finish_open() nonwithstanding, failure exits in ->atomic_open() instances should *NOT* fput() the file, no matter what. Everything is handled by the caller. + +--- + +**mandatory** + +clone_private_mount() returns a longterm mount now, so the proper destructor of +its result is kern_unmount() or kern_unmount_array(). diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..d53517f1d741 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1879,6 +1879,9 @@ struct vfsmount *clone_private_mount(const struct path *path) if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); + /* Longterm mount to be removed by kern_unmount*() */ + new_mnt->mnt_ns = MNT_NS_INTERNAL; + return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -3804,6 +3807,19 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (mnt[i]) + real_mount(mnt[i])->mnt_ns = NULL; + synchronize_rcu_expedited(); + for (i = 0; i < num; i++) + mntput(mnt[i]); +} +EXPORT_SYMBOL(kern_unmount_array); + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index eb81d8760a6a..8d8cd46e1482 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,6 +211,7 @@ static void ovl_destroy_inode(struct inode *inode) static void ovl_free_fs(struct ovl_fs *ofs) { + struct vfsmount **mounts; unsigned i; iput(ofs->workbasedir_trap); @@ -224,10 +225,14 @@ static void ovl_free_fs(struct ovl_fs *ofs) dput(ofs->workbasedir); if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); + + /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->layers; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); - mntput(ofs->layers[i].mnt); + mounts[i] = ofs->layers[i].mnt; } + kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); for (i = 0; i < ofs->numfs; i++) free_anon_bdev(ofs->fs[i].pseudo_dev); diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..8de95a0bec8d 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -109,4 +109,6 @@ extern unsigned int sysctl_mount_max; extern bool path_is_mountpoint(const struct path *path); +extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); + #endif /* _LINUX_MOUNT_H */ -- cgit v1.2.3 From d56f5136b01020155b6b0a29f69d924687529bee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jun 2020 15:16:52 +0200 Subject: KVM: let kvm_destroy_vm_debugfs clean up vCPU debugfs directories After commit 63d0434 ("KVM: x86: move kvm_create_vcpu_debugfs after last failure point") we are creating the pre-vCPU debugfs files after the creation of the vCPU file descriptor. This makes it possible for userspace to reach kvm_vcpu_release before kvm_create_vcpu_debugfs has finished. The vcpu->debugfs_dentry then does not have any associated inode anymore, and this causes a NULL-pointer dereference in debugfs_create_file. The solution is simply to avoid removing the files; they are cleaned up when the VM file descriptor is closed (and that must be after KVM_CREATE_VCPU returns). We can stop storing the dentry in struct kvm_vcpu too, because it is not needed anywhere after kvm_create_vcpu_debugfs returns. Reported-by: syzbot+705f4401d5a93a59b87d@syzkaller.appspotmail.com Fixes: 63d04348371b ("KVM: x86: move kvm_create_vcpu_debugfs after last failure point") Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 5 ----- arch/x86/kvm/debugfs.c | 10 +++++----- include/linux/kvm_host.h | 3 +-- virt/kvm/kvm_main.c | 8 ++++---- 4 files changed, 10 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7a57381c05e8..45276ed50dd6 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -144,11 +144,6 @@ out_fail_alloc: return ret; } -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { return VM_FAULT_SIGBUS; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 018aebce33ff..7e818d64bb4d 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -43,22 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); -void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { - debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, + debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); if (lapic_in_kernel(vcpu)) debugfs_create_file("lapic_timer_advance_ns", 0444, - vcpu->debugfs_dentry, vcpu, + debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); if (kvm_has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, vcpu, + debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, vcpu, + debugfs_dentry, vcpu, &vcpu_tsc_scaling_frac_fops); } } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f43b59b1294c..d38d6b9c24be 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -318,7 +318,6 @@ struct kvm_vcpu { bool preempted; bool ready; struct kvm_vcpu_arch arch; - struct dentry *debugfs_dentry; }; static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) @@ -888,7 +887,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS -void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry); #endif int kvm_arch_hardware_enable(void); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e1659..3577eb84eac0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2973,7 +2973,6 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; - debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_put_kvm(vcpu->kvm); return 0; } @@ -3000,16 +2999,17 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS + struct dentry *debugfs_dentry; char dir_name[ITOA_MAX_LEN * 2]; if (!debugfs_initialized()) return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); - vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); + debugfs_dentry = debugfs_create_dir(dir_name, + vcpu->kvm->debugfs_dentry); - kvm_arch_create_vcpu_debugfs(vcpu); + kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); #endif } -- cgit v1.2.3 From c25a26e653a61b93339dcd1b734c889df60b3eef Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 29 May 2020 16:03:00 +0800 Subject: vdpa: introduce get_vq_notification method This patch introduces a new method in the vdpa_config_ops which reports the physical address and the size of the doorbell for a specific virtqueue. This will be used by the future patches that maps doorbell to userspace. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200529080303.15449-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 5453af87a33e..239db794357c 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -17,6 +17,16 @@ struct vdpa_callback { void *private; }; +/** + * vDPA notification area + * @addr: base address of the notification area + * @size: size of the notification area + */ +struct vdpa_notification_area { + resource_size_t addr; + resource_size_t size; +}; + /** * vDPA device - representation of a vDPA device * @dev: underlying device @@ -73,6 +83,10 @@ struct vdpa_device { * @vdev: vdpa device * @idx: virtqueue index * Returns virtqueue state (last_avail_idx) + * @get_vq_notification: Get the notification area for a virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * Returns the notifcation area * @get_vq_align: Get the virtqueue align requirement * for the device * @vdev: vdpa device @@ -162,6 +176,8 @@ struct vdpa_config_ops { bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx); int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state); u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx); + struct vdpa_notification_area + (*get_vq_notification)(struct vdpa_device *vdev, u16 idx); /* Device ops */ u32 (*get_vq_align)(struct vdpa_device *vdev); -- cgit v1.2.3 From aa218795cb5fd583c94fc838dc76b7379dc4976a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 7 May 2020 16:01:30 +0200 Subject: mm: Allow to offline unmovable PageOffline() pages via MEM_GOING_OFFLINE virtio-mem wants to allow to offline memory blocks of which some parts were unplugged (allocated via alloc_contig_range()), especially, to later offline and remove completely unplugged memory blocks. The important part is that PageOffline() has to remain set until the section is offline, so these pages will never get accessed (e.g., when dumping). The pages should not be handed back to the buddy (which would require clearing PageOffline() and result in issues if offlining fails and the pages are suddenly in the buddy). Let's allow to do that by allowing to isolate any PageOffline() page when offlining. This way, we can reach the memory hotplug notifier MEM_GOING_OFFLINE, where the driver can signal that he is fine with offlining this page by dropping its reference count. PageOffline() pages with a reference count of 0 can then be skipped when offlining the pages (like if they were free, however they are not in the buddy). Anybody who uses PageOffline() pages and does not agree to offline them (e.g., Hyper-V balloon, XEN balloon, VMWare balloon for 2MB pages) will not decrement the reference count and make offlining fail when trying to migrate such an unmovable page. So there should be no observable change. Same applies to balloon compaction users (movable PageOffline() pages), the pages will simply be migrated. Note 1: If offlining fails, a driver has to increment the reference count again in MEM_CANCEL_OFFLINE. Note 2: A driver that makes use of this has to be aware that re-onlining the memory block has to be handled by hooking into onlining code (online_page_callback_t), resetting the page PageOffline() and not giving them to the buddy. Reviewed-by: Alexander Duyck Acked-by: Michal Hocko Tested-by: Pankaj Gupta Acked-by: Andrew Morton Cc: Andrew Morton Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Pavel Tatashin Cc: Alexander Duyck Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Anthony Yznaga Cc: Michal Hocko Cc: Oscar Salvador Cc: Mel Gorman Cc: Mike Rapoport Cc: Dan Williams Cc: Anshuman Khandual Cc: Qian Cai Cc: Pingfan Liu Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20200507140139.17083-7-david@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/page-flags.h | 10 ++++++++++ mm/memory_hotplug.c | 44 ++++++++++++++++++++++++++++++++++---------- mm/page_alloc.c | 24 ++++++++++++++++++++++++ mm/page_isolation.c | 9 +++++++++ 4 files changed, 77 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 222f6f7b2bb3..6be1aa559b1e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -777,6 +777,16 @@ PAGE_TYPE_OPS(Buddy, buddy) * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. + * + * If a driver wants to allow to offline unmovable PageOffline() pages without + * putting them back to the buddy, it can do so via the memory notifier by + * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the + * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() + * pages (now with a reference count of zero) are treated like free pages, + * allowing the containing memory block to get offlined. A driver that + * relies on this feature is aware that re-onlining the memory block will + * require to re-set the pages PageOffline() and not giving them to the + * buddy via online_page_callback_t. */ PAGE_TYPE_OPS(Offline, offline) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..008e4a7ed8bc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1224,11 +1224,17 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). We scan pfn because it's much - * easier than scanning over linked list. This function returns the pfn - * of the first found movable page if it's found, otherwise 0. + * non-lru movable pages and hugepages). Will skip over most unmovable + * pages (esp., pages that can be skipped when offlining), but bail out on + * definitely unmovable pages. + * + * Returns: + * 0 in case a movable page is found and movable_pfn was updated. + * -ENOENT in case no movable page was found. + * -EBUSY in case a definitely unmovable page was found. */ -static unsigned long scan_movable_pages(unsigned long start, unsigned long end) +static int scan_movable_pages(unsigned long start, unsigned long end, + unsigned long *movable_pfn) { unsigned long pfn; @@ -1240,18 +1246,30 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) continue; page = pfn_to_page(pfn); if (PageLRU(page)) - return pfn; + goto found; if (__PageMovable(page)) - return pfn; + goto found; + + /* + * PageOffline() pages that are not marked __PageMovable() and + * have a reference count > 0 (after MEM_GOING_OFFLINE) are + * definitely unmovable. If their reference count would be 0, + * they could at least be skipped when offlining memory. + */ + if (PageOffline(page) && page_count(page)) + return -EBUSY; if (!PageHuge(page)) continue; head = compound_head(page); if (page_huge_active(head)) - return pfn; + goto found; skip = compound_nr(head) - (page - head); pfn += skip - 1; } + return -ENOENT; +found: + *movable_pfn = pfn; return 0; } @@ -1518,7 +1536,8 @@ static int __ref __offline_pages(unsigned long start_pfn, } do { - for (pfn = start_pfn; pfn;) { + pfn = start_pfn; + do { if (signal_pending(current)) { ret = -EINTR; reason = "signal backoff"; @@ -1528,14 +1547,19 @@ static int __ref __offline_pages(unsigned long start_pfn, cond_resched(); lru_add_drain_all(); - pfn = scan_movable_pages(pfn, end_pfn); - if (pfn) { + ret = scan_movable_pages(pfn, end_pfn, &pfn); + if (!ret) { /* * TODO: fatal migration failures should bail * out */ do_migrate_range(pfn, end_pfn); } + } while (!ret); + + if (ret != -ENOENT) { + reason = "unmovable page"; + goto failed_removal_isolated; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce1c9df54eac..9b1c49c3097d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8372,6 +8372,19 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) + continue; + if (__PageMovable(page) || PageLRU(page)) continue; @@ -8792,6 +8805,17 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) offlined_pages++; continue; } + /* + * At this point all remaining PageOffline() pages have a + * reference count of 0 and can simply be skipped. + */ + if (PageOffline(page)) { + BUG_ON(page_count(page)); + BUG_ON(PageBuddy(page)); + pfn++; + offlined_pages++; + continue; + } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 2c11a38d6e87..f6d07c5f0d34 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -151,6 +151,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * a bit mask) * MEMORY_OFFLINE - isolate to offline (!allocate) memory * e.g., skip over PageHWPoison() pages + * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range * @@ -259,6 +260,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; + else if ((flags & MEMORY_OFFLINE) && PageOffline(page) && + !page_count(page)) + /* + * The responsible driver agreed to skip PageOffline() + * pages when offlining memory by dropping its + * reference in MEM_GOING_OFFLINE. + */ + pfn++; else break; } -- cgit v1.2.3 From 08b3acd7a68fc17902e1cb6b146389322840deab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 7 May 2020 16:01:32 +0200 Subject: mm/memory_hotplug: Introduce offline_and_remove_memory() virtio-mem wants to offline and remove a memory block once it unplugged all subblocks (e.g., using alloc_contig_range()). Let's provide an interface to do that from a driver. virtio-mem already supports to offline partially unplugged memory blocks. Offlining a fully unplugged memory block will not require to migrate any pages. All unplugged subblocks are PageOffline() and have a reference count of 0 - so offlining code will simply skip them. All we need is an interface to offline and remove the memory from kernel module context, where we don't have access to the memory block devices (esp. find_memory_block() and device_offline()) and the device hotplug lock. To keep things simple, allow to only work on a single memory block. Acked-by: Michal Hocko Tested-by: Pankaj Gupta Acked-by: Andrew Morton Cc: Andrew Morton Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Wei Yang Cc: Dan Williams Cc: Qian Cai Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20200507140139.17083-9-david@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/memory_hotplug.h | 1 + mm/memory_hotplug.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 93d9ada74ddd..cb7499843f5c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,6 +319,7 @@ extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); +extern int offline_and_remove_memory(int nid, u64 start, u64 size); #else static inline bool is_mem_section_removable(unsigned long pfn, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 008e4a7ed8bc..4acb99aa9bf4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1821,4 +1821,41 @@ int remove_memory(int nid, u64 start, u64 size) return rc; } EXPORT_SYMBOL_GPL(remove_memory); + +/* + * Try to offline and remove a memory block. Might take a long time to + * finish in case memory is still in use. Primarily useful for memory devices + * that logically unplugged all memory (so it's no longer in use) and want to + * offline + remove the memory block. + */ +int offline_and_remove_memory(int nid, u64 start, u64 size) +{ + struct memory_block *mem; + int rc = -EINVAL; + + if (!IS_ALIGNED(start, memory_block_size_bytes()) || + size != memory_block_size_bytes()) + return rc; + + lock_device_hotplug(); + mem = find_memory_block(__pfn_to_section(PFN_DOWN(start))); + if (mem) + rc = device_offline(&mem->dev); + /* Ignore if the device is already offline. */ + if (rc > 0) + rc = 0; + + /* + * In case we succeeded to offline the memory block, remove it. + * This cannot fail as it cannot get onlined in the meantime. + */ + if (!rc) { + rc = try_remove_memory(nid, start, size); + WARN_ON_ONCE(rc); + } + unlock_device_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(offline_and_remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From 6501bf87602f799b7e502014f8bc0aa58b868277 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 3 Jun 2020 16:49:46 +0200 Subject: u64_stats: Document writer non-preemptibility requirement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The u64_stats mechanism uses sequence counters to protect against 64-bit values tearing on 32-bit architectures. Updating such statistics is a sequence counter write side critical section. Preemption must be disabled before entering this seqcount write critical section. Failing to do so, the seqcount read side can preempt the write side section and spin for the entire scheduler tick. If that reader belongs to a real-time scheduling class, it can spin forever and the kernel will livelock. Document this statistics update side non-preemptibility requirement. Reword the introductory paragraph to highlight u64_stats raison d'être: 64-bit values tearing protection on 32-bit architectures. Divide documentation on a basis of internal design vs. usage constraints. Reword the u64_stats header file top comment to always mention "Reader" or "Writer" at the start of each bullet point, making it easier to follow which side each point is actually for. Clarify the statement "whole thing is a NOOP on 64bit arches or UP kernels". For 32-bit UP kernels, preemption is always disabled for the statistics read side section. Signed-off-by: Ahmed S. Darwish Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 9de5c10293f5..c6abb79501b3 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -3,33 +3,36 @@ #define _LINUX_U64_STATS_SYNC_H /* - * To properly implement 64bits network statistics on 32bit and 64bit hosts, - * we provide a synchronization point, that is a noop on 64bit or UP kernels. + * Protect against 64-bit values tearing on 32-bit architectures. This is + * typically used for statistics read/update in different subsystems. * * Key points : - * 1) Use a seqcount on SMP 32bits, with low overhead. - * 2) Whole thing is a noop on 64bit arches or UP kernels. - * 3) Write side must ensure mutual exclusion or one seqcount update could + * + * - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP. + * - The whole thing is a no-op on 64-bit architectures. + * + * Usage constraints: + * + * 1) Write side must ensure mutual exclusion, or one seqcount update could * be lost, thus blocking readers forever. - * If this synchronization point is not a mutex, but a spinlock or - * spinlock_bh() or disable_bh() : - * 3.1) Write side should not sleep. - * 3.2) Write side should not allow preemption. - * 3.3) If applicable, interrupts should be disabled. * - * 4) If reader fetches several counters, there is no guarantee the whole values - * are consistent (remember point 1) : this is a noop on 64bit arches anyway) + * 2) Write side must disable preemption, or a seqcount reader can preempt the + * writer and also spin forever. + * + * 3) Write side must use the _irqsave() variant if other writers, or a reader, + * can be invoked from an IRQ context. * - * 5) readers are allowed to sleep or be preempted/interrupted : They perform - * pure reads. But if they have to fetch many values, it's better to not allow - * preemptions/interruptions to avoid many retries. + * 4) If reader fetches several counters, there is no guarantee the whole values + * are consistent w.r.t. each other (remember point #2: seqcounts are not + * used for 64bit architectures). * - * 6) If counter might be written by an interrupt, readers should block interrupts. - * (On UP, there is no seqcount_t protection, a reader allowing interrupts could - * read partial values) + * 5) Readers are allowed to sleep or be preempted/interrupted: they perform + * pure reads. * - * 7) For irq and softirq uses, readers can use u64_stats_fetch_begin_irq() and - * u64_stats_fetch_retry_irq() helpers + * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats + * might be updated from a hardirq or softirq context (remember point #1: + * seqcounts are not used for UP kernels). 32-bit UP stat readers could read + * corrupted 64-bit values otherwise. * * Usage : * -- cgit v1.2.3 From a1c979f330cb82cae7a3b19464f9815e43060fe3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 27 May 2020 07:04:46 -0400 Subject: dm bufio: delete unused and inefficient dm_bufio_discard_buffers There is no user for this interface. If in future it is needed it can be reimplemented to walk the rbtree of buffers instead of doing block-by-block lookups. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 26 -------------------------- include/linux/dm-bufio.h | 7 ------- 2 files changed, 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index bf289be1ee3a..993e624e506c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1358,32 +1358,6 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -/* - * Free the specified range of buffers. If a buffer is held by other process, it - * is not freed. If a buffer is dirty, it is discarded without writeback. - * Finally, send the discard request to the device. - */ -int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count) -{ - sector_t i; - - for (i = block; i < block + count; i++) { - struct dm_buffer *b; - dm_bufio_lock(c); - b = __find(c, i); - if (b && likely(!b->hold_count)) { - wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); - wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); - __unlink_buffer(b); - __free_buffer_wake(b); - } - dm_bufio_unlock(c); - } - - return dm_bufio_issue_discard(c, block, count); -} -EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers); - /* * We first delete any other buffer that may be at that new location. * diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 07e1f163e299..5ec6bfbde9ae 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -123,13 +123,6 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c); */ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count); -/* - * Free the specified range of buffers. If a buffer is held by other process, it - * is not freed. If a buffer is dirty, it is discarded without writeback. - * Finally, send the discard request to the device. - */ -int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count); - /* * Like dm_bufio_release but also move the buffer to the new * block. dm_bufio_write_dirty_buffers is needed to commit the new block. -- cgit v1.2.3 From 5ff3b30ab57da82d8db4f14662a2858cabfbc2c0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 4 Jun 2020 16:46:04 -0700 Subject: kcov: collect coverage from interrupts This change extends kcov remote coverage support to allow collecting coverage from soft interrupts in addition to kernel background threads. To collect coverage from code that is executed in softirq context, a part of that code has to be annotated with kcov_remote_start/stop() in a similar way as how it is done for global kernel background threads. Then the handle used for the annotations has to be passed to the KCOV_REMOTE_ENABLE ioctl. Internally this patch adjusts the __sanitizer_cov_trace_pc() compiler inserted callback to not bail out when called from softirq context. kcov_remote_start/stop() are updated to save/restore the current per task kcov state in a per-cpu area (in case the softirq came when the kernel was already collecting coverage in task context). Coverage from softirqs is collected into pre-allocated per-cpu areas, whose size is controlled by the new CONFIG_KCOV_IRQ_AREA_SIZE. [andreyknvl@google.com: turn current->kcov_softirq into unsigned int to fix objtool warning] Link: http://lkml.kernel.org/r/841c778aa3849c5cb8c3761f56b87ce653a88671.1585233617.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Dmitry Vyukov Cc: Alan Stern Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Greg Kroah-Hartman Cc: Marco Elver Link: http://lkml.kernel.org/r/469bd385c431d050bc38a593296eff4baae50666.1584655448.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kcov.rst | 17 ++-- include/linux/sched.h | 3 + kernel/kcov.c | 194 +++++++++++++++++++++++++++++++-------- lib/Kconfig.debug | 9 ++ 4 files changed, 176 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 1c4e1825d769..8548b0b04e43 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -217,14 +217,15 @@ This allows to collect coverage from two types of kernel background threads: the global ones, that are spawned during kernel boot in a limited number of instances (e.g. one USB hub_event() worker thread is spawned per USB HCD); and the local ones, that are spawned when a user interacts with -some kernel interface (e.g. vhost workers). +some kernel interface (e.g. vhost workers); as well as from soft +interrupts. -To enable collecting coverage from a global background thread, a unique -global handle must be assigned and passed to the corresponding -kcov_remote_start() call. Then a userspace process can pass a list of such -handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the -kcov_remote_arg struct. This will attach the used kcov device to the code -sections, that are referenced by those handles. +To enable collecting coverage from a global background thread or from a +softirq, a unique global handle must be assigned and passed to the +corresponding kcov_remote_start() call. Then a userspace process can pass +a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles +array field of the kcov_remote_arg struct. This will attach the used kcov +device to the code sections, that are referenced by those handles. Since there might be many local background threads spawned from different userspace processes, we can't use a single global handle per annotation. @@ -242,7 +243,7 @@ handles as they don't belong to a particular subsystem. The bytes 4-7 are currently reserved and must be zero. In the future the number of bytes used for the subsystem or handle ids might be increased. -When a particular userspace proccess collects coverage by via a common +When a particular userspace proccess collects coverage via a common handle, kcov will collect coverage for each code section that is annotated to use the common handle obtained as kcov_handle from the current task_struct. However non common handles allow to collect coverage diff --git a/include/linux/sched.h b/include/linux/sched.h index 57a5ce9f33c5..c5d96e3e7fff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,6 +1247,9 @@ struct task_struct { /* KCOV sequence number: */ int kcov_sequence; + + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG diff --git a/kernel/kcov.c b/kernel/kcov.c index 93b28ad2da28..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -145,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -360,8 +373,9 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; @@ -370,7 +384,7 @@ static void kcov_remote_reset(struct kcov *kcov) } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -399,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -428,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -444,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -459,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -468,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -548,6 +564,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: @@ -620,17 +637,19 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } @@ -638,20 +657,22 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (remote_arg->common_handle) { if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -667,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -687,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -706,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -718,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -739,7 +761,7 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() @@ -752,6 +774,39 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { struct task_struct *t = current; @@ -761,28 +816,42 @@ void kcov_remote_start(u64 handle) void *area; unsigned int size; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ mode = READ_ONCE(t->kcov_mode); - if (WARN_ON(kcov_mode_enabled(mode))) + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); kcov = remote->kcov; /* Put in kcov_remote_stop(). */ kcov_get(kcov); @@ -790,12 +859,18 @@ void kcov_remote_start(u64 handle) * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = kcov->remote_size; mode = kcov->mode; sequence = kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { @@ -803,11 +878,20 @@ void kcov_remote_start(u64 handle) return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } kcov_start(t, kcov, size, area, mode, sequence); + local_irq_restore(flags); + } EXPORT_SYMBOL(kcov_remote_start); @@ -875,31 +959,53 @@ void kcov_remote_stop(void) void *area; unsigned int size; int sequence; + unsigned long flags; + + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); mode = READ_ONCE(t->kcov_mode); barrier(); - if (!kcov_mode_enabled(mode)) + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); return; + } kcov = t->kcov; area = t->kcov_area; size = t->kcov_size; sequence = t->kcov_sequence; + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } + kcov_stop(t); + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + local_irq_restore(flags); + + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -913,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0217ed126f77..d07ab3e056cd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1774,6 +1774,15 @@ config KCOV_INSTRUMENT_ALL filesystem fuzzing with AFL) then you will want to enable coverage for more specific subsets of files, and should say n here. +config KCOV_IRQ_AREA_SIZE + hex "Size of interrupt coverage collection area in words" + depends on KCOV + default 0x40000 + help + KCOV uses preallocated per-cpu areas to collect coverage from + soft interrupts. This specifies the size of those areas in the + number of unsigned long words. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" def_bool y -- cgit v1.2.3 From f089dcc74226b874a4d4b122854e0dea91ff72d8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 16:47:08 -0700 Subject: mm: remove __ARCH_HAS_5LEVEL_HACK and include/asm-generic/5level-fixup.h There are no architectures that use include/asm-generic/5level-fixup.h therefore it can be removed along with __ARCH_HAS_5LEVEL_HACK define and the code it surrounds Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200414153455.21744-15-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 59 -------------------------------------- include/linux/mm.h | 7 ----- mm/kasan/init.c | 11 ------- mm/memory.c | 8 ------ 4 files changed, 85 deletions(-) delete mode 100644 include/asm-generic/5level-fixup.h (limited to 'include/linux') diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h deleted file mode 100644 index 58046ddc08d0..000000000000 --- a/include/asm-generic/5level-fixup.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _5LEVEL_FIXUP_H -#define _5LEVEL_FIXUP_H - -#define __ARCH_HAS_5LEVEL_HACK -#define __PAGETABLE_P4D_FOLDED 1 - -#define P4D_SHIFT PGDIR_SHIFT -#define P4D_SIZE PGDIR_SIZE -#define P4D_MASK PGDIR_MASK -#define MAX_PTRS_PER_P4D 1 -#define PTRS_PER_P4D 1 - -#define p4d_t pgd_t - -#define pud_alloc(mm, p4d, address) \ - ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ - NULL : pud_offset(p4d, address)) - -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_alloc_track(mm, pgd, address, mask) (pgd) -#define p4d_offset(pgd, start) (pgd) - -#ifndef __ASSEMBLY__ -static inline int p4d_none(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_bad(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_present(p4d_t p4d) -{ - return 1; -} -#endif - -#define p4d_ERROR(p4d) do { } while (0) -#define p4d_clear(p4d) pgd_clear(p4d) -#define p4d_val(p4d) pgd_val(p4d) -#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_populate_safe(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_page(p4d) pgd_page(p4d) -#define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d) - -#define __p4d(x) __pgd(x) -#define set_p4d(p4dp, p4d) set_pgd(p4dp, p4d) - -#undef p4d_free_tlb -#define p4d_free_tlb(tlb, x, addr) do { } while (0) -#define p4d_free(mm, x) do { } while (0) - -#undef p4d_addr_end -#define p4d_addr_end(addr, end) (end) - -#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 66e0977f970a..e220ce5185ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2069,11 +2069,6 @@ int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) -/* - * The following ifdef needed to get the 5level-fixup.h header to work. - * Remove it when 5level-fixup.h has been removed. - */ -#ifndef __ARCH_HAS_5LEVEL_HACK static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2102,8 +2097,6 @@ static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, return p4d_offset(pgd, address); } -#endif /* !__ARCH_HAS_5LEVEL_HACK */ - static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ce45c491ebcd..fe6be0be1f76 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. - * - * The ifndef is required to avoid build breakage. - * - * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_early_shadow_p4d. It's not defined - * unless 5-level paging enabled. - * - * The ifndef can be dropped once all KASAN-enabled - * architectures will switch to pgtable-nop4d.h. */ -#ifndef __ARCH_HAS_5LEVEL_HACK pgd_populate(&init_mm, pgd, lm_alias(kasan_early_shadow_p4d)); -#endif p4d = p4d_offset(pgd, addr); p4d_populate(&init_mm, p4d, lm_alias(kasan_early_shadow_pud)); diff --git a/mm/memory.c b/mm/memory.c index 7b70398f76a0..60c279295fce 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4436,19 +4436,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); -#ifndef __ARCH_HAS_5LEVEL_HACK if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); -#else - if (!pgd_present(*p4d)) { - mm_inc_nr_puds(mm); - pgd_populate(mm, p4d, new); - } else /* Another has populated it */ - pud_free(mm, new); -#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } -- cgit v1.2.3 From 525aaf9bad00e7454b9f9b3873e92795afb59f8e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:30 -0700 Subject: arch/kmap: remove redundant arch specific kmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kmap code for all the architectures is almost 100% identical. Lift the common code to the core. Use ARCH_HAS_KMAP_FLUSH_TLB to indicate if an arch defines kmap_flush_tlb() and call if if needed. This also has the benefit of changing kmap() on a number of architectures to be an inline call rather than an actual function. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-4-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 -- arch/arc/mm/highmem.c | 10 ---------- arch/arm/include/asm/highmem.h | 2 -- arch/arm/mm/highmem.c | 9 --------- arch/csky/include/asm/highmem.h | 4 ++-- arch/csky/mm/highmem.c | 14 ++++---------- arch/microblaze/include/asm/highmem.h | 9 --------- arch/mips/include/asm/highmem.h | 4 ++-- arch/mips/mm/highmem.c | 14 +++----------- arch/nds32/include/asm/highmem.h | 2 -- arch/nds32/mm/highmem.c | 12 ------------ arch/powerpc/include/asm/highmem.h | 9 --------- arch/sparc/include/asm/highmem.h | 9 --------- arch/x86/include/asm/highmem.h | 2 -- arch/x86/mm/highmem_32.c | 9 --------- arch/xtensa/include/asm/highmem.h | 9 --------- include/linux/highmem.h | 18 ++++++++++++++++++ 17 files changed, 29 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 042e92921c4c..96eb67c86961 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,8 +30,6 @@ #include -extern void *kmap(struct page *page); -extern void *kmap_high(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void kunmap_high(struct page *page); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 39ef7b9a3aa9..4db13a6b9f3b 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,16 +49,6 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void *kmap_atomic(struct page *page) { int idx, cpu_idx; diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index eb4e4207cd3c..c917522541de 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -20,7 +20,6 @@ extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); /* @@ -63,7 +62,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index cc6eb79ef20c..e8ba37c36590 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,15 +31,6 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index a345a2f2c22e..9d0516e38110 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,10 +30,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap(struct page *page); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 690d678649d1..4a3c273bc8b9 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -13,18 +13,12 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } +EXPORT_SYMBOL(kmap_flush_tlb); + EXPORT_SYMBOL(kmap); void kunmap(struct page *page) diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 99ced7278b5c..8c5bfd228bd8 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,19 +51,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 9d84aafc33d0..1f741e3ecabf 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,10 +46,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap(struct page *page); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index edd889f6cede..c72058bfead6 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -12,19 +12,11 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); void kunmap(struct page *page) { diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b3a82c97ded3..b13654a79069 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -44,7 +44,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void kmap_init(void); @@ -54,7 +53,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index 4c7c28e994ea..d0cde53b84ae 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,18 +10,6 @@ #include #include -void *kmap(struct page *page) -{ - unsigned long vaddr; - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - vaddr = (unsigned long)kmap_high(page); - return (void *)vaddr; -} - -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 529512f6d65a..f14e4feef6d5 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,19 +59,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 7dd2d4b3f980..2ff1192047f7 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,17 +50,8 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_high(struct page *page); void kunmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a8059930056d..c916a28a9738 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,10 +58,8 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8af66382672b..12591a81b85c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,15 +4,6 @@ #include /* for totalram_pages */ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index a9587c85be85..2546b88ddecf 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,17 +63,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void *kmap_high(struct page *page); void kunmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ea5cdbd8c2c3..fc3adc51254a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -34,6 +34,24 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifdef CONFIG_HIGHMEM #include +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +void *kmap_high(struct page *page); +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; -- cgit v1.2.3 From e23c45976f82ac789469c37e4d5a72ea2ce30bba Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:34 -0700 Subject: arch/kunmap: remove duplicate kunmap implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures do exactly the same thing for kunmap(); remove all the duplicate definitions and lift the call to the core. This also has the benefit of changing kmap_unmap() on a number of architectures to be an inline call rather than an actual function. [akpm@linux-foundation.org: fix CONFIG_HIGHMEM=n build on various architectures] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-5-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 10 ---------- arch/arm/include/asm/highmem.h | 3 --- arch/arm/mm/highmem.c | 9 --------- arch/csky/include/asm/highmem.h | 3 --- arch/csky/mm/highmem.c | 9 --------- arch/microblaze/include/asm/highmem.h | 9 --------- arch/mips/include/asm/highmem.h | 3 --- arch/mips/mm/highmem.c | 9 --------- arch/nds32/include/asm/highmem.h | 3 --- arch/nds32/mm/highmem.c | 10 ---------- arch/powerpc/include/asm/highmem.h | 9 --------- arch/sparc/include/asm/highmem.h | 10 ---------- arch/x86/include/asm/highmem.h | 4 ---- arch/x86/mm/highmem_32.c | 9 --------- arch/xtensa/include/asm/highmem.h | 10 ---------- include/linux/highmem.h | 14 ++++++++++++++ 16 files changed, 14 insertions(+), 110 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 96eb67c86961..8387a5596a91 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -32,7 +32,6 @@ extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); -extern void kunmap_high(struct page *page); extern void kmap_init(void); @@ -41,15 +40,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - - #endif #endif diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index c917522541de..736f65283e7b 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -20,8 +20,6 @@ extern pte_t *pkmap_page_table; -extern void kunmap_high(struct page *page); - /* * The reason for kmap_high_get() is to ensure that the currently kmap'd * page usage count does not decrease to zero while we're using its @@ -62,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index e8ba37c36590..c700b32350ee 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,15 +31,6 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned int idx; diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 9d0516e38110..be11c5b67122 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,11 +30,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 4a3c273bc8b9..e9952211264b 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,15 +21,6 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned long vaddr; diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 8c5bfd228bd8..0c94046f2d58 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,18 +51,9 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 1f741e3ecabf..24e7e7e5cc7b 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,11 +46,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index c72058bfead6..eb8ec8493f2f 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,15 +18,6 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b13654a79069..c93c7368bb3f 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -44,8 +44,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void kunmap_high(struct page *page); - extern void kmap_init(void); /* @@ -53,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d0cde53b84ae..f9348bec0ecb 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,16 +10,6 @@ #include #include -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned int idx; diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index f14e4feef6d5..ba3371977d49 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,18 +59,9 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 2ff1192047f7..4bdb79fed02c 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,16 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index c916a28a9738..90b96594d6c5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,10 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - -void kunmap(struct page *page); - void *kmap_atomic_prot(struct page *page, pgprot_t prot); void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 12591a81b85c..c4ebfd0ae401 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,15 +4,6 @@ #include /* for totalram_pages */ #include -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 2546b88ddecf..5a481f7def0b 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,16 +63,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void flush_cache_kmaps(void) { flush_cache_all(); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fc3adc51254a..216a647ed7db 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -52,6 +52,16 @@ static inline void *kmap(struct page *page) return addr; } +void kunmap_high(struct page *page); + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; @@ -102,6 +112,10 @@ static inline void *kmap(struct page *page) return page_address(page); } +static inline void kunmap_high(struct page *page) +{ +} + static inline void kunmap(struct page *page) { } -- cgit v1.2.3 From 78b6d91ec7bbfc5bcc2dd05bb2cf13c9de1dc7cd Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:42 -0700 Subject: arch/kmap_atomic: consolidate duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every arch has the same code to ensure atomic operations and a check for !HIGHMEM page. Remove the duplicate code by defining a core kmap_atomic() which only calls the arch specific kmap_atomic_high() when the page is high memory. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-7-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 1 - arch/arc/mm/highmem.c | 9 ++------- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 9 ++------- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 9 ++------- arch/microblaze/include/asm/highmem.h | 4 ++-- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/cache.c | 2 +- arch/mips/mm/highmem.c | 18 ++---------------- arch/nds32/include/asm/highmem.h | 1 - arch/nds32/mm/highmem.c | 11 ++--------- arch/powerpc/include/asm/highmem.h | 4 ++-- arch/powerpc/mm/highmem.c | 6 ------ arch/sparc/include/asm/highmem.h | 1 - arch/sparc/mm/highmem.c | 9 ++------- arch/x86/include/asm/highmem.h | 5 ++++- arch/x86/mm/highmem_32.c | 14 -------------- arch/xtensa/include/asm/highmem.h | 1 - arch/xtensa/mm/highmem.c | 9 ++------- include/linux/highmem.h | 23 +++++++++++++++++++++++ 21 files changed, 46 insertions(+), 93 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 8387a5596a91..db425cd38545 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,7 +30,6 @@ #include -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void kmap_init(void); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 4db13a6b9f3b..0964b011c29f 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,16 +49,11 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { int idx, cpu_idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - cpu_idx = kmap_atomic_idx_push(); idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); vaddr = FIXMAP_ADDR(idx); @@ -68,7 +63,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kv) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 736f65283e7b..8c80bfe18a34 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -60,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index c700b32350ee..075fdc235091 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,18 +31,13 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - #ifdef CONFIG_DEBUG_HIGHMEM /* * There is no cache coherency issue when non VIVT, so force the @@ -76,7 +71,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index be11c5b67122..8ceee12f9bc1 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -32,7 +32,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index e9952211264b..63d74b47eee6 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,16 +21,11 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -42,7 +37,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index c38d920a1171..f7c5467df5ad 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -63,9 +63,9 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) } extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_high(struct page *page) { - return kmap_atomic_prot(page, kmap_prot); + return kmap_atomic_high_prot(page, kmap_prot); } #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 24e7e7e5cc7b..8bdbbfc322ad 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -48,7 +48,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index ad6df1cea866..295bfda7da97 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index eb8ec8493f2f..2bda56372995 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,25 +18,11 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -48,7 +34,7 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index c93c7368bb3f..a3970e566ede 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -51,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index f9348bec0ecb..d4387d835870 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,18 +10,13 @@ #include #include -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned int idx; unsigned long vaddr, pte; int type; pte_t *ptep; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); @@ -36,8 +31,7 @@ void *kmap_atomic(struct page *page) __nds32__isb(); return (void *)vaddr; } - -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { @@ -53,5 +47,4 @@ void __kunmap_atomic(void *kvaddr) pagefault_enable(); preempt_enable(); } - EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index d049806a8354..74fa2c726fde 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -71,9 +71,9 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) } extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_high(struct page *page) { - return kmap_atomic_prot(page, kmap_prot); + return kmap_atomic_high_prot(page, kmap_prot); } diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index f075cef6d663..67aaa5217f7f 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -24,12 +24,6 @@ #include #include -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 4bdb79fed02c..458210c5bc38 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,7 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index d4a80adea7e5..b53070ab6a31 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -53,16 +53,11 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; long idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -87,7 +82,7 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 61f47fef40e5..9393d55a2adb 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -68,7 +68,10 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -void *kmap_atomic(struct page *page); +static inline void *kmap_atomic_high(struct page *page) +{ + return kmap_atomic_high_prot(page, kmap_prot); +} void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 48b56b1af902..c3e272a759e0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,14 +4,6 @@ #include /* for totalram_pages */ #include -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; @@ -28,12 +20,6 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); - /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 5a481f7def0b..1e6aa15c4bdf 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,7 +68,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void kmap_init(void); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index da734a2ed641..90b85a897cb0 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,16 +37,11 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { enum fixed_addresses idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - idx = kmap_idx(kmap_atomic_idx_push(), DCACHE_ALIAS(page_to_phys(page))); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -57,7 +52,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 216a647ed7db..d2209ae8be99 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,6 +32,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include #ifdef CONFIG_HIGHMEM +extern void *kmap_atomic_high(struct page *page); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -62,6 +63,28 @@ static inline void kunmap(struct page *page) kunmap_high(page); } +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + * + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +static inline void *kmap_atomic(struct page *page) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_atomic_high(page); +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; -- cgit v1.2.3 From abca2500c0c1b20c3e552f259da4c4a99db3b4d1 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:46 -0700 Subject: arch/kunmap_atomic: consolidate duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every single architecture (including !CONFIG_HIGHMEM) calls... pagefault_enable(); preempt_enable(); ... before returning from __kunmap_atomic(). Lift this code into the kunmap_atomic() macro. While we are at it rename __kunmap_atomic() to kunmap_atomic_high() to be consistent. [ira.weiny@intel.com: don't enable pagefault/preempt twice] Link: http://lkml.kernel.org/r/20200518184843.3029640-1-ira.weiny@intel.com [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20200507150004.1423069-8-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 -- arch/arc/mm/highmem.c | 7 ++----- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 6 ++---- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 9 +++------ arch/microblaze/include/asm/highmem.h | 1 - arch/microblaze/mm/highmem.c | 11 +++-------- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/cache.c | 4 ++-- arch/mips/mm/highmem.c | 11 +++-------- arch/nds32/include/asm/highmem.h | 1 - arch/nds32/mm/highmem.c | 6 ++---- arch/parisc/include/asm/cacheflush.h | 4 +--- arch/powerpc/include/asm/highmem.h | 1 - arch/powerpc/mm/highmem.c | 11 +++-------- arch/sparc/include/asm/highmem.h | 2 -- arch/sparc/mm/highmem.c | 11 +++-------- arch/x86/include/asm/highmem.h | 1 - arch/x86/mm/highmem_32.c | 7 ++----- arch/xtensa/include/asm/highmem.h | 2 -- arch/xtensa/mm/highmem.c | 7 ++----- include/linux/highmem.h | 13 +++++++++---- 23 files changed, 37 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index db425cd38545..70900a73bfc8 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,8 +30,6 @@ #include -extern void __kunmap_atomic(void *kvaddr); - extern void kmap_init(void); static inline void flush_cache_kmaps(void) diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 0964b011c29f..5d3eab4ac0b0 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -65,7 +65,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kv) +void kunmap_atomic_high(void *kv) { unsigned long kvaddr = (unsigned long)kv; @@ -87,11 +87,8 @@ void __kunmap_atomic(void *kv) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 8c80bfe18a34..b0d4bd8dc3c1 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -60,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 075fdc235091..ac8394655a6e 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -73,7 +73,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx, type; @@ -95,10 +95,8 @@ void __kunmap_atomic(void *kvaddr) /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void *kmap_atomic_pfn(unsigned long pfn) { diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 8ceee12f9bc1..263fbddcd0a3 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -32,7 +32,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 63d74b47eee6..0aafbbbe651c 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -39,13 +39,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx; if (vaddr < FIXADDR_START) - goto out; + return; #ifdef CONFIG_DEBUG_HIGHMEM idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); @@ -58,11 +58,8 @@ void __kunmap_atomic(void *kvaddr) (void) idx; /* to kill a warning */ #endif kmap_atomic_idx_pop(); -out: - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index f7c5467df5ad..c3cbda90391d 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -61,7 +61,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic_high(struct page *page) { diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index 0e3efaa8a004..92e0890416c9 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -51,17 +51,14 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; unsigned int idx; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } type = kmap_atomic_idx(); @@ -77,7 +74,5 @@ void __kunmap_atomic(void *kvaddr) local_flush_tlb_page(NULL, vaddr); kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 8bdbbfc322ad..76dec0bd4f59 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -48,7 +48,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 295bfda7da97..3e81ba000096 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -103,7 +103,7 @@ void __flush_dcache_page(struct page *page) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); } EXPORT_SYMBOL(__flush_dcache_page); @@ -146,7 +146,7 @@ void __update_cache(unsigned long address, pte_t pte) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); ClearPageDcacheDirty(page); } diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 2bda56372995..73ef4004fe5f 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -36,16 +36,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type __maybe_unused; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); #ifdef CONFIG_DEBUG_HIGHMEM @@ -63,10 +60,8 @@ void __kunmap_atomic(void *kvaddr) } #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index a3970e566ede..4d21308549c9 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -51,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); #endif diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d4387d835870..d25c815fda21 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -33,7 +33,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START) { unsigned long vaddr = (unsigned long)kvaddr; @@ -44,7 +44,5 @@ void __kunmap_atomic(void *kvaddr) ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, 0); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 0c83644bfa5c..119c9a7681bc 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -122,11 +122,9 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { flush_kernel_dcache_page_addr(addr); - pagefault_enable(); - preempt_enable(); } #define kmap_atomic_prot(page, prot) kmap_atomic(page) diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 74fa2c726fde..373a470df205 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -69,7 +69,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic_high(struct page *page) { diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 67aaa5217f7f..624b4438aff9 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -40,15 +40,12 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { int type = kmap_atomic_idx(); @@ -66,7 +63,5 @@ void __kunmap_atomic(void *kvaddr) } kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 458210c5bc38..f4babe67cb5d 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,8 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void __kunmap_atomic(void *kvaddr); - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index b53070ab6a31..06798ae813b9 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -84,16 +84,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); @@ -126,7 +123,5 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 9393d55a2adb..be66b77885a0 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -72,7 +72,6 @@ static inline void *kmap_atomic_high(struct page *page) { return kmap_atomic_high_prot(page, kmap_prot); } -void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index c3e272a759e0..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -30,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -60,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 1e6aa15c4bdf..d6a10704307a 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,8 +68,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -void __kunmap_atomic(void *kvaddr); - void kmap_init(void); #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 90b85a897cb0..4de323e43682 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -54,7 +54,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START && kvaddr < (void *)FIXADDR_TOP) { @@ -73,11 +73,8 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d2209ae8be99..945b58d8a57b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -33,6 +33,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifdef CONFIG_HIGHMEM extern void *kmap_atomic_high(struct page *page); +extern void kunmap_atomic_high(void *kvaddr); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -151,10 +152,12 @@ static inline void *kmap_atomic(struct page *page) } #define kmap_atomic_prot(page, prot) kmap_atomic(page) -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { - pagefault_enable(); - preempt_enable(); + /* + * Nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * handles re-enabling faults + preemption + */ } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) @@ -204,7 +207,9 @@ static inline void kmap_atomic_idx_pop(void) #define kunmap_atomic(addr) \ do { \ BUILD_BUG_ON(__same_type((addr), struct page *)); \ - __kunmap_atomic(addr); \ + kunmap_atomic_high(addr); \ + pagefault_enable(); \ + preempt_enable(); \ } while (0) -- cgit v1.2.3 From 20b271dfe9d932b02b067a1f7ba9805c5b8d79bd Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:58 -0700 Subject: arch/kmap: define kmap_atomic_prot() for all arch's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support kmap_atomic_prot(), all architectures need to support protections passed to their kmap_atomic_high() function. Pass protections into kmap_atomic_high() and change the name to kmap_atomic_high_prot() to match. Then define kmap_atomic_prot() as a core function which calls kmap_atomic_high_prot() when needed. Finally, redefine kmap_atomic() as a wrapper of kmap_atomic_prot() with the default kmap_prot exported by the architectures. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-11-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/mm/highmem.c | 6 +++--- arch/arm/mm/highmem.c | 6 +++--- arch/csky/mm/highmem.c | 6 +++--- arch/microblaze/include/asm/highmem.h | 16 ---------------- arch/mips/mm/highmem.c | 6 +++--- arch/nds32/mm/highmem.c | 6 +++--- arch/powerpc/include/asm/highmem.h | 17 ----------------- arch/sparc/mm/highmem.c | 6 +++--- arch/x86/include/asm/highmem.h | 14 -------------- arch/xtensa/mm/highmem.c | 6 +++--- include/linux/highmem.h | 7 ++++--- 11 files changed, 25 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 5d3eab4ac0b0..479b0d72d3cf 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,7 +49,7 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { int idx, cpu_idx; unsigned long vaddr; @@ -59,11 +59,11 @@ void *kmap_atomic_high(struct page *page) vaddr = FIXMAP_ADDR(idx); set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, kmap_prot)); + mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kv) { diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index ac8394655a6e..e013f6b81328 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,7 +31,7 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr; @@ -67,11 +67,11 @@ void *kmap_atomic_high(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); + set_fixmap_pte(idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index f4311669b5bb..3ae5c8cd7619 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,7 +21,7 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; @@ -32,12 +32,12 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); flush_tlb_one((unsigned long)vaddr); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 90d96239152f..d7c55cfd27bd 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,22 +51,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} - -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} - #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } #endif /* __KERNEL__ */ diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 96a486777a18..8e8726992720 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,7 +18,7 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; @@ -29,12 +29,12 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); local_flush_tlb_one((unsigned long)vaddr); return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index b11b88956353..4284cd59e21a 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,7 +10,7 @@ #include #include -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr, pte; @@ -21,7 +21,7 @@ void *kmap_atomic_high(struct page *page) idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | (kmap_prot); + pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, pte); @@ -31,7 +31,7 @@ void *kmap_atomic_high(struct page *page) __nds32__isb(); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index ee5de974c5ef..8d8ee3fcd800 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,23 +59,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} - -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} - - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 02e23a677c58..9309bcab4ae6 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -54,7 +54,7 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; long idx, type; @@ -73,7 +73,7 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte-idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); /* XXX Fix - Anton */ #if 0 __flush_tlb_one(vaddr); @@ -83,7 +83,7 @@ void *kmap_atomic_high(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index be66b77885a0..0f420b24e0fc 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,20 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 50168b09510a..99b5ad137ab5 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,7 +37,7 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -48,11 +48,11 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte + idx))); #endif - set_pte(kmap_pte + idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte + idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 945b58d8a57b..9c559c670299 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,7 +32,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_high(struct page *page); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); extern void kunmap_atomic_high(void *kvaddr); #include @@ -77,14 +77,15 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - return kmap_atomic_high(page); + return kmap_atomic_high_prot(page, prot); } +#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -- cgit v1.2.3 From 7438f36310ddd9fe536fc7403187f63427cecaba Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:10 -0700 Subject: parisc/kmap: remove duplicate kmap code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parisc reimplements the kmap calls except to flush its dcache. This is arguably an abuse of kmap but regardless it is messy and confusing. Remove the duplicate code and have parisc define ARCH_HAS_FLUSH_ON_KUNMAP for a kunmap_flush_on_unmap() architecture specific call to flush the cache. Suggested-by: Al Viro Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Christoph Hellwig Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-14-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/cacheflush.h | 28 ++-------------------------- include/linux/highmem.h | 10 +++++++--- 2 files changed, 9 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 119c9a7681bc..99663fc1f997 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -100,35 +100,11 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } } -#include - -#define ARCH_HAS_KMAP - -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} - -static inline void kunmap(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} - -static inline void *kmap_atomic(struct page *page) -{ - preempt_disable(); - pagefault_disable(); - return page_address(page); -} - -static inline void kunmap_atomic_high(void *addr) +#define ARCH_HAS_FLUSH_ON_KUNMAP +static inline void kunmap_flush_on_unmap(void *addr) { flush_kernel_dcache_page_addr(addr); } -#define kmap_atomic_prot(page, prot) kmap_atomic(page) -#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) - #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 9c559c670299..091b32dff2d1 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -130,7 +130,6 @@ static inline struct page *kmap_to_page(void *addr) static inline unsigned long totalhigh_pages(void) { return 0UL; } -#ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { might_sleep(); @@ -143,6 +142,9 @@ static inline void kunmap_high(struct page *page) static inline void kunmap(struct page *page) { +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(page_address(page)); +#endif } static inline void *kmap_atomic(struct page *page) @@ -156,15 +158,17 @@ static inline void *kmap_atomic(struct page *page) static inline void kunmap_atomic_high(void *addr) { /* - * Nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() * handles re-enabling faults + preemption */ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) #define kmap_flush_unused() do {} while(0) -#endif #endif /* CONFIG_HIGHMEM */ -- cgit v1.2.3 From 090e77e166334b83f555de408df64b9ab394ea08 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:18 -0700 Subject: kmap: consolidate kmap_prot definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most architectures define kmap_prot to be PAGE_KERNEL. Let sparc and xtensa define there own and define PAGE_KERNEL as the default if not overridden. [akpm@linux-foundation.org: coding style fixes] Suggested-by: Christoph Hellwig Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-16-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 3 --- arch/arm/include/asm/highmem.h | 2 -- arch/csky/include/asm/highmem.h | 2 -- arch/microblaze/include/asm/highmem.h | 1 - arch/mips/include/asm/highmem.h | 2 -- arch/nds32/include/asm/highmem.h | 1 - arch/powerpc/include/asm/highmem.h | 1 - arch/sparc/include/asm/highmem.h | 3 ++- arch/sparc/mm/highmem.c | 4 ---- arch/x86/include/asm/fixmap.h | 1 - include/linux/highmem.h | 4 ++++ 11 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 70900a73bfc8..6e5eafb3afdd 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -25,9 +25,6 @@ #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) -#define kmap_prot PAGE_KERNEL - - #include extern void kmap_init(void); diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index b0d4bd8dc3c1..31811be38d78 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -10,8 +10,6 @@ #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL - #define flush_cache_kmaps() \ do { \ if (cache_is_vivt()) \ diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index ea2f3f39174d..14645e3d5cd5 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -38,8 +38,6 @@ extern void *kmap_atomic_pfn(unsigned long pfn); extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* __ASM_CSKY_HIGHMEM_H */ diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index d7c55cfd27bd..284ca8fb54c1 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -25,7 +25,6 @@ #include #include -#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 76dec0bd4f59..f1f788b57166 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -54,8 +54,6 @@ extern void *kmap_atomic_pfn(unsigned long pfn); extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index a48a6536d41a..5717647d14d1 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -32,7 +32,6 @@ #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL static inline void flush_cache_kmaps(void) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 8d8ee3fcd800..104026f7d6bc 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -29,7 +29,6 @@ #include #include -#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index f4babe67cb5d..ddb03c04f1f3 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -25,11 +25,12 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pgprot_t kmap_prot; +#define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; void kmap_init(void) __init; diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 9309bcab4ae6..6ff6e2a9f9b3 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -32,9 +32,6 @@ #include #include -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); - static pte_t *kmap_pte; void __init kmap_init(void) @@ -51,7 +48,6 @@ void __init kmap_init(void) /* cache the first kmap pte */ kmap_pte = pte_offset_kernel(dir, address); - kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 28183ee3cc42..b9527a54db99 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 091b32dff2d1..d6e82e3de027 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -40,6 +40,10 @@ extern void kunmap_atomic_high(void *kvaddr); static inline void kmap_flush_tlb(unsigned long addr) { } #endif +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + void *kmap_high(struct page *page); static inline void *kmap(struct page *page) { -- cgit v1.2.3 From d4eaa2837851db2bfed572898bfc17f9a9f9151e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Jun 2020 16:48:21 -0700 Subject: mm: add kvfree_sensitive() for freeing sensitive data objects For kvmalloc'ed data object that contains sensitive information like cryptographic keys, we need to make sure that the buffer is always cleared before freeing it. Using memset() alone for buffer clearing may not provide certainty as the compiler may compile it away. To be sure, the special memzero_explicit() has to be used. This patch introduces a new kvfree_sensitive() for freeing those sensitive data objects allocated by kvmalloc(). The relevant places where kvfree_sensitive() can be used are modified to use it. Fixes: 4f0882491a14 ("KEYS: Avoid false positive ENOMEM error on key read") Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Signed-off-by: Andrew Morton Reviewed-by: Eric Biggers Acked-by: David Howells Cc: Jarkko Sakkinen Cc: James Morris Cc: "Serge E. Hallyn" Cc: Joe Perches Cc: Matthew Wilcox Cc: David Rientjes Cc: Uladzislau Rezki Link: http://lkml.kernel.org/r/20200407200318.11711-1-longman@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/util.c | 18 ++++++++++++++++++ security/keys/internal.h | 11 ----------- security/keys/keyctl.c | 16 +++++----------- 4 files changed, 24 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e220ce5185ad..5bfc36320e3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -776,6 +776,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) } extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); /* * Mapcount of compound page as a whole, does not include mapped sub-pages. diff --git a/mm/util.c b/mm/util.c index fd9efe6bd463..cd62e6fb5318 100644 --- a/mm/util.c +++ b/mm/util.c @@ -604,6 +604,24 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; diff --git a/security/keys/internal.h b/security/keys/internal.h index 6d0ca48ae9a5..153d35c20d3d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -350,15 +350,4 @@ static inline void key_check(const struct key *key) #define key_check(key) do {} while(0) #endif - -/* - * Helper function to clear and free a kvmalloc'ed memory object. - */ -static inline void __kvzfree(const void *addr, size_t len) -{ - if (addr) { - memset((void *)addr, 0, len); - kvfree(addr); - } -} #endif /* _INTERNAL_H */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5e01192e222a..edde63a63007 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -142,10 +142,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, key_ref_put(keyring_ref); error3: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error2: kfree(description); error: @@ -360,7 +357,7 @@ long keyctl_update_key(key_serial_t id, key_ref_put(key_ref); error2: - __kvzfree(payload, plen); + kvfree_sensitive(payload, plen); error: return ret; } @@ -914,7 +911,7 @@ can_read_key: */ if (ret > key_data_len) { if (unlikely(key_data)) - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_data_len = ret; continue; /* Allocate buffer */ } @@ -923,7 +920,7 @@ can_read_key: ret = -EFAULT; break; } - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_put_out: key_put(key); @@ -1225,10 +1222,7 @@ long keyctl_instantiate_key_common(key_serial_t id, keyctl_change_reqkey_auth(NULL); error2: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error: return ret; } -- cgit v1.2.3 From 04f3465c98665b7c5a3484d7194f1858954069f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:31 -0700 Subject: mm/memory_hotplug: remove is_mem_section_removable() Fortunately, all users of is_mem_section_removable() are gone. Get rid of it, including some now unnecessary functions. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Oscar Salvador Link: http://lkml.kernel.org/r/20200407135416.24093-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 7 ---- mm/memory_hotplug.c | 75 ------------------------------------------ 2 files changed, 82 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 93d9ada74ddd..7dca9cd6076b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -314,19 +314,12 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else -static inline bool is_mem_section_removable(unsigned long pfn, - unsigned long nr_pages) -{ - return false; -} - static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8907426e44d9..bfe8cd2a685f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1113,81 +1113,6 @@ int add_memory(int nid, u64 start, u64 size) EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE -/* - * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy - * set and the size of the free page is given by page_order(). Using this, - * the function determines if the pageblock contains only free pages. - * Due to buddy contraints, a free page at least the size of a pageblock will - * be located at the start of the pageblock - */ -static inline int pageblock_free(struct page *page) -{ - return PageBuddy(page) && page_order(page) >= pageblock_order; -} - -/* Return the pfn of the start of the next active pageblock after a given pfn */ -static unsigned long next_active_pageblock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - - /* Ensure the starting page is pageblock-aligned */ - BUG_ON(pfn & (pageblock_nr_pages - 1)); - - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) { - int order; - /* be careful. we don't have locks, page_order can be changed.*/ - order = page_order(page); - if ((order < MAX_ORDER) && (order >= pageblock_order)) - return pfn + (1 << order); - } - - return pfn + pageblock_nr_pages; -} - -static bool is_pageblock_removable_nolock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - struct zone *zone; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; - - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); - - /* Check the starting page of each pageblock within the range */ - for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) - return false; - cond_resched(); - } - - /* All pageblocks in the memory block are likely to be hot-removable */ - return true; -} - /* * Confirm all pages in a range [start, end) belong to the same zone (skipping * memory holes). When true, return the zone. -- cgit v1.2.3 From 7b7b27214bba1966772f9213cd2d8e5d67f8487f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 4 Jun 2020 16:48:41 -0700 Subject: mm/memory_hotplug: introduce add_memory_driver_managed() Patch series "mm/memory_hotplug: Interface to add driver-managed system ram", v4. kexec (via kexec_load()) can currently not properly handle memory added via dax/kmem, and will have similar issues with virtio-mem. kexec-tools will currently add all memory to the fixed-up initial firmware memmap. In case of dax/kmem, this means that - in contrast to a proper reboot - how that persistent memory will be used can no longer be configured by the kexec'd kernel. In case of virtio-mem it will be harmful, because that memory might contain inaccessible pieces that require coordination with hypervisor first. In both cases, we want to let the driver in the kexec'd kernel handle detecting and adding the memory, like during an ordinary reboot. Introduce add_memory_driver_managed(). More on the samentics are in patch #1. In the future, we might want to make this behavior configurable for dax/kmem- either by configuring it in the kernel (which would then also allow to configure kexec_file_load()) or in kexec-tools by also adding "System RAM (kmem)" memory from /proc/iomem to the fixed-up initial firmware memmap. More on the motivation can be found in [1] and [2]. [1] https://lkml.kernel.org/r/20200429160803.109056-1-david@redhat.com [2] https://lkml.kernel.org/r/20200430102908.10107-1-david@redhat.com This patch (of 3): Some device drivers rely on memory they managed to not get added to the initial (firmware) memmap as system RAM - so it's not used as initial system RAM by the kernel and the driver is under control. While this is the case during cold boot and after a reboot, kexec is not aware of that and might add such memory to the initial (firmware) memmap of the kexec kernel. We need ways to teach kernel and userspace that this system ram is different. For example, dax/kmem allows to decide at runtime if persistent memory is to be used as system ram. Another future user is virtio-mem, which has to coordinate with its hypervisor to deal with inaccessible parts within memory resources. We want to let users in the kernel (esp. kexec) but also user space (esp. kexec-tools) know that this memory has different semantics and needs to be handled differently: 1. Don't create entries in /sys/firmware/memmap/ 2. Name the memory resource "System RAM ($DRIVER)" (exposed via /proc/iomem) ($DRIVER might be "kmem", "virtio_mem"). 3. Flag the memory resource IORESOURCE_MEM_DRIVER_MANAGED /sys/firmware/memmap/ [1] represents the "raw firmware-provided memory map" because "on most architectures that firmware-provided memory map is modified afterwards by the kernel itself". The primary user is kexec on x86-64. Since commit d96ae5309165 ("memory-hotplug: create /sys/firmware/memmap entry for new memory"), we add all hotplugged memory to that firmware memmap - which makes perfect sense for traditional memory hotplug on x86-64, where real HW will also add hotplugged DIMMs to the firmware memmap. We replicate what the "raw firmware-provided memory map" looks like after hot(un)plug. To keep things simple, let the user provide the full resource name instead of only the driver name - this way, we don't have to manually allocate/craft strings for memory resources. Also use the resource name to make decisions, to avoid passing additional flags. In case the name isn't "System RAM", it's special. We don't have to worry about firmware_map_remove() on the removal path. If there is no entry, it will simply return with -EINVAL. We'll adapt dax/kmem in a follow-up patch. [1] https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-firmware-memmap Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Acked-by: Pankaj Gupta Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Cc: Baoquan He Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Tatashin Cc: Dan Williams Link: http://lkml.kernel.org/r/20200508084217.9160-1-david@redhat.com Link: http://lkml.kernel.org/r/20200508084217.9160-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 1 + include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 62 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index a9b9170b5dd2..cc9a5b4593ca 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -103,6 +103,7 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) +#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7dca9cd6076b..fee7fab5d706 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -342,6 +342,8 @@ extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); +extern int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern void remove_pfn_range_from_zone(struct zone *zone, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 21bc3363a829..c82722c3fe32 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -98,11 +98,14 @@ void mem_hotplug_done(void) u64 max_mem_size = U64_MAX; /* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) { struct resource *res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - char *resource_name = "System RAM"; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_MEM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -1057,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res) BUG_ON(ret); /* create new memmap entry */ - firmware_map_add_hotplug(start, start + size, "System RAM"); + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); @@ -1083,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - res = register_memory_resource(start, size); + res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); @@ -1105,6 +1109,56 @@ int add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(add_memory); +/* + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". + */ +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name) +{ + struct resource *res; + int rc; + + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; + + lock_device_hotplug(); + + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; + } + + rc = add_memory_resource(nid, res); + if (rc < 0) + release_memory_resource(res); + +out_unlock: + unlock_device_hotplug(); + return rc; +} +EXPORT_SYMBOL_GPL(add_memory_driver_managed); + #ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping -- cgit v1.2.3 From 57e86fa16a707db9e05dec77eb88a398f05a022b Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Thu, 4 Jun 2020 16:48:55 -0700 Subject: mm: replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") [akpm@linux-foundation.org: fix build] Signed-off-by: chenqiwu Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Wei Yang Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Michal Hocko Cc: Pankaj Gupta Cc: Yang Shi Cc: Qian Cai Cc: Baoquan He Link: http://lkml.kernel.org/r/1586599916-15456-1-git-send-email-qiwuchen55@gmail.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5bfc36320e3c..1744081a34d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1726,7 +1726,7 @@ struct frame_vector { unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ - void *ptrs[0]; /* Array of pinned pfns / pages. Use + void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; -- cgit v1.2.3 From 2b7874490243e014112100925405c4a17a8c40aa Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 4 Jun 2020 16:49:49 -0700 Subject: include/linux/mm.h: return true in cpupid_pid_unset() Fix the following coccicheck warning: include/linux/mm.h:1371:8-9: WARNING: return of 0/1 in function 'cpupid_pid_unset' with return type bool Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200422071816.48879-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1744081a34d4..86adc71a972f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1373,7 +1373,7 @@ static inline int cpu_pid_to_cpupid(int nid, int pid) static inline bool cpupid_pid_unset(int cpupid) { - return 1; + return true; } static inline void page_cpupid_reset_last(struct page *page) -- cgit v1.2.3 From bd93f003b7462ae39a43c531abca37fe7073b866 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Jun 2020 16:50:30 -0700 Subject: include/linux/bitops.h: avoid clang shift-count-overflow warnings Clang normally does not warn about certain issues in inline functions when it only happens in an eliminated code path. However if something else goes wrong, it does tend to complain about the definition of hweight_long() on 32-bit targets: include/linux/bitops.h:75:41: error: shift count >= width of type [-Werror,-Wshift-count-overflow] return sizeof(w) == 4 ? hweight32(w) : hweight64(w); ^~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:29:49: note: expanded from macro 'hweight64' define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w)) ^~~~~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:21:76: note: expanded from macro '__const_hweight64' define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) ^ ~~ include/asm-generic/bitops/const_hweight.h:20:49: note: expanded from macro '__const_hweight32' define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16)) ^ include/asm-generic/bitops/const_hweight.h:19:72: note: expanded from macro '__const_hweight16' define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8 )) ^ include/asm-generic/bitops/const_hweight.h:12:9: note: expanded from macro '__const_hweight8' (!!((w) & (1ULL << 2))) + \ Adding an explicit cast to __u64 avoids that warning and makes it easier to read other output. Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Acked-by: Christian Brauner Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Josh Poimboeuf Cc: Nick Desaulniers Link: http://lkml.kernel.org/r/20200505135513.65265-1-arnd@arndb.de Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9acf654f0b19..99f2ac30b1d9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -72,7 +72,7 @@ static inline int get_bitmask_order(unsigned int count) static __always_inline unsigned long hweight_long(unsigned long w) { - return sizeof(w) == 4 ? hweight32(w) : hweight64(w); + return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** -- cgit v1.2.3 From 51da9dfb7f20911ae4e79e9b412a9c2d4c373d4b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 4 Jun 2020 16:50:49 -0700 Subject: elfnote: mark all .note sections SHF_ALLOC ELFNOTE_START allows callers to specify flags for .pushsection assembler directives. All callsites but ELF_NOTE use "a" for SHF_ALLOC. For vdso's that explicitly use ELF_NOTE_START and BUILD_SALT, the same section is specified twice after preprocessing, once with "a" flag, once without. Example: .pushsection .note.Linux, "a", @note ; .pushsection .note.Linux, "", @note ; While GNU as allows this ordering, it warns for the opposite ordering, making these directives position dependent. We'd prefer not to precisely match this behavior in Clang's integrated assembler. Instead, the non __ASSEMBLY__ definition of ELF_NOTE uses __attribute__((section(".note.Linux"))) which is created with SHF_ALLOC, so let's make the __ASSEMBLY__ definition of ELF_NOTE consistent with C and just always use "a" flag. This allows Clang to assemble a working mainline (5.6) kernel via: $ make CC=clang AS=clang Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Reviewed-by: Nathan Chancellor Reviewed-by: Fangrui Song Cc: Jeremy Fitzhardinge Cc: Thomas Gleixner Cc: Vincenzo Frascino Link: https://github.com/ClangBuiltLinux/linux/issues/913 Link: http://lkml.kernel.org/r/20200325231250.99205-1-ndesaulniers@google.com Debugged-by: Ilie Halip Signed-off-by: Linus Torvalds --- include/linux/elfnote.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 594d4e78654f..69b136e4dd2b 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -54,7 +54,7 @@ .popsection ; #define ELFNOTE(name, type, desc) \ - ELFNOTE_START(name, type, "") \ + ELFNOTE_START(name, type, "a") \ desc ; \ ELFNOTE_END -- cgit v1.2.3 From d2c0e6e91c7990c67921005f44f9b2b326ff2906 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 4 Jun 2020 16:51:05 -0700 Subject: include/linux/seq_file.h: introduce DEFINE_SEQ_ATTRIBUTE() helper macro Patch series "seq_file: Introduce DEFINE_SEQ_ATTRIBUTE() helper macro". As discussed in https://lore.kernel.org/lkml/20191129222310.GA3712618@kroah.com/, we could introduce a new helper macro to reduce losts of boilerplate code, vmstat and kprobes is the example which covert to use it, if this is accepted, I will send out more cleanups. This patch (of 3): Introduce DEFINE_SEQ_ATTRIBUTE() helper macro to decrease code duplication. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Cc: Greg KH Cc: Ingo Molnar Cc: Kefeng Wang Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Al Viro Link: http://lkml.kernel.org/r/20200509064031.181091-1-wangkefeng.wang@huawei.com Link: http://lkml.kernel.org/r/20200509064031.181091-2-wangkefeng.wang@huawei.com Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 1672cf6f7614..813614d4b71f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -145,6 +145,25 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); +#define DEFINE_SEQ_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + int ret = seq_open(file, &__name ## _sops); \ + if (!ret && inode->i_private) { \ + struct seq_file *seq_f = file->private_data; \ + seq_f->private = inode->i_private; \ + } \ + return ret; \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release, \ +} + #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ -- cgit v1.2.3 From 986db2d14a6dca6456b63b4f5c410ae2aab4ec9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 16:51:14 -0700 Subject: exec: simplify the copy_strings_kernel calling convention copy_strings_kernel is always used with a single argument, adjust the calling convention to that. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200501104105.2621149-2-hch@lst.de Signed-off-by: Linus Torvalds --- fs/binfmt_em86.c | 6 +++--- fs/binfmt_misc.c | 4 ++-- fs/binfmt_script.c | 6 +++--- fs/exec.c | 13 ++++++------- include/linux/binfmts.h | 3 +-- 5 files changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 466497860c62..f33fa668c91f 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -68,15 +68,15 @@ static int load_em86(struct linux_binprm *bprm) * user environment and arguments are stored. */ remove_arg_zero(bprm); - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) return retval; bprm->argc++; if (i_arg) { - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval < 0) return retval; bprm->argc++; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index cdb45829354d..b15257d8ff5e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -190,13 +190,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->file = NULL; } /* make argv[1] be the path to the binary */ - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto error; bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &fmt->interpreter, bprm); + retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto error; bprm->argc++; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index e9e6a6f4a35f..c4fb7f52a46e 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -117,17 +117,17 @@ static int load_script(struct linux_binprm *bprm) retval = remove_arg_zero(bprm); if (retval) return retval; - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) return retval; bprm->argc++; if (i_arg) { - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval) return retval; bprm->argc++; diff --git a/fs/exec.c b/fs/exec.c index 2c465119affc..4814b26a56fb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -588,24 +588,23 @@ out: } /* - * Like copy_strings, but get argv and its values from kernel memory. + * Copy and argument/environment string from the kernel to the processes stack. */ -int copy_strings_kernel(int argc, const char *const *__argv, - struct linux_binprm *bprm) +int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int r; mm_segment_t oldfs = get_fs(); struct user_arg_ptr argv = { - .ptr.native = (const char __user *const __user *)__argv, + .ptr.native = (const char __user *const __user *)&arg, }; set_fs(KERNEL_DS); - r = copy_strings(argc, argv, bprm); + r = copy_strings(1, argv, bprm); set_fs(oldfs); return r; } -EXPORT_SYMBOL(copy_strings_kernel); +EXPORT_SYMBOL(copy_string_kernel); #ifdef CONFIG_MMU @@ -1865,7 +1864,7 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval < 0) goto out; - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index a345d9fed3d8..3d3afe094c97 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -144,8 +144,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, extern int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location); extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); -extern int copy_strings_kernel(int argc, const char *const *argv, - struct linux_binprm *bprm); +int copy_string_kernel(const char *arg, struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); -- cgit v1.2.3 From 5872f1a2e5c783783d51e96468f0ff6aede61182 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 11 May 2020 21:59:51 +0100 Subject: READ_ONCE: Fix comment describing 2x32-bit atomicity READ_ONCE() permits 64-bit accesses on 32-bit architectures, since this crops up in a few places and is generally harmless because either the upper bits are always zero (e.g. for a virtual address or 32-bit time_t) or the architecture provides 64-bit atomicity anyway. Update the corresponding comment above compiletime_assert_rwonce_type(), which incorrectly states that 32-bit x86 provides 64-bit atomicity, and instead reference 32-bit Armv7 with LPAE. Cc: Thomas Gleixner Cc: Peter Zijlstra Reported-by: Jann Horn Signed-off-by: Will Deacon --- include/linux/compiler.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c363d8debc43..657e4fd38a77 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -332,9 +332,9 @@ static inline void *offset_to_ptr(const int *off) /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will - * actually be atomic in many cases (namely x86), but for others we rely on - * the access being split into 2x32-bit accesses for a 32-bit quantity (e.g. - * a virtual address) and a strong prevailing wind. + * actually be atomic in some cases (namely Armv7 + LPAE), but for others we + * rely on the access being split into 2x32-bit accesses for a 32-bit quantity + * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ -- cgit v1.2.3 From 8d4beed7bbc71666de2630b79899c8852c3bf5cd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 5 Jun 2020 11:05:51 +0100 Subject: compiler-types.h: Include naked type in __pick_integer_type() match __pick_integer_type() checks whether the type of its first argument is compatible with an explicitly signed or unsigned integer type, returning the compatible type if it exists. Unfortunately, 'char' is neither compatible with 'signed char' nor 'unsigned char', so add a check against the naked type to allow the __unqual_scalar_typeof() macro to strip qualifiers from char types without an explicit signedness. Reported-by: Rasmus Villemoes Signed-off-by: Will Deacon --- include/linux/compiler_types.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 233066c92f6f..6ed0612bc143 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -220,9 +220,14 @@ struct ftrace_likely_data { #define __pick_scalar_type(x, type, otherwise) \ __builtin_choose_expr(__same_type(x, type), (type)0, otherwise) +/* + * 'char' is not type-compatible with either 'signed char' or 'unsigned char', + * so we include the naked type here as well as the signed/unsigned variants. + */ #define __pick_integer_type(x, type, otherwise) \ - __pick_scalar_type(x, unsigned type, \ - __pick_scalar_type(x, signed type, otherwise)) + __pick_scalar_type(x, type, \ + __pick_scalar_type(x, unsigned type, \ + __pick_scalar_type(x, signed type, otherwise))) #define __unqual_scalar_typeof(x) typeof( \ __pick_integer_type(x, char, \ -- cgit v1.2.3 From b16d8ecf4fa17e16fff20638364f9bd2205615e7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 5 Jun 2020 11:19:46 +0100 Subject: compiler.h: Enforce that READ_ONCE_NOCHECK() access size is sizeof(long) READ_ONCE_NOCHECK() unconditionally performs a sizeof(long)-sized access, so enforce that the size of the pointed-to object that we are loading from is the same size as 'long'. Reported-by: Marco Elver Signed-off-by: Will Deacon --- include/linux/compiler.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 657e4fd38a77..a0aa56e6b782 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -254,9 +254,12 @@ unsigned long __read_once_word_nocheck(const void *addr) */ #define READ_ONCE_NOCHECK(x) \ ({ \ - unsigned long __x = __read_once_word_nocheck(&(x)); \ + unsigned long __x; \ + compiletime_assert(sizeof(x) == sizeof(__x), \ + "Unsupported access size for READ_ONCE_NOCHECK()."); \ + __x = __read_once_word_nocheck(&(x)); \ smp_read_barrier_depends(); \ - __x; \ + (typeof(x))__x; \ }) static __no_kasan_or_inline -- cgit v1.2.3 From 1fd76043ecb04b8567a76b106db09ac778e1e2b8 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 27 May 2020 12:32:36 +0200 Subject: compiler_types.h: Optimize __unqual_scalar_typeof compilation time If the compiler supports C11's _Generic, use it to speed up compilation times of __unqual_scalar_typeof(). GCC version 4.9 or later and all supported versions of Clang support the feature (the oldest supported compiler that doesn't support _Generic is GCC 4.8, for which we use the slower alternative). The non-_Generic variant relies on multiple expansions of __pick_integer_type -> __pick_scalar_type -> __builtin_choose_expr, which increases pre-processed code size, and can cause compile times to increase in files with numerous expansions of READ_ONCE(), or other users of __unqual_scalar_typeof(). Summary of compile-time benchmarking done by Arnd Bergmann: clang-11 gcc-9 this patch 0.78 0.91 ideal 0.76 0.86 See https://lkml.kernel.org/r/CAK8P3a3UYQeXhiufUevz=rwe09WM_vSTCd9W+KvJHJcOeQyWVA@mail.gmail.com Further compile-testing done with: gcc 4.8, 4.9, 5.5, 6.4, 7.5, 8.4; clang 9, 10. Reported-by: Arnd Bergmann Signed-off-by: Marco Elver Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra Tested-by: Arnd Bergmann Link: https://lkml.kernel.org/r/20200527103236.148700-1-elver@google.com Link: https://lkml.kernel.org/r/CAK8P3a0RJtbVi1JMsfik=jkHCNFv+DJn_FeDg-YLW+ueQW3tNg@mail.gmail.com [will: tweak new macros to make them a bit more readable] Signed-off-by: Will Deacon --- include/linux/compiler_types.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6ed0612bc143..31416b60eabf 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -213,7 +213,9 @@ struct ftrace_likely_data { /* * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving * non-scalar types unchanged. - * + */ +#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 40900 +/* * We build this out of a couple of helper macros in a vain attempt to * help you keep your lunch down while reading it. */ @@ -235,6 +237,25 @@ struct ftrace_likely_data { __pick_integer_type(x, int, \ __pick_integer_type(x, long, \ __pick_integer_type(x, long long, x)))))) +#else +/* + * If supported, prefer C11 _Generic for better compile-times. As above, 'char' + * is not type-compatible with 'signed char', and we define a separate case. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type: (unsigned type)0, \ + signed type: (signed type)0 + +#define __unqual_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (x))) +#endif /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ -- cgit v1.2.3 From b398ace5d2ea0b7f00d9f1ce23c647e289c206ca Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 28 May 2020 09:43:13 +0200 Subject: compiler_types.h: Use unoptimized __unqual_scalar_typeof for sparse If the file is being checked with sparse, use the unoptimized version of __unqual_scalar_typeof(), since sparse does not support _Generic. Reported-by: kbuild test robot Signed-off-by: Marco Elver Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/202005280727.lXn1VnTw%lkp@intel.com Signed-off-by: Will Deacon --- include/linux/compiler_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 31416b60eabf..cd73e3857a87 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -214,7 +214,7 @@ struct ftrace_likely_data { * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving * non-scalar types unchanged. */ -#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 40900 +#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 40900) || defined(__CHECKER__) /* * We build this out of a couple of helper macros in a vain attempt to * help you keep your lunch down while reading it. -- cgit v1.2.3 From cf6fada71543ceea0f6228ffdc0b85778f3f5a6e Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Sat, 30 May 2020 10:08:30 +0800 Subject: cpufreq: change '.set_boost' to act on one policy Macro 'for_each_active_policy()' is defined internally. To avoid some cpufreq driver needing this macro to iterate over all the policies in '.set_boost' callback, we redefine '.set_boost' to act on only one policy and pass the policy as an argument. 'cpufreq_boost_trigger_state()' iterates over all the policies to set boost for the system. This is preparation for adding SW BOOST support for CPPC. To protect Boost enable/disable by sysfs from CPU online/offline, add 'cpu_hotplug_lock' before calling '.set_boost' for each CPU. Also move the lock from 'set_boost()' to 'store_cpb()' in acpi_cpufreq. Signed-off-by: Xiongfeng Wang Suggested-by: Viresh Kumar Acked-by: Viresh Kumar [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 14 ++++++----- drivers/cpufreq/cpufreq.c | 57 +++++++++++++++++++++++------------------- include/linux/cpufreq.h | 2 +- 3 files changed, 40 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 289e8ce3fd13..429e5a36c08a 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -126,12 +126,12 @@ static void boost_set_msr_each(void *p_en) boost_set_msr(enable); } -static int set_boost(int val) +static int set_boost(struct cpufreq_policy *policy, int val) { - get_online_cpus(); - on_each_cpu(boost_set_msr_each, (void *)(long)val, 1); - put_online_cpus(); - pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis"); + on_each_cpu_mask(policy->cpus, boost_set_msr_each, + (void *)(long)val, 1); + pr_debug("CPU %*pbl: Core Boosting %sabled.\n", + cpumask_pr_args(policy->cpus), val ? "en" : "dis"); return 0; } @@ -162,7 +162,9 @@ static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, if (ret || val > 1) return -EINVAL; - set_boost(val); + get_online_cpus(); + set_boost(policy, val); + put_online_cpus(); return count; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d03f250f68e4..0128de3603df 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2532,34 +2532,29 @@ EXPORT_SYMBOL_GPL(cpufreq_update_limits); /********************************************************************* * BOOST * *********************************************************************/ -static int cpufreq_boost_set_sw(int state) +static int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { - struct cpufreq_policy *policy; - - for_each_active_policy(policy) { - int ret; + int ret; - if (!policy->freq_table) - return -ENXIO; + if (!policy->freq_table) + return -ENXIO; - ret = cpufreq_frequency_table_cpuinfo(policy, - policy->freq_table); - if (ret) { - pr_err("%s: Policy frequency update failed\n", - __func__); - return ret; - } - - ret = freq_qos_update_request(policy->max_freq_req, policy->max); - if (ret < 0) - return ret; + ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); + if (ret) { + pr_err("%s: Policy frequency update failed\n", __func__); + return ret; } + ret = freq_qos_update_request(policy->max_freq_req, policy->max); + if (ret < 0) + return ret; + return 0; } int cpufreq_boost_trigger_state(int state) { + struct cpufreq_policy *policy; unsigned long flags; int ret = 0; @@ -2570,15 +2565,25 @@ int cpufreq_boost_trigger_state(int state) cpufreq_driver->boost_enabled = state; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - ret = cpufreq_driver->set_boost(state); - if (ret) { - write_lock_irqsave(&cpufreq_driver_lock, flags); - cpufreq_driver->boost_enabled = !state; - write_unlock_irqrestore(&cpufreq_driver_lock, flags); - - pr_err("%s: Cannot %s BOOST\n", - __func__, state ? "enable" : "disable"); + get_online_cpus(); + for_each_active_policy(policy) { + ret = cpufreq_driver->set_boost(policy, state); + if (ret) + goto err_reset_state; } + put_online_cpus(); + + return 0; + +err_reset_state: + put_online_cpus(); + + write_lock_irqsave(&cpufreq_driver_lock, flags); + cpufreq_driver->boost_enabled = !state; + write_unlock_irqrestore(&cpufreq_driver_lock, flags); + + pr_err("%s: Cannot %s BOOST\n", + __func__, state ? "enable" : "disable"); return ret; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 67d5950bd878..3494f6763597 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -367,7 +367,7 @@ struct cpufreq_driver { /* platform specific boost support code */ bool boost_enabled; - int (*set_boost)(int state); + int (*set_boost)(struct cpufreq_policy *policy, int state); }; /* flags */ -- cgit v1.2.3 From 33a180623b6c35f2727daecb63763955af3af1df Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:40 +0200 Subject: dm bufio: introduce forget_buffer_locked Introduce a function forget_buffer_locked that forgets a range of buffers. It is more efficient than calling forget_buffer in a loop. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/dm-bufio.h | 7 ++++++ 2 files changed, 63 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff19add97e0b..95f6c544aa01 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -262,6 +262,29 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) return NULL; } +static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + struct dm_buffer *best = NULL; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + if (block <= b->block) { + n = n->rb_left; + best = b; + } else { + n = n->rb_right; + } + } + + return best; +} + static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) { struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; @@ -1434,6 +1457,14 @@ retry: } EXPORT_SYMBOL_GPL(dm_bufio_release_move); +static void forget_buffer_locked(struct dm_buffer *b) +{ + if (likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } +} + /* * Free the given buffer. * @@ -1447,15 +1478,36 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) dm_bufio_lock(c); b = __find(c, block); - if (b && likely(!b->hold_count) && likely(!b->state)) { - __unlink_buffer(b); - __free_buffer_wake(b); - } + if (b) + forget_buffer_locked(b); dm_bufio_unlock(c); } EXPORT_SYMBOL_GPL(dm_bufio_forget); +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks) +{ + struct dm_buffer *b; + sector_t end_block = block + n_blocks; + + while (block < end_block) { + dm_bufio_lock(c); + + b = __find_next(c, block); + if (b) { + block = b->block + 1; + forget_buffer_locked(b); + } + + dm_bufio_unlock(c); + + if (!b) + break; + } + +} +EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers); + void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) { c->minimum_buffers = n; diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 5ec6bfbde9ae..29d255fdd5d6 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -136,6 +136,13 @@ void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); */ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block); +/* + * Free the given range of buffers. + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks); + /* * Set the minimum number of buffers before cleanup happens. */ -- cgit v1.2.3 From 90c56b3877995d948dc382fe833a1c5eeaaee2ad Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Jun 2020 07:21:22 -0700 Subject: net: ethtool: Fix comment mentioning typo in IS_ENABLED() This has no code changes, but it's a typo noticed in other clean-ups, so we might as well fix it. IS_ENABLED() takes full names, and should have the "CONFIG_" prefix. Reported-by: Joe Perches Link: https://lore.kernel.org/lkml/b08611018fdb6d88757c6008a5c02fa0e07b32fb.camel@perches.com Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 8fbe4f97ffad..1e7bf78cb382 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -67,5 +67,5 @@ static inline int ethnl_cable_test_step(struct phy_device *phydev, u32 first, { return -EOPNOTSUPP; } -#endif /* IS_ENABLED(ETHTOOL_NETLINK) */ +#endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ -- cgit v1.2.3 From 46f21af862656c477387f2256adc1005503db67d Mon Sep 17 00:00:00 2001 From: Wesley Sheng Date: Tue, 26 May 2020 15:59:43 +0800 Subject: NTB: correct ntb_peer_spad_addr and ntb_peer_spad_read comment typos The comment for ntb_peer_spad_addr and ntb_peer_spad_read incorrectly referred to peer doorbell register and local scratchpad register. Signed-off-by: Wesley Sheng Signed-off-by: Jon Mason --- include/linux/ntb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h index a07252f78770..191b524e5c0d 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -1351,7 +1351,7 @@ static inline int ntb_spad_write(struct ntb_dev *ntb, int sidx, u32 val) * @sidx: Scratchpad index. * @spad_addr: OUT - The address of the peer scratchpad register. * - * Return the address of the peer doorbell register. This may be used, for + * Return the address of the peer scratchpad register. This may be used, for * example, by drivers that offload memory copy operations to a dma engine. * * Return: Zero on success, otherwise an error number. @@ -1373,7 +1373,7 @@ static inline int ntb_peer_spad_addr(struct ntb_dev *ntb, int pidx, int sidx, * * Read the peer scratchpad register, and return the value. * - * Return: The value of the local scratchpad register. + * Return: The value of the peer scratchpad register. */ static inline u32 ntb_peer_spad_read(struct ntb_dev *ntb, int pidx, int sidx) { -- cgit v1.2.3 From f5152f4ded3ce6d332d5e4f9d7e325c3b81cae1b Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Sat, 6 Jun 2020 11:35:50 +0200 Subject: firmware/dmi: Report DMI Bios & EC firmware release Some vendors like HPe or Dell, encode the release version of their BIOS in the "System BIOS {Major|Minor} Release" fields of Type 0. This information is used to know which bios release actually runs. It could be used for some quirks, debugging sessions or inventory tasks. A typical output for a Dell system running the 65.27 bios is : [root@t1700 ~]# cat /sys/devices/virtual/dmi/id/bios_release 65.27 [root@t1700 ~]# Servers that have a BMC encode the release version of their firmware in the "Embedded Controller Firmware {Major|Minor} Release" fields of Type 0. This information is used to know which BMC release actually runs. It could be used for some quirks, debugging sessions or inventory tasks. A typical output for a Dell system running the 3.75 bmc release is : [root@t1700 ~]# cat /sys/devices/virtual/dmi/id/ec_firmware_release 3.75 [root@t1700 ~]# Signed-off-by: Erwan Velu Signed-off-by: Jean Delvare --- drivers/firmware/dmi-id.c | 6 ++++++ drivers/firmware/dmi_scan.c | 30 ++++++++++++++++++++++++++++++ include/linux/mod_devicetable.h | 2 ++ scripts/mod/file2alias.c | 2 ++ 4 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c index ff39f64f2aae..86d71b0212b1 100644 --- a/drivers/firmware/dmi-id.c +++ b/drivers/firmware/dmi-id.c @@ -42,6 +42,8 @@ DEFINE_DMI_ATTR_WITH_SHOW(bios_vendor, 0444, DMI_BIOS_VENDOR); DEFINE_DMI_ATTR_WITH_SHOW(bios_version, 0444, DMI_BIOS_VERSION); DEFINE_DMI_ATTR_WITH_SHOW(bios_date, 0444, DMI_BIOS_DATE); DEFINE_DMI_ATTR_WITH_SHOW(sys_vendor, 0444, DMI_SYS_VENDOR); +DEFINE_DMI_ATTR_WITH_SHOW(bios_release, 0444, DMI_BIOS_RELEASE); +DEFINE_DMI_ATTR_WITH_SHOW(ec_firmware_release, 0444, DMI_EC_FIRMWARE_RELEASE); DEFINE_DMI_ATTR_WITH_SHOW(product_name, 0444, DMI_PRODUCT_NAME); DEFINE_DMI_ATTR_WITH_SHOW(product_version, 0444, DMI_PRODUCT_VERSION); DEFINE_DMI_ATTR_WITH_SHOW(product_serial, 0400, DMI_PRODUCT_SERIAL); @@ -78,6 +80,8 @@ static ssize_t get_modalias(char *buffer, size_t buffer_size) { "bvn", DMI_BIOS_VENDOR }, { "bvr", DMI_BIOS_VERSION }, { "bd", DMI_BIOS_DATE }, + { "br", DMI_BIOS_RELEASE }, + { "efr", DMI_EC_FIRMWARE_RELEASE }, { "svn", DMI_SYS_VENDOR }, { "pn", DMI_PRODUCT_NAME }, { "pvr", DMI_PRODUCT_VERSION }, @@ -187,6 +191,8 @@ static void __init dmi_id_init_attr_table(void) ADD_DMI_ATTR(bios_vendor, DMI_BIOS_VENDOR); ADD_DMI_ATTR(bios_version, DMI_BIOS_VERSION); ADD_DMI_ATTR(bios_date, DMI_BIOS_DATE); + ADD_DMI_ATTR(bios_release, DMI_BIOS_RELEASE); + ADD_DMI_ATTR(ec_firmware_release, DMI_EC_FIRMWARE_RELEASE); ADD_DMI_ATTR(sys_vendor, DMI_SYS_VENDOR); ADD_DMI_ATTR(product_name, DMI_PRODUCT_NAME); ADD_DMI_ATTR(product_version, DMI_PRODUCT_VERSION); diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index f59163cb7cba..5066d1f1d687 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -186,6 +186,34 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, dmi_ident[slot] = p; } +static void __init dmi_save_release(const struct dmi_header *dm, int slot, + int index) +{ + const u8 *minor, *major; + char *s; + + /* If the table doesn't have the field, let's return */ + if (dmi_ident[slot] || dm->length < index) + return; + + minor = (u8 *) dm + index; + major = (u8 *) dm + index - 1; + + /* As per the spec, if the system doesn't support this field, + * the value is FF + */ + if (*major == 0xFF && *minor == 0xFF) + return; + + s = dmi_alloc(8); + if (!s) + return; + + sprintf(s, "%u.%u", *major, *minor); + + dmi_ident[slot] = s; +} + static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index) { @@ -444,6 +472,8 @@ static void __init dmi_decode(const struct dmi_header *dm, void *dummy) dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); dmi_save_ident(dm, DMI_BIOS_VERSION, 5); dmi_save_ident(dm, DMI_BIOS_DATE, 8); + dmi_save_release(dm, DMI_BIOS_RELEASE, 21); + dmi_save_release(dm, DMI_EC_FIRMWARE_RELEASE, 23); break; case 1: /* System Information */ dmi_save_ident(dm, DMI_SYS_VENDOR, 4); diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..4b3d0a4945df 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -532,6 +532,8 @@ enum dmi_field { DMI_BIOS_VENDOR, DMI_BIOS_VERSION, DMI_BIOS_DATE, + DMI_BIOS_RELEASE, + DMI_EC_FIRMWARE_RELEASE, DMI_SYS_VENDOR, DMI_PRODUCT_NAME, DMI_PRODUCT_VERSION, diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 02d5d79da284..9599e2a3f1e6 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -954,6 +954,8 @@ static const struct dmifield { { "bvn", DMI_BIOS_VENDOR }, { "bvr", DMI_BIOS_VERSION }, { "bd", DMI_BIOS_DATE }, + { "br", DMI_BIOS_RELEASE }, + { "efr", DMI_EC_FIRMWARE_RELEASE }, { "svn", DMI_SYS_VENDOR }, { "pn", DMI_PRODUCT_NAME }, { "pvr", DMI_PRODUCT_VERSION }, -- cgit v1.2.3 From e649b3f0188f8fd34dd0dde8d43fd3312b902fb2 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Sat, 6 Jun 2020 13:26:27 +0900 Subject: KVM: x86: Fix APIC page invalidation race Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to fix inappropriate APIC page invalidation by re-introducing arch specific kvm_arch_mmu_notifier_invalidate_range() and calling it from kvm_mmu_notifier_invalidate_range_start. However, the patch left a possible race where the VMCS APIC address cache is updated *before* it is unmapped: (Invalidator) kvm_mmu_notifier_invalidate_range_start() (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD) (KVM VCPU) vcpu_enter_guest() (KVM VCPU) kvm_vcpu_reload_apic_access_page() (Invalidator) actually unmap page Because of the above race, there can be a mismatch between the host physical address stored in the APIC_ACCESS_PAGE VMCS field and the host physical address stored in the EPT entry for the APIC GPA (0xfee0000). When this happens, the processor will not trap APIC accesses, and will instead show the raw contents of the APIC-access page. Because Windows OS periodically checks for unexpected modifications to the LAPIC register, this will show up as a BSOD crash with BugCheck CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in https://bugzilla.redhat.com/show_bug.cgi?id=1751017. The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range() cannot guarantee that no additional references are taken to the pages in the range before kvm_mmu_notifier_invalidate_range_end(). Fortunately, this case is supported by the MMU notifier API, as documented in include/linux/mmu_notifier.h: * If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). The fix therefore is to reload the APIC-access page field in the VMCS from kvm_mmu_notifier_invalidate_range() instead of ..._range_start(). Cc: stable@vger.kernel.org Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation") Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=197951 Signed-off-by: Eiichi Tsukata Message-Id: <20200606042627.61070-1-eiichi.tsukata@nutanix.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 ++----- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- 3 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c26dd1363151..24de847af52e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8270,9 +8270,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, - bool blockable) +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) { unsigned long apic_address; @@ -8283,8 +8282,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); - - return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d38d6b9c24be..e2f82131bb3e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1420,8 +1420,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, bool blockable); +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4db151f6101e..7b6013f2ba19 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, bool blockable) +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) { - return 0; } bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) @@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); + srcu_read_unlock(&kvm->srcu, idx); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; - int ret; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); - - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, - range->end, - mmu_notifier_range_blockable(range)); - srcu_read_unlock(&kvm->srcu, idx); - return ret; + return 0; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .invalidate_range = kvm_mmu_notifier_invalidate_range, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -- cgit v1.2.3 From 7ff0d4490ebadae8037a079a70c5cac13385a808 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jun 2020 07:52:37 +0200 Subject: trace: fix an incorrect __user annotation on stack_trace_sysctl No user pointers for sysctls anymore. Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Reported-by: build test robot Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/ftrace.h | 5 ++--- kernel/trace/trace_stack.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ddfc377de0d2..fce81238f304 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -319,9 +319,8 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, extern int stack_tracer_enabled; -int stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); /* DO NOT MODIFY THIS VARIABLE DIRECTLY! */ DECLARE_PER_CPU(int, disable_stack_tracer); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c557f42a9397..98bba4764c52 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -515,9 +515,8 @@ static const struct file_operations stack_trace_filter_fops = { #endif /* CONFIG_DYNAMIC_FTRACE */ int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int was_enabled; int ret; -- cgit v1.2.3 From e1eb26fa62d04ec0955432be1aa8722a97cb52e7 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sun, 7 Jun 2020 21:40:10 -0700 Subject: ipc/namespace.c: use a work queue to free_ipc the reason is to avoid a delay caused by the synchronize_rcu() call in kern_umount() when the mqueue mount is freed. the code: #define _GNU_SOURCE #include #include #include #include int main() { int i; for (i = 0; i < 1000; i++) if (unshare(CLONE_NEWIPC) < 0) error(EXIT_FAILURE, errno, "unshare"); } goes from Command being timed: "./ipc-namespace" User time (seconds): 0.00 System time (seconds): 0.06 Percent of CPU this job got: 0% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05 to Command being timed: "./ipc-namespace" User time (seconds): 0.00 System time (seconds): 0.02 Percent of CPU this job got: 96% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03 Signed-off-by: Giuseppe Scrivano Signed-off-by: Andrew Morton Reviewed-by: Paul E. McKenney Reviewed-by: Waiman Long Cc: Davidlohr Bueso Cc: Manfred Spraul Link: http://lkml.kernel.org/r/20200225145419.527994-1-gscrivan@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 ++ ipc/namespace.c | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c309f43bde45..a06a78c67f19 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -68,6 +68,8 @@ struct ipc_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; + struct llist_node mnt_llist; + struct ns_common ns; } __randomize_layout; diff --git a/ipc/namespace.c b/ipc/namespace.c index fdc3b5f3f53a..24e7b45320f7 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { + /* mq_put_mnt() waits for a grace period as kern_unmount() + * uses synchronize_rcu(). + */ + mq_put_mnt(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); @@ -127,6 +131,21 @@ static void free_ipc_ns(struct ipc_namespace *ns) kfree(ns); } +static LLIST_HEAD(free_ipc_list); +static void free_ipc(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&free_ipc_list); + struct ipc_namespace *n, *t; + + llist_for_each_entry_safe(n, t, node, mnt_llist) + free_ipc_ns(n); +} + +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static DECLARE_WORK(free_ipc_work, free_ipc); + /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put @@ -148,8 +167,9 @@ void put_ipc_ns(struct ipc_namespace *ns) if (refcount_dec_and_lock(&ns->count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); - mq_put_mnt(ns); - free_ipc_ns(ns); + + if (llist_add(&ns->mnt_llist, &free_ipc_list)) + schedule_work(&free_ipc_work); } } -- cgit v1.2.3 From ceabef7dd71720aef58bd182943352c9c307a3de Mon Sep 17 00:00:00 2001 From: Orson Zhai Date: Sun, 7 Jun 2020 21:40:14 -0700 Subject: dynamic_debug: add an option to enable dynamic debug for modules only Instead of enabling dynamic debug globally with CONFIG_DYNAMIC_DEBUG, CONFIG_DYNAMIC_DEBUG_CORE will only enable core function of dynamic debug. With the DYNAMIC_DEBUG_MODULE defined for any modules, dynamic debug will be tied to them. This is useful for people who only want to enable dynamic debug for kernel modules without worrying about kernel image size and memory consumption is increasing too much. [orson.zhai@unisoc.com: v2] Link: http://lkml.kernel.org/r/1587408228-10861-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Orson Zhai Signed-off-by: Andrew Morton Acked-by: Greg Kroah-Hartman Acked-by: Petr Mladek Cc: Jonathan Corbet Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Baron Cc: Randy Dunlap Link: http://lkml.kernel.org/r/1586521984-5890-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/dynamic-debug-howto.rst | 5 +++++ include/linux/dev_printk.h | 6 ++++-- include/linux/dynamic_debug.h | 2 +- include/linux/net.h | 3 ++- include/linux/netdevice.h | 6 ++++-- include/linux/printk.h | 9 ++++++--- include/rdma/ib_verbs.h | 6 ++++-- lib/Kconfig.debug | 12 ++++++++++++ lib/Makefile | 2 +- lib/dynamic_debug.c | 9 +++++++-- 10 files changed, 46 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7b..3028b644b4fb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de..abcd5fde30eb 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/net.h b/include/linux/net.h index e10f378194a5..016a9c5faa34 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..5b364a2e0006 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06..fc8f03c54543 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 033e7044f29c..ef2f3986c493 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bd4eb7f5ec1..333e878d8af9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK diff --git a/lib/Makefile b/lib/Makefile index 32f19b4d1d2a..315516fa4ef4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -190,7 +190,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab5..321437bbf87d 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; -- cgit v1.2.3 From db38d5c106dfdd7cb7207c83267d82fdf4950b61 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:17 -0700 Subject: kernel: add panic_on_taint Analogously to the introduction of panic_on_warn, this patch introduces a kernel option named panic_on_taint in order to provide a simple and generic way to stop execution and catch a coredump when the kernel gets tainted by any given flag. This is useful for debugging sessions as it avoids having to rebuild the kernel to explicitly add calls to panic() into the code sites that introduce the taint flags of interest. For instance, if one is interested in proceeding with a post-mortem analysis at the point a given code path is hitting a bad page (i.e. unaccount_page_cache_page(), or slab_bug()), a coredump can be collected by rebooting the kernel with 'panic_on_taint=0x20' amended to the command line. Another, perhaps less frequent, use for this option would be as a means for assuring a security policy case where only a subset of taints, or no single taint (in paranoid mode), is allowed for the running system. The optional switch 'nousertaint' is handy in this particular scenario, as it will avoid userspace induced crashes by writes to sysctl interface /proc/sys/kernel/tainted causing false positive hits for such policies. [akpm@linux-foundation.org: tweak kernel-parameters.txt wording] Suggested-by: Qian Cai Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Dave Young Cc: Baoquan He Cc: Jonathan Corbet Cc: Kees Cook Cc: Randy Dunlap Cc: "Theodore Ts'o" Cc: Adrian Bunk Cc: Greg Kroah-Hartman Cc: Laura Abbott Cc: Jeff Mahoney Cc: Jiri Kosina Cc: Takashi Iwai Link: http://lkml.kernel.org/r/20200515175502.146720-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kdump/kdump.rst | 8 ++++++ Documentation/admin-guide/kernel-parameters.txt | 13 ++++++++++ Documentation/admin-guide/sysctl/kernel.rst | 7 +++++ include/linux/kernel.h | 3 +++ kernel/panic.c | 34 +++++++++++++++++++++++++ kernel/sysctl.c | 11 +++++++- 6 files changed, 75 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d2935..2da65fef2a1c 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f3eeecbb3f63..df9b0fe2ed60 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + Format: [,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d01141..3b00b9223157 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1239,6 +1239,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9b7a8d74a9d6..f7835db7102e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -528,6 +528,8 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern unsigned long panic_on_taint; +extern bool panic_on_taint_nousertaint; extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; @@ -596,6 +598,7 @@ extern enum system_states { #define TAINT_AUX 16 #define TAINT_RANDSTRUCT 17 #define TAINT_FLAGS_COUNT 18 +#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..94b5c973770c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -44,6 +44,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +436,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -686,3 +693,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f..587ed0494f2f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,11 +866,20 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) add_taint(i, LOCKDEP_STILL_OK); -- cgit v1.2.3 From 01f39c1c11ee5bf44a1df49e47eb53a86515b9dc Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Jun 2020 21:40:20 -0700 Subject: xarray.h: correct return code documentation for xa_store_{bh,irq}() __xa_store() and xa_store() document that the functions can fail, and that the return code can be an xa_err() encoded error code. xa_store_bh() and xa_store_irq() do not document that the functions can fail and that they can also return xa_err() encoded error codes. Thus: Update the documentation. Signed-off-by: Manfred Spraul Signed-off-by: Andrew Morton Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200430111424.16634-1-manfred@colorfullife.com Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 14c893433139..b4d70e7568b2 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -576,7 +576,7 @@ void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) @@ -602,7 +602,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) -- cgit v1.2.3 From 3db978d480e2843979a2b56f2f7da726f2b295b2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:24 -0700 Subject: kernel/sysctl: support setting sysctl parameters from kernel command line Patch series "support setting sysctl parameters from kernel command line", v3. This series adds support for something that seems like many people always wanted but nobody added it yet, so here's the ability to set sysctl parameters via kernel command line options in the form of sysctl.vm.something=1 The important part is Patch 1. The second, not so important part is an attempt to clean up legacy one-off parameters that do the same thing as a sysctl. I don't want to remove them completely for compatibility reasons, but with generic sysctl support the idea is to remove the one-off param handlers and treat the parameters as aliases for the sysctl variants. I have identified several parameters that mention sysctl counterparts in Documentation/admin-guide/kernel-parameters.txt but there might be more. The conversion also has varying level of success: - numa_zonelist_order is converted in Patch 2 together with adding the necessary infrastructure. It's easy as it doesn't really do anything but warn on deprecated value these days. - hung_task_panic is converted in Patch 3, but there's a downside that now it only accepts 0 and 1, while previously it was any integer value - nmi_watchdog maps to two sysctls nmi_watchdog and hardlockup_panic, so there's no straighforward conversion possible - traceoff_on_warning is a flag without value and it would be required to handle that somehow in the conversion infractructure, which seems pointless for a single flag This patch (of 5): A recently proposed patch to add vm_swappiness command line parameter in addition to existing sysctl [1] made me wonder why we don't have a general support for passing sysctl parameters via command line. Googling found only somebody else wondering the same [2], but I haven't found any prior discussion with reasons why not to do this. Settings the vm_swappiness issue aside (the underlying issue might be solved in a different way), quick search of kernel-parameters.txt shows there are already some that exist as both sysctl and kernel parameter - hung_task_panic, nmi_watchdog, numa_zonelist_order, traceoff_on_warning. A general mechanism would remove the need to add more of those one-offs and might be handy in situations where configuration by e.g. /etc/sysctl.d/ is impractical. Hence, this patch adds a new parse_args() pass that looks for parameters prefixed by 'sysctl.' and tries to interpret them as writes to the corresponding sys/ files using an temporary in-kernel procfs mount. This mechanism was suggested by Eric W. Biederman [3], as it handles all dynamically registered sysctl tables, even though we don't handle modular sysctls. Errors due to e.g. invalid parameter name or value are reported in the kernel log. The processing is hooked right before the init process is loaded, as some handlers might be more complicated than simple setters and might need some subsystems to be initialized. At the moment the init process can be started and eventually execute a process writing to /proc/sys/ then it should be also fine to do that from the kernel. Sysctls registered later on module load time are not set by this mechanism - it's expected that in such scenarios, setting sysctl values from userspace is practical enough. [1] https://lore.kernel.org/r/BL0PR02MB560167492CA4094C91589930E9FC0@BL0PR02MB5601.namprd02.prod.outlook.com/ [2] https://unix.stackexchange.com/questions/558802/how-to-set-sysctl-using-kernel-command-line-parameter [3] https://lore.kernel.org/r/87bloj2skm.fsf@x220.int.ebiederm.org/ Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Reviewed-by: Masami Hiramatsu Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: "Eric W . Biederman" Cc: "Guilherme G . Piccoli" Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Christian Brauner Link: http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Link: http://lkml.kernel.org/r/20200427180433.7029-2-vbabka@suse.cz Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 9 ++ fs/proc/proc_sysctl.c | 107 ++++++++++++++++++++++++ include/linux/sysctl.h | 4 + init/main.c | 2 + 4 files changed, 122 insertions(+) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index df9b0fe2ed60..d02b9d023660 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4969,6 +4969,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index df2143e05c57..973acf96f37c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1703,3 +1704,109 @@ int __init proc_sys_init(void) return sysctl_init(); } + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) + return 0; + + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + len = strlen(val); + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f2401e45a3c2..50bb7f383a1b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -197,6 +197,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +void do_sysctl_args(void); extern int pwrsw_enabled; extern int unaligned_enabled; @@ -235,6 +236,9 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, { } +static inline void do_sysctl_args(void) +{ +} #endif /* CONFIG_SYSCTL */ int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, diff --git a/init/main.c b/init/main.c index 76df62fc3e2c..b59e09353881 100644 --- a/init/main.c +++ b/init/main.c @@ -1412,6 +1412,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + do_sysctl_args(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) -- cgit v1.2.3 From 0ec9dc9bcba0a62b0844e54c1caf6b8b0bf6b5b4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:45 -0700 Subject: kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected Commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic") introduced a change in that we started to show all CPUs backtraces when a hung task is detected _and_ the sysctl/kernel parameter "hung_task_panic" is set. The idea is good, because usually when observing deadlocks (that may lead to hung tasks), the culprit is another task holding a lock and not necessarily the task detected as hung. The problem with this approach is that dumping backtraces is a slightly expensive task, specially printing that on console (and specially in many CPU machines, as servers commonly found nowadays). So, users that plan to collect a kdump to investigate the hung tasks and narrow down the deadlock definitely don't need the CPUs backtrace on dmesg/console, which will delay the panic and pollute the log (crash tool would easily grab all CPUs traces with 'bt -a' command). Also, there's the reciprocal scenario: some users may be interested in seeing the CPUs backtraces but not have the system panic when a hung task is detected. The current approach hence is almost as embedding a policy in the kernel, by forcing the CPUs backtraces' dump (only) on hung_task_panic. This patch decouples the panic event on hung task from the CPUs backtraces dump, by creating (and documenting) a new sysctl called "hung_task_all_cpu_backtrace", analog to the approach taken on soft/hard lockups, that have both a panic and an "all_cpu_backtrace" sysctl to allow individual control. The new mechanism for dumping the CPUs backtraces on hung task detection respects "hung_task_warnings" by not dumping the traces in case there's no warnings left. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/20200327223646.20779-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 14 ++++++++++++++ include/linux/sched/sysctl.h | 7 +++++++ kernel/hung_task.c | 20 ++++++++++++++++++-- kernel/sysctl.c | 11 +++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3b00b9223157..861820d27c19 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7b4d3a49b6c5..660ac49f2b53 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,6 +7,13 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK + +#ifdef CONFIG_SMP +extern unsigned int sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b22b5eeab3cb..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -127,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -191,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 587ed0494f2f..34c1278951b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2437,6 +2437,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, -- cgit v1.2.3 From 60c958d8df9cfc40b745d6cd583cfbfa7525ead6 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:48 -0700 Subject: panic: add sysctl to dump all CPUs backtraces on oops event Usually when the kernel reaches an oops condition, it's a point of no return; in case not enough debug information is available in the kernel splat, one of the last resorts would be to collect a kernel crash dump and analyze it. The problem with this approach is that in order to collect the dump, a panic is required (to kexec-load the crash kernel). When in an environment of multiple virtual machines, users may prefer to try living with the oops, at least until being able to properly shutdown their VMs / finish their important tasks. This patch implements a way to collect a bit more debug details when an oops event is reached, by printing all the CPUs backtraces through the usage of NMIs (on architectures that support that). The sysctl added (and documented) here was called "oops_all_cpu_backtrace", and when set will (as the name suggests) dump all CPUs backtraces. Far from ideal, this may be the last option though for users that for some reason cannot panic on oops. Most of times oopses are clear enough to indicate the kernel portion that must be investigated, but in virtual environments it's possible to observe hypervisor/KVM issues that could lead to oopses shown in other guests CPUs (like virtual APIC crashes). This patch hence aims to help debug such complex issues without resorting to kdump. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Randy Dunlap Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200327224116.21030-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 16 ++++++++++++++++ include/linux/kernel.h | 6 ++++++ kernel/panic.c | 11 +++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 861820d27c19..83acf5025488 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -646,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7835db7102e..82d91547d122 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -520,6 +520,12 @@ static inline u32 int_sqrt64(u64 x) } #endif +#ifdef CONFIG_SMP +extern unsigned int sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; diff --git a/kernel/panic.c b/kernel/panic.c index 94b5c973770c..85568bbfb12b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -522,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34c1278951b9..f69d581d39c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2150,6 +2150,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, -- cgit v1.2.3 From dadbb612f6e50bbf9101c2f5d82690ce9ea4d66b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sun, 7 Jun 2020 21:40:55 -0700 Subject: mm/gup.c: convert to use get_user_{page|pages}_fast_only() API __get_user_pages_fast() renamed to get_user_pages_fast_only() to align with pin_user_pages_fast_only(). As part of this we will get rid of write parameter. Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). This will not change any existing functionality of the API. All the callers are changed to pass FOLL_WRITE. Also introduce get_user_page_fast_only(), and use it in a few places that hard-code nr_pages to 1. Updated the documentation of the API. Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Paul Mackerras [arch/powerpc/kvm] Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Aneesh Kumar K.V Cc: Michal Suchanek Link: http://lkml.kernel.org/r/1590396812-31277-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/perf/callchain_64.c | 4 +--- include/linux/mm.h | 10 ++++++++-- kernel/events/core.c | 4 ++-- mm/gup.c | 29 ++++++++++++++++------------- virt/kvm/kvm_main.c | 8 +++----- 7 files changed, 32 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 18aed9775a3c..ddfc4c90ebb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -581,7 +581,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * We always ask for write permission since the common case * is that the page is writable. */ - if (__get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 02219e28b1e4..a47fa8b4d0f0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -795,7 +795,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { upgrade_write = true; } else { unsigned long pfn; diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index b63086b663ef..9cc1a129737e 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -30,11 +30,9 @@ int read_user_stack_slow(void __user *ptr, void *buf, int nb) unsigned long addr = (unsigned long) ptr; unsigned long offset; struct page *page; - int nrpages; void *kaddr; - nrpages = __get_user_pages_fast(addr, 1, 1, &page); - if (nrpages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) { kaddr = page_address(page); /* align address to page boundary */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 86adc71a972f..22010c4175f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1824,10 +1824,16 @@ extern int mprotect_fixup(struct vm_area_struct *vma, /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); int pin_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a08..63d66bbebbd5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } diff --git a/mm/gup.c b/mm/gup.c index e19ff770eb4c..9d1bf3ec8e39 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2294,7 +2294,7 @@ pte_unmap: * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a - * __get_user_pages_fast implementation that can pin pages. Thus it's still + * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, @@ -2699,7 +2699,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end, #ifndef gup_fast_permitted /* - * Check if it's allowed to use __get_user_pages_fast() for the range, or + * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) @@ -2811,8 +2811,14 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -/* +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the @@ -2825,8 +2831,8 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { int nr_pinned; /* @@ -2836,10 +2842,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; - - if (write) - gup_flags |= FOLL_WRITE; + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -2855,7 +2858,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr_pinned; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory @@ -2926,8 +2929,8 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast); /* - * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the - * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. * * The API rules are the same, too: no negative values may be returned. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e1659..7b0da1c28e51 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1740,7 +1740,6 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) { struct page *page[1]; - int npages; /* * Fast pin a writable pfn only if it is a write fault request @@ -1750,8 +1749,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, if (!(write_fault || writable)) return false; - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { *pfn = page_to_pfn(page[0]); if (writable) @@ -1791,7 +1789,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (unlikely(!write_fault) && writable) { struct page *wpage; - if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { *writable = true; put_page(page); page = wpage; @@ -2003,7 +2001,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); + return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); -- cgit v1.2.3 From 420c2091b65a95b1daea4c36d27df4ac5f38033d Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:02 -0700 Subject: mm/gup: introduce pin_user_pages_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/gup: introduce pin_user_pages_locked(), use it in frame_vector.c", v2. This adds yet one more pin_user_pages*() variant, and uses that to convert mm/frame_vector.c. With this, along with maybe 20 or 30 other recent patches in various trees, we are close to having the relevant gup call sites converted--with the notable exception of the bio/block layer. This patch (of 2): Introduce pin_user_pages_locked(), which is nearly identical to get_user_pages_locked() except that it sets FOLL_PIN and rejects FOLL_GET. As with other pairs of get_user_pages*() and pin_user_pages() API calls, it's prudent to assert that FOLL_PIN is *not* set in the get_user_pages*() call, so add that as part of this. [jhubbard@nvidia.com: v2] Link: http://lkml.kernel.org/r/20200531234131.770697-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Souptick Joarder Link: http://lkml.kernel.org/r/20200531234131.770697-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 22010c4175f2..9d6042178ca7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1706,6 +1706,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/mm/gup.c b/mm/gup.c index 9d1bf3ec8e39..5ce0a99dd8f3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2035,6 +2035,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, @@ -3058,3 +3064,32 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/* + * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). + * Behavior is the same, except that this one sets FOLL_PIN and rejects + * FOLL_GET. + */ +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) +{ + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(pin_user_pages_locked); -- cgit v1.2.3 From 2062a4e8ae9f486847652927aaf88e21ab8d195d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:29:56 -0700 Subject: kallsyms/printk: add loglvl to print_ip_sym() Patch series "Add log level to show_stack()", v3. Add log level argument to show_stack(). Done in three stages: 1. Introducing show_stack_loglvl() for every architecture 2. Migrating old users with an explicit log level 3. Renaming show_stack_loglvl() into show_stack() Justification: - It's a design mistake to move a business-logic decision into platform realization detail. - I have currently two patches sets that would benefit from this work: Removing console_loglevel jumps in sysrq driver [1] Hung task warning before panic [2] - suggested by Tetsuo (but he probably didn't realise what it would involve). - While doing (1), (2) the backtraces were adjusted to headers and other messages for each situation - so there won't be a situation when the backtrace is printed, but the headers are missing because they have lesser log level (or the reverse). - As the result in (2) plays with console_loglevel for kdb are removed. The least important for upstream, but maybe still worth to note that every company I've worked in so far had an off-list patch to print backtrace with the needed log level (but only for the architecture they cared about). If you have other ideas how you will benefit from show_stack() with a log level - please, reply to this cover letter. See also discussion on v1: https://lore.kernel.org/linux-riscv/20191106083538.z5nlpuf64cigxigh@pathway.suse.cz/ This patch (of 50): print_ip_sym() needs to have a log level parameter to comply with other parts being printed. Otherwise, half of the expected backtrace would be printed and other may be missing with some logging level. The following callee(s) are using now the adjusted log level: - microblaze/unwind: the same level as headers & userspace unwind. Note that pr_debug()'s there are for debugging the unwinder itself. - nds32/traps: symbol addresses are printed with the same log level as backtrace headers. - lockdep: ip for locking issues is printed with the same log level as other part of the warning. - sched: ip where preemption was disabled is printed as error like the rest part of the message. - ftrace: bug reports are now consistent in the log level being used. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Steven Rostedt (VMware) Cc: Albert Ou Cc: Ben Segall Cc: Dietmar Eggemann Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Hogan Cc: Juri Lelli Cc: Mel Gorman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vincent Chen Cc: Vincent Guittot Cc: Will Deacon Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Dmitry Safonov Cc: Jiri Slaby Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Aurelien Jacquiot Cc: Mark Salter Cc: Guo Ren Cc: Yoshinori Sato Cc: Brian Cain Cc: Fenghua Yu Cc: Tony Luck Cc: Geert Uytterhoeven Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Rich Felker Cc: "David S. Miller" Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Chris Zankel Cc: Max Filippov Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Daniel Thompson Cc: Douglas Anderson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-2-dima@arista.com Signed-off-by: Linus Torvalds --- arch/microblaze/kernel/unwind.c | 2 +- arch/mips/kernel/traps.c | 4 ++-- arch/nds32/kernel/traps.c | 4 ++-- arch/riscv/kernel/stacktrace.c | 2 +- include/linux/kallsyms.h | 4 ++-- kernel/locking/lockdep.c | 4 ++-- kernel/sched/core.c | 6 ++---- kernel/trace/ftrace.c | 8 ++++---- tools/include/linux/kallsyms.h | 2 +- 9 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 34c270cb11fc..4241cdd28ee7 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -254,7 +254,7 @@ static void microblaze_unwind_inner(struct task_struct *task, task->comm); break; } else - print_ip_sym(pc); + print_ip_sym(KERN_INFO, pc); } /* Stop when we reach anything not part of the kernel */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 22f805a73921..210fea63de75 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -125,7 +125,7 @@ static void show_raw_backtrace(unsigned long reg29) break; } if (__kernel_text_address(addr)) - print_ip_sym(addr); + print_ip_sym(KERN_DEFAULT, addr); } printk("\n"); } @@ -155,7 +155,7 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) } printk("Call Trace:\n"); do { - print_ip_sym(pc); + print_ip_sym(KERN_DEFAULT, pc); pc = unwind_stack(task, &sp, pc, &ra); } while (pc); pr_cont("\n"); diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index f4d386b52622..40625760a125 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -108,7 +108,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) if (__kernel_text_address(ret_addr)) { ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(ret_addr); + print_ip_sym(KERN_EMERG, ret_addr); } if (--cnt < 0) break; @@ -124,7 +124,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(ret_addr); + print_ip_sym(KERN_EMERG, ret_addr); } if (--cnt < 0) break; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 837b9b38f825..9f1ac258482f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -99,7 +99,7 @@ void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(pc); + print_ip_sym(KERN_DEFAULT, pc); return false; } diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 657a83b943f0..98338dc6b5d2 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -165,9 +165,9 @@ static inline int kallsyms_show_value(void) #endif /*CONFIG_KALLSYMS*/ -static inline void print_ip_sym(unsigned long ip) +static inline void print_ip_sym(const char *loglvl, unsigned long ip) { - printk("[<%px>] %pS\n", (void *) ip, (void *) ip); + printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4c057dd8e93b..38cce34d03dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8298b2c240ce..c06da3c3e317 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b5765aeea698..7d0ebd104706 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2020,12 +2020,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EFAULT: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2036,12 +2036,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EPERM: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { diff --git a/tools/include/linux/kallsyms.h b/tools/include/linux/kallsyms.h index 89ca6fe257cc..efb6c3f5f2a9 100644 --- a/tools/include/linux/kallsyms.h +++ b/tools/include/linux/kallsyms.h @@ -20,7 +20,7 @@ static inline const char *kallsyms_lookup(unsigned long addr, #include #include -static inline void print_ip_sym(unsigned long ip) +static inline void print_ip_sym(const char *loglvl, unsigned long ip) { char **name; -- cgit v1.2.3 From ab34b46d1a74e98996e67a7da7e5d683ecfd9f57 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:10 -0700 Subject: sysrq: use show_stack_loglvl() Show the stack trace on a CPU with the same log level as "CPU%d" header. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: Jiri Slaby Link: http://lkml.kernel.org/r/20200418201944.482088-45-dima@arista.com Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 2 +- include/linux/sched/debug.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 477cdc1e9cf3..7bd935379dec 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -235,7 +235,7 @@ static void showacpu(void *dummy) raw_spin_lock_irqsave(&show_lock, flags); pr_info("CPU%d:\n", smp_processor_id()); - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_INFO); raw_spin_unlock_irqrestore(&show_lock, flags); } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 95fb9e025247..373e4e3faf2a 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -31,6 +31,8 @@ extern void show_regs(struct pt_regs *); * trace (or NULL if the entire call-chain of the task should be shown). */ extern void show_stack(struct task_struct *task, unsigned long *sp); +extern void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl); extern void sched_show_task(struct task_struct *p); -- cgit v1.2.3 From 9cb8f069deeed708bf19486d5893e297dc467ae0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:29 -0700 Subject: kernel: rename show_stack_loglvl() => show_stack() Now the last users of show_stack() got converted to use an explicit log level, show_stack_loglvl() can drop it's redundant suffix and become once again well known show_stack(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200418201944.482088-51-dima@arista.com Signed-off-by: Linus Torvalds --- arch/alpha/kernel/traps.c | 8 +------- arch/arc/kernel/stacktrace.c | 8 +------- arch/arm/kernel/traps.c | 8 +------- arch/arm64/kernel/traps.c | 8 +------- arch/c6x/kernel/traps.c | 7 +------ arch/csky/kernel/ptrace.c | 4 ++-- arch/csky/kernel/stacktrace.c | 9 +-------- arch/h8300/kernel/traps.c | 8 +------- arch/hexagon/kernel/traps.c | 8 +------- arch/ia64/kernel/mca.c | 2 +- arch/ia64/kernel/process.c | 11 ++--------- arch/m68k/kernel/traps.c | 11 +++-------- arch/microblaze/kernel/traps.c | 8 +------- arch/mips/kernel/traps.c | 8 +------- arch/nds32/kernel/traps.c | 8 +------- arch/nios2/kernel/traps.c | 12 +++--------- arch/openrisc/kernel/traps.c | 10 ++-------- arch/parisc/kernel/traps.c | 8 +------- arch/powerpc/kernel/process.c | 11 +++-------- arch/powerpc/kernel/stacktrace.c | 2 +- arch/riscv/kernel/stacktrace.c | 8 +------- arch/s390/kernel/dumpstack.c | 9 ++------- arch/sh/kernel/dumpstack.c | 8 +------- arch/sparc/kernel/process_32.c | 11 ++--------- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/traps_64.c | 8 +------- arch/um/drivers/mconsole_kern.c | 2 +- arch/um/kernel/sysrq.c | 7 +------ arch/unicore32/kernel/traps.c | 7 +------ arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/dumpstack.c | 7 +------ arch/xtensa/kernel/traps.c | 10 ++-------- drivers/base/power/main.c | 2 +- drivers/tty/sysrq.c | 2 +- include/linux/sched/debug.h | 5 ++--- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/locking/rtmutex-debug.c | 2 +- kernel/sched/core.c | 2 +- lib/dump_stack.c | 2 +- 39 files changed, 52 insertions(+), 205 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 2402f1777f54..8383ccfaccdc 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -144,8 +144,7 @@ dik_show_trace(unsigned long *sp, const char *loglvl) static int kstack_depth_to_print = 24; -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long *stack; int i; @@ -174,11 +173,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, dik_show_trace(sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) { diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 24f9cd8a12c9..feba91c9d969 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -228,17 +228,11 @@ noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { show_stacktrace(tsk, NULL, loglvl); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - /* Another API expected by schedular, shows up in "ps" as Wait Channel * Of course just returning schedule( ) would be pointless so unwind until * the function is not in schedular code diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2ae2c23799ad..65a3b1e75480 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -247,18 +247,12 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, } #endif -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 3621868b2fcc..3fd9e2731e0d 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -137,18 +137,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, put_task_stack(tsk); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index 4afbf48f1ce0..2b9121c755be 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -374,7 +374,7 @@ static void show_trace(unsigned long *stack, unsigned long *endstack, printk("%s\n", loglvl); } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { unsigned long *p, *endstack; @@ -403,11 +403,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, show_trace(stack, endstack, loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_DEBUG); -} - int is_valid_bugaddr(unsigned long addr) { return __kernel_text_address(addr); diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c index 5a82230bddf9..bbd801f86eb5 100644 --- a/arch/csky/kernel/ptrace.c +++ b/arch/csky/kernel/ptrace.c @@ -344,7 +344,7 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs) trace_sys_exit(regs, syscall_get_return_value(current, regs)); } -extern void show_stack(struct task_struct *task, unsigned long *stack); +extern void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl); void show_regs(struct pt_regs *fp) { unsigned long *sp; @@ -420,6 +420,6 @@ void show_regs(struct pt_regs *fp) } pr_cont("\n"); - show_stack(NULL, (unsigned long *)fp->regs[4]); + show_stack(NULL, (unsigned long *)fp->regs[4], KERN_INFO); return; } diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index ca135f13cc13..16ae20a0af34 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -95,19 +95,12 @@ static bool print_trace_address(unsigned long pc, void *arg) return false; } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { pr_cont("Call Trace:\n"); walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, KERN_INFO); -} - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index 6362446563d6..5d8b969cd8f3 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -115,8 +115,7 @@ void die(const char *str, struct pt_regs *fp, unsigned long err) static int kstack_depth_to_print = 24; -void show_stack_loglvl(struct task_struct *task, unsigned long *esp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { unsigned long *stack, addr; int i; @@ -158,8 +157,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *esp, } printk("%s\n", loglvl); } - -void show_stack(struct task_struct *task, unsigned long *esp) -{ - show_stack_loglvl(task, esp, KERN_INFO); -} diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index a8a3a210d781..904134b37232 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -175,18 +175,12 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } } -void show_stack_loglvl(struct task_struct *task, unsigned long *fp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *fp, const char *loglvl) { /* Saved link reg is one word above FP */ do_show_stack(task, fp, 0, loglvl); } -void show_stack(struct task_struct *task, unsigned long *fp) -{ - show_stack_loglvl(task, fp, 0, KERN_INFO); -} - int die(const char *str, struct pt_regs *regs, long err) { static struct { diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6fb54dfa1350..2703f7795672 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1631,7 +1631,7 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi if (read_trylock(&tasklist_lock)) { do_each_thread (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); - show_stack(t, NULL); + show_stack(t, NULL, KERN_DEFAULT); } while_each_thread (g, t); read_unlock(&tasklist_lock); } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 913d9a01cbf9..96dfb9e4b16f 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -85,8 +85,7 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) } void -show_stack_loglvl (struct task_struct *task, unsigned long *sp, - const char *loglvl) +show_stack (struct task_struct *task, unsigned long *sp, const char *loglvl) { if (!task) unw_init_running(ia64_do_show_stack, (void *)loglvl); @@ -98,12 +97,6 @@ show_stack_loglvl (struct task_struct *task, unsigned long *sp, } } -void -show_stack (struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_regs (struct pt_regs *regs) { @@ -158,7 +151,7 @@ show_regs (struct pt_regs *regs) ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); } } else - show_stack(NULL, NULL); + show_stack(NULL, NULL, KERN_DEFAULT); } /* local support for deprecated console_print */ diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index ffcc5ec4fac3..df6fc782754f 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -916,7 +916,7 @@ void show_registers(struct pt_regs *regs) default: pr_cont("\n"); } - show_stack(NULL, (unsigned long *)addr); + show_stack(NULL, (unsigned long *)addr, KERN_INFO); pr_info("Code:"); set_fs(KERNEL_DS); @@ -935,8 +935,8 @@ void show_registers(struct pt_regs *regs) pr_cont("\n"); } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p; unsigned long *endstack; @@ -963,11 +963,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, show_trace(stack, loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_INFO); -} - /* * The vector number returned in the frame pointer may also contain * the "fs" (Fault Status) bits on ColdFire. These are in the bottom diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index 149ae534937e..94b6fe93147d 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -31,8 +31,7 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long words_to_show; u32 fp = (u32) sp; @@ -77,8 +76,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, debug_show_held_locks(task); } - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_INFO); -} diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e49040739b61..320797bd03f6 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -198,8 +198,7 @@ static void show_stacktrace(struct task_struct *task, show_backtrace(task, regs, loglvl); } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { struct pt_regs regs; mm_segment_t old_fs = get_fs(); @@ -227,11 +226,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, set_fs(old_fs); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT) -} - static void show_code(unsigned int __user *pc) { long i; diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 90f12582c218..6a9772ba7392 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -135,8 +135,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg, printk("%s\n", loglvl); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { unsigned long *base_reg; @@ -157,11 +156,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_EMERG); -} - DEFINE_SPINLOCK(die_lock); /* diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 08071caa9b36..b172da4eb1a9 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -52,14 +52,13 @@ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) } /* - * The show_stack(), show_stack_loglvl() are external API - * which we do not use ourselves. + * The show_stack() is external API which we do not use ourselves. */ int kstack_depth_to_print = 48; -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *endstack, addr; int i; @@ -106,11 +105,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_EMERG); -} - void __init trap_init(void) { /* Nothing to do here */ diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 3b7978a22d68..3022b0ad142c 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -48,8 +48,7 @@ void print_trace(void *data, unsigned long addr, int reliable) } /* displays a short stack trace */ -void show_stack_loglvl(struct task_struct *task, unsigned long *esp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { if (esp == NULL) esp = (unsigned long *)&esp; @@ -58,11 +57,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *esp, unwind_stack((void *)loglvl, esp, print_trace); } -void show_stack(struct task_struct *task, unsigned long *esp) -{ - show_stack_loglvl(task, esp, KERN_EMERG); -} - void show_registers(struct pt_regs *regs) { int i; @@ -104,7 +98,7 @@ void show_registers(struct pt_regs *regs) if (in_kernel) { printk("\nStack: "); - show_stack(NULL, (unsigned long *)esp); + show_stack(NULL, (unsigned long *)esp, KERN_EMERG); printk("\nCode: "); if (regs->pc < PAGE_OFFSET) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index c2411de3730f..0a89899f154a 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -198,17 +198,11 @@ static void parisc_show_stack(struct task_struct *task, do_show_stack(&info, loglvl); } -void show_stack_loglvl(struct task_struct *t, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *t, unsigned long *sp, const char *loglvl) { parisc_show_stack(t, NULL, loglvl); } -void show_stack(struct task_struct *t, unsigned long *sp) -{ - show_stack_loglvl(t, sp, KERN_CRIT) -} - int is_valid_bugaddr(unsigned long iaoq) { return 1; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a456b4454b3f..99d619f81cb5 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1456,7 +1456,7 @@ void show_regs(struct pt_regs * regs) printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); #endif - show_stack(current, (unsigned long *) regs->gpr[1]); + show_stack(current, (unsigned long *) regs->gpr[1], KERN_DEFAULT); if (!user_mode(regs)) show_instructions(regs); } @@ -2063,8 +2063,8 @@ unsigned long get_wchan(struct task_struct *p) static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *stack, + const char *loglvl) { unsigned long sp, ip, lr, newsp; int count = 0; @@ -2133,11 +2133,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, put_task_stack(tsk); } -void show_stack(struct task_struct *tsk, unsigned long *stack) -{ - show_stack_loglvl(tsk, stack, KERN_DEFAULT); -} - #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void notrace __ppc64_runlatch_on(void) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index c477b8585a29..b6440657ef92 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -260,7 +260,7 @@ static void raise_backtrace_ipi(cpumask_t *mask) pr_cont(" current pointer corrupt? (%px)\n", p->__current); pr_warn("Back trace of paca->saved_r1 (0x%016llx) (possibly stale):\n", p->saved_r1); - show_stack(p->__current, (unsigned long *)p->saved_r1); + show_stack(p->__current, (unsigned long *)p->saved_r1, KERN_WARNING); } } diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index aaa64bf007f8..595342910c3f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -105,18 +105,12 @@ static bool print_trace_address(unsigned long pc, void *arg) return false; } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { pr_cont("Call Trace:\n"); walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 887a054919fc..0dc4b258b98d 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -126,7 +126,7 @@ unknown: return -EINVAL; } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { struct unwind_state state; @@ -139,11 +139,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, debug_show_held_locks(task ? : current); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_DEFAULT); -} - static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); @@ -181,7 +176,7 @@ void show_regs(struct pt_regs *regs) show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ if (!user_mode(regs)) - show_stack(NULL, (unsigned long *) regs->gprs[15]); + show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT); show_last_breaking_event(regs); } diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index cc51e9d74667..a13c045804ed 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -144,8 +144,7 @@ void show_trace(struct task_struct *tsk, unsigned long *sp, debug_show_held_locks(tsk); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { unsigned long stack; @@ -161,8 +160,3 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, (unsigned long)task_stack_page(tsk)); show_trace(tsk, sp, NULL, loglvl); } - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 0b07de5618e5..65c0d5207b0c 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -145,12 +145,10 @@ void show_regs(struct pt_regs *r) } /* - * The show_stack(), show_stack_loglvl() are external APIs which - * we do not use ourselves. + * The show_stack() is external API which we do not use ourselves. * The oops is printed in die_if_kernel. */ -void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *_ksp, const char *loglvl) { unsigned long pc, fp; unsigned long task_base; @@ -179,11 +177,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - /* * Free current thread data structures etc.. */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 423011e60982..5a4d9c8f8903 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -195,7 +195,7 @@ void show_regs(struct pt_regs *regs) regs->u_regs[15]); printk("RPC: <%pS>\n", (void *) regs->u_regs[15]); show_regwindow(regs); - show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]); + show_stack(current, (unsigned long *)regs->u_regs[UREG_FP], KERN_DEFAULT); } union global_cpu_snapshot global_cpu_snapshot[NR_CPUS]; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 8715bc93bd9d..96d92f107551 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2453,8 +2453,7 @@ static void user_instruction_dump(unsigned int __user *pc) printk("\n"); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *_ksp, const char *loglvl) { unsigned long fp, ksp; struct thread_info *tp; @@ -2514,11 +2513,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, } while (++count < 16); } -void show_stack(struct task_struct *tsk, unsigned long *_ksp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - static inline struct reg_window *kernel_stack_up(struct reg_window *rw) { unsigned long fp = rw->ins[6]; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 30575bd92975..a2e680f7d39f 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -648,7 +648,7 @@ static void stack_proc(void *arg) { struct task_struct *task = arg; - show_stack(task, NULL); + show_stack(task, NULL, KERN_INFO); } /* diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 1b54b6431499..acbc879d2773 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -27,7 +27,7 @@ static const struct stacktrace_ops stackops = { .address = _print_addr }; -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { struct pt_regs *segv_regs = current->thread.segv_regs; @@ -56,8 +56,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, dump_trace(current, &stackops, (void *)loglvl); printk("%s\n", loglvl); } - -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_INFO); -} diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 8b1335997f50..a3ac01df1a2e 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -167,18 +167,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, c_backtrace(fp, loglvl); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT) -} - static int __die(const char *str, int err, struct thread_info *thread, struct pt_regs *regs) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 9d2c076be37a..5f816861f5d2 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack_loglvl(NULL, NULL, KERN_ERR); + show_stack(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4396f2cfad19..456511b2284e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,7 @@ next: } } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { task = task ? : current; @@ -294,11 +294,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace_log_lvl(task, NULL, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_stack_regs(struct pt_regs *regs) { show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 1013acc2e03e..e880460741d2 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -501,8 +501,7 @@ static void show_trace(struct task_struct *task, unsigned long *sp, #define STACK_DUMP_LINE_SIZE 32 static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { size_t len; @@ -519,11 +518,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace(task, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_INFO); -} - DEFINE_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) @@ -540,7 +534,7 @@ void die(const char * str, struct pt_regs * regs, long err) pr_info("%s: sig: %ld [#%d]%s\n", str, err, ++die_counter, pr); show_regs(regs); if (!user_mode(regs)) - show_stack(NULL, (unsigned long*)regs->areg[1]); + show_stack(NULL, (unsigned long *)regs->areg[1], KERN_INFO); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irq(&die_lock); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index be5af89bbff1..9dd85bea4026 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -519,7 +519,7 @@ static void dpm_watchdog_handler(struct timer_list *t) struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); - show_stack_loglvl(wd->tsk, NULL, KERN_EMERG); + show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 7bd935379dec..7c95afa905a0 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -235,7 +235,7 @@ static void showacpu(void *dummy) raw_spin_lock_irqsave(&show_lock, flags); pr_info("CPU%d:\n", smp_processor_id()); - show_stack_loglvl(NULL, NULL, KERN_INFO); + show_stack(NULL, NULL, KERN_INFO); raw_spin_unlock_irqrestore(&show_lock, flags); } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 373e4e3faf2a..00c45a0e6abe 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -30,9 +30,8 @@ extern void show_regs(struct pt_regs *); * task), SP is the stack pointer of the first frame that should be shown in the back * trace (or NULL if the entire call-chain of the task should be shown). */ -extern void show_stack(struct task_struct *task, unsigned long *sp); -extern void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl); +extern void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl); extern void sched_show_task(struct task_struct *p); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 43f5dcd2b9ac..18e03aba2cfc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -30,7 +30,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) kdb_dump_stack_on_cpu(kdb_process_cpu(p)); console_loglevel = old_lvl; } else { - show_stack_loglvl(p, addr, KERN_EMERG); + show_stack(p, addr, KERN_EMERG); } kdb_trap_printk--; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 5e63d6e8a223..36e69100e8e0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack_loglvl(task, NULL, KERN_DEFAULT); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c68a6e7b306f..8f360326861e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack_loglvl(p, NULL, KERN_INFO); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 5595e8962cf6..a00ee6eedc7c 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -74,7 +74,7 @@ void show_regs_print_info(const char *log_lvl) static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); - show_stack_loglvl(NULL, NULL, KERN_DEFAULT); + show_stack(NULL, NULL, KERN_DEFAULT); } /** -- cgit v1.2.3 From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/boot/bootp.c | 1 - arch/alpha/boot/bootpz.c | 1 - arch/alpha/boot/main.c | 1 - arch/alpha/include/asm/io.h | 1 - arch/alpha/kernel/process.c | 1 - arch/alpha/kernel/ptrace.c | 1 - arch/alpha/kernel/setup.c | 1 - arch/alpha/kernel/smp.c | 1 - arch/alpha/kernel/sys_alcor.c | 1 - arch/alpha/kernel/sys_cabriolet.c | 1 - arch/alpha/kernel/sys_dp264.c | 1 - arch/alpha/kernel/sys_eb64p.c | 1 - arch/alpha/kernel/sys_eiger.c | 1 - arch/alpha/kernel/sys_jensen.c | 1 - arch/alpha/kernel/sys_marvel.c | 1 - arch/alpha/kernel/sys_miata.c | 1 - arch/alpha/kernel/sys_mikasa.c | 1 - arch/alpha/kernel/sys_nautilus.c | 1 - arch/alpha/kernel/sys_noritake.c | 1 - arch/alpha/kernel/sys_rawhide.c | 1 - arch/alpha/kernel/sys_ruffian.c | 1 - arch/alpha/kernel/sys_rx164.c | 1 - arch/alpha/kernel/sys_sable.c | 1 - arch/alpha/kernel/sys_sio.c | 1 - arch/alpha/kernel/sys_sx164.c | 1 - arch/alpha/kernel/sys_takara.c | 1 - arch/alpha/kernel/sys_titan.c | 1 - arch/alpha/kernel/sys_wildfire.c | 1 - arch/alpha/mm/init.c | 1 - arch/arm/kernel/machine_kexec.c | 1 - arch/arm/kernel/module.c | 1 - arch/arm/kernel/ptrace.c | 1 - arch/arm/kernel/smp.c | 1 - arch/arm/mach-ebsa110/core.c | 1 - arch/arm/mach-footbridge/common.c | 1 - arch/arm/mach-imx/mm-imx21.c | 1 - arch/arm/mach-imx/mm-imx27.c | 1 - arch/arm/mach-imx/mm-imx3.c | 1 - arch/arm/mach-iop32x/i2c.c | 1 - arch/arm/mach-iop32x/iq31244.c | 1 - arch/arm/mach-iop32x/iq80321.c | 1 - arch/arm/mach-iop32x/n2100.c | 1 - arch/arm/mach-ixp4xx/common.c | 1 - arch/arm/mach-sa1100/assabet.c | 1 - arch/arm/mm/copypage-v4mc.c | 1 - arch/arm/mm/copypage-v6.c | 1 - arch/arm/mm/copypage-xscale.c | 1 - arch/arm/mm/dump.c | 1 - arch/arm/mm/fault-armv.c | 1 - arch/arm/mm/fault.c | 1 - arch/arm/mm/pageattr.c | 1 - arch/arm64/kernel/hibernate.c | 1 - arch/arm64/kernel/ptrace.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/dump.c | 1 - arch/arm64/mm/fault.c | 1 - arch/arm64/mm/kasan_init.c | 1 - arch/arm64/mm/pageattr.c | 1 - arch/csky/kernel/module.c | 1 - arch/csky/kernel/ptrace.c | 1 - arch/csky/mm/init.c | 1 - arch/csky/mm/tlb.c | 1 - arch/h8300/kernel/process.c | 1 - arch/h8300/kernel/setup.c | 1 - arch/h8300/kernel/signal.c | 1 - arch/h8300/mm/fault.c | 1 - arch/h8300/mm/init.c | 1 - arch/h8300/mm/memory.c | 1 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/kernel/efi.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/ia64/kernel/smp.c | 1 - arch/ia64/kernel/smpboot.c | 1 - arch/ia64/mm/contig.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/68000/timers.c | 1 - arch/m68k/amiga/config.c | 1 - arch/m68k/apollo/config.c | 1 - arch/m68k/atari/atasound.c | 1 - arch/m68k/atari/stram.c | 1 - arch/m68k/bvme6000/config.c | 1 - arch/m68k/kernel/process.c | 1 - arch/m68k/kernel/ptrace.c | 1 - arch/m68k/kernel/setup_no.c | 1 - arch/m68k/kernel/signal.c | 1 - arch/m68k/kernel/uboot.c | 1 - arch/m68k/mac/config.c | 1 - arch/m68k/mm/mcfmmu.c | 1 - arch/m68k/mm/sun3kmap.c | 1 - arch/m68k/mm/sun3mmu.c | 1 - arch/m68k/mvme147/config.c | 1 - arch/m68k/mvme16x/config.c | 1 - arch/m68k/q40/config.c | 1 - arch/m68k/sun3/config.c | 1 - arch/m68k/sun3/dvma.c | 1 - arch/m68k/sun3/mmu_emu.c | 1 - arch/m68k/sun3/sun3dvma.c | 1 - arch/m68k/sun3x/dvma.c | 1 - arch/m68k/sun3x/prom.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/mm/fault.c | 1 - arch/mips/fw/arc/memory.c | 1 - arch/mips/include/asm/mach-generic/floppy.h | 1 - arch/mips/include/asm/mach-jazz/floppy.h | 1 - arch/mips/jazz/jazzdma.c | 1 - arch/mips/kernel/module.c | 1 - arch/mips/kernel/process.c | 1 - arch/mips/kernel/ptrace.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/kernel/smp-bmips.c | 1 - arch/mips/kernel/traps.c | 1 - arch/mips/kvm/tlb.c | 1 - arch/mips/lib/dump_tlb.c | 1 - arch/mips/lib/r3k_dump_tlb.c | 1 - arch/mips/mm/c-octeon.c | 1 - arch/mips/mm/c-r3k.c | 1 - arch/mips/mm/c-r4k.c | 1 - arch/mips/mm/c-tx39.c | 1 - arch/mips/mm/init.c | 1 - arch/mips/mm/page.c | 1 - arch/mips/mm/pgtable-32.c | 1 - arch/mips/mm/pgtable-64.c | 1 - arch/mips/mm/sc-ip22.c | 1 - arch/mips/mm/sc-mips.c | 1 - arch/mips/mm/sc-r5k.c | 1 - arch/mips/mm/tlb-r3k.c | 1 - arch/mips/mm/tlb-r4k.c | 1 - arch/mips/sgi-ip27/ip27-init.c | 1 - arch/mips/sgi-ip27/ip27-timer.c | 1 - arch/mips/sgi-ip32/ip32-memory.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nds32/mm/proc.c | 1 - arch/nios2/kernel/module.c | 1 - arch/nios2/mm/init.c | 1 - arch/nios2/mm/pgtable.c | 1 - arch/nios2/mm/tlb.c | 1 - arch/openrisc/include/asm/tlbflush.h | 1 - arch/openrisc/kernel/asm-offsets.c | 1 - arch/openrisc/kernel/process.c | 1 - arch/openrisc/kernel/ptrace.c | 1 - arch/openrisc/kernel/setup.c | 1 - arch/openrisc/kernel/traps.c | 1 - arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/tlb.c | 1 - arch/parisc/include/asm/mmu_context.h | 1 - arch/parisc/kernel/module.c | 1 - arch/parisc/kernel/ptrace.c | 1 - arch/parisc/kernel/smp.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/include/asm/io.h | 1 - arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/process.c | 1 - arch/powerpc/kernel/signal_32.c | 1 - arch/powerpc/kernel/signal_64.c | 1 - arch/powerpc/kernel/traps.c | 1 - arch/powerpc/kernel/vdso.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/hash_pgtable.c | 1 - arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/radix_pgtable.c | 1 - arch/powerpc/mm/fault.c | 1 - arch/powerpc/mm/hugetlbpage.c | 1 - arch/powerpc/mm/init_32.c | 1 - arch/powerpc/mm/init_64.c | 1 - arch/powerpc/mm/mem.c | 1 - arch/powerpc/mm/nohash/40x.c | 1 - arch/powerpc/mm/nohash/fsl_booke.c | 1 - arch/powerpc/mm/pgtable_32.c | 1 - arch/powerpc/mm/pgtable_64.c | 1 - arch/powerpc/mm/ptdump/hashpagetable.c | 1 - arch/powerpc/mm/ptdump/ptdump.c | 1 - arch/powerpc/perf/callchain.c | 1 - arch/powerpc/perf/callchain_32.c | 1 - arch/powerpc/perf/callchain_64.c | 1 - arch/powerpc/platforms/8xx/cpm1.c | 1 - arch/powerpc/platforms/8xx/micropatch.c | 1 - arch/powerpc/platforms/cell/setup.c | 1 - arch/powerpc/platforms/chrp/setup.c | 1 - arch/powerpc/platforms/maple/setup.c | 1 - arch/powerpc/platforms/maple/time.c | 1 - arch/powerpc/platforms/powermac/setup.c | 1 - arch/powerpc/platforms/powermac/time.c | 1 - arch/powerpc/platforms/pseries/setup.c | 1 - arch/powerpc/sysdev/cpm2.c | 1 - arch/powerpc/xmon/xmon.c | 1 - arch/riscv/kernel/setup.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/include/asm/tlbflush.h | 1 - arch/s390/kernel/machine_kexec.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/kernel/vdso.c | 1 - arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/fault.c | 1 - arch/s390/mm/init.c | 1 - arch/s390/mm/pageattr.c | 1 - arch/s390/mm/pgtable.c | 1 - arch/s390/mm/vmem.c | 1 - arch/sh/kernel/machine_kexec.c | 1 - arch/sh/kernel/ptrace_32.c | 1 - arch/sh/kernel/signal_32.c | 1 - arch/sh/mm/cache-sh3.c | 1 - arch/sh/mm/cache-sh4.c | 1 - arch/sh/mm/cache-sh7705.c | 1 - arch/sh/mm/nommu.c | 1 - arch/sparc/kernel/leon_smp.c | 1 - arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/setup_32.c | 1 - arch/sparc/kernel/setup_64.c | 1 - arch/sparc/kernel/signal32.c | 1 - arch/sparc/kernel/signal_32.c | 1 - arch/sparc/kernel/signal_64.c | 1 - arch/sparc/kernel/smp_32.c | 1 - arch/sparc/kernel/smp_64.c | 1 - arch/sparc/kernel/traps_64.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/sparc/mm/hugetlbpage.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 1 - arch/sparc/mm/io-unit.c | 1 - arch/sparc/mm/iommu.c | 1 - arch/sparc/mm/tlb.c | 1 - arch/um/kernel/process.c | 1 - arch/um/kernel/skas/mmu.c | 1 - arch/um/kernel/skas/uaccess.c | 1 - arch/um/kernel/tlb.c | 1 - arch/um/kernel/trap.c | 1 - arch/um/kernel/um_arch.c | 1 - arch/unicore32/kernel/module.c | 1 - arch/unicore32/mm/fault.c | 1 - arch/x86/include/asm/iomap.h | 1 - arch/x86/include/asm/xen/page.h | 1 - arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/amd_gart_64.c | 1 - arch/x86/kernel/doublefault_32.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/machine_kexec_64.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/tboot.c | 1 - arch/x86/mm/dump_pagetables.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/kasan_init_64.c | 1 - arch/x86/mm/pat/cpa-test.c | 1 - arch/x86/mm/pat/memtype.c | 1 - arch/x86/mm/pgtable.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/mm/pti.c | 1 - arch/x86/platform/efi/efi_64.c | 1 - arch/x86/xen/enlighten_pv.c | 1 - arch/x86/xen/grant-table.c | 1 - arch/xtensa/kernel/process.c | 1 - arch/xtensa/kernel/ptrace.c | 1 - arch/xtensa/kernel/setup.c | 1 - drivers/char/agp/frontend.c | 1 - drivers/char/agp/generic.c | 1 - drivers/char/bsr.c | 1 - drivers/char/mspec.c | 1 - drivers/gpu/drm/i915/i915_mm.c | 1 - drivers/infiniband/sw/rdmavt/mmap.c | 1 - drivers/infiniband/sw/rxe/rxe_mmap.c | 1 - drivers/media/platform/davinci/vpbe_display.c | 1 - drivers/media/v4l2-core/v4l2-common.c | 1 - drivers/misc/sgi-gru/grufault.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/sbus/char/flash.c | 1 - drivers/sbus/char/uctrl.c | 1 - drivers/scsi/a2091.c | 1 - drivers/scsi/a3000.c | 1 - drivers/scsi/gvp11.c | 1 - drivers/scsi/lasi700.c | 1 - drivers/scsi/mvme147.c | 1 - drivers/scsi/sni_53c710.c | 1 - drivers/video/console/newport_con.c | 1 - drivers/video/fbdev/acornfb.c | 1 - drivers/video/fbdev/atafb.c | 1 - drivers/video/fbdev/cirrusfb.c | 1 - drivers/video/fbdev/cyber2000fb.c | 1 - drivers/video/fbdev/fb-puv3.c | 1 - drivers/video/fbdev/hitfb.c | 1 - drivers/video/fbdev/neofb.c | 1 - drivers/video/fbdev/q40fb.c | 1 - drivers/video/fbdev/savage/savagefb_driver.c | 1 - drivers/xen/balloon.c | 1 - drivers/xen/grant-table.c | 1 - drivers/xen/privcmd.c | 1 - drivers/xen/xenbus/xenbus_probe.c | 1 - drivers/xen/xenbus/xenbus_probe_backend.c | 1 - drivers/xen/xenbus/xenbus_probe_frontend.c | 1 - fs/proc/array.c | 1 - fs/proc/meminfo.c | 1 - fs/proc/nommu.c | 1 - fs/proc/vmcore.c | 1 - include/linux/dax.h | 1 - init/init_task.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - lib/ioremap.c | 1 - mm/debug_vm_pgtable.c | 1 - mm/gup.c | 1 - mm/hugetlb.c | 1 - mm/memory.c | 1 - mm/page_io.c | 1 - mm/shmem.c | 1 - mm/sparse-vmemmap.c | 1 - mm/sparse.c | 1 - mm/swap_state.c | 1 - mm/swapfile.c | 1 - mm/vmacache.c | 1 - sound/core/sgbuf.c | 1 - virt/kvm/kvm_main.c | 1 - 319 files changed, 319 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f4858..00266e6e1b71 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344b..43af71835adf 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed8610970..e5347a080008 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5..13bea465f1c0 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a..b45f0b0d6511 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d6..8c43212ae38e 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b4..f5c42a8fcf9c 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 52995bf413fe..631cc17410d1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65..e063b3857b3d 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f..47459b73cdb7 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d33508621820..9fb445d7dca5 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb987..3c43fd347526 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141..bf99dcfd40c4 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77..0a2ab6cb18db 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1..83d6c53d6d4d 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225..e1bee8f84c58 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e1..7690dfd57cb6 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b..53adf43dcd44 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f765..47f3ce4f719a 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b5..b5846ffdadce 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d33074011960..4b1c8d85c4f0 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa..94046f9aea08 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d8..930005b2f630 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47ad..7c420d8dac53 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c8..dd9de84b630c 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c6864088..9e2adb69bc74 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de140..b1f3b4fcf99b 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fd..2c54d707142a 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5..3c42b3147fd6 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e8..974b6c64d3e6 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24e..e15444b25ca0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff635..d0f7c8896c96 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a8..9a6432557871 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759..5960e3dfd2bf 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d..eee095f0e2f6 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include