From 07236eab7a3139da97aef9f5f21f403be82a82ea Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:14 -0800
Subject: bpf: factor out fetching basic kfunc metadata

Factor out logic to fetch basic kfunc metadata based on struct bpf_insn.
This is not exactly short or trivial code to just copy/paste and this
information is sometimes necessary in other parts of the verifier logic.
Subsequent patches will rely on this to determine if an instruction is
a kfunc call to iterator next method.

No functional changes intended, including that verbose() warning
behavior when kfunc is not allowed for a particular program type.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b2116ca78d9a..8d40fba6a1c0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10079,24 +10079,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 	return 0;
 }
 
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			    int *insn_idx_p)
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn,
+			    struct bpf_kfunc_call_arg_meta *meta,
+			    const char **kfunc_name)
 {
-	const struct btf_type *t, *func, *func_proto, *ptr_type;
-	u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
-	struct bpf_reg_state *regs = cur_regs(env);
-	const char *func_name, *ptr_type_name;
-	bool sleepable, rcu_lock, rcu_unlock;
-	struct bpf_kfunc_call_arg_meta meta;
-	int err, insn_idx = *insn_idx_p;
-	const struct btf_param *args;
-	const struct btf_type *ret_t;
+	const struct btf_type *func, *func_proto;
+	u32 func_id, *kfunc_flags;
+	const char *func_name;
 	struct btf *desc_btf;
-	u32 *kfunc_flags;
 
-	/* skip for now, but return error when we find this in fixup_kfunc_call */
+	if (kfunc_name)
+		*kfunc_name = NULL;
+
 	if (!insn->imm)
-		return 0;
+		return -EINVAL;
 
 	desc_btf = find_kfunc_desc_btf(env, insn->off);
 	if (IS_ERR(desc_btf))
@@ -10105,22 +10102,51 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	func_id = insn->imm;
 	func = btf_type_by_id(desc_btf, func_id);
 	func_name = btf_name_by_offset(desc_btf, func->name_off);
+	if (kfunc_name)
+		*kfunc_name = func_name;
 	func_proto = btf_type_by_id(desc_btf, func->type);
 
 	kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
 	if (!kfunc_flags) {
-		verbose(env, "calling kernel function %s is not allowed\n",
-			func_name);
 		return -EACCES;
 	}
 
-	/* Prepare kfunc call metadata */
-	memset(&meta, 0, sizeof(meta));
-	meta.btf = desc_btf;
-	meta.func_id = func_id;
-	meta.kfunc_flags = *kfunc_flags;
-	meta.func_proto = func_proto;
-	meta.func_name = func_name;
+	memset(meta, 0, sizeof(*meta));
+	meta->btf = desc_btf;
+	meta->func_id = func_id;
+	meta->kfunc_flags = *kfunc_flags;
+	meta->func_proto = func_proto;
+	meta->func_name = func_name;
+
+	return 0;
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			    int *insn_idx_p)
+{
+	const struct btf_type *t, *ptr_type;
+	u32 i, nargs, ptr_type_id, release_ref_obj_id;
+	struct bpf_reg_state *regs = cur_regs(env);
+	const char *func_name, *ptr_type_name;
+	bool sleepable, rcu_lock, rcu_unlock;
+	struct bpf_kfunc_call_arg_meta meta;
+	struct bpf_insn_aux_data *insn_aux;
+	int err, insn_idx = *insn_idx_p;
+	const struct btf_param *args;
+	const struct btf_type *ret_t;
+	struct btf *desc_btf;
+
+	/* skip for now, but return error when we find this in fixup_kfunc_call */
+	if (!insn->imm)
+		return 0;
+
+	err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+	if (err == -EACCES && func_name)
+		verbose(env, "calling kernel function %s is not allowed\n", func_name);
+	if (err)
+		return err;
+	desc_btf = meta.btf;
+	insn_aux = &env->insn_aux_data[insn_idx];
 
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
@@ -10173,7 +10199,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		err = release_reference(env, regs[meta.release_regno].ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 	}
@@ -10185,14 +10211,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 
 		err = release_reference(env, release_ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 	}
@@ -10202,7 +10228,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 					set_rbtree_add_callback_state);
 		if (err) {
 			verbose(env, "kfunc %s#%d failed callback verification\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 	}
@@ -10211,7 +10237,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
 	/* Check return type */
-	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+	t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
 
 	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
 		/* Only exception is bpf_obj_new_impl */
@@ -10260,11 +10286,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].btf = ret_btf;
 				regs[BPF_REG_0].btf_id = ret_btf_id;
 
-				env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
-				env->insn_aux_data[insn_idx].kptr_struct_meta =
+				insn_aux->obj_new_size = ret_t->size;
+				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(ret_btf, ret_btf_id);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
-				env->insn_aux_data[insn_idx].kptr_struct_meta =
+				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(meta.arg_obj_drop.btf,
 							     meta.arg_obj_drop.btf_id);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
@@ -10397,8 +10423,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			regs[BPF_REG_0].id = ++env->id_gen;
 	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
 
-	nargs = btf_type_vlen(func_proto);
-	args = (const struct btf_param *)(func_proto + 1);
+	nargs = btf_type_vlen(meta.func_proto);
+	args = (const struct btf_param *)(meta.func_proto + 1);
 	for (i = 0; i < nargs; i++) {
 		u32 regno = i + 1;
 
-- 
cgit v1.2.3


From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:15 -0800
Subject: bpf: add iterator kfuncs registration and validation logic

Add ability to register kfuncs that implement BPF open-coded iterator
contract and enforce naming and function proto convention. Enforcement
happens at the time of kfunc registration and significantly simplifies
the rest of iterators logic in the verifier.

More details follow in subsequent patches, but we enforce the following
conditions.

All kfuncs (constructor, next, destructor) have to be named consistenly
as bpf_iter_<type>_{new,next,destroy}(), respectively. <type> represents
iterator type, and iterator state should be represented as a matching
`struct bpf_iter_<type>` state type. Also, all iter kfuncs should have
a pointer to this `struct bpf_iter_<type>` as the very first argument.

Additionally:
  - Constructor, i.e., bpf_iter_<type>_new(), can have arbitrary extra
  number of arguments. Return type is not enforced either.
  - Next method, i.e., bpf_iter_<type>_next(), has to return a pointer
  type and should have exactly one argument: `struct bpf_iter_<type> *`
  (const/volatile/restrict and typedefs are ignored).
  - Destructor, i.e., bpf_iter_<type>_destroy(), should return void and
  should have exactly one argument, similar to the next method.
  - struct bpf_iter_<type> size is enforced to be positive and
  a multiple of 8 bytes (to fit stack slots correctly).

Such strictness and consistency allows to build generic helpers
abstracting important, but boilerplate, details to be able to use
open-coded iterators effectively and ergonomically (see bpf_for_each()
in subsequent patches). It also simplifies the verifier logic in some
places. At the same time, this doesn't hurt generality of possible
iterator implementations. Win-win.

Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is
marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course),
while destructor kfunc is marked as KF_ITER_DESTROY.

Additionally, we add a trivial kfunc name validation: it should be
a valid non-NULL and non-empty string.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |   2 +
 include/linux/btf.h          |   4 ++
 kernel/bpf/btf.c             | 112 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 18538bad2b8c..e2dc7f064449 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,8 @@ struct bpf_active_lock {
 	u32 id;
 };
 
+#define ITER_PREFIX "bpf_iter_"
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 556b3e2e7471..1bba0827e8c4 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -71,6 +71,10 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
+/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
+#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
+#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
+#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a8cb09e5973b..71758cd15b07 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7596,6 +7596,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
 BTF_TRACING_TYPE_xxx
 #undef BTF_TRACING_TYPE
 
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+				 const struct btf_type *func, u32 func_flags)
+{
+	u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+	const char *name, *sfx, *iter_name;
+	const struct btf_param *arg;
+	const struct btf_type *t;
+	char exp_name[128];
+	u32 nr_args;
+
+	/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+	if (!flags || (flags & (flags - 1)))
+		return -EINVAL;
+
+	/* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+	nr_args = btf_type_vlen(func);
+	if (nr_args < 1)
+		return -EINVAL;
+
+	arg = &btf_params(func)[0];
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!t || !btf_type_is_ptr(t))
+		return -EINVAL;
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+	if (!t || !__btf_type_is_struct(t))
+		return -EINVAL;
+
+	name = btf_name_by_offset(btf, t->name_off);
+	if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+		return -EINVAL;
+
+	/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+	 * fit nicely in stack slots
+	 */
+	if (t->size == 0 || (t->size % 8))
+		return -EINVAL;
+
+	/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+	 * naming pattern
+	 */
+	iter_name = name + sizeof(ITER_PREFIX) - 1;
+	if (flags & KF_ITER_NEW)
+		sfx = "new";
+	else if (flags & KF_ITER_NEXT)
+		sfx = "next";
+	else /* (flags & KF_ITER_DESTROY) */
+		sfx = "destroy";
+
+	snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+	if (strcmp(func_name, exp_name))
+		return -EINVAL;
+
+	/* only iter constructor should have extra arguments */
+	if (!(flags & KF_ITER_NEW) && nr_args != 1)
+		return -EINVAL;
+
+	if (flags & KF_ITER_NEXT) {
+		/* bpf_iter_<type>_next() should return pointer */
+		t = btf_type_skip_modifiers(btf, func->type, NULL);
+		if (!t || !btf_type_is_ptr(t))
+			return -EINVAL;
+	}
+
+	if (flags & KF_ITER_DESTROY) {
+		/* bpf_iter_<type>_destroy() should return void */
+		t = btf_type_by_id(btf, func->type);
+		if (!t || !btf_type_is_void(t))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+	const struct btf_type *func;
+	const char *func_name;
+	int err;
+
+	/* any kfunc should be FUNC -> FUNC_PROTO */
+	func = btf_type_by_id(btf, func_id);
+	if (!func || !btf_type_is_func(func))
+		return -EINVAL;
+
+	/* sanity check kfunc name */
+	func_name = btf_name_by_offset(btf, func->name_off);
+	if (!func_name || !func_name[0])
+		return -EINVAL;
+
+	func = btf_type_by_id(btf, func->type);
+	if (!func || !btf_type_is_func_proto(func))
+		return -EINVAL;
+
+	if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+		err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Kernel Function (kfunc) BTF ID set registration API */
 
 static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7772,7 +7874,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 				       const struct btf_kfunc_id_set *kset)
 {
 	struct btf *btf;
-	int ret;
+	int ret, i;
 
 	btf = btf_get_module_btf(kset->owner);
 	if (!btf) {
@@ -7789,7 +7891,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
+	for (i = 0; i < kset->set->cnt; i++) {
+		ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+					     kset->set->pairs[i].flags);
+		if (ret)
+			goto err_out;
+	}
+
 	ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
 	btf_put(btf);
 	return ret;
 }
-- 
cgit v1.2.3


From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:16 -0800
Subject: bpf: add support for open-coded iterator loops

Teach verifier about the concept of the open-coded (or inline) iterators.

This patch adds generic iterator loop verification logic, new STACK_ITER
stack slot type to contain iterator state, and necessary kfunc plumbing
for iterator's constructor, destructor and next methods. Next patch
implements first specific iterator (numbers iterator for implementing
for() loop logic). Such split allows to have more focused commits for
verifier logic and separate commit that we could point later to
demonstrating  what does it take to add a new kind of iterator.

Each kind of iterator has its own associated struct bpf_iter_<type>,
where <type> denotes a specific type of iterator. struct bpf_iter_<type>
state is supposed to live on BPF program stack, so there will be no way
to change its size later on without breaking backwards compatibility, so
choose wisely! But given this struct is specific to a given <type> of
iterator, this allows a lot of flexibility: simple iterators could be
fine with just one stack slot (8 bytes), like numbers iterator in the
next patch, while some other more complicated iterators might need way
more to keep their iterator state. Either way, such design allows to
avoid runtime memory allocations, which otherwise would be necessary if
we fixed on-the-stack size and it turned out to be too small for a given
iterator implementation.

The way BPF verifier logic is implemented, there are no artificial
restrictions on a number of active iterators, it should work correctly
using multiple active iterators at the same time. This also means you
can have multiple nested iteration loops. struct bpf_iter_<type>
reference can be safely passed to subprograms as well.

General flow is easiest to demonstrate with a simple example using
number iterator implemented in next patch. Here's the simplest possible
loop:

  struct bpf_iter_num it;
  int *v;

  bpf_iter_num_new(&it, 2, 5);
  while ((v = bpf_iter_num_next(&it))) {
      bpf_printk("X = %d", *v);
  }
  bpf_iter_num_destroy(&it);

Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is
exclusive and is not returned. This matches similar APIs (e.g., slices
in Go or Rust) that implement a range of elements, where end index is
non-inclusive.

In the above example, we see a trio of function:
  - constructor, bpf_iter_num_new(), which initializes iterator state
  (struct bpf_iter_num it) on the stack. If any of the input arguments
  are invalid, constructor should make sure to still initialize it such
  that subsequent bpf_iter_num_next() calls will return NULL. I.e., on
  error, return error and construct empty iterator.
  - next method, bpf_iter_num_next(), which accepts pointer to iterator
  state and produces an element. Next method should always return
  a pointer. The contract between BPF verifier is that next method will
  always eventually return NULL when elements are exhausted. Once NULL is
  returned, subsequent next calls should keep returning NULL. In the
  case of numbers iterator, bpf_iter_num_next() returns a pointer to an int
  (storage for this integer is inside the iterator state itself),
  which can be dereferenced after corresponding NULL check.
  - once done with the iterator, it's mandated that user cleans up its
  state with the call to destructor, bpf_iter_num_destroy() in this
  case. Destructor frees up any resources and marks stack space used by
  struct bpf_iter_num as usable for something else.

Any other iterator implementation will have to implement at least these
three methods. It is enforced that for any given type of iterator only
applicable constructor/destructor/next are callable. I.e., verifier
ensures you can't pass number iterator state into, say, cgroup
iterator's next method.

It is important to keep the naming pattern consistent to be able to
create generic macros to help with BPF iter usability. E.g., one
of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h
in selftests, which allows to utilize iterator "trio" nicely without
having to code the above somewhat tedious loop explicitly every time.
This is enforced at kfunc registration point by one of the previous
patches in this series.

At the implementation level, iterator state tracking for verification
purposes is very similar to dynptr. We add STACK_ITER stack slot type,
reserve necessary number of slots, depending on
sizeof(struct bpf_iter_<type>), and keep track of necessary extra state
in the "main" slot, which is marked with non-zero ref_obj_id. Other
slots are also marked as STACK_ITER, but have zero ref_obj_id. This is
simpler than having a separate "is_first_slot" flag.

Another big distinction is that STACK_ITER is *always refcounted*, which
simplifies implementation without sacrificing usability. So no need for
extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new
constructors, etc. Keeping it simple here.

As far as the verification logic goes, there are two extensive comments:
in process_iter_next_call() and iter_active_depths_differ() explaining
some important and sometimes subtle aspects. Please refer to them for
details.

But from 10,000-foot point of view, next methods are the points of
forking a verification state, which are conceptually similar to what
verifier is doing when validating conditional jump. We branch out at
a `call bpf_iter_<type>_next` instruction and simulate two outcomes:
NULL (iteration is done) and non-NULL (new element is returned). NULL is
simulated first and is supposed to reach exit without looping. After
that non-NULL case is validated and it either reaches exit (for trivial
examples with no real loop), or reaches another `call bpf_iter_<type>_next`
instruction with the state equivalent to already (partially) validated
one. State equivalency at that point means we technically are going to
be looping forever without "breaking out" out of established "state
envelope" (i.e., subsequent iterations don't add any new knowledge or
constraints to the verifier state, so running 1, 2, 10, or a million of
them doesn't matter). But taking into account the contract stating that
iterator next method *has to* return NULL eventually, we can conclude
that loop body is safe and will eventually terminate. Given we validated
logic outside of the loop (NULL case), and concluded that loop body is
safe (though potentially looping many times), verifier can claim safety
of the overall program logic.

The rest of the patch is necessary plumbing for state tracking, marking,
validation, and necessary further kfunc plumbing to allow implementing
iterator constructor, destructor, and next methods.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  23 ++
 kernel/bpf/verifier.c        | 595 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 610 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e2dc7f064449..0c052bc79940 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
 
 #define ITER_PREFIX "bpf_iter_"
 
+enum bpf_iter_state {
+	BPF_ITER_STATE_INVALID, /* for non-first slot */
+	BPF_ITER_STATE_ACTIVE,
+	BPF_ITER_STATE_DRAINED,
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
 			bool first_slot;
 		} dynptr;
 
+		/* For bpf_iter stack slots */
+		struct {
+			/* BTF container and BTF type ID describing
+			 * struct bpf_iter_<type> of an iterator state
+			 */
+			struct btf *btf;
+			u32 btf_id;
+			/* packing following two fields to fit iter state into 16 bytes */
+			enum bpf_iter_state state:2;
+			int depth:30;
+		} iter;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 * For stack slots that are dynptrs, this is used to track references to
 	 * the dynptr to determine proper reference freeing.
+	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
+	 * to a specific instance of bpf_iter.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
 	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
 	 */
 	STACK_DYNPTR,
+	STACK_ITER,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d40fba6a1c0..45a082284464 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -302,6 +302,10 @@ struct bpf_kfunc_call_arg_meta {
 		enum bpf_dynptr_type type;
 		u32 id;
 	} initialized_dynptr;
+	struct {
+		u8 spi;
+		u8 frameno;
+	} iter;
 	u64 mem_size;
 };
 
@@ -668,6 +672,7 @@ static char slot_type_char[] = {
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
 	[STACK_DYNPTR]	= 'd',
+	[STACK_ITER]	= 'i',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -742,6 +747,11 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re
 	return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
 }
 
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
 static const char *kernel_type_name(const struct btf* btf, u32 id)
 {
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -766,6 +776,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
 	}
 }
 
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+	if (!btf || btf_id == 0)
+		return "<invalid>";
+
+	/* we already validated that type is valid and has conforming name */
+	return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+	switch (state) {
+	case BPF_ITER_STATE_ACTIVE:
+		return "active";
+	case BPF_ITER_STATE_DRAINED:
+		return "drained";
+	case BPF_ITER_STATE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown iter state %d\n", state);
+		return "<unknown>";
+	}
+}
+
 static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
 {
 	env->scratched_regs |= 1U << regno;
@@ -1118,6 +1152,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 	}
 }
 
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_reg_state *reg, int insn_idx,
+				 struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j, id;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	id = acquire_reference_state(env, insn_idx);
+	if (id < 0)
+		return id;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		__mark_reg_known_zero(st);
+		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		st->live |= REG_LIVE_WRITTEN;
+		st->ref_obj_id = i == 0 ? id : 0;
+		st->iter.btf = btf;
+		st->iter.btf_id = btf_id;
+		st->iter.state = BPF_ITER_STATE_ACTIVE;
+		st->iter.depth = 0;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_ITER;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		if (i == 0)
+			WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+		__mark_reg_not_init(env, st);
+
+		/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+		st->live |= REG_LIVE_WRITTEN;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_INVALID;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+				     struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+	 * will do check_mem_access to check and update stack bounds later, so
+	 * return true for that case.
+	 */
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi == -ERANGE)
+		return true;
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] == STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		/* only main (first) slot has ref_obj_id set */
+		if (i == 0 && !st->ref_obj_id)
+			return false;
+		if (i != 0 && st->ref_obj_id)
+			return false;
+		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+			return false;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] != STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+/* Check if given stack slot is "special":
+ *   - spilled register state (STACK_SPILL);
+ *   - dynptr state (STACK_DYNPTR);
+ *   - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+	enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+	switch (type) {
+	case STACK_SPILL:
+	case STACK_DYNPTR:
+	case STACK_ITER:
+		return true;
+	case STACK_INVALID:
+	case STACK_MISC:
+	case STACK_ZERO:
+		return false;
+	default:
+		WARN_ONCE(1, "unknown stack slot type %d\n", type);
+		return true;
+	}
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -1267,6 +1452,19 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (reg->ref_obj_id)
 				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
 			break;
+		case STACK_ITER:
+			/* only main slot has ref_obj_id set; skip others */
+			reg = &state->stack[i].spilled_ptr;
+			if (!reg->ref_obj_id)
+				continue;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+				iter_type_str(reg->iter.btf, reg->iter.btf_id),
+				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->iter.depth);
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		default:
@@ -2710,6 +2908,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
 			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
 }
 
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			  int spi, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int err, i;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+		err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+		if (err)
+			return err;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
 /* This function is supposed to be used by the following 32-bit optimization
  * code only. It returns TRUE if the source or destination register operates
  * on 64-bit, otherwise return FALSE.
@@ -3691,8 +3908,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		/* regular write of data into stack destroys any spilled ptr */
 		state->stack[spi].spilled_ptr.type = NOT_INIT;
-		/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
-		if (is_spilled_reg(&state->stack[spi]))
+		/* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+		if (is_stack_slot_special(&state->stack[spi]))
 			for (i = 0; i < BPF_REG_SIZE; i++)
 				scrub_spilled_slot(&state->stack[spi].slot_type[i]);
 
@@ -6506,6 +6723,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 	return err;
 }
 
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+	struct bpf_func_state *state = func(env, reg);
+
+	return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+	/* btf_check_iter_kfuncs() guarantees that first argument of any iter
+	 * kfunc is iter state pointer
+	 */
+	return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+			    struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	const struct btf_type *t;
+	const struct btf_param *arg;
+	int spi, err, i, nr_slots;
+	u32 btf_id;
+
+	/* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+	arg = &btf_params(meta->func_proto)[0];
+	t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);	/* PTR */
+	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */
+	nr_slots = t->size / BPF_REG_SIZE;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0 && spi != -ERANGE)
+		return spi;
+
+	meta->iter.spi = spi;
+	meta->iter.frameno = reg->frameno;
+
+	if (is_iter_new_kfunc(meta)) {
+		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
+		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+			err = check_mem_access(env, insn_idx, regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		if (err)
+			return err;
+	} else {
+		/* iter_next() or iter_destroy() expect initialized iter state*/
+		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+			verbose(env, "expected an initialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		err = mark_iter_read(env, reg, spi, nr_slots);
+		if (err)
+			return err;
+
+		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+		if (is_iter_destroy_kfunc(meta)) {
+			err = unmark_stack_slots_iter(env, reg, nr_slots);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+				  struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+	struct bpf_reg_state *cur_iter, *queued_iter;
+	int iter_frameno = meta->iter.frameno;
+	int iter_spi = meta->iter.spi;
+
+	BTF_TYPE_EMIT(struct bpf_iter);
+
+	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+		verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+			cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+		return -EFAULT;
+	}
+
+	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* branch out active iter state */
+		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+		if (!queued_st)
+			return -ENOMEM;
+
+		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+		queued_iter->iter.depth++;
+
+		queued_fr = queued_st->frame[queued_st->curframe];
+		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+	}
+
+	/* switch to DRAINED state, but keep the depth unchanged */
+	/* mark current iter state as drained and assume returned NULL */
+	cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+	__mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+	return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -9099,6 +9513,7 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */
 	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */
 	KF_ARG_PTR_TO_DYNPTR,
+	KF_ARG_PTR_TO_ITER,
 	KF_ARG_PTR_TO_LIST_HEAD,
 	KF_ARG_PTR_TO_LIST_NODE,
 	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */
@@ -9220,6 +9635,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
+	if (is_kfunc_arg_iter(meta, argno))
+		return KF_ARG_PTR_TO_ITER;
+
 	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_LIST_HEAD;
 
@@ -9848,6 +10266,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_KPTR:
 		case KF_ARG_PTR_TO_DYNPTR:
+		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
 		case KF_ARG_PTR_TO_LIST_NODE:
 		case KF_ARG_PTR_TO_RB_ROOT:
@@ -9944,6 +10363,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 			break;
 		}
+		case KF_ARG_PTR_TO_ITER:
+			ret = process_iter_arg(env, regno, insn_idx, meta);
+			if (ret < 0)
+				return ret;
+			break;
 		case KF_ARG_PTR_TO_LIST_HEAD:
 			if (reg->type != PTR_TO_MAP_VALUE &&
 			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -10148,6 +10572,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	desc_btf = meta.btf;
 	insn_aux = &env->insn_aux_data[insn_idx];
 
+	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
 		return -EACCES;
@@ -10436,6 +10862,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			mark_btf_func_reg_size(env, regno, t->size);
 	}
 
+	if (is_iter_next_kfunc(&meta)) {
+		err = process_iter_next_call(env, insn_idx, &meta);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -13548,6 +13980,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			 * async state will be pushed for further exploration.
 			 */
 			mark_prune_point(env, t);
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			struct bpf_kfunc_call_arg_meta meta;
+
+			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+			if (ret == 0 && is_iter_next_kfunc(&meta))
+				mark_prune_point(env, t);
+		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
 	case BPF_JA:
@@ -14301,6 +14740,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 	 * didn't use them
 	 */
 	for (i = 0; i < old->allocated_stack; i++) {
+		struct bpf_reg_state *old_reg, *cur_reg;
+
 		spi = i / BPF_REG_SIZE;
 
 		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -14357,9 +14798,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 				return false;
 			break;
 		case STACK_DYNPTR:
-		{
-			const struct bpf_reg_state *old_reg, *cur_reg;
-
 			old_reg = &old->stack[spi].spilled_ptr;
 			cur_reg = &cur->stack[spi].spilled_ptr;
 			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -14367,7 +14805,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
 				return false;
 			break;
-		}
+		case STACK_ITER:
+			old_reg = &old->stack[spi].spilled_ptr;
+			cur_reg = &cur->stack[spi].spilled_ptr;
+			/* iter.depth is not compared between states as it
+			 * doesn't matter for correctness and would otherwise
+			 * prevent convergence; we maintain it only to prevent
+			 * infinite loop check triggering, see
+			 * iter_active_depths_differ()
+			 */
+			if (old_reg->iter.btf != cur_reg->iter.btf ||
+			    old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+			    old_reg->iter.state != cur_reg->iter.state ||
+			    /* ignore {old_reg,cur_reg}->iter.depth, see above */
+			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+				return false;
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		case STACK_INVALID:
@@ -14626,6 +15079,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
 	return true;
 }
 
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ *   0: again:                          ; set up iter_next() call args
+ *   1:   r1 = &it                      ; <CHECKPOINT HERE>
+ *   2:   call bpf_iter_num_next        ; this is iter_next() call
+ *   3:   if r0 == 0 goto done
+ *   4:   ... something useful here ...
+ *   5:   goto again                    ; another iteration
+ *   6: done:
+ *   7:   r1 = &it
+ *   8:   call bpf_iter_num_destroy     ; clean up iter state
+ *   9:   exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ *   struct bpf_iter_num it;
+ *   int *p, x;
+ *
+ *   bpf_iter_num_new(&it, 0, 10);
+ *   while ((p = bpf_iter_num_next(&t))) {
+ *       x = p;
+ *       while (x--) {} // <<-- infinite loop here
+ *   }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+	struct bpf_reg_state *slot, *cur_slot;
+	struct bpf_func_state *state;
+	int i, fr;
+
+	for (fr = old->curframe; fr >= 0; fr--) {
+		state = old->frame[fr];
+		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+			if (state->stack[i].slot_type[0] != STACK_ITER)
+				continue;
+
+			slot = &state->stack[i].spilled_ptr;
+			if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+				continue;
+
+			cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+			if (cur_slot->iter.depth != slot->iter.depth)
+				return true;
+		}
+	}
+	return false;
+}
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
@@ -14673,8 +15212,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				 * Since the verifier still needs to catch infinite loops
 				 * inside async callbacks.
 				 */
-			} else if (states_maybe_looping(&sl->state, cur) &&
-				   states_equal(env, &sl->state, cur)) {
+				goto skip_inf_loop_check;
+			}
+			/* BPF open-coded iterators loop detection is special.
+			 * states_maybe_looping() logic is too simplistic in detecting
+			 * states that *might* be equivalent, because it doesn't know
+			 * about ID remapping, so don't even perform it.
+			 * See process_iter_next_call() and iter_active_depths_differ()
+			 * for overview of the logic. When current and one of parent
+			 * states are detected as equivalent, it's a good thing: we prove
+			 * convergence and can stop simulating further iterations.
+			 * It's safe to assume that iterator loop will finish, taking into
+			 * account iter_next() contract of eventually returning
+			 * sticky NULL result.
+			 */
+			if (is_iter_next_insn(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur)) {
+					struct bpf_func_state *cur_frame;
+					struct bpf_reg_state *iter_state, *iter_reg;
+					int spi;
+
+					cur_frame = cur->frame[cur->curframe];
+					/* btf_check_iter_kfuncs() enforces that
+					 * iter state pointer is always the first arg
+					 */
+					iter_reg = &cur_frame->regs[BPF_REG_1];
+					/* current state is valid due to states_equal(),
+					 * so we can assume valid iter and reg state,
+					 * no need for extra (re-)validations
+					 */
+					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+						goto hit;
+				}
+				goto skip_inf_loop_check;
+			}
+			/* attempt to detect infinite loop to avoid unnecessary doomed work */
+			if (states_maybe_looping(&sl->state, cur) &&
+			    states_equal(env, &sl->state, cur) &&
+			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				return -EINVAL;
@@ -14691,6 +15268,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * This threshold shouldn't be too high either, since states
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
+skip_inf_loop_check:
 			if (!env->test_state_freq &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
@@ -14698,6 +15276,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			goto miss;
 		}
 		if (states_equal(env, &sl->state, cur)) {
+hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
 			 * prune the search.
-- 
cgit v1.2.3


From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:17 -0800
Subject: bpf: implement numbers iterator

Implement the first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, end is exclusive).
  - bpf_iter_num_next() which will keep returning read-only pointer to int
    until the range is exhausted, at which point NULL will be returned.
    If bpf_iter_num_next() is kept calling after this, NULL will be
    persistently returned.
  - bpf_iter_num_destroy() destructor, which needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup an empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the
[start, end) value range, as both `start` and `end` are not statically
known and enforced: they are runtime values.

While the implementation is pretty trivial, some care needs to be taken
to avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known or verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 +++--
 include/uapi/linux/bpf.h       |  8 +++++
 kernel/bpf/bpf_iter.c          | 70 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c           |  3 ++
 tools/include/uapi/linux/bpf.h |  8 +++++
 5 files changed, 95 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..e64ff1e89fb2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1617,8 +1617,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
 	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+struct bpf_iter_num_kern {
+	int cur; /* current value, inclusive */
+	int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+	BTF_TYPE_EMIT(struct btf_iter_num);
+
+	/* start == end is legit, it's an empty range and we'll just get NULL
+	 * on first (and any subsequent) bpf_iter_num_next() call
+	 */
+	if (start > end) {
+		s->cur = s->end = 0;
+		return -EINVAL;
+	}
+
+	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+	if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+		s->cur = s->end = 0;
+		return -E2BIG;
+	}
+
+	/* user will call bpf_iter_num_next() first,
+	 * which will set s->cur to exactly start value;
+	 * underflow shouldn't matter
+	 */
+	s->cur = start - 1;
+	s->end = end;
+
+	return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	/* check failed initialization or if we are done (same behavior);
+	 * need to be careful about overflow, so convert to s64 for checks,
+	 * e.g., if s->cur == s->end == INT_MAX, we can't just do
+	 * s->cur + 1 >= s->end
+	 */
+	if ((s64)(s->cur + 1) >= s->end) {
+		s->cur = s->end = 0;
+		return NULL;
+	}
+
+	s->cur++;
+
+	return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 637ac4e92e75..f9b7eeedce08 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Mar 2023 22:01:49 -0800
Subject: bpf: ensure state checkpointing at iter_next() call sites

State equivalence check and checkpointing performed in is_state_visited()
employs certain heuristics to try to save memory by avoiding state checkpoints
if not enough jumps and instructions happened since last checkpoint. This leads
to unpredictability of whether a particular instruction will be checkpointed
and how regularly. While normally this is not causing much problems (except
inconveniences for predictable verifier tests, which we overcome with
BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded
iterators.

Checking and saving state checkpoints at iter_next() call is crucial for fast
convergence of open-coded iterator loop logic, so we need to force it. If we
don't do that, is_state_visited() might skip saving a checkpoint, causing
unnecessarily long sequence of not checkpointed instructions and jumps, leading
to exhaustion of jump history buffer, and potentially other undesired outcomes.
It is expected that with correct open-coded iterators convergence will happen
quickly, so we don't run a risk of exhausting memory.

This patch adds, in addition to prune and jump instruction marks, also a
"forced checkpoint" mark, and makes sure that any iter_next() call instruction
is marked as such.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  6 +++++-
 kernel/bpf/verifier.c        | 31 ++++++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c052bc79940..81d525d057c7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -477,8 +477,12 @@ struct bpf_insn_aux_data {
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
-	bool prune_point;
 	bool jmp_point;
+	bool prune_point;
+	/* ensure we check state equivalence and save state checkpoint and
+	 * this instruction, regardless of any heuristics
+	 */
+	bool force_checkpoint;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45a082284464..13fd4c893f3b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13865,6 +13865,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].prune_point;
 }
 
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+	env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
 enum {
 	DONE_EXPLORING = 0,
 	KEEP_EXPLORING = 1,
@@ -13984,8 +13995,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			struct bpf_kfunc_call_arg_meta meta;
 
 			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
-			if (ret == 0 && is_iter_next_kfunc(&meta))
+			if (ret == 0 && is_iter_next_kfunc(&meta)) {
 				mark_prune_point(env, t);
+				/* Checking and saving state checkpoints at iter_next() call
+				 * is crucial for fast convergence of open-coded iterator loop
+				 * logic, so we need to force it. If we don't do that,
+				 * is_state_visited() might skip saving a checkpoint, causing
+				 * unnecessarily long sequence of not checkpointed
+				 * instructions and jumps, leading to exhaustion of jump
+				 * history buffer, and potentially other undesired outcomes.
+				 * It is expected that with correct open-coded iterators
+				 * convergence will happen quickly, so we don't run a risk of
+				 * exhausting memory.
+				 */
+				mark_force_checkpoint(env, t);
+			}
 		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
@@ -15172,7 +15196,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
-	bool add_new_state = env->test_state_freq ? true : false;
+	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+	bool add_new_state = force_new_state;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -15269,7 +15294,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
 skip_inf_loop_check:
-			if (!env->test_state_freq &&
+			if (!force_new_state &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
 				add_new_state = false;
-- 
cgit v1.2.3


From 52c2b005a3c18c565fc70cfd0ca49375f301e952 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Mar 2023 14:41:31 -0800
Subject: bpf: take into account liveness when propagating precision

When doing state comparison, if old state has register that is not
marked as REG_LIVE_READ, then we just skip comparison, regardless what's
the state of corresponing register in current state. This is because not
REG_LIVE_READ register is irrelevant for further program execution and
correctness. All good here.

But when we get to precision propagation, after two states were declared
equivalent, we don't take into account old register's liveness, and thus
attempt to propagate precision for register in current state even if
that register in old state was not REG_LIVE_READ anymore. This is bad,
because register in current state could be anything at all and this
could cause -EFAULT due to internal logic bugs.

Fix by taking into account REG_LIVE_READ liveness mark to keep the logic
in state comparison in sync with precision propagation.

Fixes: a3ce685dd01a ("bpf: fix precision tracking")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230309224131.57449-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 13fd4c893f3b..0aaf7703326b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15058,7 +15058,8 @@ static int propagate_precision(struct bpf_verifier_env *env,
 		state_reg = state->regs;
 		for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
 			if (state_reg->type != SCALAR_VALUE ||
-			    !state_reg->precise)
+			    !state_reg->precise ||
+			    !(state_reg->live & REG_LIVE_READ))
 				continue;
 			if (env->log.level & BPF_LOG_LEVEL2)
 				verbose(env, "frame %d: propagating r%d\n", i, fr);
@@ -15072,7 +15073,8 @@ static int propagate_precision(struct bpf_verifier_env *env,
 				continue;
 			state_reg = &state->stack[i].spilled_ptr;
 			if (state_reg->type != SCALAR_VALUE ||
-			    !state_reg->precise)
+			    !state_reg->precise ||
+			    !(state_reg->live & REG_LIVE_READ))
 				continue;
 			if (env->log.level & BPF_LOG_LEVEL2)
 				verbose(env, "frame %d: propagating fp%d\n",
-- 
cgit v1.2.3


From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:20 -0800
Subject: bpf: Move a few bpf_local_storage functions to static scope

This patch moves the bpf_local_storage_free_rcu() and
bpf_selem_unlink_map() to static because they are
not used outside of bpf_local_storage.c.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 3 ---
 kernel/bpf/bpf_local_storage.c    | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index d934248b8e81..502ad7093f13 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
-
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
@@ -163,7 +161,6 @@ struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags);
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu);
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index d3ba3f2db640..1904a4245ebe 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -95,7 +95,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
@@ -251,7 +251,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 	hlist_add_head_rcu(&selem->snode, &local_storage->list);
 }
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map *smap;
 	struct bpf_local_storage_map_bucket *b;
-- 
cgit v1.2.3


From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:21 -0800
Subject: bpf: Refactor codes into bpf_local_storage_destroy

This patch first renames bpf_local_storage_unlink_nolock to
bpf_local_storage_destroy(). It better reflects that it is only
used when the storage's owner (sk/task/cgrp/inode) is being kfree().

All bpf_local_storage_destroy's caller is taking the spin lock and
then free the storage. This patch also moves these two steps into
the bpf_local_storage_destroy.

This is a preparation work for a later patch that uses
bpf_mem_cache_alloc/free in the bpf_local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 2 +-
 kernel/bpf/bpf_cgrp_storage.c     | 9 +--------
 kernel/bpf/bpf_inode_storage.c    | 8 +-------
 kernel/bpf/bpf_local_storage.c    | 8 ++++++--
 kernel/bpf/bpf_task_storage.c     | 9 +--------
 net/core/bpf_sk_storage.c         | 8 +-------
 6 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 502ad7093f13..5908a954ddc2 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit);
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage);
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
 
 void bpf_local_storage_map_free(struct bpf_map *map,
 				struct bpf_local_storage_cache *cache,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 9ae07aedaf23..492594d69a86 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
 void bpf_cgrp_storage_free(struct cgroup *cgroup)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_cgroup_storage = false;
-	unsigned long flags;
 
 	rcu_read_lock();
 	local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
@@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
 	}
 
 	bpf_cgrp_storage_lock();
-	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_local_storage_destroy(local_storage);
 	bpf_cgrp_storage_unlock();
 	rcu_read_unlock();
-
-	if (free_cgroup_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static struct bpf_local_storage_data *
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 43e2619c8167..2d25bcfa371b 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 void bpf_inode_storage_free(struct inode *inode)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_inode_storage = false;
 	struct bpf_storage_blob *bsb;
 
 	bsb = bpf_inode(inode);
@@ -72,13 +71,8 @@ void bpf_inode_storage_free(struct inode *inode)
 		return;
 	}
 
-	raw_spin_lock_bh(&local_storage->lock);
-	free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_bh(&local_storage->lock);
+	bpf_local_storage_destroy(local_storage);
 	rcu_read_unlock();
-
-	if (free_inode_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 1904a4245ebe..e19f9f50a60d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -652,11 +652,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 	return 0;
 }
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
 	struct bpf_local_storage_elem *selem;
 	bool free_storage = false;
 	struct hlist_node *n;
+	unsigned long flags;
 
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
@@ -667,6 +668,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 	 * when unlinking elem from the local_storage->list and
 	 * the map's bucket->list.
 	 */
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
 		/* Always unlink from map before unlinking from
 		 * local_storage.
@@ -681,8 +683,10 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 		free_storage = bpf_selem_unlink_storage_nolock(
 			local_storage, selem, false, false);
 	}
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	return free_storage;
+	if (free_storage)
+		kfree_rcu(local_storage, rcu);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 20f942229f3c..4dcef28744d1 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 void bpf_task_storage_free(struct task_struct *task)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_task_storage = false;
-	unsigned long flags;
 
 	rcu_read_lock();
 
@@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)
 	}
 
 	bpf_task_storage_lock();
-	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_local_storage_destroy(local_storage);
 	bpf_task_storage_unlock();
 	rcu_read_unlock();
-
-	if (free_task_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 7a36353dbc22..8f56438c104b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -49,7 +49,6 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 void bpf_sk_storage_free(struct sock *sk)
 {
 	struct bpf_local_storage *sk_storage;
-	bool free_sk_storage = false;
 
 	rcu_read_lock();
 	sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -58,13 +57,8 @@ void bpf_sk_storage_free(struct sock *sk)
 		return;
 	}
 
-	raw_spin_lock_bh(&sk_storage->lock);
-	free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage);
-	raw_spin_unlock_bh(&sk_storage->lock);
+	bpf_local_storage_destroy(sk_storage);
 	rcu_read_unlock();
-
-	if (free_sk_storage)
-		kfree_rcu(sk_storage, rcu);
 }
 
 static void bpf_sk_storage_map_free(struct bpf_map *map)
-- 
cgit v1.2.3


From 62827d612ae525695799b3635a087cb49c55e977 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:22 -0800
Subject: bpf: Remove __bpf_local_storage_map_alloc

bpf_local_storage_map_alloc() is the only caller of
__bpf_local_storage_map_alloc().  The remaining logic in
bpf_local_storage_map_alloc() is only a one liner setting
the smap->cache_idx.

Remove __bpf_local_storage_map_alloc() to simplify code.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 63 +++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index e19f9f50a60d..f7234a8d4959 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -601,40 +601,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
-static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_local_storage_map *smap;
-	unsigned int i;
-	u32 nbuckets;
-
-	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
-	if (!smap)
-		return ERR_PTR(-ENOMEM);
-	bpf_map_init_from_attr(&smap->map, attr);
-
-	nbuckets = roundup_pow_of_two(num_possible_cpus());
-	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
-	nbuckets = max_t(u32, 2, nbuckets);
-	smap->bucket_log = ilog2(nbuckets);
-
-	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
-					 nbuckets, GFP_USER | __GFP_NOWARN);
-	if (!smap->buckets) {
-		bpf_map_area_free(smap);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (i = 0; i < nbuckets; i++) {
-		INIT_HLIST_HEAD(&smap->buckets[i].list);
-		raw_spin_lock_init(&smap->buckets[i].lock);
-	}
-
-	smap->elem_size = offsetof(struct bpf_local_storage_elem,
-				   sdata.data[attr->value_size]);
-
-	return smap;
-}
-
 int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 				    const struct btf *btf,
 				    const struct btf_type *key_type,
@@ -704,10 +670,33 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache)
 {
 	struct bpf_local_storage_map *smap;
+	unsigned int i;
+	u32 nbuckets;
+
+	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+	if (!smap)
+		return ERR_PTR(-ENOMEM);
+	bpf_map_init_from_attr(&smap->map, attr);
+
+	nbuckets = roundup_pow_of_two(num_possible_cpus());
+	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+	nbuckets = max_t(u32, 2, nbuckets);
+	smap->bucket_log = ilog2(nbuckets);
 
-	smap = __bpf_local_storage_map_alloc(attr);
-	if (IS_ERR(smap))
-		return ERR_CAST(smap);
+	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+					 nbuckets, GFP_USER | __GFP_NOWARN);
+	if (!smap->buckets) {
+		bpf_map_area_free(smap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < nbuckets; i++) {
+		INIT_HLIST_HEAD(&smap->buckets[i].list);
+		raw_spin_lock_init(&smap->buckets[i].lock);
+	}
+
+	smap->elem_size = offsetof(struct bpf_local_storage_elem,
+				   sdata.data[attr->value_size]);
 
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
 	return &smap->map;
-- 
cgit v1.2.3


From 121f31f3e00dfc1acbca43f6f35779e050b56cfc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:23 -0800
Subject: bpf: Remove the preceding __ from __bpf_selem_unlink_storage

__bpf_selem_unlink_storage is taking the spin lock and there is
no name collision also. Having the preceding '__' is confusing
when reviewing the later patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-5-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index f7234a8d4959..70df8dcb2066 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -216,8 +216,8 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	return free_local_storage;
 }
 
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
-				       bool use_trace_rcu)
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+				     bool use_trace_rcu)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
@@ -288,7 +288,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
 	 * the local_storage.
 	 */
 	bpf_selem_unlink_map(selem);
-	__bpf_selem_unlink_storage(selem, use_trace_rcu);
+	bpf_selem_unlink_storage(selem, use_trace_rcu);
 }
 
 /* If cacheit_lockit is false, this lookup function is lockless */
-- 
cgit v1.2.3


From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:24 -0800
Subject: bpf: Remember smap in bpf_local_storage

This patch remembers which smap triggers the allocation
of a 'struct bpf_local_storage' object. The local_storage is
allocated during the very first selem added to the owner.
The smap pointer is needed when using the bpf_mem_cache_free
in a later patch because it needs to free to the correct
smap's bpf_mem_alloc object.

When a selem is being removed, it needs to check if it is
the selem that triggers the creation of the local_storage.
If it is, the local_storage->smap pointer will be reset to NULL.
This NULL reset is done under the local_storage->lock in
bpf_selem_unlink_storage_nolock() when a selem is being removed.
Also note that the local_storage may not go away even
local_storage->smap is NULL because there may be other
selem still stored in the local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 1 +
 kernel/bpf/bpf_local_storage.c    | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 5908a954ddc2..613b1805ed9f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -83,6 +83,7 @@ struct bpf_local_storage_elem {
 
 struct bpf_local_storage {
 	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+	struct bpf_local_storage_map __rcu *smap;
 	struct hlist_head list; /* List of bpf_local_storage_elem */
 	void *owner;		/* The object that owns the above "list" of
 				 * bpf_local_storage_elem.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 70df8dcb2066..5585dbfd9c66 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -213,6 +213,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 			kfree_rcu(selem, rcu);
 	}
 
+	if (rcu_access_pointer(local_storage->smap) == smap)
+		RCU_INIT_POINTER(local_storage->smap, NULL);
+
 	return free_local_storage;
 }
 
@@ -368,6 +371,7 @@ int bpf_local_storage_alloc(void *owner,
 		goto uncharge;
 	}
 
+	RCU_INIT_POINTER(storage->smap, smap);
 	INIT_HLIST_HEAD(&storage->list);
 	raw_spin_lock_init(&storage->lock);
 	storage->owner = owner;
-- 
cgit v1.2.3


From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:25 -0800
Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage

This patch re-purpose the use_trace_rcu to mean
if the freed memory can be reused immediately or not.
The use_trace_rcu is renamed to reuse_now. Other than
the boolean test is reversed, it should be a no-op.

The following explains the reason for the rename and how it will
be used in a later patch.

In a later patch, bpf_mem_cache_alloc/free will be used
in the bpf_local_storage. The bpf mem allocator will reuse
the freed memory immediately. Some of the free paths in
bpf_local_storage does not support memory to be reused immediately.
These paths are the "delete" elem cases from the bpf_*_storage_delete()
helper and the map_delete_elem() syscall. Note that "delete" elem
before the owner's (sk/task/cgrp/inode) lifetime ended is not
the common usage for the local storage.

The common free path, bpf_local_storage_destroy(), can reuse the
memory immediately. This common path means the storage stays with
its owner until the owner is destroyed.

The above mentioned "delete" elem paths that cannot
reuse immediately always has the 'use_trace_rcu ==  true'.
The cases that is safe for immediate reuse always have
'use_trace_rcu == false'. Instead of adding another arg
in a later patch, this patch re-purpose this arg
to reuse_now and have the test logic reversed.

In a later patch, 'reuse_now == true' will free to the
bpf_mem_cache_free() where the memory can be reused
immediately. 'reuse_now == false' will go through the
call_rcu_tasks_trace().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  2 +-
 kernel/bpf/bpf_cgrp_storage.c     |  2 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 24 ++++++++++++------------
 kernel/bpf/bpf_task_storage.c     |  2 +-
 net/core/bpf_sk_storage.c         |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 613b1805ed9f..18a31add2255 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 492594d69a86..c975cacdd16b 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -121,7 +121,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 	return 0;
 }
 
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2d25bcfa371b..ad2ab0187e45 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -122,7 +122,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 5585dbfd9c66..70c34a948c3c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -147,7 +147,7 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
  */
 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 					    struct bpf_local_storage_elem *selem,
-					    bool uncharge_mem, bool use_trace_rcu)
+					    bool uncharge_mem, bool reuse_now)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -201,7 +201,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	 * any special fields.
 	 */
 	rec = smap->map.record;
-	if (use_trace_rcu) {
+	if (!reuse_now) {
 		if (!IS_ERR_OR_NULL(rec))
 			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
 		else
@@ -220,7 +220,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 }
 
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
-				     bool use_trace_rcu)
+				     bool reuse_now)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
@@ -235,11 +235,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, use_trace_rcu);
+			local_storage, selem, true, reuse_now);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage) {
-		if (use_trace_rcu)
+		if (!reuse_now)
 			call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_rcu);
 		else
@@ -284,14 +284,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
 {
 	/* Always unlink from map before unlinking from local_storage
 	 * because selem will be freed after successfully unlinked from
 	 * the local_storage.
 	 */
 	bpf_selem_unlink_map(selem);
-	bpf_selem_unlink_storage(selem, use_trace_rcu);
+	bpf_selem_unlink_storage(selem, reuse_now);
 }
 
 /* If cacheit_lockit is false, this lookup function is lockless */
@@ -538,7 +538,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false, true);
+						false, false);
 	}
 
 unlock:
@@ -651,7 +651,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, false);
+			local_storage, selem, false, true);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
@@ -745,7 +745,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 				migrate_disable();
 				this_cpu_inc(*busy_counter);
 			}
-			bpf_selem_unlink(selem, false);
+			bpf_selem_unlink(selem, true);
 			if (busy_counter) {
 				this_cpu_dec(*busy_counter);
 				migrate_enable();
@@ -783,8 +783,8 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
 		 * is true, because while call_rcu invocation is skipped in that
 		 * case in bpf_selem_free_fields_trace_rcu (and all local
-		 * storage maps pass use_trace_rcu = true), there can be
-		 * call_rcu callbacks based on use_trace_rcu = false in the
+		 * storage maps pass reuse_now = false), there can be
+		 * call_rcu callbacks based on reuse_now = true in the
 		 * while ((selem = ...)) loop above or when owner's free path
 		 * calls bpf_local_storage_unlink_nolock.
 		 */
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 4dcef28744d1..c88cc04c17c1 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -168,7 +168,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
 	if (!nobusy)
 		return -EBUSY;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 8f56438c104b..a5f185b8e50a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
-- 
cgit v1.2.3


From c609981342dca634e5dea8c6ca175b6533581261 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:26 -0800
Subject: bpf: Remove bpf_selem_free_fields*_rcu

This patch removes the bpf_selem_free_fields*_rcu. The
bpf_obj_free_fields() can be done before the call_rcu_trasks_trace()
and kfree_rcu(). It is needed when a later patch uses
bpf_mem_cache_alloc/free. In bpf hashtab, bpf_obj_free_fields()
is also called before calling bpf_mem_cache_free. The discussion
can be found in
https://lore.kernel.org/bpf/f67021ee-21d9-bfae-6134-4ca542fab843@linux.dev/

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-8-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 67 ++++--------------------------------------
 1 file changed, 5 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 70c34a948c3c..715deaaefe13 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -109,27 +109,6 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 		kfree_rcu(local_storage, rcu);
 }
 
-static void bpf_selem_free_fields_rcu(struct rcu_head *rcu)
-{
-	struct bpf_local_storage_elem *selem;
-	struct bpf_local_storage_map *smap;
-
-	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	/* protected by the rcu_barrier*() */
-	smap = rcu_dereference_protected(SDATA(selem)->smap, true);
-	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	kfree(selem);
-}
-
-static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu)
-{
-	/* Free directly if Tasks Trace RCU GP also implies RCU GP */
-	if (rcu_trace_implies_rcu_gp())
-		bpf_selem_free_fields_rcu(rcu);
-	else
-		call_rcu(rcu, bpf_selem_free_fields_rcu);
-}
-
 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
@@ -151,7 +130,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
-	struct btf_record *rec;
 	void *owner;
 
 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
@@ -192,26 +170,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	/* A different RCU callback is chosen whenever we need to free
-	 * additional fields in selem data before freeing selem.
-	 * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU
-	 * callbacks when it has special fields, hence we can only conditionally
-	 * dereference smap, as by this time the map might have already been
-	 * freed without waiting for our call_rcu callback if it did not have
-	 * any special fields.
-	 */
-	rec = smap->map.record;
-	if (!reuse_now) {
-		if (!IS_ERR_OR_NULL(rec))
-			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
-		else
-			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	} else {
-		if (!IS_ERR_OR_NULL(rec))
-			call_rcu(&selem->rcu, bpf_selem_free_fields_rcu);
-		else
-			kfree_rcu(selem, rcu);
-	}
+	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+	if (!reuse_now)
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+	else
+		kfree_rcu(selem, rcu);
 
 	if (rcu_access_pointer(local_storage->smap) == smap)
 		RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -769,26 +732,6 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
-	/* Only delay freeing of smap, buckets are not needed anymore */
 	kvfree(smap->buckets);
-
-	/* When local storage has special fields, callbacks for
-	 * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will
-	 * keep using the map BTF record, we need to execute an RCU barrier to
-	 * wait for them as the record will be freed right after our map_free
-	 * callback.
-	 */
-	if (!IS_ERR_OR_NULL(smap->map.record)) {
-		rcu_barrier_tasks_trace();
-		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
-		 * is true, because while call_rcu invocation is skipped in that
-		 * case in bpf_selem_free_fields_trace_rcu (and all local
-		 * storage maps pass reuse_now = false), there can be
-		 * call_rcu callbacks based on reuse_now = true in the
-		 * while ((selem = ...)) loop above or when owner's free path
-		 * calls bpf_local_storage_unlink_nolock.
-		 */
-		rcu_barrier();
-	}
 	bpf_map_area_free(smap);
 }
-- 
cgit v1.2.3


From f8ccf30c179ec1ac16654f6e6ceb40cce1530b91 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:27 -0800
Subject: bpf: Add bpf_selem_free_rcu callback

Add bpf_selem_free_rcu() callback to do the kfree() instead
of using kfree_rcu. It is a preparation work for using
bpf_mem_cache_alloc/free in a later patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-9-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 715deaaefe13..146e9caeda96 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -109,15 +109,20 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 		kfree_rcu(local_storage, rcu);
 }
 
-static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	kfree(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
 	if (rcu_trace_implies_rcu_gp())
-		kfree(selem);
+		bpf_selem_free_rcu(rcu);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(rcu, bpf_selem_free_rcu);
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -174,7 +179,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	if (!reuse_now)
 		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
 
 	if (rcu_access_pointer(local_storage->smap) == smap)
 		RCU_INIT_POINTER(local_storage->smap, NULL);
-- 
cgit v1.2.3


From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:28 -0800
Subject: bpf: Add bpf_selem_free()

This patch refactors the selem freeing logic into bpf_selem_free().
It is a preparation work for a later patch using
bpf_mem_cache_alloc/free. The other kfree(selem) cases
are also changed to bpf_selem_free(..., reuse_now = true).

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  4 ++++
 kernel/bpf/bpf_local_storage.c    | 21 ++++++++++++++-------
 net/core/bpf_sk_storage.c         |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 18a31add2255..a34f61467a2f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -152,6 +152,10 @@ struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now);
+
 int
 bpf_local_storage_alloc(void *owner,
 			struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146e9caeda96..512943aac435 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -125,6 +125,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_selem_free_rcu);
 }
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now)
+{
+	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+	if (!reuse_now)
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+	else
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+}
+
 /* local_storage->lock must be held and selem->local_storage == local_storage.
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
@@ -175,11 +186,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	if (!reuse_now)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	bpf_selem_free(selem, smap, reuse_now);
 
 	if (rcu_access_pointer(local_storage->smap) == smap)
 		RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -423,7 +430,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 
 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
 		if (err) {
-			kfree(selem);
+			bpf_selem_free(selem, smap, true);
 			mem_uncharge(smap, owner, smap->elem_size);
 			return ERR_PTR(err);
 		}
@@ -517,7 +524,7 @@ unlock_err:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	if (selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
-		kfree(selem);
+		bpf_selem_free(selem, smap, true);
 	}
 	return ERR_PTR(err);
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index a5f185b8e50a..24c3dc0d62e5 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -197,7 +197,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		} else {
 			ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
 			if (ret) {
-				kfree(copy_selem);
+				bpf_selem_free(copy_selem, smap, true);
 				atomic_sub(smap->elem_size,
 					   &newsk->sk_omem_alloc);
 				bpf_map_put(map);
-- 
cgit v1.2.3


From 1288aaa2786b1e58c9e88e53f7654d520ebe0f3b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:29 -0800
Subject: bpf: Add bpf_local_storage_rcu callback

The existing bpf_local_storage_free_rcu is renamed to
bpf_local_storage_free_trace_rcu. A new bpf_local_storage_rcu
callback is added to do the kfree instead of using kfree_rcu.
It is a preparation work for a later patch using
bpf_mem_cache_alloc/free.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-11-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 512943aac435..0fbc477c8b27 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -99,14 +99,19 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
+	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+	kfree(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
 	/* If RCU Tasks Trace grace period implies RCU grace period, do
 	 * kfree(), else do kfree_rcu().
 	 */
-	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
 	if (rcu_trace_implies_rcu_gp())
-		kfree(local_storage);
+		bpf_local_storage_free_rcu(rcu);
 	else
-		kfree_rcu(local_storage, rcu);
+		call_rcu(rcu, bpf_local_storage_free_rcu);
 }
 
 static void bpf_selem_free_rcu(struct rcu_head *rcu)
@@ -216,9 +221,9 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	if (free_local_storage) {
 		if (!reuse_now)
 			call_rcu_tasks_trace(&local_storage->rcu,
-				     bpf_local_storage_free_rcu);
+				     bpf_local_storage_free_trace_rcu);
 		else
-			kfree_rcu(local_storage, rcu);
+			call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
 	}
 }
 
@@ -631,7 +636,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_storage)
-		kfree_rcu(local_storage, rcu);
+		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
-- 
cgit v1.2.3


From 7e30a8477b0bdd13dfd0b24e4f32b26d22b96e6c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:30 -0800
Subject: bpf: Add bpf_local_storage_free()

This patch refactors local_storage freeing logic into
bpf_local_storage_free(). It is a preparation work for a later
patch that uses bpf_mem_cache_alloc/free. The other kfree(local_storage)
cases are also changed to bpf_local_storage_free(..., reuse_now = true).

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-12-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 0fbc477c8b27..351d991694cb 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -114,6 +114,16 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_local_storage_free_rcu);
 }
 
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+				   bool reuse_now)
+{
+	if (!reuse_now)
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     bpf_local_storage_free_trace_rcu);
+	else
+		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+}
+
 static void bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
@@ -218,13 +228,8 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 			local_storage, selem, true, reuse_now);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	if (free_local_storage) {
-		if (!reuse_now)
-			call_rcu_tasks_trace(&local_storage->rcu,
-				     bpf_local_storage_free_trace_rcu);
-		else
-			call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
-	}
+	if (free_local_storage)
+		bpf_local_storage_free(local_storage, reuse_now);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -391,7 +396,7 @@ int bpf_local_storage_alloc(void *owner,
 	return 0;
 
 uncharge:
-	kfree(storage);
+	bpf_local_storage_free(storage, true);
 	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
@@ -636,7 +641,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_storage)
-		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+		bpf_local_storage_free(local_storage, true);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
-- 
cgit v1.2.3


From b32a5dae44cc7346835839c2e92356ff4609c823 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 9 Mar 2023 10:01:06 -0800
Subject: bpf: verifier: Rename kernel_type_name helper to btf_type_name

kernel_type_name was introduced in commit 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF")
with type signature:

  const char *kernel_type_name(u32 id)

At that time the function used global btf_vmlinux BTF for all id lookups. Later,
in commit 22dc4a0f5ed1 ("bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier"),
the type signature was changed to:

  static const char *kernel_type_name(const struct btf* btf, u32 id)

With the btf parameter used for lookups instead of global btf_vmlinux.

The helper will function as expected for type name lookup using non-kernel BTFs,
and will be used for such in further patches in the series. Let's rename it to
avoid incorrect assumptions that might arise when seeing the current name.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230309180111.1618459-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0aaf7703326b..073c56dbec63 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -752,7 +752,7 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
 }
 
-static const char *kernel_type_name(const struct btf* btf, u32 id)
+static const char *btf_type_name(const struct btf *btf, u32 id)
 {
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
 }
@@ -782,7 +782,7 @@ static const char *iter_type_str(const struct btf *btf, u32 btf_id)
 		return "<invalid>";
 
 	/* we already validated that type is valid and has conforming name */
-	return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+	return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
 }
 
 static const char *iter_state_str(enum bpf_iter_state state)
@@ -1349,7 +1349,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 
 			verbose(env, "%s", reg_type_str(env, t));
 			if (base_type(t) == PTR_TO_BTF_ID)
-				verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
+				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
 			verbose(env, "(");
 /*
  * _a stands for append, was shortened to avoid multiline statements below.
@@ -4518,7 +4518,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 			       struct btf_field *kptr_field,
 			       struct bpf_reg_state *reg, u32 regno)
 {
-	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+	const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
 	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
 	const char *reg_name = "";
 
@@ -4534,7 +4534,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
-	reg_name = kernel_type_name(reg->btf, reg->btf_id);
+	reg_name = btf_type_name(reg->btf, reg->btf_id);
 
 	/* For ref_ptr case, release function check should ensure we get one
 	 * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
@@ -7177,8 +7177,8 @@ found:
 						  btf_vmlinux, *arg_btf_id,
 						  strict_type_match)) {
 				verbose(env, "R%d is of type %s but %s is expected\n",
-					regno, kernel_type_name(reg->btf, reg->btf_id),
-					kernel_type_name(btf_vmlinux, *arg_btf_id));
+					regno, btf_type_name(reg->btf, reg->btf_id),
+					btf_type_name(btf_vmlinux, *arg_btf_id));
 				return -EACCES;
 			}
 		}
@@ -7248,7 +7248,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			verbose(env, "R%d must have zero offset when passed to release func\n",
 				regno);
 			verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
-				kernel_type_name(reg->btf, reg->btf_id), reg->off);
+				btf_type_name(reg->btf, reg->btf_id), reg->off);
 			return -EINVAL;
 		}
 
-- 
cgit v1.2.3


From a4aa38897b6a6dad4318bed036edc7ed0c8a4578 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 9 Mar 2023 10:01:07 -0800
Subject: bpf: btf: Remove unused btf_field_info_type enum

This enum was added and used in commit aa3496accc41 ("bpf: Refactor kptr_off_tab
into btf_record"). Later refactoring in commit db559117828d ("bpf: Consolidate
spin_lock, timer management into btf_record") resulted in the enum
values no longer being used anywhere.

Let's remove them.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230309180111.1618459-3-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 71758cd15b07..37779ceefd09 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3231,12 +3231,6 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
-enum btf_field_info_type {
-	BTF_FIELD_SPIN_LOCK,
-	BTF_FIELD_TIMER,
-	BTF_FIELD_KPTR,
-};
-
 enum {
 	BTF_FIELD_IGNORE = 0,
 	BTF_FIELD_FOUND  = 1,
-- 
cgit v1.2.3


From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 9 Mar 2023 10:01:08 -0800
Subject: bpf: Change btf_record_find enum parameter to field_mask

btf_record_find's 3rd parameter can be multiple enum btf_field_type's
masked together. The function is called with BPF_KPTR in two places in
verifier.c, so it works with masked values already.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 2 +-
 kernel/bpf/syscall.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e64ff1e89fb2..3a38db315f7f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog);
 void bpf_map_free_id(struct bpf_map *map);
 
 struct btf_field *btf_record_find(const struct btf_record *rec,
-				  u32 offset, enum btf_field_type type);
+				  u32 offset, u32 field_mask);
 void btf_record_free(struct btf_record *rec);
 void bpf_map_free_record(struct bpf_map *map);
 struct btf_record *btf_record_dup(const struct btf_record *rec);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f406dfa13792..cc4b7684910c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b)
 }
 
 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
-				  enum btf_field_type type)
+				  u32 field_mask)
 {
 	struct btf_field *field;
 
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
 		return NULL;
 	field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
-	if (!field || !(field->type & type))
+	if (!field || !(field->type & field_mask))
 		return NULL;
 	return field;
 }
-- 
cgit v1.2.3


From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 10 Mar 2023 15:07:41 -0800
Subject: bpf: Support __kptr to local kptrs

If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module
BTF - it must have been allocated by bpf_obj_new and therefore must be
free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local
kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new.

This patch adds support for treating __kptr-tagged pointers to "local
kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr
acquire / release semantics. Consider the following example:

  struct node_data {
          long key;
          long data;
          struct bpf_rb_node node;
  };

  struct map_value {
          struct node_data __kptr *node;
  };

  struct {
          __uint(type, BPF_MAP_TYPE_ARRAY);
          __type(key, int);
          __type(value, struct map_value);
          __uint(max_entries, 1);
  } some_nodes SEC(".maps");

If struct node_data had a matching definition in kernel BTF, the verifier would
expect a destructor for the type to be registered. Since struct node_data does
not match any type in kernel BTF, the verifier knows that there is no kfunc
that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can
only come from bpf_obj_new. So instead of searching for a registered dtor,
a bpf_obj_drop dtor can be assumed.

This allows the runtime to properly destruct such kptrs in
bpf_obj_free_fields, which enables maps to clean up map_vals w/ such
kptrs when going away.

Implementation notes:
  * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr.
    Before this patch, the variable would only ever point to vmlinux or
    module BTFs, but now it can point to some program BTF for local kptr
    type. It's later used to populate the (btf, btf_id) pair in kptr btf
    field.
  * It's necessary to btf_get the program BTF when populating btf_field
    for local kptr. btf_record_free later does a btf_put.
  * Behavior for non-local referenced kptrs is not modified, as
    bpf_find_btf_id helper only searches vmlinux and module BTFs for
    matching BTF type. If such a type is found, btf_field_kptr's btf will
    pass btf_is_kernel check, and the associated release function is
    some one-argument dtor. If btf_is_kernel check fails, associated
    release function is two-arg bpf_obj_drop_impl. Before this patch
    only btf_field_kptr's w/ kernel or module BTFs were created.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 11 ++++++++++-
 include/linux/btf.h  |  2 --
 kernel/bpf/btf.c     | 37 ++++++++++++++++++++++++++++---------
 kernel/bpf/helpers.c | 11 ++++++++---
 kernel/bpf/syscall.c | 14 +++++++++++++-
 5 files changed, 59 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3a38db315f7f..756b85f0d0d3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -189,10 +189,19 @@ enum btf_field_type {
 				 BPF_RB_NODE | BPF_RB_ROOT,
 };
 
+typedef void (*btf_dtor_kfunc_t)(void *);
+typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
+
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	btf_dtor_kfunc_t dtor;
+	union {
+		/* dtor used if btf_is_kernel(btf), otherwise the type
+		 * is program-allocated and obj_drop is used
+		 */
+		btf_dtor_kfunc_t dtor;
+		btf_dtor_obj_drop obj_drop;
+	};
 	u32 btf_id;
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1bba0827e8c4..d53b10cc55f2 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -121,8 +121,6 @@ struct btf_struct_metas {
 	struct btf_struct_meta types[];
 };
 
-typedef void (*btf_dtor_kfunc_t)(void *);
-
 extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 37779ceefd09..66fad7a16b6c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,12 +3551,17 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
 	struct module *mod = NULL;
 	const struct btf_type *t;
-	struct btf *kernel_btf;
+	/* If a matching btf type is found in kernel or module BTFs, kptr_ref
+	 * is that BTF, otherwise it's program BTF
+	 */
+	struct btf *kptr_btf;
 	int ret;
 	s32 id;
 
@@ -3565,7 +3570,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 	 */
 	t = btf_type_by_id(btf, info->kptr.type_id);
 	id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
-			     &kernel_btf);
+			     &kptr_btf);
+	if (id == -ENOENT) {
+		/* btf_parse_kptr should only be called w/ btf = program BTF */
+		WARN_ON_ONCE(btf_is_kernel(btf));
+
+		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+		 * kptr allocated via bpf_obj_new
+		 */
+		field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
+		id = info->kptr.type_id;
+		kptr_btf = (struct btf *)btf;
+		btf_get(kptr_btf);
+		goto found_dtor;
+	}
 	if (id < 0)
 		return id;
 
@@ -3582,20 +3600,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		 * can be used as a referenced pointer and be stored in a map at
 		 * the same time.
 		 */
-		dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+		dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
 		if (dtor_btf_id < 0) {
 			ret = dtor_btf_id;
 			goto end_btf;
 		}
 
-		dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+		dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
 		if (!dtor_func) {
 			ret = -ENOENT;
 			goto end_btf;
 		}
 
-		if (btf_is_module(kernel_btf)) {
-			mod = btf_try_get_module(kernel_btf);
+		if (btf_is_module(kptr_btf)) {
+			mod = btf_try_get_module(kptr_btf);
 			if (!mod) {
 				ret = -ENXIO;
 				goto end_btf;
@@ -3605,7 +3623,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		/* We already verified dtor_func to be btf_type_is_func
 		 * in register_btf_id_dtor_kfuncs.
 		 */
-		dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+		dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
 		addr = kallsyms_lookup_name(dtor_func_name);
 		if (!addr) {
 			ret = -EINVAL;
@@ -3614,14 +3632,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		field->kptr.dtor = (void *)addr;
 	}
 
+found_dtor:
 	field->kptr.btf_id = id;
-	field->kptr.btf = kernel_btf;
+	field->kptr.btf = kptr_btf;
 	field->kptr.module = mod;
 	return 0;
 end_mod:
 	module_put(mod);
 end_btf:
-	btf_put(kernel_btf);
+	btf_put(kptr_btf);
 	return ret;
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f9b7eeedce08..77d64b6951b9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1896,14 +1896,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+	if (rec)
+		bpf_obj_free_fields(rec, p);
+	bpf_mem_free(&bpf_global_ma, p);
+}
+
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
 
-	if (meta)
-		bpf_obj_free_fields(meta->record, p);
-	bpf_mem_free(&bpf_global_ma, p);
+	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
 }
 
 static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cc4b7684910c..0684febc447a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -659,8 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		return;
 	fields = rec->fields;
 	for (i = 0; i < rec->cnt; i++) {
+		struct btf_struct_meta *pointee_struct_meta;
 		const struct btf_field *field = &fields[i];
 		void *field_ptr = obj + field->offset;
+		void *xchgd_field;
 
 		switch (fields[i].type) {
 		case BPF_SPIN_LOCK:
@@ -672,7 +674,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
-			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+			if (!btf_is_kernel(field->kptr.btf)) {
+				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+									   field->kptr.btf_id);
+				WARN_ON_ONCE(!pointee_struct_meta);
+				field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
+								  pointee_struct_meta->record :
+								  NULL);
+			} else {
+				field->kptr.dtor(xchgd_field);
+			}
 			break;
 		case BPF_LIST_HEAD:
 			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
-- 
cgit v1.2.3


From 738c96d5e2e3700b370f02ac84a9dc159ca81f25 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 10 Mar 2023 15:07:42 -0800
Subject: bpf: Allow local kptrs to be exchanged via bpf_kptr_xchg

The previous patch added necessary plumbing for verifier and runtime to
know what to do with non-kernel PTR_TO_BTF_IDs in map values, but didn't
provide any way to get such local kptrs into a map value. This patch
modifies verifier handling of bpf_kptr_xchg to allow MEM_ALLOC kptr
types.

check_reg_type is modified accept MEM_ALLOC-flagged input to
bpf_kptr_xchg despite such types not being in btf_ptr_types. This could
have been done with a MAYBE_MEM_ALLOC equivalent to MAYBE_NULL, but
bpf_kptr_xchg is the only helper that I can forsee using
MAYBE_MEM_ALLOC, so keep it special-cased for now.

The verifier tags bpf_kptr_xchg retval MEM_ALLOC if and only if the BTF
associated with the retval is not kernel BTF.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230310230743.2320707-3-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 073c56dbec63..519d465407ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7123,6 +7123,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type & PTR_MAYBE_NULL)
 		type &= ~PTR_MAYBE_NULL;
 
+	if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+		type &= ~MEM_ALLOC;
+
 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
 		expected = compatible->types[i];
 		if (expected == NOT_INIT)
@@ -7185,7 +7188,8 @@ found:
 		break;
 	}
 	case PTR_TO_BTF_ID | MEM_ALLOC:
-		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+		    meta->func_id != BPF_FUNC_kptr_xchg) {
 			verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
 			return -EFAULT;
 		}
@@ -9151,6 +9155,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		if (func_id == BPF_FUNC_kptr_xchg) {
 			ret_btf = meta.kptr_field->kptr.btf;
 			ret_btf_id = meta.kptr_field->kptr.btf_id;
+			if (!btf_is_kernel(ret_btf))
+				regs[BPF_REG_0].type |= MEM_ALLOC;
 		} else {
 			if (fn->ret_btf_id == BPF_PTR_POISON) {
 				verbose(env, "verifier internal error:");
-- 
cgit v1.2.3


From 34f0677e7afd3a292bc1aadda7ce8e35faedb204 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 13 Mar 2023 11:40:17 -0700
Subject: bpf: fix precision propagation verbose logging

Fix wrong order of frame index vs register/slot index in precision
propagation verbose (level 2) output. It's wrong and very confusing as is.

Fixes: 529409ea92d5 ("bpf: propagate precision across all frames, not just the last one")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230313184017.4083374-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 519d465407ef..883d4ff2e288 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15068,7 +15068,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
 			    !(state_reg->live & REG_LIVE_READ))
 				continue;
 			if (env->log.level & BPF_LOG_LEVEL2)
-				verbose(env, "frame %d: propagating r%d\n", i, fr);
+				verbose(env, "frame %d: propagating r%d\n", fr, i);
 			err = mark_chain_precision_frame(env, fr, i);
 			if (err < 0)
 				return err;
@@ -15084,7 +15084,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
 				continue;
 			if (env->log.level & BPF_LOG_LEVEL2)
 				verbose(env, "frame %d: propagating fp%d\n",
-					(-i - 1) * BPF_REG_SIZE, fr);
+					fr, (-i - 1) * BPF_REG_SIZE);
 			err = mark_chain_precision_stack_frame(env, fr, i);
 			if (err < 0)
 				return err;
-- 
cgit v1.2.3


From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Mar 2023 14:46:41 -0700
Subject: bpf: Disable migration when freeing stashed local kptr using obj drop

When a local kptr is stashed in a map and freed when the map goes away,
currently an error like the below appears:

[   39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875
[   39.196549] caller is bpf_mem_free+0x56/0xc0
[   39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G           O       6.2.0-13016-g22df776a9a86 #4477
[   39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[   39.198949] Workqueue: events_unbound bpf_map_free_deferred
[   39.199470] Call Trace:
[   39.199703]  <TASK>
[   39.199911]  dump_stack_lvl+0x60/0x70
[   39.200267]  check_preemption_disabled+0xbf/0xe0
[   39.200704]  bpf_mem_free+0x56/0xc0
[   39.201032]  ? bpf_obj_new_impl+0xa0/0xa0
[   39.201430]  bpf_obj_free_fields+0x1cd/0x200
[   39.201838]  array_map_free+0xad/0x220
[   39.202193]  ? finish_task_switch+0xe5/0x3c0
[   39.202614]  bpf_map_free_deferred+0xea/0x210
[   39.203006]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.203460]  process_one_work+0x64f/0xbe0
[   39.203822]  ? pwq_dec_nr_in_flight+0x110/0x110
[   39.204264]  ? do_raw_spin_lock+0x107/0x1c0
[   39.204662]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.205107]  worker_thread+0x74/0x7a0
[   39.205451]  ? process_one_work+0xbe0/0xbe0
[   39.205818]  kthread+0x171/0x1a0
[   39.206111]  ? kthread_complete_and_exit+0x20/0x20
[   39.206552]  ret_from_fork+0x1f/0x30
[   39.206886]  </TASK>

This happens because the call to __bpf_obj_drop_impl I added in the patch
adding support for stashing local kptrs doesn't disable migration. Prior
to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF
progarm, whereas now it can be called from map free path, so it's
necessary to explicitly disable migration.

Also, refactor a bit to just call __bpf_obj_drop_impl directly instead
of bothering w/ dtor union and setting pointer-to-obj_drop.

Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 12 ++++--------
 kernel/bpf/btf.c     |  4 +---
 kernel/bpf/syscall.c | 10 +++++++---
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 756b85f0d0d3..71cc92a4ba48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,18 +190,14 @@ enum btf_field_type {
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
-typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
 
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	union {
-		/* dtor used if btf_is_kernel(btf), otherwise the type
-		 * is program-allocated and obj_drop is used
-		 */
-		btf_dtor_kfunc_t dtor;
-		btf_dtor_obj_drop obj_drop;
-	};
+	/* dtor used if btf_is_kernel(btf), otherwise the type is
+	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
+	 */
+	btf_dtor_kfunc_t dtor;
 	u32 btf_id;
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 66fad7a16b6c..b7e5a5510b91 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,8 +3551,6 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
@@ -3578,7 +3576,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
 		 * kptr allocated via bpf_obj_new
 		 */
-		field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
+		field->kptr.dtor = NULL;
 		id = info->kptr.type_id;
 		kptr_btf = (struct btf *)btf;
 		btf_get(kptr_btf);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0684febc447a..5b88301a2ae0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
@@ -679,9 +681,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 									   field->kptr.btf_id);
 				WARN_ON_ONCE(!pointee_struct_meta);
-				field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
-								  pointee_struct_meta->record :
-								  NULL);
+				migrate_disable();
+				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+								 pointee_struct_meta->record :
+								 NULL);
+				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
 			}
-- 
cgit v1.2.3


From c9267aa8b794c2188d49c7d7bd2990e98b2d6b84 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 13 Mar 2023 16:58:43 -0700
Subject: bpf: Fix bpf_strncmp proto.

bpf_strncmp() doesn't write into its first argument.
Make sure that the verifier knows about it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230313235845.61029-2-alexei.starovoitov@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 77d64b6951b9..f753676ef652 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -571,7 +571,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = {
 	.func		= bpf_strncmp,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_PTR_TO_CONST_STR,
 };
-- 
cgit v1.2.3


From 3e30be4288b31702d4898487a74e80ba14150a9f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 13 Mar 2023 16:58:44 -0700
Subject: bpf: Allow helpers access trusted PTR_TO_BTF_ID.

The verifier rejects the code:
  bpf_strncmp(task->comm, 16, "my_task");
with the message:
  16: (85) call bpf_strncmp#182
  R1 type=trusted_ptr_ expected=fp, pkt, pkt_meta, map_key, map_value, mem, ringbuf_mem, buf

Teach the verifier that such access pattern is safe.
Do not allow untrusted and legacy ptr_to_btf_id to be passed into helpers.

Reported-by: David Vernet <void@manifault.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230313235845.61029-3-alexei.starovoitov@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/verifier.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 883d4ff2e288..2bbd89279070 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6303,6 +6303,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				env,
 				regno, reg->off, access_size,
 				zero_size_allowed, ACCESS_HELPER, meta);
+	case PTR_TO_BTF_ID:
+		return check_ptr_to_btf_access(env, regs, regno, reg->off,
+					       access_size, BPF_READ, -1);
 	case PTR_TO_CTX:
 		/* in case the function doesn't know how to access the context,
 		 * (because we are in a program of type SYSCALL for example), we
@@ -7014,6 +7017,7 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_MEM,
 		PTR_TO_MEM | MEM_RINGBUF,
 		PTR_TO_BUF,
+		PTR_TO_BTF_ID | PTR_TRUSTED,
 	},
 };
 
@@ -7145,6 +7149,17 @@ found:
 	if (base_type(reg->type) != PTR_TO_BTF_ID)
 		return 0;
 
+	if (compatible == &mem_types) {
+		if (!(arg_type & MEM_RDONLY)) {
+			verbose(env,
+				"%s() may write into memory pointed by R%d type=%s\n",
+				func_id_name(meta->func_id),
+				regno, reg_type_str(env, reg->type));
+			return -EACCES;
+		}
+		return 0;
+	}
+
 	switch ((int)reg->type) {
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
-- 
cgit v1.2.3


From b8a2e3f93d412114a1539ea97b59b3e6ed6e1f9a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Mar 2023 11:59:49 -1000
Subject: cgroup: Make current_cgns_cgroup_dfl() safe to call after
 exit_task_namespace()

The commit 332ea1f697be ("bpf: Add bpf_cgroup_from_id() kfunc") added
bpf_cgroup_from_id() which calls current_cgns_cgroup_dfl() through
cgroup_get_from_id(). However, BPF programs may be attached to a point where
current->nsproxy has already been cleared to NULL by exit_task_namespace()
and calling bpf_cgroup_from_id() would cause an oops.

Just return the system-wide root if nsproxy has been cleared. This allows
all cgroups to be looked up after the task passed through
exit_task_namespace(), which semantically makes sense. Given that the only
way to get this behavior is through BPF programs, it seems safe but let's
see what others think.

Fixes: 332ea1f697be ("bpf: Add bpf_cgroup_from_id() kfunc")
Signed-off-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/ZBDuVWiFj2jiz3i8@slm.duckdns.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/cgroup/cgroup.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 935e8121b21e..8a5294f4ce72 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1465,8 +1465,18 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
 {
 	struct css_set *cset;
 
-	cset = current->nsproxy->cgroup_ns->root_cset;
-	return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+	if (current->nsproxy) {
+		cset = current->nsproxy->cgroup_ns->root_cset;
+		return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+	} else {
+		/*
+		 * NOTE: This function may be called from bpf_cgroup_from_id()
+		 * on a task which has already passed exit_task_namespaces() and
+		 * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
+		 * cgroups visible for lookups.
+		 */
+		return &cgrp_dfl_root.cgrp;
+	}
 }
 
 /* look up cgroup associated with given css_set on the specified hierarchy */
-- 
cgit v1.2.3


From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 10 Mar 2023 08:40:59 +0100
Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules

This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
to functions located in modules:

1. The verifier tries to find the address to attach to in kallsyms. This
   is always done by searching the entire kallsyms, not respecting the
   module in which the function is located. Such approach causes an
   incorrect attachment address to be computed if the function to attach
   to is shadowed by a function of the same name located earlier in
   kallsyms.

2. If the address to attach to is located in a module, the module
   reference is only acquired in register_fentry. If the module is
   unloaded between the place where the address is found
   (bpf_check_attach_target in the verifier) and register_fentry, it is
   possible that another module is loaded to the same address which may
   lead to potential errors.

Since the attachment must contain the BTF of the program to attach to,
we extract the module from it and search for the function address in the
correct module (resolving problem no. 1). Then, the module reference is
taken directly in bpf_check_attach_target and stored in the bpf program
(in bpf_prog_aux). The reference is only released when the program is
unloaded (resolving problem no. 2).

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/syscall.c     |  6 ++++++
 kernel/bpf/trampoline.c  | 28 ----------------------------
 kernel/bpf/verifier.c    | 18 +++++++++++++++++-
 kernel/module/internal.h |  5 +++++
 5 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 71cc92a4ba48..3ef98fb92987 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1103,6 +1103,7 @@ struct bpf_trampoline {
 struct bpf_attach_target_info {
 	struct btf_func_model fmodel;
 	long tgt_addr;
+	struct module *tgt_mod;
 	const char *tgt_name;
 	const struct btf_type *tgt_type;
 };
@@ -1406,6 +1407,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct module *mod;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
 	union {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5b88301a2ae0..099e9068bcdd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2067,6 +2067,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 {
 	bpf_prog_kallsyms_del_all(prog);
 	btf_put(prog->aux->btf);
+	module_put(prog->aux->mod);
 	kvfree(prog->aux->jited_linfo);
 	kvfree(prog->aux->linfo);
 	kfree(prog->aux->kfunc_tab);
@@ -3113,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		if (err)
 			goto out_unlock;
 
+		if (tgt_info.tgt_mod) {
+			module_put(prog->aux->mod);
+			prog->aux->mod = tgt_info.tgt_mod;
+		}
+
 		tr = bpf_trampoline_get(key, &tgt_info);
 		if (!tr) {
 			err = -ENOMEM;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..f61d5138b12b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,6 @@
 #include <linux/btf.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
-#include <linux/module.h>
 #include <linux/static_call.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
@@ -172,26 +171,6 @@ out:
 	return tr;
 }
 
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
-	struct module *mod;
-	int err = 0;
-
-	preempt_disable();
-	mod = __module_text_address((unsigned long) tr->func.addr);
-	if (mod && !try_module_get(mod))
-		err = -ENOENT;
-	preempt_enable();
-	tr->mod = mod;
-	return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
-	module_put(tr->mod);
-	tr->mod = NULL;
-}
-
 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 {
 	void *ip = tr->func.addr;
@@ -202,8 +181,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
-	if (!ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
@@ -238,9 +215,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		tr->func.ftrace_managed = true;
 	}
 
-	if (bpf_trampoline_module_get(tr))
-		return -ENOENT;
-
 	if (tr->func.ftrace_managed) {
 		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
 		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
@@ -248,8 +222,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 	}
 
-	if (ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2bbd89279070..60793f793ca6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
+#include "../module/internal.h"
 
 #include "disasm.h"
 
@@ -18307,6 +18308,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	const char *tname;
 	struct btf *btf;
 	long addr = 0;
+	struct module *mod = NULL;
 
 	if (!btf_id) {
 		bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -18480,8 +18482,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			else
 				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
 		} else {
-			addr = kallsyms_lookup_name(tname);
+			if (btf_is_module(btf)) {
+				mod = btf_try_get_module(btf);
+				if (mod)
+					addr = find_kallsyms_symbol_value(mod, tname);
+				else
+					addr = 0;
+			} else {
+				addr = kallsyms_lookup_name(tname);
+			}
 			if (!addr) {
+				module_put(mod);
 				bpf_log(log,
 					"The address of function %s cannot be found\n",
 					tname);
@@ -18521,11 +18532,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				break;
 			}
 			if (ret) {
+				module_put(mod);
 				bpf_log(log, "%s is not sleepable\n", tname);
 				return ret;
 			}
 		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
 			if (tgt_prog) {
+				module_put(mod);
 				bpf_log(log, "can't modify return codes of BPF programs\n");
 				return -EINVAL;
 			}
@@ -18534,6 +18547,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    !check_attach_modify_return(addr, tname))
 				ret = 0;
 			if (ret) {
+				module_put(mod);
 				bpf_log(log, "%s() is not modifiable\n", tname);
 				return ret;
 			}
@@ -18544,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	tgt_info->tgt_addr = addr;
 	tgt_info->tgt_name = tname;
 	tgt_info->tgt_type = t;
+	tgt_info->tgt_mod = mod;
 	return 0;
 }
 
@@ -18623,6 +18638,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	/* store info about the attachment target that will be used later */
 	prog->aux->attach_func_proto = tgt_info.tgt_type;
 	prog->aux->attach_func_name = tgt_info.tgt_name;
+	prog->aux->mod = tgt_info.tgt_mod;
 
 	if (tgt_prog) {
 		prog->aux->saved_dst_prog_type = tgt_prog->type;
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..5c9170f9135c 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 77473d1a962f3d4f7ba48324502b6d27b8ef2591 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 16 Mar 2023 00:40:24 -0500
Subject: bpf: Free struct bpf_cpumask in call_rcu handler

The struct bpf_cpumask type uses the bpf_mem_cache_{alloc,free}() APIs
to allocate and free its cpumasks. The bpf_mem allocator may currently
immediately reuse some memory when its freed, without waiting for an RCU
read cycle to elapse. We want to be able to treat struct bpf_cpumask
objects as completely RCU safe.

This is necessary for two reasons:

1. bpf_cpumask_kptr_get() currently does an RCU-protected
   refcnt_inc_not_zero(). This of course assumes that the underlying
   memory is not reused, and is therefore unsafe in its current form.

2. We want to be able to get rid of bpf_cpumask_kptr_get() entirely, and
   intead use the superior kptr RCU semantics now afforded by the
   verifier.

This patch fixes (1), and enables (2), by making struct bpf_cpumask RCU
safe. A subsequent patch will update the verifier to allow struct
bpf_cpumask * pointers to be passed to KF_RCU kfuncs, and then a latter
patch will remove bpf_cpumask_kptr_get().

Fixes: 516f4d3397c9 ("bpf: Enable cpumasks to be queried and used as kptrs")
Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230316054028.88924-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumask.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index b6587ec40f1b..98eea62b6b7b 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -9,6 +9,7 @@
 /**
  * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
  * @cpumask:	The actual cpumask embedded in the struct.
+ * @rcu:	The RCU head used to free the cpumask with RCU safety.
  * @usage:	Object reference counter. When the refcount goes to 0, the
  *		memory is released back to the BPF allocator, which provides
  *		RCU safety.
@@ -24,6 +25,7 @@
  */
 struct bpf_cpumask {
 	cpumask_t cpumask;
+	struct rcu_head rcu;
 	refcount_t usage;
 };
 
@@ -108,6 +110,16 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas
 	return cpumask;
 }
 
+static void cpumask_free_cb(struct rcu_head *head)
+{
+	struct bpf_cpumask *cpumask;
+
+	cpumask = container_of(head, struct bpf_cpumask, rcu);
+	migrate_disable();
+	bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
+	migrate_enable();
+}
+
 /**
  * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
  * @cpumask: The cpumask being released.
@@ -121,11 +133,8 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
 	if (!cpumask)
 		return;
 
-	if (refcount_dec_and_test(&cpumask->usage)) {
-		migrate_disable();
-		bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
-		migrate_enable();
-	}
+	if (refcount_dec_and_test(&cpumask->usage))
+		call_rcu(&cpumask->rcu, cpumask_free_cb);
 }
 
 /**
-- 
cgit v1.2.3


From 63d2d83d21a6e2c6f019da5b2d5cdabe6d1cb951 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 16 Mar 2023 00:40:25 -0500
Subject: bpf: Mark struct bpf_cpumask as rcu protected

struct bpf_cpumask is a BPF-wrapper around the struct cpumask type which
can be instantiated by a BPF program, and then queried as a cpumask in
similar fashion to normal kernel code. The previous patch in this series
makes the type fully RCU safe, so the type can be included in the
rcu_protected_type BTF ID list.

A subsequent patch will remove bpf_cpumask_kptr_get(), as it's no longer
useful now that we can just treat the type as RCU safe by default and do
our own if check.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230316054028.88924-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 60793f793ca6..15b5c5c729f9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4599,6 +4599,7 @@ static bool in_rcu_cs(struct bpf_verifier_env *env)
 BTF_SET_START(rcu_protected_types)
 BTF_ID(struct, prog_test_ref_kfunc)
 BTF_ID(struct, cgroup)
+BTF_ID(struct, bpf_cpumask)
 BTF_SET_END(rcu_protected_types)
 
 static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
-- 
cgit v1.2.3


From 1b403ce77dfbf234723a91bc411dfb03a0499d6e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 16 Mar 2023 00:40:27 -0500
Subject: bpf: Remove bpf_cpumask_kptr_get() kfunc

Now that struct bpf_cpumask is RCU safe, there's no need for this kfunc.
Rather than doing the following:

private(MASK) static struct bpf_cpumask __kptr *global;

int BPF_PROG(prog, s32 cpu, ...)
{
	struct bpf_cpumask *cpumask;

	bpf_rcu_read_lock();
	cpumask = bpf_cpumask_kptr_get(&global);
	if (!cpumask) {
		bpf_rcu_read_unlock();
		return -1;
	}
	bpf_cpumask_setall(cpumask);
	...
	bpf_cpumask_release(cpumask);
	bpf_rcu_read_unlock();
}

Programs can instead simply do (assume same global cpumask):

int BPF_PROG(prog, ...)
{
	struct bpf_cpumask *cpumask;

	bpf_rcu_read_lock();
	cpumask = global;
	if (!cpumask) {
		bpf_rcu_read_unlock();
		return -1;
	}
	bpf_cpumask_setall(cpumask);
	...
	bpf_rcu_read_unlock();
}

In other words, no extra atomic acquire / release, and less boilerplate
code.

This patch removes both the kfunc, as well as its selftests and
documentation.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230316054028.88924-5-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumask.c                               | 29 ---------------------
 tools/testing/selftests/bpf/prog_tests/cpumask.c   |  1 -
 tools/testing/selftests/bpf/progs/cpumask_common.h |  1 -
 .../testing/selftests/bpf/progs/cpumask_failure.c  | 24 -----------------
 .../testing/selftests/bpf/progs/cpumask_success.c  | 30 ----------------------
 5 files changed, 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 98eea62b6b7b..db9da2194c1a 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -82,34 +82,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
 	return cpumask;
 }
 
-/**
- * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
- *			    stored in a map.
- * @cpumaskp: A pointer to a BPF cpumask map value.
- *
- * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
- * cpumask returned by this function must either be embedded in a map as a
- * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
- * no BPF cpumask was found in the specified map value.
- */
-__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
-{
-	struct bpf_cpumask *cpumask;
-
-	/* The BPF memory allocator frees memory backing its caches in an RCU
-	 * callback. Thus, we can safely use RCU to ensure that the cpumask is
-	 * safe to read.
-	 */
-	rcu_read_lock();
-
-	cpumask = READ_ONCE(*cpumaskp);
-	if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
-		cpumask = NULL;
-
-	rcu_read_unlock();
-	return cpumask;
-}
-
 static void cpumask_free_cb(struct rcu_head *head)
 {
 	struct bpf_cpumask *cpumask;
@@ -435,7 +407,6 @@ BTF_SET8_START(cpumask_kfunc_btf_ids)
 BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c
index 6c0fe23498c7..cdf4acc18e4c 100644
--- a/tools/testing/selftests/bpf/prog_tests/cpumask.c
+++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c
@@ -16,7 +16,6 @@ static const char * const cpumask_success_testcases[] = {
 	"test_copy_any_anyand",
 	"test_insert_leave",
 	"test_insert_remove_release",
-	"test_insert_kptr_get_release",
 	"test_global_mask_rcu",
 };
 
diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
index 7623782fbd62..0c5b785a93e4 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_common.h
+++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
@@ -26,7 +26,6 @@ struct array_map {
 struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
 void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
 struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
-struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumask) __ksym;
 u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
 u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
 void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
index 9f726d55f747..db4f94e72b61 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_failure.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -94,30 +94,6 @@ int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_
 	return 0;
 }
 
-SEC("tp_btf/task_newtask")
-__failure __msg("Unreleased reference")
-int BPF_PROG(test_kptr_get_no_release, struct task_struct *task, u64 clone_flags)
-{
-	struct bpf_cpumask *cpumask;
-	struct __cpumask_map_value *v;
-
-	cpumask = create_cpumask();
-	if (!cpumask)
-		return 0;
-
-	if (cpumask_map_insert(cpumask))
-		return 0;
-
-	v = cpumask_map_value_lookup();
-	if (!v)
-		return 0;
-
-	cpumask = bpf_cpumask_kptr_get(&v->cpumask);
-
-	/* cpumask is never released. */
-	return 0;
-}
-
 SEC("tp_btf/task_newtask")
 __failure __msg("NULL pointer passed to trusted arg0")
 int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags)
diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c
index fe928ff72a06..2fcdd7f68ac7 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_success.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_success.c
@@ -394,36 +394,6 @@ int BPF_PROG(test_insert_remove_release, struct task_struct *task, u64 clone_fla
 	return 0;
 }
 
-SEC("tp_btf/task_newtask")
-int BPF_PROG(test_insert_kptr_get_release, struct task_struct *task, u64 clone_flags)
-{
-	struct bpf_cpumask *cpumask;
-	struct __cpumask_map_value *v;
-
-	cpumask = create_cpumask();
-	if (!cpumask)
-		return 0;
-
-	if (cpumask_map_insert(cpumask)) {
-		err = 3;
-		return 0;
-	}
-
-	v = cpumask_map_value_lookup();
-	if (!v) {
-		err = 4;
-		return 0;
-	}
-
-	cpumask = bpf_cpumask_kptr_get(&v->cpumask);
-	if (cpumask)
-		bpf_cpumask_release(cpumask);
-	else
-		err = 5;
-
-	return 0;
-}
-
 SEC("tp_btf/task_newtask")
 int BPF_PROG(test_global_mask_rcu, struct task_struct *task, u64 clone_flags)
 {
-- 
cgit v1.2.3


From 082cdc69a4651dd2a77539d69416a359ed1214f5 Mon Sep 17 00:00:00 2001
From: Luis Gerhorst <gerhorst@cs.fau.de>
Date: Wed, 15 Mar 2023 17:54:00 +0100
Subject: bpf: Remove misleading spec_v1 check on var-offset stack read

For every BPF_ADD/SUB involving a pointer, adjust_ptr_min_max_vals()
ensures that the resulting pointer has a constant offset if
bypass_spec_v1 is false. This is ensured by calling sanitize_check_bounds()
which in turn calls check_stack_access_for_ptr_arithmetic(). There,
-EACCESS is returned if the register's offset is not constant, thereby
rejecting the program.

In summary, an unprivileged user must never be able to create stack
pointers with a variable offset. That is also the case, because a
respective check in check_stack_write() is missing. If they were able
to create a variable-offset pointer, users could still use it in a
stack-write operation to trigger unsafe speculative behavior [1].

Because unprivileged users must already be prevented from creating
variable-offset stack pointers, viable options are to either remove
this check (replacing it with a clarifying comment), or to turn it
into a "verifier BUG"-message, also adding a similar check in
check_stack_write() (for consistency, as a second-level defense).
This patch implements the first option to reduce verifier bloat.

This check was introduced by commit 01f810ace9ed ("bpf: Allow
variable-offset stack access") which correctly notes that
"variable-offset reads and writes are disallowed (they were already
disallowed for the indirect access case) because the speculative
execution checking code doesn't support them". However, it does not
further discuss why the check in check_stack_read() is necessary.
The code which made this check obsolete was also introduced in this
commit.

I have compiled ~650 programs from the Linux selftests, Linux samples,
Cilium, and libbpf/examples projects and confirmed that none of these
trigger the check in check_stack_read() [2]. Instead, all of these
programs are, as expected, already rejected when constructing the
variable-offset pointers. Note that the check in
check_stack_access_for_ptr_arithmetic() also prints "off=%d" while the
code removed by this patch does not (the error removed does not appear
in the "verification_error" values). For reproducibility, the
repository linked includes the raw data and scripts used to create
the plot.

  [1] https://arxiv.org/pdf/1807.03757.pdf
  [2] https://gitlab.cs.fau.de/un65esoq/bpf-spectre/-/raw/53dc19fcf459c186613b1156a81504b39c8d49db/data/plots/23-02-26_23-56_bpftool/bpftool/0004-errors.pdf?inline=false

Fixes: 01f810ace9ed ("bpf: Allow variable-offset stack access")
Signed-off-by: Luis Gerhorst <gerhorst@cs.fau.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230315165358.23701-1-gerhorst@cs.fau.de
---
 kernel/bpf/verifier.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 15b5c5c729f9..d62b7127ff2a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4303,17 +4303,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
 	}
 	/* Variable offset is prohibited for unprivileged mode for simplicity
 	 * since it requires corresponding support in Spectre masking for stack
-	 * ALU. See also retrieve_ptr_limit().
+	 * ALU. See also retrieve_ptr_limit(). The check in
+	 * check_stack_access_for_ptr_arithmetic() called by
+	 * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+	 * with variable offsets, therefore no check is required here. Further,
+	 * just checking it here would be insufficient as speculative stack
+	 * writes could still lead to unsafe speculative behaviour.
 	 */
-	if (!env->bypass_spec_v1 && var_off) {
-		char tn_buf[48];
-
-		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
-				ptr_regno, tn_buf);
-		return -EACCES;
-	}
-
 	if (!var_off) {
 		off += reg->var_off.value;
 		err = check_stack_read_fixed_off(env, state, off, size,
-- 
cgit v1.2.3


From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 17 Mar 2023 10:56:01 +0100
Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header

Moving find_kallsyms_symbol_value from kernel/module/internal.h to
include/linux/module.h. The reason is that internal.h is not prepared to
be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by
kernel/bpf/verifier.c and including internal.h from it (without modules)
leads into a compilation error:

  In file included from ../include/linux/container_of.h:5,
                   from ../include/linux/list.h:5,
                   from ../include/linux/timer.h:5,
                   from ../include/linux/workqueue.h:9,
                   from ../include/linux/bpf.h:10,
                   from ../include/linux/bpf-cgroup.h:5,
                   from ../kernel/bpf/verifier.c:7:
  ../kernel/bpf/../module/internal.h: In function 'mod_find':
  ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module'
     20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
        |                                                      ^~
  [...]

This patch fixes the above error.

Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/
Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com
---
 include/linux/module.h   | 8 ++++++++
 kernel/bpf/verifier.c    | 2 +-
 kernel/module/internal.h | 6 ------
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 4435ad9439ab..41cfd3be57e5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d62b7127ff2a..99394a2f7ee4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,7 +24,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
-#include "../module/internal.h"
+#include <linux/module.h>
 
 #include "disasm.h"
 
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 5c9170f9135c..1c877561a7d2 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -246,7 +246,6 @@ static inline void kmemleak_load_module(const struct module *mod,
 void init_build_id(struct module *mod, const struct load_info *info);
 void layout_symtab(struct module *mod, struct load_info *info);
 void add_kallsyms(struct module *mod, const struct load_info *info);
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
 
 static inline bool sect_empty(const Elf_Shdr *sect)
 {
@@ -256,11 +255,6 @@ static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
-static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
-						       const char *name)
-{
-	return 0;
-}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 58aa2afbb1e61fcf35bfcc819952a3c13d9f9203 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 17 Mar 2023 13:19:17 -0700
Subject: bpf: Allow ld_imm64 instruction to point to kfunc.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow ld_imm64 insn with BPF_PSEUDO_BTF_ID to hold the address of kfunc. The
ld_imm64 pointing to a valid kfunc will be seen as non-null PTR_TO_MEM by
is_branch_taken() logic of the verifier, while libbpf will resolve address to
unknown kfunc as ld_imm64 reg, 0 which will also be recognized by
is_branch_taken() and the verifier will proceed dead code elimination. BPF
programs can use this logic to detect at load time whether kfunc is present in
the kernel with bpf_ksym_exists() macro that is introduced in the next patches.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230317201920.62030-2-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99394a2f7ee4..8bc44f5dc5b6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15952,8 +15952,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		goto err_put;
 	}
 
-	if (!btf_type_is_var(t)) {
-		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+	if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
 		err = -EINVAL;
 		goto err_put;
 	}
@@ -15966,6 +15966,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		err = -ENOENT;
 		goto err_put;
 	}
+	insn[0].imm = (u32)addr;
+	insn[1].imm = addr >> 32;
+
+	if (btf_type_is_func(t)) {
+		aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+		aux->btf_var.mem_size = 0;
+		goto check_btf;
+	}
 
 	datasec_id = find_btf_percpu_datasec(btf);
 	if (datasec_id > 0) {
@@ -15978,9 +15986,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		}
 	}
 
-	insn[0].imm = (u32)addr;
-	insn[1].imm = addr >> 32;
-
 	type = t->type;
 	t = btf_type_skip_modifiers(btf, type, NULL);
 	if (percpu) {
@@ -16008,7 +16013,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		aux->btf_var.btf = btf;
 		aux->btf_var.btf_id = type;
 	}
-
+check_btf:
 	/* check whether we recorded this BTF (and maybe module) already */
 	for (i = 0; i < env->used_btf_cnt; i++) {
 		if (env->used_btfs[i].btf == btf) {
-- 
cgit v1.2.3


From 1057d299459657b85e593a4b6294a000f920672a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 21 Mar 2023 13:38:52 -0700
Subject: bpf: Teach the verifier to recognize rdonly_mem as not null.

Teach the verifier to recognize PTR_TO_MEM | MEM_RDONLY as not NULL
otherwise if (!bpf_ksym_exists(known_kfunc)) doesn't go through
dead code elimination.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230321203854.3035-3-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8bc44f5dc5b6..5693e4a92752 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -486,8 +486,17 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
 		type == PTR_TO_XDP_SOCK;
 }
 
+static bool type_may_be_null(u32 type)
+{
+	return type & PTR_MAYBE_NULL;
+}
+
 static bool reg_type_not_null(enum bpf_reg_type type)
 {
+	if (type_may_be_null(type))
+		return false;
+
+	type = base_type(type);
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_TCP_SOCK ||
 		type == PTR_TO_MAP_VALUE ||
@@ -531,11 +540,6 @@ static bool type_is_rdonly_mem(u32 type)
 	return type & MEM_RDONLY;
 }
 
-static bool type_may_be_null(u32 type)
-{
-	return type & PTR_MAYBE_NULL;
-}
-
 static bool is_acquire_function(enum bpf_func_id func_id,
 				const struct bpf_map *map)
 {
-- 
cgit v1.2.3


From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Wed, 22 Mar 2023 12:47:54 -0700
Subject: bpf: return long from bpf_map_ops funcs

This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.

The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.

For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:

; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
  ...
  46:	call   0xffffffffe103291c	; htab_map_update_elem
; if (err && err != -EEXIST) {
  4b:	cmp    $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax

kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:

movl $0xffffffef, %r9d
...
movl %r9d, %eax

...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef

Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 14 +++++++-------
 include/linux/filter.h         |  6 +++---
 kernel/bpf/arraymap.c          | 12 ++++++------
 kernel/bpf/bloom_filter.c      | 12 ++++++------
 kernel/bpf/bpf_cgrp_storage.c  |  6 +++---
 kernel/bpf/bpf_inode_storage.c |  6 +++---
 kernel/bpf/bpf_struct_ops.c    |  6 +++---
 kernel/bpf/bpf_task_storage.c  |  6 +++---
 kernel/bpf/cpumap.c            |  8 ++++----
 kernel/bpf/devmap.c            | 24 ++++++++++++------------
 kernel/bpf/hashtab.c           | 36 ++++++++++++++++++------------------
 kernel/bpf/local_storage.c     |  6 +++---
 kernel/bpf/lpm_trie.c          |  6 +++---
 kernel/bpf/queue_stack_maps.c  | 22 +++++++++++-----------
 kernel/bpf/reuseport_array.c   |  2 +-
 kernel/bpf/ringbuf.c           |  6 +++---
 kernel/bpf/stackmap.c          |  6 +++---
 kernel/bpf/verifier.c          | 14 +++++++-------
 net/core/bpf_sk_storage.c      |  6 +++---
 net/core/sock_map.c            |  8 ++++----
 net/xdp/xskmap.c               |  8 ++++----
 21 files changed, 110 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3ef98fb92987..ec0df059f562 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,11 +96,11 @@ struct bpf_map_ops {
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
-	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
-	int (*map_delete_elem)(struct bpf_map *map, void *key);
-	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
-	int (*map_pop_elem)(struct bpf_map *map, void *value);
-	int (*map_peek_elem)(struct bpf_map *map, void *value);
+	long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
+	long (*map_delete_elem)(struct bpf_map *map, void *key);
+	long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
+	long (*map_pop_elem)(struct bpf_map *map, void *value);
+	long (*map_peek_elem)(struct bpf_map *map, void *value);
 	void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
 
 	/* funcs called by prog_array and perf_event_array map */
@@ -139,7 +139,7 @@ struct bpf_map_ops {
 	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
 	/* Misc helpers.*/
-	int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
+	long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
 
 	/* map_meta_equal must be implemented for maps that can be
 	 * used as an inner map.  It is a runtime check to ensure
@@ -157,7 +157,7 @@ struct bpf_map_ops {
 	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
 					      struct bpf_func_state *caller,
 					      struct bpf_func_state *callee);
-	int (*map_for_each_callback)(struct bpf_map *map,
+	long (*map_for_each_callback)(struct bpf_map *map,
 				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index efa5d4a1677e..23c08c31bea9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
-						  u64 flags, const u64 flag_mask,
-						  void *lookup_elem(struct bpf_map *map, u32 key))
+static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
+						   u64 flags, const u64 flag_mask,
+						   void *lookup_elem(struct bpf_map *map, u32 key))
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1588c793a715..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
 }
 
 /* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
-				 u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+				  u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
@@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 }
 
 /* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
@@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),
 };
 
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
-				   void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				    void *callback_ctx, u64 flags)
 {
 	u32 i, key, num_elems = 0;
 	struct bpf_array *array;
@@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	return 0;
 }
 
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *old_ptr;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 6350c5d35a9b..db19784601a7 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -41,7 +41,7 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
 	return h & bloom->bitset_mask;
 }
 
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
 {
 	struct bpf_bloom_filter *bloom =
 		container_of(map, struct bpf_bloom_filter, map);
@@ -56,7 +56,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
 	return 0;
 }
 
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 {
 	struct bpf_bloom_filter *bloom =
 		container_of(map, struct bpf_bloom_filter, map);
@@ -73,12 +73,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 	return 0;
 }
 
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
 }
 
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
 }
@@ -177,8 +177,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(-EINVAL);
 }
 
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
-				 void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+				  void *value, u64 flags)
 {
 	/* The eBPF program should use map_push_elem instead */
 	return -EINVAL;
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c975cacdd16b..f5b016a5484d 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -93,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
 	return sdata ? sdata->data : NULL;
 }
 
-static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
-					  void *value, u64 map_flags)
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+					 void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct cgroup *cgroup;
@@ -125,7 +125,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	return 0;
 }
 
-static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct cgroup *cgroup;
 	int err, fd;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index ad2ab0187e45..9a5f05151898 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -91,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
 	return sdata ? sdata->data : NULL;
 }
 
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct file *f;
@@ -127,7 +127,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	return 0;
 }
 
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct file *f;
 	int fd, err;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 38903fb52f98..ba7a94276e3b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -349,8 +349,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 					   model, flags, tlinks, NULL);
 }
 
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
-					  void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+					   void *value, u64 flags)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 	const struct bpf_struct_ops *st_ops = st_map->st_ops;
@@ -524,7 +524,7 @@ unlock:
 	return err;
 }
 
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 {
 	enum bpf_struct_ops_state prev_state;
 	struct bpf_struct_ops_map *st_map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index c88cc04c17c1..ab5bd1ef58c4 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -120,8 +120,8 @@ out:
 	return ERR_PTR(err);
 }
 
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
-					    void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct task_struct *task;
@@ -173,7 +173,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
 	return 0;
 }
 
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct task_struct *task;
 	unsigned int f_flags;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 871809e71b4e..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
 	}
 }
 
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	u32 key_cpu = *(u32 *)key;
@@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	struct bpf_cpumap_val cpumap_value = {};
@@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __cpu_map_lookup_elem);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 19b036a228f7..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 	kfree(dev);
 }
 
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *old_dev;
@@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *old_dev;
@@ -897,8 +897,8 @@ err_out:
 	return ERR_PTR(-EINVAL);
 }
 
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
-				 void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+				  void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
@@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	return __dev_map_update_elem(current->nsproxy->net_ns,
 				     map, key, value, map_flags);
 }
 
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
-				     void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+				       void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
@@ -999,21 +999,21 @@ out_err:
 	return err;
 }
 
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
-				   u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+				     u64 map_flags)
 {
 	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
 					 map, key, value, map_flags);
 }
 
-static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
 				      __dev_map_lookup_elem);
 }
 
-static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0df4b0c10f59..96b645bba3a4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1073,8 +1073,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 }
 
 /* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1175,8 +1175,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
 
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
-				    u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+				     u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new, *l_old = NULL;
@@ -1242,9 +1242,9 @@ err:
 	return ret;
 }
 
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags,
-					 bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags,
+					  bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1297,9 +1297,9 @@ err:
 	return ret;
 }
 
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
-					     void *value, u64 map_flags,
-					     bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					      void *value, u64 map_flags,
+					      bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1364,21 +1364,21 @@ err:
 	return ret;
 }
 
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					void *value, u64 map_flags)
 {
 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
 }
 
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
-					   void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					    void *value, u64 map_flags)
 {
 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
 						 false);
 }
 
 /* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
@@ -1414,7 +1414,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
@@ -2134,8 +2134,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
 };
 
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
-				  void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				   void *callback_ctx, u64 flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a993560f200a..4c7bbec4a9e4 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
 	return &READ_ONCE(storage->buf)->data[0];
 }
 
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 flags)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
@@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
 	bpf_map_area_free(map);
 }
 
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index dc23f2ac9cde..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
 }
 
 /* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
-			    void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+			     void *_key, void *value, u64 flags)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -431,7 +431,7 @@ out:
 }
 
 /* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct bpf_lpm_trie_key *key = _key;
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 63ecbbcb349d..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
 	bpf_map_area_free(qs);
 }
 
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
@@ -124,7 +124,7 @@ out:
 }
 
 
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
@@ -156,32 +156,32 @@ out:
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return __queue_map_get(map, value, false);
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return __stack_map_get(map, value, false);
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return __queue_map_get(map, value, true);
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return __stack_map_get(map, value, true);
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
-				     u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+				      u64 flags)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long irq_flags;
@@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+					void *value, u64 flags)
 {
 	return -EINVAL;
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 71cb72f5b733..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
 {
 	struct reuseport_array *array = reuseport_array(map);
 	u32 index = *(u32 *)key;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0d2a45ff83f1..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
-				   u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+				    u64 flags)
 {
 	return -ENOTSUPP;
 }
 
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -ENOTSUPP;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 0f1d8dced933..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
 	return 0;
 }
 
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
-				 u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+				  u64 map_flags)
 {
 	return -EINVAL;
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *old_bucket;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5693e4a92752..50c995697f0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17692,21 +17692,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
 				     (void *(*)(struct bpf_map *map, void *key))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
-				     (int (*)(struct bpf_map *map, void *key))NULL));
+				     (long (*)(struct bpf_map *map, void *key))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
-				     (int (*)(struct bpf_map *map, void *key, void *value,
+				     (long (*)(struct bpf_map *map, void *key, void *value,
 					      u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_push_elem,
-				     (int (*)(struct bpf_map *map, void *value,
+				     (long (*)(struct bpf_map *map, void *value,
 					      u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
-				     (int (*)(struct bpf_map *map, void *value))NULL));
+				     (long (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
-				     (int (*)(struct bpf_map *map, void *value))NULL));
+				     (long (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_redirect,
-				     (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+				     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
-				     (int (*)(struct bpf_map *map,
+				     (long (*)(struct bpf_map *map,
 					      bpf_callback_t callback_fn,
 					      void *callback_ctx,
 					      u64 flags))NULL));
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 24c3dc0d62e5..cb0f5a105b89 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -94,8 +94,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(err);
 }
 
-static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags)
+static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct socket *sock;
@@ -114,7 +114,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 	return err;
 }
 
-static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct socket *sock;
 	int fd, err;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9b854e236d23..7c189c2e2fbf 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -437,7 +437,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
 	__sock_map_delete(stab, sk, link_raw);
 }
 
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
+static long sock_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	u32 i = *(u32 *)key;
@@ -587,8 +587,8 @@ out:
 	return ret;
 }
 
-static int sock_map_update_elem(struct bpf_map *map, void *key,
-				void *value, u64 flags)
+static long sock_map_update_elem(struct bpf_map *map, void *key,
+				 void *value, u64 flags)
 {
 	struct sock *sk = (struct sock *)value;
 	int ret;
@@ -925,7 +925,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
 	raw_spin_unlock_bh(&bucket->lock);
 }
 
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+static long sock_hash_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
 	u32 hash, key_size = map->key_size;
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 0c38d7175922..2c1427074a3b 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -162,8 +162,8 @@ static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	struct xdp_sock __rcu **map_entry;
@@ -223,7 +223,7 @@ out:
 	return err;
 }
 
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+static long xsk_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	struct xdp_sock __rcu **map_entry;
@@ -243,7 +243,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __xsk_map_lookup_elem);
-- 
cgit v1.2.3


From 7be14c1c9030f73cc18b4ff23b78a0a081f16188 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 22 Mar 2023 22:30:55 +0100
Subject: bpf: Fix __reg_bound_offset 64->32 var_off subreg propagation

Xu reports that after commit 3f50f132d840 ("bpf: Verifier, do explicit ALU32
bounds tracking"), the following BPF program is rejected by the verifier:

   0: (61) r2 = *(u32 *)(r1 +0)          ; R2_w=pkt(off=0,r=0,imm=0)
   1: (61) r3 = *(u32 *)(r1 +4)          ; R3_w=pkt_end(off=0,imm=0)
   2: (bf) r1 = r2
   3: (07) r1 += 1
   4: (2d) if r1 > r3 goto pc+8
   5: (71) r1 = *(u8 *)(r2 +0)           ; R1_w=scalar(umax=255,var_off=(0x0; 0xff))
   6: (18) r0 = 0x7fffffffffffff10
   8: (0f) r1 += r0                      ; R1_w=scalar(umin=0x7fffffffffffff10,umax=0x800000000000000f)
   9: (18) r0 = 0x8000000000000000
  11: (07) r0 += 1
  12: (ad) if r0 < r1 goto pc-2
  13: (b7) r0 = 0
  14: (95) exit

And the verifier log says:

  func#0 @0
  0: R1=ctx(off=0,imm=0) R10=fp0
  0: (61) r2 = *(u32 *)(r1 +0)          ; R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=0,imm=0)
  1: (61) r3 = *(u32 *)(r1 +4)          ; R1=ctx(off=0,imm=0) R3_w=pkt_end(off=0,imm=0)
  2: (bf) r1 = r2                       ; R1_w=pkt(off=0,r=0,imm=0) R2_w=pkt(off=0,r=0,imm=0)
  3: (07) r1 += 1                       ; R1_w=pkt(off=1,r=0,imm=0)
  4: (2d) if r1 > r3 goto pc+8          ; R1_w=pkt(off=1,r=1,imm=0) R3_w=pkt_end(off=0,imm=0)
  5: (71) r1 = *(u8 *)(r2 +0)           ; R1_w=scalar(umax=255,var_off=(0x0; 0xff)) R2_w=pkt(off=0,r=1,imm=0)
  6: (18) r0 = 0x7fffffffffffff10       ; R0_w=9223372036854775568
  8: (0f) r1 += r0                      ; R0_w=9223372036854775568 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775823,s32_min=-240,s32_max=15)
  9: (18) r0 = 0x8000000000000000       ; R0_w=-9223372036854775808
  11: (07) r0 += 1                      ; R0_w=-9223372036854775807
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775809)
  13: (b7) r0 = 0                       ; R0_w=0
  14: (95) exit

  from 12 to 11: R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775810,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775806
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775806 R1_w=scalar(umin=9223372036854775810,umax=9223372036854775810,var_off=(0x8000000000000000; 0xffffffff))
  13: safe

  [...]

  from 12 to 11: R0_w=-9223372036854775795 R1=scalar(umin=9223372036854775822,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775794
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775794 R1=scalar(umin=9223372036854775822,umax=9223372036854775822,var_off=(0x8000000000000000; 0xffffffff))
  13: safe

  from 12 to 11: R0_w=-9223372036854775794 R1=scalar(umin=9223372036854775823,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775793
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775793 R1=scalar(umin=9223372036854775823,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff))
  13: safe

  from 12 to 11: R0_w=-9223372036854775793 R1=scalar(umin=9223372036854775824,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775792
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775792 R1=scalar(umin=9223372036854775824,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff))
  13: safe

  [...]

The 64bit umin=9223372036854775810 bound continuously bumps by +1 while
umax=9223372036854775823 stays as-is until the verifier complexity limit
is reached and the program gets finally rejected. During this simulation,
the umin also eventually surpasses umax. Looking at the first 'from 12
to 11' output line from the loop, R1 has the following state:

  R1_w=scalar(umin=0x8000000000000002 (9223372036854775810),
              umax=0x800000000000000f (9223372036854775823),
          var_off=(0x8000000000000000;
                           0xffffffff))

The var_off has technically not an inconsistent state but it's very
imprecise and far off surpassing 64bit umax bounds whereas the expected
output with refined known bits in var_off should have been like:

  R1_w=scalar(umin=0x8000000000000002 (9223372036854775810),
              umax=0x800000000000000f (9223372036854775823),
          var_off=(0x8000000000000000;
                                  0xf))

In the above log, var_off stays as var_off=(0x8000000000000000; 0xffffffff)
and does not converge into a narrower mask where more bits become known,
eventually transforming R1 into a constant upon umin=9223372036854775823,
umax=9223372036854775823 case where the verifier would have terminated and
let the program pass.

The __reg_combine_64_into_32() marks the subregister unknown and propagates
64bit {s,u}min/{s,u}max bounds to their 32bit equivalents iff they are within
the 32bit universe. The question came up whether __reg_combine_64_into_32()
should special case the situation that when 64bit {s,u}min bounds have
the same value as 64bit {s,u}max bounds to then assign the latter as
well to the 32bit reg->{s,u}32_{min,max}_value. As can be seen from the
above example however, that is just /one/ special case and not a /generic/
solution given above example would still not be addressed this way and
remain at an imprecise var_off=(0x8000000000000000; 0xffffffff).

The improvement is needed in __reg_bound_offset() to refine var32_off with
the updated var64_off instead of the prior reg->var_off. The reg_bounds_sync()
code first refines information about the register's min/max bounds via
__update_reg_bounds() from the current var_off, then in __reg_deduce_bounds()
from sign bit and with the potentially learned bits from bounds it'll
update the var_off tnum in __reg_bound_offset(). For example, intersecting
with the old var_off might have improved bounds slightly, e.g. if umax
was 0x7f...f and var_off was (0; 0xf...fc), then new var_off will then
result in (0; 0x7f...fc). The intersected var64_off holds then the
universe which is a superset of var32_off. The point for the latter is
not to broaden, but to further refine known bits based on the intersection
of var_off with 32 bit bounds, so that we later construct the final var_off
from upper and lower 32 bits. The final __update_reg_bounds() can then
potentially still slightly refine bounds if more bits became known from the
new var_off.

After the improvement, we can see R1 converging successively:

  func#0 @0
  0: R1=ctx(off=0,imm=0) R10=fp0
  0: (61) r2 = *(u32 *)(r1 +0)          ; R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=0,imm=0)
  1: (61) r3 = *(u32 *)(r1 +4)          ; R1=ctx(off=0,imm=0) R3_w=pkt_end(off=0,imm=0)
  2: (bf) r1 = r2                       ; R1_w=pkt(off=0,r=0,imm=0) R2_w=pkt(off=0,r=0,imm=0)
  3: (07) r1 += 1                       ; R1_w=pkt(off=1,r=0,imm=0)
  4: (2d) if r1 > r3 goto pc+8          ; R1_w=pkt(off=1,r=1,imm=0) R3_w=pkt_end(off=0,imm=0)
  5: (71) r1 = *(u8 *)(r2 +0)           ; R1_w=scalar(umax=255,var_off=(0x0; 0xff)) R2_w=pkt(off=0,r=1,imm=0)
  6: (18) r0 = 0x7fffffffffffff10       ; R0_w=9223372036854775568
  8: (0f) r1 += r0                      ; R0_w=9223372036854775568 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775823,s32_min=-240,s32_max=15)
  9: (18) r0 = 0x8000000000000000       ; R0_w=-9223372036854775808
  11: (07) r0 += 1                      ; R0_w=-9223372036854775807
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775809)
  13: (b7) r0 = 0                       ; R0_w=0
  14: (95) exit

  from 12 to 11: R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775810,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775806
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775806 R1_w=-9223372036854775806
  13: safe

  from 12 to 11: R0_w=-9223372036854775806 R1_w=scalar(umin=9223372036854775811,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775805
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775805 R1_w=-9223372036854775805
  13: safe

  [...]

  from 12 to 11: R0_w=-9223372036854775798 R1=scalar(umin=9223372036854775819,umax=9223372036854775823,var_off=(0x8000000000000008; 0x7),s32_min=8,s32_max=15,u32_min=8,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775797
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775797 R1=-9223372036854775797
  13: safe

  from 12 to 11: R0_w=-9223372036854775797 R1=scalar(umin=9223372036854775820,umax=9223372036854775823,var_off=(0x800000000000000c; 0x3),s32_min=12,s32_max=15,u32_min=12,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775796
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775796 R1=-9223372036854775796
  13: safe

  from 12 to 11: R0_w=-9223372036854775796 R1=scalar(umin=9223372036854775821,umax=9223372036854775823,var_off=(0x800000000000000c; 0x3),s32_min=12,s32_max=15,u32_min=12,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775795
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775795 R1=-9223372036854775795
  13: safe

  from 12 to 11: R0_w=-9223372036854775795 R1=scalar(umin=9223372036854775822,umax=9223372036854775823,var_off=(0x800000000000000e; 0x1),s32_min=14,s32_max=15,u32_min=14,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775794
  12: (ad) if r0 < r1 goto pc-2         ; R0_w=-9223372036854775794 R1=-9223372036854775794
  13: safe

  from 12 to 11: R0_w=-9223372036854775794 R1=-9223372036854775793 R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  11: (07) r0 += 1                      ; R0_w=-9223372036854775793
  12: (ad) if r0 < r1 goto pc-2
  last_idx 12 first_idx 12
  parent didn't have regs=1 stack=0 marks: R0_rw=P-9223372036854775801 R1_r=scalar(umin=9223372036854775815,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  last_idx 11 first_idx 11
  regs=1 stack=0 before 11: (07) r0 += 1
  parent didn't have regs=1 stack=0 marks: R0_rw=P-9223372036854775805 R1_rw=scalar(umin=9223372036854775812,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
  last_idx 12 first_idx 0
  regs=1 stack=0 before 12: (ad) if r0 < r1 goto pc-2
  regs=1 stack=0 before 11: (07) r0 += 1
  regs=1 stack=0 before 12: (ad) if r0 < r1 goto pc-2
  regs=1 stack=0 before 11: (07) r0 += 1
  regs=1 stack=0 before 12: (ad) if r0 < r1 goto pc-2
  regs=1 stack=0 before 11: (07) r0 += 1
  regs=1 stack=0 before 9: (18) r0 = 0x8000000000000000
  last_idx 12 first_idx 12
  parent didn't have regs=2 stack=0 marks: R0_rw=P-9223372036854775801 R1_r=Pscalar(umin=9223372036854775815,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0
  last_idx 11 first_idx 11
  regs=2 stack=0 before 11: (07) r0 += 1
  parent didn't have regs=2 stack=0 marks: R0_rw=P-9223372036854775805 R1_rw=Pscalar(umin=9223372036854775812,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
  last_idx 12 first_idx 0
  regs=2 stack=0 before 12: (ad) if r0 < r1 goto pc-2
  regs=2 stack=0 before 11: (07) r0 += 1
  regs=2 stack=0 before 12: (ad) if r0 < r1 goto pc-2
  regs=2 stack=0 before 11: (07) r0 += 1
  regs=2 stack=0 before 12: (ad) if r0 < r1 goto pc-2
  regs=2 stack=0 before 11: (07) r0 += 1
  regs=2 stack=0 before 9: (18) r0 = 0x8000000000000000
  regs=2 stack=0 before 8: (0f) r1 += r0
  regs=3 stack=0 before 6: (18) r0 = 0x7fffffffffffff10
  regs=2 stack=0 before 5: (71) r1 = *(u8 *)(r2 +0)
  13: safe

  from 4 to 13: safe
  verification time 322 usec
  stack depth 0
  processed 56 insns (limit 1000000) max_states_per_insn 1 total_states 3 peak_states 3 mark_read 1

This also fixes up a test case along with this improvement where we match
on the verifier log. The updated log now has a refined var_off, too.

Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking")
Reported-by: Xu Kuohai <xukuohai@huaweicloud.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230314203424.4015351-2-xukuohai@huaweicloud.com
Link: https://lore.kernel.org/bpf/20230322213056.2470-1-daniel@iogearbox.net
---
 kernel/bpf/verifier.c                          | 6 +++---
 tools/testing/selftests/bpf/prog_tests/align.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 50c995697f0e..fd2f216de920 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2149,9 +2149,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
 	struct tnum var64_off = tnum_intersect(reg->var_off,
 					       tnum_range(reg->umin_value,
 							  reg->umax_value));
-	struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
-						tnum_range(reg->u32_min_value,
-							   reg->u32_max_value));
+	struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+					       tnum_range(reg->u32_min_value,
+							  reg->u32_max_value));
 
 	reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index c94fa8d6c4f6..b92770592563 100644
--- a/tools/testing/selftests/bpf/prog_tests/align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -575,14 +575,14 @@ static struct bpf_align_test tests[] = {
 			/* New unknown value in R7 is (4n), >= 76 */
 			{14, "R7_w=scalar(umin=76,umax=1096,var_off=(0x0; 0x7fc))"},
 			/* Adding it to packet pointer gives nice bounds again */
-			{16, "R5_w=pkt(id=3,off=0,r=0,umin=2,umax=1082,var_off=(0x2; 0xfffffffc)"},
+			{16, "R5_w=pkt(id=3,off=0,r=0,umin=2,umax=1082,var_off=(0x2; 0x7fc)"},
 			/* At the time the word size load is performed from R5,
 			 * its total fixed offset is NET_IP_ALIGN + reg->off (0)
 			 * which is 2.  Then the variable offset is (4n+2), so
 			 * the total offset is 4-byte aligned and meets the
 			 * load's requirements.
 			 */
-			{20, "R5=pkt(id=3,off=0,r=4,umin=2,umax=1082,var_off=(0x2; 0xfffffffc)"},
+			{20, "R5=pkt(id=3,off=0,r=4,umin=2,umax=1082,var_off=(0x2; 0x7fc)"},
 		},
 	},
 };
-- 
cgit v1.2.3


From b63cbc490e18d893632929b8faa55bb28da3fcd4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 22 Mar 2023 16:25:02 -0700
Subject: bpf: remember meta->iter info only for initialized iters

For iter_new() functions iterator state's slot might not be yet
initialized, in which case iter_get_spi() will return -ERANGE. This is
expected and is handled properly. But for iter_next() and iter_destroy()
cases iter slot is supposed to be initialized and correct, so -ERANGE is
not possible.

Move meta->iter.{spi,frameno} initialization into iter_next/iter_destroy
handling branch to make it more explicit that valid information will be
remembered in meta->iter block for subsequent use in process_iter_next_call(),
avoiding confusingly looking -ERANGE assignment for meta->iter.spi.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230322232502.836171-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/verifier.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fd2f216de920..64f06f6e16bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6778,13 +6778,6 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */
 	nr_slots = t->size / BPF_REG_SIZE;
 
-	spi = iter_get_spi(env, reg, nr_slots);
-	if (spi < 0 && spi != -ERANGE)
-		return spi;
-
-	meta->iter.spi = spi;
-	meta->iter.frameno = reg->frameno;
-
 	if (is_iter_new_kfunc(meta)) {
 		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
 		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
@@ -6811,10 +6804,17 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 			return -EINVAL;
 		}
 
+		spi = iter_get_spi(env, reg, nr_slots);
+		if (spi < 0)
+			return spi;
+
 		err = mark_iter_read(env, reg, spi, nr_slots);
 		if (err)
 			return err;
 
+		/* remember meta->iter info for process_iter_next_call() */
+		meta->iter.spi = spi;
+		meta->iter.frameno = reg->frameno;
 		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
 
 		if (is_iter_destroy_kfunc(meta)) {
-- 
cgit v1.2.3


From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:23:58 -0700
Subject: bpf: Retire the struct_ops map kvalue->refcnt.

We have replaced kvalue-refcnt with synchronize_rcu() to wait for an
RCU grace period.

Maintenance of kvalue->refcnt was a complicated task, as we had to
simultaneously keep track of two reference counts: one for the
reference count of bpf_map. When the kvalue->refcnt reaches zero, we
also have to reduce the reference count on bpf_map - yet these steps
are not performed in an atomic manner and require us to be vigilant
when managing them. By eliminating kvalue->refcnt, we can make our
maintenance more straightforward as the refcount of bpf_map is now
solely managed!

To prevent the trampoline image of a struct_ops from being released
while it is still in use, we wait for an RCU grace period. The
setsockopt(TCP_CONGESTION, "...") command allows you to change your
socket's congestion control algorithm and can result in releasing the
old struct_ops implementation. It is fine. However, this function is
exposed through bpf_setsockopt(), it may be accessed by BPF programs
as well. To ensure that the trampoline image belonging to struct_op
can be safely called while its method is in use, the trampoline
safeguarde the BPF program with rcu_read_lock(). Doing so prevents any
destruction of the associated images before returning from a
trampoline and requires us to wait for an RCU grace period.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h         |  1 +
 kernel/bpf/bpf_struct_ops.c | 77 ++++++++++++++++++++++++++-------------------
 kernel/bpf/syscall.c        |  6 ++--
 3 files changed, 49 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec0df059f562..f04098468d7a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map);
 void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
 struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ba7a94276e3b..2f3c4a0e03ee 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,6 +11,7 @@
 #include <linux/refcount.h>
 #include <linux/mutex.h>
 #include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
 
 enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
@@ -249,6 +250,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	enum bpf_struct_ops_state state;
+	s64 refcnt;
 
 	if (unlikely(*(u32 *)key != 0))
 		return -ENOENT;
@@ -267,7 +269,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	uvalue = value;
 	memcpy(uvalue, st_map->uvalue, map->value_size);
 	uvalue->state = state;
-	refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+	/* This value offers the user space a general estimate of how
+	 * many sockets are still utilizing this struct_ops for TCP
+	 * congestion control. The number might not be exact, but it
+	 * should sufficiently meet our present goals.
+	 */
+	refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+	refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
 
 	return 0;
 }
@@ -491,7 +500,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	refcount_set(&kvalue->refcnt, 1);
 	bpf_map_inc(map);
 
 	set_memory_rox((long)st_map->image, 1);
@@ -536,8 +544,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	switch (prev_state) {
 	case BPF_STRUCT_OPS_STATE_INUSE:
 		st_map->st_ops->unreg(&st_map->kvalue.data);
-		if (refcount_dec_and_test(&st_map->kvalue.refcnt))
-			bpf_map_put(map);
+		bpf_map_put(map);
 		return 0;
 	case BPF_STRUCT_OPS_STATE_TOBEFREE:
 		return -EINPROGRESS;
@@ -570,7 +577,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
 	kfree(value);
 }
 
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 
@@ -582,6 +589,28 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 	bpf_map_area_free(st_map);
 }
 
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+	/* The struct_ops's function may switch to another struct_ops.
+	 *
+	 * For example, bpf_tcp_cc_x->init() may switch to
+	 * another tcp_cc_y by calling
+	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+	 * and its refcount may reach 0 which then free its
+	 * trampoline image while tcp_cc_x is still running.
+	 *
+	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+	 * to finish. bpf-tcp-cc prog is non sleepable.
+	 * A rcu_tasks gp is to wait for the last few insn
+	 * in the tramopline image to finish before releasing
+	 * the trampoline image.
+	 */
+	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+	__bpf_struct_ops_map_free(map);
+}
+
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
@@ -630,7 +659,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 				   NUMA_NO_NODE);
 	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
 	if (!st_map->uvalue || !st_map->links || !st_map->image) {
-		bpf_struct_ops_map_free(map);
+		__bpf_struct_ops_map_free(map);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -676,41 +705,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
 bool bpf_struct_ops_get(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
 
 	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
 
-	return refcount_inc_not_zero(&kvalue->refcnt);
-}
-
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
-{
-	struct bpf_struct_ops_map *st_map;
-
-	st_map = container_of(head, struct bpf_struct_ops_map, rcu);
-	bpf_map_put(&st_map->map);
+	map = __bpf_map_inc_not_zero(&st_map->map, false);
+	return !IS_ERR(map);
 }
 
 void bpf_struct_ops_put(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
 
 	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
-	if (refcount_dec_and_test(&kvalue->refcnt)) {
-		struct bpf_struct_ops_map *st_map;
-
-		st_map = container_of(kvalue, struct bpf_struct_ops_map,
-				      kvalue);
-		/* The struct_ops's function may switch to another struct_ops.
-		 *
-		 * For example, bpf_tcp_cc_x->init() may switch to
-		 * another tcp_cc_y by calling
-		 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
-		 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
-		 * and its map->refcnt may reach 0 which then free its
-		 * trampoline image while tcp_cc_x is still running.
-		 *
-		 * Thus, a rcu grace period is needed here.
-		 */
-		call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
-	}
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	bpf_map_put(&st_map->map);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 099e9068bcdd..cff0348a2871 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
 {
 	int refold;
 
-- 
cgit v1.2.3


From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:00 -0700
Subject: bpf: Create links for BPF struct_ops maps.

Make bpf_link support struct_ops.  Previously, struct_ops were always
used alone without any associated links. Upon updating its value, a
struct_ops would be activated automatically. Yet other BPF program
types required to make a bpf_link with their instances before they
could become active. Now, however, you can create an inactive
struct_ops, and create a link to activate it later.

With bpf_links, struct_ops has a behavior similar to other BPF program
types. You can pin/unpin them from their links and the struct_ops will
be deactivated when its link is removed while previously need someone
to delete the value for it to be deactivated.

bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.

The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self.  Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.

To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |   7 ++
 include/uapi/linux/bpf.h       |  12 +++-
 kernel/bpf/bpf_struct_ops.c    | 143 ++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           |  23 ++++---
 net/ipv4/bpf_tcp_ca.c          |   8 ++-
 tools/include/uapi/linux/bpf.h |  12 +++-
 6 files changed, 190 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f04098468d7a..8552279efe46 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1518,6 +1518,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
 	const char *name;
@@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 	else
 		module_put(owner);
 }
+int bpf_struct_ops_link_create(union bpf_attr *attr);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 13129df937cd..42f40ee083bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6379,6 +6386,9 @@ struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 2f3c4a0e03ee..3d6b5240c25a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -17,6 +17,7 @@ enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
 	BPF_STRUCT_OPS_STATE_INUSE,
 	BPF_STRUCT_OPS_STATE_TOBEFREE,
+	BPF_STRUCT_OPS_STATE_READY,
 };
 
 #define BPF_STRUCT_OPS_COMMON_VALUE			\
@@ -59,6 +60,11 @@ struct bpf_struct_ops_map {
 	struct bpf_struct_ops_value kvalue;
 };
 
+struct bpf_struct_ops_link {
+	struct bpf_link link;
+	struct bpf_map __rcu *map;
+};
+
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
@@ -500,11 +506,29 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	bpf_map_inc(map);
+	if (st_map->map.map_flags & BPF_F_LINK) {
+		err = st_ops->validate(kdata);
+		if (err)
+			goto reset_unlock;
+		set_memory_rox((long)st_map->image, 1);
+		/* Let bpf_link handle registration & unregistration.
+		 *
+		 * Pair with smp_load_acquire() during lookup_elem().
+		 */
+		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+		goto unlock;
+	}
 
 	set_memory_rox((long)st_map->image, 1);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
+		/* This refcnt increment on the map here after
+		 * 'st_ops->reg()' is secure since the state of the
+		 * map must be set to INIT at this moment, and thus
+		 * bpf_struct_ops_map_delete_elem() can't unregister
+		 * or transition it to TOBEFREE concurrently.
+		 */
+		bpf_map_inc(map);
 		/* Pair with smp_load_acquire() during lookup_elem().
 		 * It ensures the above udata updates (e.g. prog->aux->id)
 		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -520,7 +544,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 */
 	set_memory_nx((long)st_map->image, 1);
 	set_memory_rw((long)st_map->image, 1);
-	bpf_map_put(map);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -538,6 +561,9 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	struct bpf_struct_ops_map *st_map;
 
 	st_map = (struct bpf_struct_ops_map *)map;
+	if (st_map->map.map_flags & BPF_F_LINK)
+		return -EOPNOTSUPP;
+
 	prev_state = cmpxchg(&st_map->kvalue.state,
 			     BPF_STRUCT_OPS_STATE_INUSE,
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
@@ -614,7 +640,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
+	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
 		return -EINVAL;
 	return 0;
 }
@@ -638,6 +664,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
+	if (attr->map_flags & BPF_F_LINK && !st_ops->validate)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	t = st_ops->type;
 
 	st_map_size = sizeof(*st_map) +
@@ -725,3 +754,111 @@ void bpf_struct_ops_put(const void *kdata)
 
 	bpf_map_put(&st_map->map);
 }
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+		map->map_flags & BPF_F_LINK &&
+		/* Pair with smp_store_release() during map_update */
+		smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_struct_ops_map *st_map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	st_map = (struct bpf_struct_ops_map *)
+		rcu_dereference_protected(st_link->map, true);
+	if (st_map) {
+		/* st_link->map can be NULL if
+		 * bpf_struct_ops_link_create() fails to register.
+		 */
+		st_map->st_ops->unreg(&st_map->kvalue.data);
+		bpf_map_put(&st_map->map);
+	}
+	kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+					    struct seq_file *seq)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock();
+	map = rcu_dereference(st_link->map);
+	seq_printf(seq, "map_id:\t%d\n", map->id);
+	rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+					       struct bpf_link_info *info)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock();
+	map = rcu_dereference(st_link->map);
+	info->struct_ops.map_id = map->id;
+	rcu_read_unlock();
+	return 0;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+	.dealloc = bpf_struct_ops_map_link_dealloc,
+	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	struct bpf_struct_ops_link *link = NULL;
+	struct bpf_link_primer link_primer;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
+	int err;
+
+	map = bpf_map_get(attr->link_create.map_fd);
+	if (!map)
+		return -EINVAL;
+
+	st_map = (struct bpf_struct_ops_map *)map;
+
+	if (!bpf_struct_ops_valid_to_reg(map)) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto err_out;
+
+	err = st_map->st_ops->reg(st_map->kvalue.data);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		link = NULL;
+		goto err_out;
+	}
+	RCU_INIT_POINTER(link->map, map);
+
+	return bpf_link_settle(&link_primer);
+
+err_out:
+	bpf_map_put(map);
+	kfree(link);
+	return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cff0348a2871..21f76698875c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2825,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 	const struct bpf_prog *prog = link->prog;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
 
-	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "link_type:\t%s\n"
-		   "link_id:\t%u\n"
-		   "prog_tag:\t%s\n"
-		   "prog_id:\t%u\n",
+		   "link_id:\t%u\n",
 		   bpf_link_type_strs[link->type],
-		   link->id,
-		   prog_tag,
-		   prog->aux->id);
+		   link->id);
+	if (prog) {
+		bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+		seq_printf(m,
+			   "prog_tag:\t%s\n"
+			   "prog_id:\t%u\n",
+			   prog_tag,
+			   prog->aux->id);
+	}
 	if (link->ops->show_fdinfo)
 		link->ops->show_fdinfo(link, m);
 }
@@ -4314,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
 
 	info.type = link->type;
 	info.id = link->id;
-	info.prog_id = link->prog->aux->id;
+	if (link->prog)
+		info.prog_id = link->prog->aux->id;
 
 	if (link->ops->fill_link_info) {
 		err = link->ops->fill_link_info(link, &info);
@@ -4577,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	if (CHECK_ATTR(BPF_LINK_CREATE))
 		return -EINVAL;
 
+	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+		return bpf_struct_ops_link_create(attr);
+
 	prog = bpf_prog_get(attr->link_create.prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd9..bbbd5eb94db2 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
 		if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
 				     sizeof(tcp_ca->name)) <= 0)
 			return -EINVAL;
-		if (tcp_ca_find(utcp_ca->name))
-			return -EEXIST;
 		return 1;
 	}
 
@@ -266,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
+static int bpf_tcp_ca_validate(void *kdata)
+{
+	return tcp_validate_congestion_control(kdata);
+}
+
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
@@ -273,6 +276,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.check_member = bpf_tcp_ca_check_member,
 	.init_member = bpf_tcp_ca_init_member,
 	.init = bpf_tcp_ca_init,
+	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
 };
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 13129df937cd..9cf1deaf21f2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* eBPF struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6379,6 +6386,9 @@ struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3


From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:02 -0700
Subject: bpf: Update the struct_ops of a bpf_link.

By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
to conveniently switch between different struct_ops on a single
bpf_link. This would enable smoother transitions from one struct_ops
to another.

The struct_ops maps passing along with BPF_LINK_UPDATE should have the
BPF_F_LINK flag.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |  3 +++
 include/uapi/linux/bpf.h       | 21 +++++++++++++-----
 kernel/bpf/bpf_struct_ops.c    | 48 +++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           | 34 ++++++++++++++++++++++++++++++
 net/ipv4/bpf_tcp_ca.c          |  6 ++++++
 tools/include/uapi/linux/bpf.h | 21 +++++++++++++-----
 6 files changed, 122 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8552279efe46..2d8f3f639e68 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1476,6 +1476,8 @@ struct bpf_link_ops {
 	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
 	int (*fill_link_info)(const struct bpf_link *link,
 			      struct bpf_link_info *info);
+	int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
+			  struct bpf_map *old_map);
 };
 
 struct bpf_tramp_link {
@@ -1518,6 +1520,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42f40ee083bf..e3d3b5160d26 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1555,12 +1555,23 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
 		__u32		link_fd;	/* link fd */
-		/* new program fd to update link with */
-		__u32		new_prog_fd;
+		union {
+			/* new program fd to update link with */
+			__u32		new_prog_fd;
+			/* new struct_ops map fd to update link with */
+			__u32           new_map_fd;
+		};
 		__u32		flags;		/* extra flags */
-		/* expected link's program fd; is specified only if
-		 * BPF_F_REPLACE flag is set in flags */
-		__u32		old_prog_fd;
+		union {
+			/* expected link's program fd; is specified only if
+			 * BPF_F_REPLACE flag is set in flags.
+			 */
+			__u32		old_prog_fd;
+			/* expected link's map fd; is specified only
+			 * if BPF_F_REPLACE flag is set.
+			 */
+			__u32           old_map_fd;
+		};
 	} link_update;
 
 	struct {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3d6b5240c25a..6401deca3b56 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -65,6 +65,8 @@ struct bpf_struct_ops_link {
 	struct bpf_map __rcu *map;
 };
 
+static DEFINE_MUTEX(update_mutex);
+
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
@@ -664,7 +666,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->map_flags & BPF_F_LINK && !st_ops->validate)
+	if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
 		return ERR_PTR(-EOPNOTSUPP);
 
 	t = st_ops->type;
@@ -810,10 +812,54 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
 	return 0;
 }
 
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+					  struct bpf_map *expected_old_map)
+{
+	struct bpf_struct_ops_map *st_map, *old_st_map;
+	struct bpf_map *old_map;
+	struct bpf_struct_ops_link *st_link;
+	int err = 0;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+	if (!bpf_struct_ops_valid_to_reg(new_map))
+		return -EINVAL;
+
+	mutex_lock(&update_mutex);
+
+	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+	if (expected_old_map && old_map != expected_old_map) {
+		err = -EPERM;
+		goto err_out;
+	}
+
+	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+	/* The new and old struct_ops must be the same type. */
+	if (st_map->st_ops != old_st_map->st_ops) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+	if (err)
+		goto err_out;
+
+	bpf_map_inc(new_map);
+	rcu_assign_pointer(st_link->map, new_map);
+	bpf_map_put(old_map);
+
+err_out:
+	mutex_unlock(&update_mutex);
+
+	return err;
+}
+
 static const struct bpf_link_ops bpf_struct_ops_map_lops = {
 	.dealloc = bpf_struct_ops_map_link_dealloc,
 	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
 	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+	.update_map = bpf_struct_ops_map_link_update,
 };
 
 int bpf_struct_ops_link_create(union bpf_attr *attr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 21f76698875c..b4d758fa5981 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4682,6 +4682,35 @@ out:
 	return ret;
 }
 
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+	struct bpf_map *new_map, *old_map = NULL;
+	int ret;
+
+	new_map = bpf_map_get(attr->link_update.new_map_fd);
+	if (IS_ERR(new_map))
+		return -EINVAL;
+
+	if (attr->link_update.flags & BPF_F_REPLACE) {
+		old_map = bpf_map_get(attr->link_update.old_map_fd);
+		if (IS_ERR(old_map)) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+	} else if (attr->link_update.old_map_fd) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	ret = link->ops->update_map(link, new_map, old_map);
+
+	if (old_map)
+		bpf_map_put(old_map);
+out_put:
+	bpf_map_put(new_map);
+	return ret;
+}
+
 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
 
 static int link_update(union bpf_attr *attr)
@@ -4702,6 +4731,11 @@ static int link_update(union bpf_attr *attr)
 	if (IS_ERR(link))
 		return PTR_ERR(link);
 
+	if (link->ops->update_map) {
+		ret = link_update_map(link, attr);
+		goto out_put_link;
+	}
+
 	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
 	if (IS_ERR(new_prog)) {
 		ret = PTR_ERR(new_prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index bbbd5eb94db2..e8b27826283e 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -264,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+	return tcp_update_congestion_control(kdata, old_kdata);
+}
+
 static int bpf_tcp_ca_validate(void *kdata)
 {
 	return tcp_validate_congestion_control(kdata);
@@ -273,6 +278,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
 	.unreg = bpf_tcp_ca_unreg,
+	.update = bpf_tcp_ca_update,
 	.check_member = bpf_tcp_ca_check_member,
 	.init_member = bpf_tcp_ca_init_member,
 	.init = bpf_tcp_ca_init,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9cf1deaf21f2..d6c5a022ae28 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1555,12 +1555,23 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
 		__u32		link_fd;	/* link fd */
-		/* new program fd to update link with */
-		__u32		new_prog_fd;
+		union {
+			/* new program fd to update link with */
+			__u32		new_prog_fd;
+			/* new struct_ops map fd to update link with */
+			__u32           new_map_fd;
+		};
 		__u32		flags;		/* extra flags */
-		/* expected link's program fd; is specified only if
-		 * BPF_F_REPLACE flag is set in flags */
-		__u32		old_prog_fd;
+		union {
+			/* expected link's program fd; is specified only if
+			 * BPF_F_REPLACE flag is set in flags.
+			 */
+			__u32		old_prog_fd;
+			/* expected link's map fd; is specified only
+			 * if BPF_F_REPLACE flag is set.
+			 */
+			__u32           old_map_fd;
+		};
 	} link_update;
 
 	struct {
-- 
cgit v1.2.3


From 55fbae05476df65e5eee8be54f61d0257af0240b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 24 Mar 2023 11:42:41 -0700
Subject: bpf: Check IS_ERR for the bpf_map_get() return value

This patch fixes a mistake in checking NULL instead of
checking IS_ERR for the bpf_map_get() return value.

It also fixes the return value in link_update_map() from -EINVAL
to PTR_ERR(*_map).

Reported-by: syzbot+71ccc0fe37abb458406b@syzkaller.appspotmail.com
Fixes: 68b04864ca42 ("bpf: Create links for BPF struct_ops maps.")
Fixes: aef56f2e918b ("bpf: Update the struct_ops of a bpf_link.")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Kui-Feng Lee <kuifeng@meta.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230324184241.1387437-1-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_struct_ops.c | 4 ++--
 kernel/bpf/syscall.c        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 6401deca3b56..d3f0a4825fa6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -871,8 +871,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 	int err;
 
 	map = bpf_map_get(attr->link_create.map_fd);
-	if (!map)
-		return -EINVAL;
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 
 	st_map = (struct bpf_struct_ops_map *)map;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b4d758fa5981..a09597c95029 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4689,12 +4689,12 @@ static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
 
 	new_map = bpf_map_get(attr->link_update.new_map_fd);
 	if (IS_ERR(new_map))
-		return -EINVAL;
+		return PTR_ERR(new_map);
 
 	if (attr->link_update.flags & BPF_F_REPLACE) {
 		old_map = bpf_map_get(attr->link_update.old_map_fd);
 		if (IS_ERR(old_map)) {
-			ret = -EINVAL;
+			ret = PTR_ERR(old_map);
 			goto out_put;
 		}
 	} else if (attr->link_update.old_map_fd) {
-- 
cgit v1.2.3


From 1431d0b584a673ea690c88a5f7e1aedd9caf0e84 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sat, 25 Mar 2023 16:31:42 -0500
Subject: bpf: Only invoke kptr dtor following non-NULL xchg

When a map value is being freed, we loop over all of the fields of the
corresponding BPF object and issue the appropriate cleanup calls
corresponding to the field's type. If the field is a referenced kptr, we
atomically xchg the value out of the map, and invoke the kptr's
destructor on whatever was there before (or bpf_obj_drop() it if it was
a local kptr).

Currently, we always invoke the destructor (either bpf_obj_drop() or the
kptr's registered destructor) on any KPTR_REF-type field in a map, even
if there wasn't a value in the map. This means that any function serving
as the kptr's KF_RELEASE destructor must always treat the argument as
possibly NULL, as the following can and regularly does happen:

void *xchgd_field;

/* No value was in the map, so xchgd_field is NULL */
xchgd_field = (void *)xchg(unsigned long *field_ptr, 0);
field->kptr.dtor(xchgd_field);

These are odd semantics to impose on KF_RELEASE kfuncs -- BPF programs
are prohibited by the verifier from passing NULL pointers to KF_RELEASE
kfuncs, so it doesn't make sense to require this of BPF programs, but
not the main kernel destructor path. It's also unnecessary to invoke any
cleanup logic for local kptrs. If there is no object there, there's
nothing to drop.

So as to allow KF_RELEASE kfuncs to fully assume that an argument is
non-NULL, this patch updates a KPTR_REF's destructor to only be invoked
when a non-NULL value is xchg'd out of the kptr map field.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230325213144.486885-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a09597c95029..e18ac7fdc210 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -677,6 +677,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			break;
 		case BPF_KPTR_REF:
 			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+			if (!xchgd_field)
+				break;
+
 			if (!btf_is_kernel(field->kptr.btf)) {
 				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 									   field->kptr.btf_id);
-- 
cgit v1.2.3


From fb2211a57c110b4ced3cb7f8570bd7246acf2d04 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sat, 25 Mar 2023 16:31:45 -0500
Subject: bpf: Remove now-unnecessary NULL checks for KF_RELEASE kfuncs

Now that we're not invoking kfunc destructors when the kptr in a map was
NULL, we no longer require NULL checks in many of our KF_RELEASE kfuncs.
This patch removes those NULL checks.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230325213144.486885-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/hid/bpf/hid_bpf_dispatch.c | 3 ---
 kernel/bpf/cpumask.c               | 3 ---
 kernel/bpf/helpers.c               | 6 ------
 net/bpf/test_run.c                 | 3 ---
 net/netfilter/nf_conntrack_bpf.c   | 2 --
 5 files changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index 8a034a555d4c..d9ef45fcaeab 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -342,9 +342,6 @@ hid_bpf_release_context(struct hid_bpf_ctx *ctx)
 {
 	struct hid_bpf_ctx_kern *ctx_kern;
 
-	if (!ctx)
-		return;
-
 	ctx_kern = container_of(ctx, struct hid_bpf_ctx_kern, ctx);
 
 	kfree(ctx_kern);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index db9da2194c1a..e991af7dc13c 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -102,9 +102,6 @@ static void cpumask_free_cb(struct rcu_head *head)
  */
 __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
 {
-	if (!cpumask)
-		return;
-
 	if (refcount_dec_and_test(&cpumask->usage))
 		call_rcu(&cpumask->rcu, cpumask_free_cb);
 }
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f753676ef652..8980f6859443 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2089,9 +2089,6 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
  */
 __bpf_kfunc void bpf_task_release(struct task_struct *p)
 {
-	if (!p)
-		return;
-
 	put_task_struct(p);
 }
 
@@ -2148,9 +2145,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
  */
 __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
 {
-	if (!cgrp)
-		return;
-
 	cgroup_put(cgrp);
 }
 
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 8d6b31209bd6..27587f1c5f36 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -615,9 +615,6 @@ bpf_kfunc_call_memb_acquire(void)
 
 __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
 {
-	if (!p)
-		return;
-
 	refcount_dec(&p->cnt);
 }
 
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index cd99e6dc1f35..002e9d24a1e9 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -401,8 +401,6 @@ __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
  */
 __bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
 {
-	if (!nfct)
-		return;
 	nf_ct_put(nfct);
 }
 
-- 
cgit v1.2.3


From 6c831c4684124a544f73f7c9b83bc7b2eb0b23d3 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sat, 25 Mar 2023 16:31:46 -0500
Subject: bpf: Treat KF_RELEASE kfuncs as KF_TRUSTED_ARGS

KF_RELEASE kfuncs are not currently treated as having KF_TRUSTED_ARGS,
even though they have a superset of the requirements of KF_TRUSTED_ARGS.
Like KF_TRUSTED_ARGS, KF_RELEASE kfuncs require a 0-offset argument, and
don't allow NULL-able arguments. Unlike KF_TRUSTED_ARGS which require
_either_ an argument with ref_obj_id > 0, _or_ (ref->type &
BPF_REG_TRUSTED_MODIFIERS) (and no unsafe modifiers allowed), KF_RELEASE
only allows for ref_obj_id > 0.  Because KF_RELEASE today doesn't
automatically imply KF_TRUSTED_ARGS, some of these requirements are
enforced in different ways that can make the behavior of the verifier
feel unpredictable. For example, a KF_RELEASE kfunc with a NULL-able
argument will currently fail in the verifier with a message like, "arg#0
is ptr_or_null_ expected ptr_ or socket" rather than "Possibly NULL
pointer passed to trusted arg0". Our intention is the same, but the
semantics are different due to implemenetation details that kfunc authors
and BPF program writers should not need to care about.

Let's make the behavior of the verifier more consistent and intuitive by
having KF_RELEASE kfuncs imply the presence of KF_TRUSTED_ARGS. Our
eventual goal is to have all kfuncs assume KF_TRUSTED_ARGS by default
anyways, so this takes us a step in that direction.

Note that it does not make sense to assume KF_TRUSTED_ARGS for all
KF_ACQUIRE kfuncs. KF_ACQUIRE kfuncs can have looser semantics than
KF_RELEASE, with e.g. KF_RCU | KF_RET_NULL. We may want to have
KF_ACQUIRE imply KF_TRUSTED_ARGS _unless_ KF_RCU is specified, but that
can be left to another patch set, and there are no such subtleties to
address for KF_RELEASE.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230325213144.486885-4-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst                           |  7 ++++---
 kernel/bpf/cpumask.c                                   |  2 +-
 kernel/bpf/verifier.c                                  |  2 +-
 net/bpf/test_run.c                                     |  6 ++++++
 tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c |  4 ++--
 tools/testing/selftests/bpf/progs/task_kfunc_failure.c |  6 +++---
 tools/testing/selftests/bpf/verifier/calls.c           | 10 +++++++---
 tools/testing/selftests/bpf/verifier/ref_tracking.c    |  6 +++---
 8 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 69eccf6f98ef..bf1b85941452 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -179,9 +179,10 @@ both are orthogonal to each other.
 ---------------------
 
 The KF_RELEASE flag is used to indicate that the kfunc releases the pointer
-passed in to it. There can be only one referenced pointer that can be passed in.
-All copies of the pointer being released are invalidated as a result of invoking
-kfunc with this flag.
+passed in to it. There can be only one referenced pointer that can be passed
+in. All copies of the pointer being released are invalidated as a result of
+invoking kfunc with this flag. KF_RELEASE kfuncs automatically receive the
+protection afforded by the KF_TRUSTED_ARGS flag described below.
 
 2.4.4 KF_KPTR_GET flag
 ----------------------
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index e991af7dc13c..7efdf5d770ca 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -402,7 +402,7 @@ __diag_pop();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
 BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 64f06f6e16bf..20eb2015842f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9307,7 +9307,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
 
 static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
 {
-	return meta->kfunc_flags & KF_TRUSTED_ARGS;
+	return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
 }
 
 static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 27587f1c5f36..f1652f5fbd2e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -606,6 +606,11 @@ bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
 	return &prog_test_struct;
 }
 
+__bpf_kfunc void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p)
+{
+	WARN_ON_ONCE(1);
+}
+
 __bpf_kfunc struct prog_test_member *
 bpf_kfunc_call_memb_acquire(void)
 {
@@ -800,6 +805,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
 BTF_SET8_END(test_sk_check_kfunc_ids)
 
 static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 807fb0ac41e9..48b2034cadb3 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -206,7 +206,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("expects refcounted")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value *v;
@@ -234,7 +234,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value local, *v;
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index 27994d6b2914..2c374a7ffece 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -206,7 +206,7 @@ int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flag
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags)
 {
 	struct __tasks_kfunc_map_value *v;
@@ -234,7 +234,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags)
 {
 	struct __tasks_kfunc_map_value local, *v;
@@ -277,7 +277,7 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 5702fc9761ef..1bdf2b43e49e 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -109,7 +109,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket",
+	.errstr = "Possibly NULL pointer passed to trusted arg0",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 3 },
 		{ "bpf_kfunc_call_test_release", 5 },
@@ -165,19 +165,23 @@
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
 	BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
 	BPF_EXIT_INSN(),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 16),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -4),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 3 },
-		{ "bpf_kfunc_call_test_release", 9 },
+		{ "bpf_kfunc_call_test_offset", 9 },
+		{ "bpf_kfunc_call_test_release", 12 },
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index 9540164712b7..5a2e154dd1e0 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -142,7 +142,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket",
+	.errstr = "Possibly NULL pointer passed to trusted arg0",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_lookup_user_key", 2 },
 		{ "bpf_key_put", 4 },
@@ -163,7 +163,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket",
+	.errstr = "Possibly NULL pointer passed to trusted arg0",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_lookup_system_key", 1 },
 		{ "bpf_key_put", 3 },
@@ -182,7 +182,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar",
+	.errstr = "Possibly NULL pointer passed to trusted arg0",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_key_put", 1 },
 	},
-- 
cgit v1.2.3


From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:42 -0700
Subject: bpf: Add a few bpf mem allocator functions

This patch adds a few bpf mem allocator functions which will
be used in the bpf_local_storage in a later patch.

bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the
flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL).
bpf_local_storage knows its running context is sleepable (GFP_KERNEL)
and provides a better guarantee on memory allocation.

bpf_local_storage has some uncommon cases that its selem
cannot be reused immediately. It handles its own
rcu_head and goes through a rcu_trace gp and then free it.
bpf_mem_cache_raw_free() is added for direct free purpose
without leaking the LLIST_NODE_SZ internal knowledge.
During free time, the 'struct bpf_mem_alloc *ma' is no longer
available. However, the caller should know if it is
percpu memory or not and it can call different raw_free functions.
bpf_local_storage does not support percpu value, so only
the non-percpu 'bpf_mem_cache_raw_free()' is added in
this patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  2 ++
 kernel/bpf/memalloc.c         | 59 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index a7104af61ab4..3929be5743f4 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_raw_free(void *ptr);
+void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
 #endif /* _BPF_MEM_ALLOC_H */
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5fcdacbb8439..410637c225fb 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 	return entry;
 }
 
-static void *__alloc(struct bpf_mem_cache *c, int node)
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
 {
-	/* Allocate, but don't deplete atomic reserves that typical
-	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
-	 * will allocate from the current numa node which is what we
-	 * want here.
-	 */
-	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
-
 	if (c->percpu_size) {
 		void **obj = kmalloc_node(c->percpu_size, flags, node);
 		void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
 		 */
 		obj = __llist_del_first(&c->free_by_rcu);
 		if (!obj) {
-			obj = __alloc(c, node);
+			/* Allocate, but don't deplete atomic reserves that typical
+			 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+			 * will allocate from the current numa node which is what we
+			 * want here.
+			 */
+			obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
 			if (!obj)
 				break;
 		}
@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+	if (!ptr)
+		return;
+
+	kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+	struct bpf_mem_cache *c;
+	void *ret;
+
+	c = this_cpu_ptr(ma->cache);
+
+	ret = unit_alloc(c);
+	if (!ret && flags == GFP_KERNEL) {
+		struct mem_cgroup *memcg, *old_memcg;
+
+		memcg = get_memcg(c);
+		old_memcg = set_active_memcg(memcg);
+		ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+		set_active_memcg(old_memcg);
+		mem_cgroup_put(memcg);
+	}
+
+	return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
-- 
cgit v1.2.3


From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:43 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem

This patch uses bpf_mem_alloc for the task and cgroup local storage that
the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID.
eg. bpf_get_current_task_btf() can be used in some of the kmalloc code
path which will cause deadlock/recursion. bpf_mem_cache_alloc is
deadlock free and will solve a legit use case in [1].

For sk storage, its batch creation benchmark shows a few percent
regression when the sk create/destroy batch size is larger than 32.
The sk creation/destruction happens much more often and
depends on external traffic. Considering it is hypothetical
to be able to cause deadlock with sk storage, it can cross
the bridge to use bpf_mem_alloc till a legit (ie. useful)
use case comes up.

For inode storage, bpf_local_storage_destroy() is called before
waiting for a rcu gp and its memory cannot be reused immediately.
inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp.

A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc().
Only task and cgroup storage have 'bpf_ma == true' which
means to use bpf_mem_cache_alloc/free(). This patch only changes
selem to use bpf_mem_alloc for task and cgroup. The next patch
will change the local_storage to use bpf_mem_alloc also for
task and cgroup.

Here is some more details on the changes:

* memory allocation:
After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because
bpf_mem_cache_alloc() could return a reused selem. It is to keep
the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data
is zero-ed. SDATA(selem)->data is the visible part to the bpf prog.
No need to use zero_map_value() to do the zeroing because
bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using
the selem before returning the selem through bpf_mem_cache_free().
For the internal fields of selem, they will be initialized when
linking to the new smap and the new local_storage.

When 'bpf_ma == false', nothing changes in this patch. It will
stay with the bpf_map_kzalloc().

* memory free:
The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle
the bpf_ma == true case.

For the common selem free path where its owner is also being destroyed,
the mem is freed in bpf_local_storage_destroy(), the owner (task
and cgroup) has gone through a rcu gp. The memory can be reused
immediately, so bpf_local_storage_destroy() will call
bpf_selem_free(..., reuse_now = true) which will do
bpf_mem_cache_free() for immediate reuse consideration.

An exception is the delete elem code path. The delete elem code path
is called from the helper bpf_*_storage_delete() and the syscall
bpf_map_delete_elem(). This path is an unusual case for local
storage because the common use case is to have the local storage
staying with its owner life time so that the bpf prog and the user
space does not have to monitor the owner's destruction. For the delete
elem path, the selem cannot be reused immediately because there could
be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false)
and it will wait for a rcu tasks trace gp before freeing the elem. The
rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree().

When 'bpf_ma == false', it should be the same as before.
__bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu().
A few words on the 'reuse_now == true'. When 'reuse_now == true',
it is still racing with bpf_local_storage_map_free which is under rcu
protection, so it still needs to wait for a rcu gp instead of kfree().
Otherwise, the selem may be reused by slab for a totally different struct
while the bpf_local_storage_map_free() is still using it (as a
rcu reader). For the inode case, there may be other rcu readers also.
In short, when bpf_ma == false and reuse_now == true => vanilla rcu.

[1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 ++-
 kernel/bpf/bpf_cgrp_storage.c     |  2 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 95 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/bpf_task_storage.c     |  2 +-
 net/core/bpf_sk_storage.c         |  2 +-
 6 files changed, 95 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index a34f61467a2f..30efbcab2798 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/types.h>
+#include <linux/bpf_mem_alloc.h>
 #include <uapi/linux/btf.h>
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
@@ -55,6 +56,8 @@ struct bpf_local_storage_map {
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
+	struct bpf_mem_alloc selem_ma;
+	bool bpf_ma;
 };
 
 struct bpf_local_storage_data {
@@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache);
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma);
 
 struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index f5b016a5484d..d17d5b694668 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+	return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
 }
 
 static void cgroup_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 9a5f05151898..e17ad581b9be 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 
 static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &inode_cache);
+	return bpf_local_storage_map_alloc(attr, &inode_cache, false);
 }
 
 static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 351d991694cb..309ea727a5cb 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
-	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
-				gfp_flags | __GFP_NOWARN);
+	if (smap->bpf_ma) {
+		migrate_disable();
+		selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+		migrate_enable();
+		if (selem)
+			/* Keep the original bpf_map_kzalloc behavior
+			 * before started using the bpf_mem_cache_alloc.
+			 *
+			 * No need to use zero_map_value. The bpf_selem_free()
+			 * only does bpf_mem_cache_free when there is
+			 * no other bpf prog is using the selem.
+			 */
+			memset(SDATA(selem)->data, 0, smap->map.value_size);
+	} else {
+		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+					gfp_flags | __GFP_NOWARN);
+	}
+
 	if (selem) {
 		if (value)
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -124,12 +140,34 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
 }
 
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(selem);
+	else
+		kfree_rcu(selem, rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+			     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(selem, rcu);
+	else
+		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
 static void bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	kfree(selem);
+	bpf_mem_cache_raw_free(selem);
 }
 
 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -145,10 +183,23 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		    bool reuse_now)
 {
 	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	if (!reuse_now)
+
+	if (!smap->bpf_ma) {
+		__bpf_selem_free(selem, reuse_now);
+		return;
+	}
+
+	if (!reuse_now) {
 		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	} else {
+		/* Instead of using the vanilla call_rcu(),
+		 * bpf_mem_cache_free will be able to reuse selem
+		 * immediately.
+		 */
+		migrate_disable();
+		bpf_mem_cache_free(&smap->selem_ma, selem);
+		migrate_enable();
+	}
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -654,13 +705,25 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
 	return usage;
 }
 
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache)
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma)
 {
 	struct bpf_local_storage_map *smap;
 	unsigned int i;
 	u32 nbuckets;
+	int err;
 
 	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
 	if (!smap)
@@ -675,8 +738,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
 					 nbuckets, GFP_USER | __GFP_NOWARN);
 	if (!smap->buckets) {
-		bpf_map_area_free(smap);
-		return ERR_PTR(-ENOMEM);
+		err = -ENOMEM;
+		goto free_smap;
 	}
 
 	for (i = 0; i < nbuckets; i++) {
@@ -687,8 +750,20 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	smap->elem_size = offsetof(struct bpf_local_storage_elem,
 				   sdata.data[attr->value_size]);
 
+	smap->bpf_ma = bpf_ma;
+	if (bpf_ma) {
+		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+		if (err)
+			goto free_smap;
+	}
+
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
 	return &smap->map;
+
+free_smap:
+	kvfree(smap->buckets);
+	bpf_map_area_free(smap);
+	return ERR_PTR(err);
 }
 
 void bpf_local_storage_map_free(struct bpf_map *map,
@@ -754,6 +829,8 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
+	if (smap->bpf_ma)
+		bpf_mem_alloc_destroy(&smap->selem_ma);
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ab5bd1ef58c4..d1af0c8f9ce4 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &task_cache);
+	return bpf_local_storage_map_alloc(attr, &task_cache, true);
 }
 
 static void task_storage_map_free(struct bpf_map *map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index cb0f5a105b89..085025c7130a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &sk_cache);
+	return bpf_local_storage_map_alloc(attr, &sk_cache, false);
 }
 
 static int notsupp_get_next_key(struct bpf_map *map, void *key,
-- 
cgit v1.2.3


From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:44 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage

This patch uses bpf_mem_cache_alloc/free for allocating and freeing
bpf_local_storage for task and cgroup storage.

The changes are similar to the previous patch. A few things that
worth to mention for bpf_local_storage:

The local_storage is freed when the last selem is deleted.
Before deleting a selem from local_storage, it needs to retrieve the
local_storage->smap because the bpf_selem_unlink_storage_nolock()
may have set it to NULL. Note that local_storage->smap may have
already been NULL when the selem created this local_storage has
been removed. In this case, call_rcu will be used to free the
local_storage.
Also, the bpf_ma (true or false) value is needed before calling
bpf_local_storage_free(). The bpf_ma can either be obtained from
the local_storage->smap (if available) or any of its selem's smap.
A new helper check_storage_bpf_ma() is added to obtain
bpf_ma for a deleting bpf_local_storage.

When bpf_local_storage_alloc getting a reused memory, all
fields are either in the correct values or will be initialized.
'cache[]' must already be all NULLs. 'list' must be empty.
Others will be initialized.

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |   1 +
 kernel/bpf/bpf_local_storage.c    | 130 +++++++++++++++++++++++++++++++++-----
 2 files changed, 116 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 30efbcab2798..173ec7f43ed1 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -57,6 +57,7 @@ struct bpf_local_storage_map {
 	u16 elem_size;
 	u16 cache_idx;
 	struct bpf_mem_alloc selem_ma;
+	struct bpf_mem_alloc storage_ma;
 	bool bpf_ma;
 };
 
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 309ea727a5cb..dab2ff4c99d9 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -111,33 +111,74 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage *local_storage;
+
+	/* If RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree(), else do kfree_rcu().
+	 */
+	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(local_storage);
+	else
+		kfree_rcu(local_storage, rcu);
+}
+
 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
-	kfree(local_storage);
+	bpf_mem_cache_raw_free(local_storage);
 }
 
 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 {
-	/* If RCU Tasks Trace grace period implies RCU grace period, do
-	 * kfree(), else do kfree_rcu().
-	 */
 	if (rcu_trace_implies_rcu_gp())
 		bpf_local_storage_free_rcu(rcu);
 	else
 		call_rcu(rcu, bpf_local_storage_free_rcu);
 }
 
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+				     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(local_storage, rcu);
+	else
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     __bpf_local_storage_free_trace_rcu);
+}
+
 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
-				   bool reuse_now)
+				   struct bpf_local_storage_map *smap,
+				   bool bpf_ma, bool reuse_now)
 {
-	if (!reuse_now)
+	if (!bpf_ma) {
+		__bpf_local_storage_free(local_storage, reuse_now);
+		return;
+	}
+
+	if (!reuse_now) {
 		call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_trace_rcu);
-	else
+		return;
+	}
+
+	if (smap) {
+		migrate_disable();
+		bpf_mem_cache_free(&smap->storage_ma, local_storage);
+		migrate_enable();
+	} else {
+		/* smap could be NULL if the selem that triggered
+		 * this 'local_storage' creation had been long gone.
+		 * In this case, directly do call_rcu().
+		 */
 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+	}
 }
 
 /* rcu tasks trace callback for bpf_ma == false */
@@ -260,11 +301,47 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	return free_local_storage;
 }
 
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+				 struct bpf_local_storage_map *storage_smap,
+				 struct bpf_local_storage_elem *selem)
+{
+
+	struct bpf_local_storage_map *selem_smap;
+
+	/* local_storage->smap may be NULL. If it is, get the bpf_ma
+	 * from any selem in the local_storage->list. The bpf_ma of all
+	 * local_storage and selem should have the same value
+	 * for the same map type.
+	 *
+	 * If the local_storage->list is already empty, the caller will not
+	 * care about the bpf_ma value also because the caller is not
+	 * responsibile to free the local_storage.
+	 */
+
+	if (storage_smap)
+		return storage_smap->bpf_ma;
+
+	if (!selem) {
+		struct hlist_node *n;
+
+		n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+					  bpf_rcu_lock_held());
+		if (!n)
+			return false;
+
+		selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+	}
+	selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+	return selem_smap->bpf_ma;
+}
+
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 				     bool reuse_now)
 {
+	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage *local_storage;
-	bool free_local_storage = false;
+	bool bpf_ma, free_local_storage = false;
 	unsigned long flags;
 
 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -273,6 +350,10 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 
 	local_storage = rcu_dereference_check(selem->local_storage,
 					      bpf_rcu_lock_held());
+	storage_smap = rcu_dereference_check(local_storage->smap,
+					     bpf_rcu_lock_held());
+	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
@@ -280,7 +361,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage)
-		bpf_local_storage_free(local_storage, reuse_now);
+		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -400,8 +481,15 @@ int bpf_local_storage_alloc(void *owner,
 	if (err)
 		return err;
 
-	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
-				  gfp_flags | __GFP_NOWARN);
+	if (smap->bpf_ma) {
+		migrate_disable();
+		storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+		migrate_enable();
+	} else {
+		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+					  gfp_flags | __GFP_NOWARN);
+	}
+
 	if (!storage) {
 		err = -ENOMEM;
 		goto uncharge;
@@ -447,7 +535,7 @@ int bpf_local_storage_alloc(void *owner,
 	return 0;
 
 uncharge:
-	bpf_local_storage_free(storage, true);
+	bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
 	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
@@ -660,11 +748,15 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 
 void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
+	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage_elem *selem;
-	bool free_storage = false;
+	bool bpf_ma, free_storage = false;
 	struct hlist_node *n;
 	unsigned long flags;
 
+	storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
+
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
 	 * Thus, no elem can be added to or deleted from the
@@ -692,7 +784,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_storage)
-		bpf_local_storage_free(local_storage, true);
+		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -755,6 +847,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
 		if (err)
 			goto free_smap;
+
+		err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+		if (err) {
+			bpf_mem_alloc_destroy(&smap->selem_ma);
+			goto free_smap;
+		}
 	}
 
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
@@ -829,8 +927,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
-	if (smap->bpf_ma)
+	if (smap->bpf_ma) {
 		bpf_mem_alloc_destroy(&smap->selem_ma);
+		bpf_mem_alloc_destroy(&smap->storage_ma);
+	}
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
-- 
cgit v1.2.3


From e4c2acab95a5947fe7948140a83e4f4918c6c048 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 30 Mar 2023 09:52:02 -0500
Subject: bpf: Handle PTR_MAYBE_NULL case in PTR_TO_BTF_ID helper call arg

When validating a helper function argument, we use check_reg_type() to
ensure that the register containing the argument is of the correct type.
When the register's base type is PTR_TO_BTF_ID, there is some
supplemental logic where we do extra checks for various combinations of
PTR_TO_BTF_ID type modifiers. For example, for PTR_TO_BTF_ID,
PTR_TO_BTF_ID | PTR_TRUSTED, and PTR_TO_BTF_ID | MEM_RCU, we call
map_kptr_match_type() for bpf_kptr_xchg() calls, and
btf_struct_ids_match() for other helper calls.

When an unhandled PTR_TO_BTF_ID type modifier combination is passed to
check_reg_type(), the verifier fails with an internal verifier error
message. This can currently be triggered by passing a PTR_MAYBE_NULL
pointer to helper functions (currently just bpf_kptr_xchg()) with an
ARG_PTR_TO_BTF_ID_OR_NULL arg type. For example, by callin
bpf_kptr_xchg(&v->kptr, bpf_cpumask_create()).

Whether or not passing a PTR_MAYBE_NULL arg to an
ARG_PTR_TO_BTF_ID_OR_NULL argument is valid is an interesting question.
In a vacuum, it seems fine. A helper function with an
ARG_PTR_TO_BTF_ID_OR_NULL arg would seem to be implying that it can
handle either a NULL or non-NULL arg, and has logic in place to detect
and gracefully handle each. This is the case for bpf_kptr_xchg(), which
of course simply does an xchg(). On the other hand, bpf_kptr_xchg() also
specifies OBJ_RELEASE, and refcounting semantics for a PTR_MAYBE_NULL
pointer is different than handling it for a NULL _OR_ non-NULL pointer.
For example, with a non-NULL arg, we should always fail if there was not
a nonzero refcount for the value in the register being passed to the
helper. For PTR_MAYBE_NULL on the other hand, it's unclear. If the
pointer is NULL it would be fine, but if it's not NULL, it would be
incorrect to load the program.

The current solution to this is to just fail if PTR_MAYBE_NULL is
passed, and to instead require programs to have a NULL check to
explicitly handle the NULL and non-NULL cases. This seems reasonable.
Not only would it possibly be quite complicated to correctly handle
PTR_MAYBE_NULL refcounting in the verifier, but it's also an arguably
odd programming pattern in general to not explicitly handle the NULL
case anyways. For example, it seems odd to not care about whether a
pointer you're passing to bpf_kptr_xchg() was successfully allocated in
a program such as the following:

private(MASK) static struct bpf_cpumask __kptr * global_mask;

SEC("tp_btf/task_newtask")
int BPF_PROG(example, struct task_struct *task, u64 clone_flags)
{
        struct bpf_cpumask *prev;

	/* bpf_cpumask_create() returns PTR_MAYBE_NULL */
	prev = bpf_kptr_xchg(&global_mask, bpf_cpumask_create());
	if (prev)
		bpf_cpumask_release(prev);

	return 0;
}

This patch therefore updates the verifier to explicitly check for
PTR_MAYBE_NULL in check_reg_type(), and fail gracefully if it's
observed. This isn't really "fixing" anything unsafe or incorrect. We're
just updating the verifier to fail gracefully, and explicitly handle
this pattern rather than unintentionally falling back to an internal
verifier error path. A subsequent patch will update selftests.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230330145203.80506-1-void@manifault.com
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 20eb2015842f..52738f9dcb15 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7204,6 +7204,10 @@ found:
 		}
 		break;
 	}
+	case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+	case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+		verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+		return -EACCES;
 	case PTR_TO_BTF_ID | MEM_ALLOC:
 		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
 		    meta->func_id != BPF_FUNC_kptr_xchg) {
-- 
cgit v1.2.3


From d02c48fa113953aba0b330ec6c35f50c7d1d7986 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 31 Mar 2023 14:57:31 -0500
Subject: bpf: Make struct task_struct an RCU-safe type

struct task_struct objects are a bit interesting in terms of how their
lifetime is protected by refcounts. task structs have two refcount
fields:

1. refcount_t usage: Protects the memory backing the task struct. When
   this refcount drops to 0, the task is immediately freed, without
   waiting for an RCU grace period to elapse. This is the field that
   most callers in the kernel currently use to ensure that a task
   remains valid while it's being referenced, and is what's currently
   tracked with bpf_task_acquire() and bpf_task_release().

2. refcount_t rcu_users: A refcount field which, when it drops to 0,
   schedules an RCU callback that drops a reference held on the 'usage'
   field above (which is acquired when the task is first created). This
   field therefore provides a form of RCU protection on the task by
   ensuring that at least one 'usage' refcount will be held until an RCU
   grace period has elapsed. The qualifier "a form of" is important
   here, as a task can remain valid after task->rcu_users has dropped to
   0 and the subsequent RCU gp has elapsed.

In terms of BPF, we want to use task->rcu_users to protect tasks that
function as referenced kptrs, and to allow tasks stored as referenced
kptrs in maps to be accessed with RCU protection.

Let's first determine whether we can safely use task->rcu_users to
protect tasks stored in maps. All of the bpf_task* kfuncs can only be
called from tracepoint, struct_ops, or BPF_PROG_TYPE_SCHED_CLS, program
types. For tracepoint and struct_ops programs, the struct task_struct
passed to a program handler will always be trusted, so it will always be
safe to call bpf_task_acquire() with any task passed to a program.
Note, however, that we must update bpf_task_acquire() to be KF_RET_NULL,
as it is possible that the task has exited by the time the program is
invoked, even if the pointer is still currently valid because the main
kernel holds a task->usage refcount. For BPF_PROG_TYPE_SCHED_CLS, tasks
should never be passed as an argument to the any program handlers, so it
should not be relevant.

The second question is whether it's safe to use RCU to access a task
that was acquired with bpf_task_acquire(), and stored in a map. Because
bpf_task_acquire() now uses task->rcu_users, it follows that if the task
is present in the map, that it must have had at least one
task->rcu_users refcount by the time the current RCU cs was started.
Therefore, it's safe to access that task until the end of the current
RCU cs.

With all that said, this patch makes struct task_struct is an
RCU-protected object. In doing so, we also change bpf_task_acquire() to
be KF_ACQUIRE | KF_RCU | KF_RET_NULL, and adjust any selftests as
necessary. A subsequent patch will remove bpf_task_kptr_get(), and
bpf_task_acquire_not_zero() respectively.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230331195733.699708-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                               | 11 +--
 kernel/bpf/verifier.c                              |  1 +
 .../testing/selftests/bpf/prog_tests/task_kfunc.c  |  1 +
 .../selftests/bpf/progs/task_kfunc_common.h        |  5 ++
 .../selftests/bpf/progs/task_kfunc_failure.c       | 80 +++++++++++++++++++---
 .../selftests/bpf/progs/task_kfunc_success.c       | 26 ++++++-
 6 files changed, 108 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8980f6859443..e71a4a54ce99 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/poison.h>
 #include <linux/proc_ns.h>
+#include <linux/sched/task.h>
 #include <linux/security.h>
 #include <linux/btf_ids.h>
 #include <linux/bpf_mem_alloc.h>
@@ -2013,7 +2014,9 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
  */
 __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
 {
-	return get_task_struct(p);
+	if (refcount_inc_not_zero(&p->rcu_users))
+		return p;
+	return NULL;
 }
 
 /**
@@ -2089,7 +2092,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
  */
 __bpf_kfunc void bpf_task_release(struct task_struct *p)
 {
-	put_task_struct(p);
+	put_task_struct_rcu_user(p);
 }
 
 #ifdef CONFIG_CGROUPS
@@ -2199,7 +2202,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
 	rcu_read_lock();
 	p = find_task_by_pid_ns(pid, &init_pid_ns);
 	if (p)
-		bpf_task_acquire(p);
+		p = bpf_task_acquire(p);
 	rcu_read_unlock();
 
 	return p;
@@ -2371,7 +2374,7 @@ BTF_ID_FLAGS(func, bpf_list_push_front)
 BTF_ID_FLAGS(func, bpf_list_push_back)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52738f9dcb15..92ae4e8ab87b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4600,6 +4600,7 @@ BTF_SET_START(rcu_protected_types)
 BTF_ID(struct, prog_test_ref_kfunc)
 BTF_ID(struct, cgroup)
 BTF_ID(struct, bpf_cpumask)
+BTF_ID(struct, task_struct)
 BTF_SET_END(rcu_protected_types)
 
 static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
index f79fa5bc9a8d..330133ece3f6 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
@@ -78,6 +78,7 @@ static const char * const success_tests[] = {
 	"test_task_from_pid_arg",
 	"test_task_from_pid_current",
 	"test_task_from_pid_invalid",
+	"task_kfunc_acquire_trusted_walked",
 };
 
 void test_task_kfunc(void)
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
index 4c2a4b0e3a25..bf0d1da9aff8 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
@@ -24,6 +24,8 @@ struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
 struct task_struct *bpf_task_kptr_get(struct task_struct **pp) __ksym;
 void bpf_task_release(struct task_struct *p) __ksym;
 struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
 
 static inline struct __tasks_kfunc_map_value *tasks_kfunc_map_value_lookup(struct task_struct *p)
 {
@@ -60,6 +62,9 @@ static inline int tasks_kfunc_map_insert(struct task_struct *p)
 	}
 
 	acquired = bpf_task_acquire(p);
+	if (!acquired)
+		return -ENOENT;
+
 	old = bpf_kptr_xchg(&v->task, acquired);
 	if (old) {
 		bpf_task_release(old);
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index 2c374a7ffece..63aef547da87 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -40,6 +40,9 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f
 
 	/* Can't invoke bpf_task_acquire() on an untrusted pointer. */
 	acquired = bpf_task_acquire(v->task);
+	if (!acquired)
+		return 0;
+
 	bpf_task_release(acquired);
 
 	return 0;
@@ -53,38 +56,49 @@ int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags)
 
 	/* Can't invoke bpf_task_acquire() on a random frame pointer. */
 	acquired = bpf_task_acquire((struct task_struct *)&stack_task);
+	if (!acquired)
+		return 0;
+
 	bpf_task_release(acquired);
 
 	return 0;
 }
 
 SEC("kretprobe/free_task")
-__failure __msg("reg type unsupported for arg#0 function")
+__failure __msg("calling kernel function bpf_task_acquire is not allowed")
 int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
 
+	/* Can't call bpf_task_acquire() or bpf_task_release() in an untrusted prog. */
 	acquired = bpf_task_acquire(task);
-	/* Can't release a bpf_task_acquire()'d task without a NULL check. */
+	if (!acquired)
+		return 0;
 	bpf_task_release(acquired);
 
 	return 0;
 }
 
-SEC("tp_btf/task_newtask")
-__failure __msg("R1 must be referenced or trusted")
-int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 clone_flags)
+SEC("kretprobe/free_task")
+__failure __msg("calling kernel function bpf_task_acquire is not allowed")
+int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe_rcu, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
 
-	/* Can't invoke bpf_task_acquire() on a trusted pointer obtained from walking a struct. */
-	acquired = bpf_task_acquire(task->group_leader);
-	bpf_task_release(acquired);
+	bpf_rcu_read_lock();
+	if (!task) {
+		bpf_rcu_read_unlock();
+		return 0;
+	}
+	/* Can't call bpf_task_acquire() or bpf_task_release() in an untrusted prog. */
+	acquired = bpf_task_acquire(task);
+	if (acquired)
+		bpf_task_release(acquired);
+	bpf_rcu_read_unlock();
 
 	return 0;
 }
 
-
 SEC("tp_btf/task_newtask")
 __failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags)
@@ -137,6 +151,8 @@ int BPF_PROG(task_kfunc_get_non_kptr_acquired, struct task_struct *task, u64 clo
 	struct task_struct *kptr, *acquired;
 
 	acquired = bpf_task_acquire(task);
+	if (!acquired)
+		return 0;
 
 	/* Cannot use bpf_task_kptr_get() on a non-kptr, even if it was acquired. */
 	kptr = bpf_task_kptr_get(&acquired);
@@ -185,6 +201,19 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla
 	return 0;
 }
 
+SEC("tp_btf/task_newtask")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, u64 clone_flags)
+{
+	struct task_struct *acquired;
+
+	acquired = bpf_task_acquire(task);
+	/* Can't invoke bpf_task_release() on an acquired task without a NULL check. */
+	bpf_task_release(acquired);
+
+	return 0;
+}
+
 SEC("tp_btf/task_newtask")
 __failure __msg("Unreleased reference")
 int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flags)
@@ -256,12 +285,13 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags)
 		return -ENOENT;
 
 	acquired = bpf_task_acquire(task);
+	if (!acquired)
+		return -EEXIST;
 
 	old = bpf_kptr_xchg(&v->task, acquired);
 
 	/* old cannot be passed to bpf_task_release() without a NULL check. */
 	bpf_task_release(old);
-	bpf_task_release(old);
 
 	return 0;
 }
@@ -298,6 +328,9 @@ int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
 
 	/* the argument of lsm task_free hook is untrusted. */
 	acquired = bpf_task_acquire(task);
+	if (!acquired)
+		return 0;
+
 	bpf_task_release(acquired);
 	return 0;
 }
@@ -337,3 +370,30 @@ int BPF_PROG(task_access_comm4, struct task_struct *task, const char *buf, bool
 	bpf_strncmp(task->comm, 16, "foo");
 	return 0;
 }
+
+SEC("tp_btf/task_newtask")
+__failure __msg("R1 must be referenced or trusted")
+int BPF_PROG(task_kfunc_release_in_map, struct task_struct *task, u64 clone_flags)
+{
+	struct task_struct *local;
+	struct __tasks_kfunc_map_value *v;
+
+	if (tasks_kfunc_map_insert(task))
+		return 0;
+
+	v = tasks_kfunc_map_value_lookup(task);
+	if (!v)
+		return 0;
+
+	bpf_rcu_read_lock();
+	local = v->task;
+	if (!local) {
+		bpf_rcu_read_unlock();
+		return 0;
+	}
+	/* Can't release a kptr that's still stored in a map. */
+	bpf_task_release(local);
+	bpf_rcu_read_unlock();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
index cfa7f12b84e8..a75304a5e860 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
@@ -47,7 +47,10 @@ static int test_acquire_release(struct task_struct *task)
 	}
 
 	acquired = bpf_task_acquire(task);
-	bpf_task_release(acquired);
+	if (acquired)
+		bpf_task_release(acquired);
+	else
+		err = 6;
 
 	return 0;
 }
@@ -166,7 +169,10 @@ int BPF_PROG(test_task_current_acquire_release, struct task_struct *task, u64 cl
 
 	current = bpf_get_current_task_btf();
 	acquired = bpf_task_acquire(current);
-	bpf_task_release(acquired);
+	if (acquired)
+		bpf_task_release(acquired);
+	else
+		err = 1;
 
 	return 0;
 }
@@ -241,3 +247,19 @@ int BPF_PROG(test_task_from_pid_invalid, struct task_struct *task, u64 clone_fla
 
 	return 0;
 }
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 clone_flags)
+{
+	struct task_struct *acquired;
+
+	/* task->group_leader is listed as a trusted, non-NULL field of task struct. */
+	acquired = bpf_task_acquire(task->group_leader);
+	if (acquired)
+		bpf_task_release(acquired);
+	else
+		err = 1;
+
+
+	return 0;
+}
-- 
cgit v1.2.3


From f85671c6ef46d490a90dac719e0c0e0adbacfd9b Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 31 Mar 2023 14:57:32 -0500
Subject: bpf: Remove now-defunct task kfuncs

In commit 22df776a9a86 ("tasks: Extract rcu_users out of union"), the
'refcount_t rcu_users' field was extracted out of a union with the
'struct rcu_head rcu' field. This allows us to safely perform a
refcount_inc_not_zero() on task->rcu_users when acquiring a reference on
a task struct. A prior patch leveraged this by making struct task_struct
an RCU-protected object in the verifier, and by bpf_task_acquire() to
use the task->rcu_users field for synchronization.

Now that we can use RCU to protect tasks, we no longer need
bpf_task_kptr_get(), or bpf_task_acquire_not_zero(). bpf_task_kptr_get()
is truly completely unnecessary, as we can just use RCU to get the
object. bpf_task_acquire_not_zero() is now equivalent to
bpf_task_acquire().

In addition to these changes, this patch also updates the associated
selftests to no longer use these kfuncs.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230331195733.699708-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                               | 69 --------------------
 .../testing/selftests/bpf/prog_tests/task_kfunc.c  |  2 +-
 tools/testing/selftests/bpf/progs/rcu_read_lock.c  |  9 +--
 .../selftests/bpf/progs/task_kfunc_common.h        |  1 -
 .../selftests/bpf/progs/task_kfunc_failure.c       | 73 ----------------------
 .../selftests/bpf/progs/task_kfunc_success.c       | 22 +++----
 6 files changed, 14 insertions(+), 162 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e71a4a54ce99..6be16db9f188 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2019,73 +2019,6 @@ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
 	return NULL;
 }
 
-/**
- * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
- * acquired by this kfunc which is not stored in a map as a kptr, must be
- * released by calling bpf_task_release().
- * @p: The task on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
-{
-	/* For the time being this function returns NULL, as it's not currently
-	 * possible to safely acquire a reference to a task with RCU protection
-	 * using get_task_struct() and put_task_struct(). This is due to the
-	 * slightly odd mechanics of p->rcu_users, and how task RCU protection
-	 * works.
-	 *
-	 * A struct task_struct is refcounted by two different refcount_t
-	 * fields:
-	 *
-	 * 1. p->usage:     The "true" refcount field which tracks a task's
-	 *		    lifetime. The task is freed as soon as this
-	 *		    refcount drops to 0.
-	 *
-	 * 2. p->rcu_users: An "RCU users" refcount field which is statically
-	 *		    initialized to 2, and is co-located in a union with
-	 *		    a struct rcu_head field (p->rcu). p->rcu_users
-	 *		    essentially encapsulates a single p->usage
-	 *		    refcount, and when p->rcu_users goes to 0, an RCU
-	 *		    callback is scheduled on the struct rcu_head which
-	 *		    decrements the p->usage refcount.
-	 *
-	 * There are two important implications to this task refcounting logic
-	 * described above. The first is that
-	 * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
-	 * after the refcount goes to 0, the RCU callback being scheduled will
-	 * cause the memory backing the refcount to again be nonzero due to the
-	 * fields sharing a union. The other is that we can't rely on RCU to
-	 * guarantee that a task is valid in a BPF program. This is because a
-	 * task could have already transitioned to being in the TASK_DEAD
-	 * state, had its rcu_users refcount go to 0, and its rcu callback
-	 * invoked in which it drops its single p->usage reference. At this
-	 * point the task will be freed as soon as the last p->usage reference
-	 * goes to 0, without waiting for another RCU gp to elapse. The only
-	 * way that a BPF program can guarantee that a task is valid is in this
-	 * scenario is to hold a p->usage refcount itself.
-	 *
-	 * Until we're able to resolve this issue, either by pulling
-	 * p->rcu_users and p->rcu out of the union, or by getting rid of
-	 * p->usage and just using p->rcu_users for refcounting, we'll just
-	 * return NULL here.
-	 */
-	return NULL;
-}
-
-/**
- * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_task_release().
- * @pp: A pointer to a task kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
-{
-	/* We must return NULL here until we have clarity on how to properly
-	 * leverage RCU for ensuring a task's lifetime. See the comment above
-	 * in bpf_task_acquire_not_zero() for more details.
-	 */
-	return NULL;
-}
-
 /**
  * bpf_task_release - Release the reference acquired on a task.
  * @p: The task on which a reference is being released.
@@ -2375,8 +2308,6 @@ BTF_ID_FLAGS(func, bpf_list_push_back)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_rbtree_add)
diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
index 330133ece3f6..740d5f644b40 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
@@ -73,7 +73,7 @@ static const char * const success_tests[] = {
 	"test_task_acquire_release_current",
 	"test_task_acquire_leave_in_map",
 	"test_task_xchg_release",
-	"test_task_get_release",
+	"test_task_map_acquire_release",
 	"test_task_current_acquire_release",
 	"test_task_from_pid_arg",
 	"test_task_from_pid_current",
diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
index 6a8c88e58df2..14fb01437fb8 100644
--- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
@@ -23,7 +23,7 @@ struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym;
 void bpf_key_put(struct bpf_key *key) __ksym;
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
-struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
 void bpf_task_release(struct task_struct *p) __ksym;
 
 SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
@@ -159,13 +159,8 @@ int task_acquire(void *ctx)
 		goto out;
 
 	/* acquire a reference which can be used outside rcu read lock region */
-	gparent = bpf_task_acquire_not_zero(gparent);
+	gparent = bpf_task_acquire(gparent);
 	if (!gparent)
-		/* Until we resolve the issues with using task->rcu_users, we
-		 * expect bpf_task_acquire_not_zero() to return a NULL task.
-		 * See the comment at the definition of
-		 * bpf_task_acquire_not_zero() for more details.
-		 */
 		goto out;
 
 	(void)bpf_task_storage_get(&map_a, gparent, 0, 0);
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
index bf0d1da9aff8..41f2d44f49cb 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
@@ -21,7 +21,6 @@ struct hash_map {
 } __tasks_kfunc_map SEC(".maps");
 
 struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
-struct task_struct *bpf_task_kptr_get(struct task_struct **pp) __ksym;
 void bpf_task_release(struct task_struct *p) __ksym;
 struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
 void bpf_rcu_read_lock(void) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index 63aef547da87..dcdea3127086 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -128,59 +128,6 @@ int BPF_PROG(task_kfunc_acquire_unreleased, struct task_struct *task, u64 clone_
 	return 0;
 }
 
-SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 expected pointer to map value")
-int BPF_PROG(task_kfunc_get_non_kptr_param, struct task_struct *task, u64 clone_flags)
-{
-	struct task_struct *kptr;
-
-	/* Cannot use bpf_task_kptr_get() on a non-kptr, even on a valid task. */
-	kptr = bpf_task_kptr_get(&task);
-	if (!kptr)
-		return 0;
-
-	bpf_task_release(kptr);
-
-	return 0;
-}
-
-SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 expected pointer to map value")
-int BPF_PROG(task_kfunc_get_non_kptr_acquired, struct task_struct *task, u64 clone_flags)
-{
-	struct task_struct *kptr, *acquired;
-
-	acquired = bpf_task_acquire(task);
-	if (!acquired)
-		return 0;
-
-	/* Cannot use bpf_task_kptr_get() on a non-kptr, even if it was acquired. */
-	kptr = bpf_task_kptr_get(&acquired);
-	bpf_task_release(acquired);
-	if (!kptr)
-		return 0;
-
-	bpf_task_release(kptr);
-
-	return 0;
-}
-
-SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 expected pointer to map value")
-int BPF_PROG(task_kfunc_get_null, struct task_struct *task, u64 clone_flags)
-{
-	struct task_struct *kptr;
-
-	/* Cannot use bpf_task_kptr_get() on a NULL pointer. */
-	kptr = bpf_task_kptr_get(NULL);
-	if (!kptr)
-		return 0;
-
-	bpf_task_release(kptr);
-
-	return 0;
-}
-
 SEC("tp_btf/task_newtask")
 __failure __msg("Unreleased reference")
 int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_flags)
@@ -214,26 +161,6 @@ int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task,
 	return 0;
 }
 
-SEC("tp_btf/task_newtask")
-__failure __msg("Unreleased reference")
-int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flags)
-{
-	struct task_struct *kptr;
-	struct __tasks_kfunc_map_value *v;
-
-	v = insert_lookup_task(task);
-	if (!v)
-		return 0;
-
-	kptr = bpf_task_kptr_get(&v->task);
-	if (!kptr)
-		return 0;
-
-	/* Kptr acquired above is never released. */
-
-	return 0;
-}
-
 SEC("tp_btf/task_newtask")
 __failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags)
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
index a75304a5e860..b09371bba204 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
@@ -122,7 +122,7 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
-int BPF_PROG(test_task_get_release, struct task_struct *task, u64 clone_flags)
+int BPF_PROG(test_task_map_acquire_release, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr;
 	struct __tasks_kfunc_map_value *v;
@@ -143,18 +143,18 @@ int BPF_PROG(test_task_get_release, struct task_struct *task, u64 clone_flags)
 		return 0;
 	}
 
-	kptr = bpf_task_kptr_get(&v->task);
-	if (kptr) {
-		/* Until we resolve the issues with using task->rcu_users, we
-		 * expect bpf_task_kptr_get() to return a NULL task. See the
-		 * comment at the definition of bpf_task_acquire_not_zero() for
-		 * more details.
-		 */
-		bpf_task_release(kptr);
+	bpf_rcu_read_lock();
+	kptr = v->task;
+	if (!kptr) {
 		err = 3;
-		return 0;
+	} else {
+		kptr = bpf_task_acquire(kptr);
+		if (!kptr)
+			err = 4;
+		else
+			bpf_task_release(kptr);
 	}
-
+	bpf_rcu_read_unlock();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 5b85575ad4280171c382e888b006cb8d12c35bde Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Sat, 1 Apr 2023 20:06:02 +0000
Subject: bpf: optimize hashmap lookups when key_size is divisible by 4

The BPF hashmap uses the jhash() hash function. There is an optimized version
of this hash function which may be used if hash size is a multiple of 4. Apply
this optimization to the hashmap in a similar way as it is done in the bloom
filter map.

On practice the optimization is only noticeable for smaller key sizes, which,
however, is sufficient for many applications. An example is listed in the
following table of measurements (a hashmap of 65536 elements was used):

    --------------------------------------------------------------------
    | key_size | fullness | lookups /sec | lookups (opt) /sec |   gain |
    --------------------------------------------------------------------
    |        4 |      25% |      42.990M |            46.000M |   7.0% |
    |        4 |      50% |      37.910M |            39.094M |   3.1% |
    |        4 |      75% |      34.486M |            36.124M |   4.7% |
    |        4 |     100% |      31.760M |            32.719M |   3.0% |
    --------------------------------------------------------------------
    |        8 |      25% |      43.855M |            49.626M |  13.2% |
    |        8 |      50% |      38.328M |            42.152M |  10.0% |
    |        8 |      75% |      34.483M |            38.088M |  10.5% |
    |        8 |     100% |      31.306M |            34.686M |  10.8% |
    --------------------------------------------------------------------
    |       12 |      25% |      38.398M |            43.770M |  14.0% |
    |       12 |      50% |      33.336M |            37.712M |  13.1% |
    |       12 |      75% |      29.917M |            34.440M |  15.1% |
    |       12 |     100% |      27.322M |            30.480M |  11.6% |
    --------------------------------------------------------------------
    |       16 |      25% |      41.491M |            41.921M |   1.0% |
    |       16 |      50% |      36.206M |            36.474M |   0.7% |
    |       16 |      75% |      32.529M |            33.027M |   1.5% |
    |       16 |     100% |      29.581M |            30.325M |   2.5% |
    --------------------------------------------------------------------
    |       20 |      25% |      34.240M |            36.787M |   7.4% |
    |       20 |      50% |      30.328M |            32.663M |   7.7% |
    |       20 |      75% |      27.536M |            29.354M |   6.6% |
    |       20 |     100% |      24.847M |            26.505M |   6.7% |
    --------------------------------------------------------------------
    |       24 |      25% |      36.329M |            40.608M |  11.8% |
    |       24 |      50% |      31.444M |            35.059M |  11.5% |
    |       24 |      75% |      28.426M |            31.452M |  10.6% |
    |       24 |     100% |      26.278M |            28.741M |   9.4% |
    --------------------------------------------------------------------
    |       28 |      25% |      31.540M |            31.944M |   1.3% |
    |       28 |      50% |      27.739M |            28.063M |   1.2% |
    |       28 |      75% |      24.993M |            25.814M |   3.3% |
    |       28 |     100% |      23.513M |            23.500M |  -0.1% |
    --------------------------------------------------------------------
    |       32 |      25% |      32.116M |            33.953M |   5.7% |
    |       32 |      50% |      28.879M |            29.859M |   3.4% |
    |       32 |      75% |      26.227M |            26.948M |   2.7% |
    |       32 |     100% |      23.829M |            24.613M |   3.3% |
    --------------------------------------------------------------------
    |       64 |      25% |      22.535M |            22.554M |   0.1% |
    |       64 |      50% |      20.471M |            20.675M |   1.0% |
    |       64 |      75% |      19.077M |            19.146M |   0.4% |
    |       64 |     100% |      17.710M |            18.131M |   2.4% |
    --------------------------------------------------------------------

The following script was used to gather the results (SMT & frequency off):

    cd tools/testing/selftests/bpf
    for key_size in 4 8 12 16 20 24 28 32 64; do
            for nr_entries in `seq 16384 16384 65536`; do
                    fullness=$(printf '%3s' $((nr_entries*100/65536)))
                    echo -n "key_size=$key_size: $fullness% full: "
                    sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu
            done
            echo
    done

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Link: https://lore.kernel.org/r/20230401200602.3275-1-aspsk@isovalent.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 96b645bba3a4..00c253b84bf5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -607,6 +607,8 @@ free_htab:
 
 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
 {
+	if (likely(key_len % 4 == 0))
+		return jhash2(key, key_len / 4, hashrnd);
 	return jhash(key, key_len, hashrnd);
 }
 
-- 
cgit v1.2.3


From 92b2e810f0d3a2c05d8cf12a800592b238d458df Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Sun, 2 Apr 2023 11:43:40 +0000
Subject: bpf: compute hashes in bloom filter similar to hashmap

If the value size in a bloom filter is a multiple of 4, then the jhash2()
function is used to compute hashes. The length parameter of this function
equals to the number of 32-bit words in input. Compute it in the hot path
instead of pre-computing it, as this is translated to one extra shift to
divide the length by four vs. one extra memory load of a pre-computed length.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Link: https://lore.kernel.org/r/20230402114340.3441-1-aspsk@isovalent.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bloom_filter.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index db19784601a7..540331b610a9 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -16,13 +16,6 @@ struct bpf_bloom_filter {
 	struct bpf_map map;
 	u32 bitset_mask;
 	u32 hash_seed;
-	/* If the size of the values in the bloom filter is u32 aligned,
-	 * then it is more performant to use jhash2 as the underlying hash
-	 * function, else we use jhash. This tracks the number of u32s
-	 * in an u32-aligned value size. If the value size is not u32 aligned,
-	 * this will be 0.
-	 */
-	u32 aligned_u32_count;
 	u32 nr_hash_funcs;
 	unsigned long bitset[];
 };
@@ -32,9 +25,8 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
 {
 	u32 h;
 
-	if (bloom->aligned_u32_count)
-		h = jhash2(value, bloom->aligned_u32_count,
-			   bloom->hash_seed + index);
+	if (likely(value_size % 4 == 0))
+		h = jhash2(value, value_size / 4, bloom->hash_seed + index);
 	else
 		h = jhash(value, value_size, bloom->hash_seed + index);
 
@@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
 	bloom->nr_hash_funcs = nr_hash_funcs;
 	bloom->bitset_mask = bitset_mask;
 
-	/* Check whether the value size is u32-aligned */
-	if ((attr->value_size & (sizeof(u32) - 1)) == 0)
-		bloom->aligned_u32_count =
-			attr->value_size / sizeof(u32);
-
 	if (!(attr->map_flags & BPF_F_ZERO_SEED))
 		bloom->hash_seed = get_random_u32();
 
-- 
cgit v1.2.3


From f6a6a5a976288e4d0d94eb1c6c9e983e8e5cdb31 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 3 Apr 2023 13:00:27 -0700
Subject: bpf: Fix struct_meta lookup for bpf_obj_free_fields kfunc call

bpf_obj_drop_impl has a void return type. In check_kfunc_call, the "else
if" which sets insn_aux->kptr_struct_meta for bpf_obj_drop_impl is
surrounded by a larger if statement which checks btf_type_is_ptr. As a
result:

  * The bpf_obj_drop_impl-specific code will never execute
  * The btf_struct_meta input to bpf_obj_drop is always NULL
  * __bpf_obj_drop_impl will always see a NULL btf_record when called
    from BPF program, and won't call bpf_obj_free_fields
  * program-allocated kptrs which have fields that should be cleaned up
    by bpf_obj_free_fields may instead leak resources

This patch adds a btf_type_is_void branch to the larger if and moves
special handling for bpf_obj_drop_impl there, fixing the issue.

Fixes: ac9f06050a35 ("bpf: Introduce bpf_obj_drop")
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230403200027.2271029-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92ae4e8ab87b..eaf9c5291cf0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10743,10 +10743,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				insn_aux->obj_new_size = ret_t->size;
 				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(ret_btf, ret_btf_id);
-			} else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
-				insn_aux->kptr_struct_meta =
-					btf_find_struct_meta(meta.arg_obj_drop.btf,
-							     meta.arg_obj_drop.btf_id);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
 				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
 				struct btf_field *field = meta.arg_list_head.field;
@@ -10875,7 +10871,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 		if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
 			regs[BPF_REG_0].id = ++env->id_gen;
-	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+	} else if (btf_type_is_void(t)) {
+		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+				insn_aux->kptr_struct_meta =
+					btf_find_struct_meta(meta.arg_obj_drop.btf,
+							     meta.arg_obj_drop.btf_id);
+			}
+		}
+	}
 
 	nargs = btf_type_vlen(meta.func_proto);
 	args = (const struct btf_param *)(meta.func_proto + 1);
-- 
cgit v1.2.3


From 7d64c513284408fee5178a0953a686e9410f2399 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:22 -0700
Subject: bpf: Invoke btf_struct_access() callback only for writes.

Remove duplicated if (atype == BPF_READ) btf_struct_access() from
btf_struct_access() callback and invoke it only for writes. This is
possible to do because currently btf_struct_access() custom callback
always delegates to generic btf_struct_access() helper for BPF_READ
accesses.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-2-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c          | 2 +-
 net/bpf/bpf_dummy_struct_ops.c | 2 +-
 net/core/filter.c              | 6 ------
 net/ipv4/bpf_tcp_ca.c          | 3 ---
 4 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eaf9c5291cf0..83984568ccb4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5504,7 +5504,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+	if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
 		if (!btf_is_kernel(reg->btf)) {
 			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
 			return -EFAULT;
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ff4f89a2b02a..9535c8506cda 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -198,7 +198,7 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 	if (err < 0)
 		return err;
 
-	return atype == BPF_READ ? err : NOT_INIT;
+	return NOT_INIT;
 }
 
 static const struct bpf_verifier_ops bpf_dummy_verifier_ops = {
diff --git a/net/core/filter.c b/net/core/filter.c
index 3370efad1dda..8b9f409a2ec3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8753,9 +8753,6 @@ static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 {
 	int ret = -EACCES;
 
-	if (atype == BPF_READ)
-		return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
 		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
@@ -8830,9 +8827,6 @@ static int xdp_btf_struct_access(struct bpf_verifier_log *log,
 {
 	int ret = -EACCES;
 
-	if (atype == BPF_READ)
-		return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
 		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index ea21c96c03aa..d6465876bbf6 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -78,9 +78,6 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
 	const struct btf_type *t;
 	size_t end;
 
-	if (atype == BPF_READ)
-		return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-
 	t = btf_type_by_id(reg->btf, reg->btf_id);
 	if (t != tcp_sock_type) {
 		bpf_log(log, "only read is supported\n");
-- 
cgit v1.2.3


From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:23 -0700
Subject: bpf: Remove unused arguments from btf_struct_access().

Remove unused arguments from btf_struct_access() callback.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h              |  3 +--
 include/linux/filter.h           |  3 +--
 kernel/bpf/verifier.c            |  4 ++--
 net/bpf/bpf_dummy_struct_ops.c   | 12 +++++-------
 net/core/filter.c                | 13 +++++--------
 net/ipv4/bpf_tcp_ca.c            |  3 +--
 net/netfilter/nf_conntrack_bpf.c |  3 +--
 7 files changed, 16 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2d8f3f639e68..4f689dda748f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -893,8 +893,7 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 	int (*btf_struct_access)(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag);
+				 int off, int size);
 };
 
 struct bpf_prog_offload_ops {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 23c08c31bea9..5364b0c52c1d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 extern struct mutex nf_conn_btf_access_lock;
 extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 				     const struct bpf_reg_state *reg,
-				     int off, int size, enum bpf_access_type atype,
-				     u32 *next_btf_id, enum bpf_type_flag *flag);
+				     int off, int size);
 
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83984568ccb4..5ca520e5eddf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5459,7 +5459,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
 	enum bpf_type_flag flag = 0;
-	u32 btf_id;
+	u32 btf_id = 0;
 	int ret;
 
 	if (!env->allow_ptr_leaks) {
@@ -5509,7 +5509,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
 			return -EFAULT;
 		}
-		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+		ret = env->ops->btf_struct_access(&env->log, reg, off, size);
 	} else {
 		/* Writes are permitted with default btf_struct_access for
 		 * program allocated objects (which always have ref_obj_id > 0),
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 9535c8506cda..5918d1b32e19 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -173,14 +173,11 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t,
 
 static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
-					   int off, int size, enum bpf_access_type atype,
-					   u32 *next_btf_id,
-					   enum bpf_type_flag *flag)
+					   int off, int size)
 {
 	const struct btf_type *state;
 	const struct btf_type *t;
 	s32 type_id;
-	int err;
 
 	type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
 					BTF_KIND_STRUCT);
@@ -194,9 +191,10 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 		return -EACCES;
 	}
 
-	err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-	if (err < 0)
-		return err;
+	if (off + size > sizeof(struct bpf_dummy_ops_state)) {
+		bpf_log(log, "write access at off %d with size %d\n", off, size);
+		return -EACCES;
+	}
 
 	return NOT_INIT;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 8b9f409a2ec3..1f2abf0f60e6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8742,20 +8742,18 @@ EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
 
 int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 			      const struct bpf_reg_state *reg,
-			      int off, int size, enum bpf_access_type atype,
-			      u32 *next_btf_id, enum bpf_type_flag *flag);
+			      int off, int size);
 EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
 
 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 					const struct bpf_reg_state *reg,
-					int off, int size, enum bpf_access_type atype,
-					u32 *next_btf_id, enum bpf_type_flag *flag)
+					int off, int size)
 {
 	int ret = -EACCES;
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
@@ -8822,14 +8820,13 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
 static int xdp_btf_struct_access(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag)
+				 int off, int size)
 {
 	int ret = -EACCES;
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index d6465876bbf6..4406d796cc2f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -72,8 +72,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
 
 static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
 					const struct bpf_reg_state *reg,
-					int off, int size, enum bpf_access_type atype,
-					u32 *next_btf_id, enum bpf_type_flag *flag)
+					int off, int size)
 {
 	const struct btf_type *t;
 	size_t end;
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 002e9d24a1e9..3f821b7ba646 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -192,8 +192,7 @@ BTF_ID(struct, nf_conn___init)
 /* Check writes into `struct nf_conn` */
 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
-					   int off, int size, enum bpf_access_type atype,
-					   u32 *next_btf_id, enum bpf_type_flag *flag)
+					   int off, int size)
 {
 	const struct btf_type *ncit, *nct, *t;
 	size_t end;
-- 
cgit v1.2.3


From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:24 -0700
Subject: bpf: Refactor btf_nested_type_is_trusted().

btf_nested_type_is_trusted() tries to find a struct member at corresponding offset.
It works for flat structures and falls apart in more complex structs with nested structs.
The offset->member search is already performed by btf_struct_walk() including nested structs.
Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted()
instead of offset to make BTF_TYPE_SAFE*() logic more robust.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h   |  7 ++++---
 kernel/bpf/btf.c      | 44 +++++++++++++++++---------------------------
 kernel/bpf/verifier.c | 23 ++++++++++++-----------
 3 files changed, 33 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f689dda748f..002a811b6b90 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size,
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype,
-		      u32 *next_btf_id, enum bpf_type_flag *flag);
+		      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *btf, u32 id, int off,
 			  const struct btf *need_btf, u32 need_type_id,
@@ -2302,7 +2302,7 @@ struct bpf_core_ctx {
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix);
+				const char *field_name, u32 btf_id, const char *suffix);
 
 bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 			       const struct btf *reg_btf, u32 reg_id,
@@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 static inline int btf_struct_access(struct bpf_verifier_log *log,
 				    const struct bpf_reg_state *reg,
 				    int off, int size, enum bpf_access_type atype,
-				    u32 *next_btf_id, enum bpf_type_flag *flag)
+				    u32 *next_btf_id, enum bpf_type_flag *flag,
+				    const char **field_name)
 {
 	return -EACCES;
 }
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b7e5a5510b91..593c45a294d0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6166,7 +6166,8 @@ enum bpf_struct_walk_result {
 
 static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 			   const struct btf_type *t, int off, int size,
-			   u32 *next_btf_id, enum bpf_type_flag *flag)
+			   u32 *next_btf_id, enum bpf_type_flag *flag,
+			   const char **field_name)
 {
 	u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
 	const struct btf_type *mtype, *elem_type = NULL;
@@ -6395,6 +6396,8 @@ error:
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
 				*flag |= tmp_flag;
+				if (field_name)
+					*field_name = mname;
 				return WALK_PTR;
 			}
 		}
@@ -6421,7 +6424,8 @@ error:
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype __maybe_unused,
-		      u32 *next_btf_id, enum bpf_type_flag *flag)
+		      u32 *next_btf_id, enum bpf_type_flag *flag,
+		      const char **field_name)
 {
 	const struct btf *btf = reg->btf;
 	enum bpf_type_flag tmp_flag = 0;
@@ -6453,7 +6457,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 
 	t = btf_type_by_id(btf, id);
 	do {
-		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
+		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
 
 		switch (err) {
 		case WALK_PTR:
@@ -6528,7 +6532,7 @@ again:
 	type = btf_type_by_id(btf, id);
 	if (!type)
 		return false;
-	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
+	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
 	if (err != WALK_STRUCT)
 		return false;
 
@@ -8488,16 +8492,15 @@ out:
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix)
+				const char *field_name, u32 btf_id, const char *suffix)
 {
 	struct btf *btf = reg->btf;
 	const struct btf_type *walk_type, *safe_type;
 	const char *tname;
 	char safe_tname[64];
 	long ret, safe_id;
-	const struct btf_member *member, *m_walk = NULL;
+	const struct btf_member *member;
 	u32 i;
-	const char *walk_name;
 
 	walk_type = btf_type_by_id(btf, reg->btf_id);
 	if (!walk_type)
@@ -8517,30 +8520,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 	if (!safe_type)
 		return false;
 
-	for_each_member(i, walk_type, member) {
-		u32 moff;
-
-		/* We're looking for the PTR_TO_BTF_ID member in the struct
-		 * type we're walking which matches the specified offset.
-		 * Below, we'll iterate over the fields in the safe variant of
-		 * the struct and see if any of them has a matching type /
-		 * name.
-		 */
-		moff = __btf_member_bit_offset(walk_type, member) / 8;
-		if (off == moff) {
-			m_walk = member;
-			break;
-		}
-	}
-	if (m_walk == NULL)
-		return false;
-
-	walk_name = __btf_name_by_offset(btf, m_walk->name_off);
 	for_each_member(i, safe_type, member) {
 		const char *m_name = __btf_name_by_offset(btf, member->name_off);
+		const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+		u32 id;
+
+		if (!btf_type_is_ptr(mtype))
+			continue;
 
+		btf_type_skip_modifiers(btf, mtype->type, &id);
 		/* If we match on both type and name, the field is considered trusted. */
-		if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+		if (btf_id == id && !strcmp(field_name, m_name))
 			return true;
 	}
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5ca520e5eddf..2cd2e0b725cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5400,12 +5400,12 @@ BTF_TYPE_SAFE_RCU(struct css_set) {
 
 /* full trusted: these fields are trusted even outside of RCU CS and never NULL */
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
-	__bpf_md_ptr(struct seq_file *, seq);
+	struct seq_file *seq;
 };
 
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
-	__bpf_md_ptr(struct bpf_iter_meta *, meta);
-	__bpf_md_ptr(struct task_struct *, task);
+	struct bpf_iter_meta *meta;
+	struct task_struct *task;
 };
 
 BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
@@ -5427,17 +5427,17 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) {
 
 static bool type_is_rcu(struct bpf_verifier_env *env,
 			struct bpf_reg_state *reg,
-			int off)
+			const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
 
-	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu");
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
 }
 
 static bool type_is_trusted(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *reg,
-			    int off)
+			    const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
@@ -5446,7 +5446,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
 
-	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted");
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
 }
 
 static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5458,6 +5458,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = regs + regno;
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+	const char *field_name = NULL;
 	enum bpf_type_flag flag = 0;
 	u32 btf_id = 0;
 	int ret;
@@ -5526,7 +5527,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			return -EFAULT;
 		}
 
-		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
 	}
 
 	if (ret < 0)
@@ -5554,10 +5555,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 * A regular RCU-protected pointer with __rcu tag can also be deemed
 		 * trusted if we are in an RCU CS. Such pointer can be NULL.
 		 */
-		if (type_is_trusted(env, reg, off)) {
+		if (type_is_trusted(env, reg, field_name, btf_id)) {
 			flag |= PTR_TRUSTED;
 		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
-			if (type_is_rcu(env, reg, off)) {
+			if (type_is_rcu(env, reg, field_name, btf_id)) {
 				/* ignore __rcu tag and mark it MEM_RCU */
 				flag |= MEM_RCU;
 			} else if (flag & MEM_RCU) {
@@ -5640,7 +5641,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 	/* Simulate access to a PTR_TO_BTF_ID */
 	memset(&map_reg, 0, sizeof(map_reg));
 	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
-	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
+	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From 91571a515d1bcdc280bb46423bb697ea7eb42ff3 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:25 -0700
Subject: bpf: Teach verifier that certain helpers accept NULL pointer.

bpf_[sk|inode|task|cgrp]_storage_[get|delete]() and bpf_get_socket_cookie() helpers
perform run-time check that sk|inode|task|cgrp pointer != NULL.
Teach verifier about this fact and allow bpf programs to pass
PTR_TO_BTF_ID | PTR_MAYBE_NULL into such helpers.
It will be used in the subsequent patch that will do
bpf_sk_storage_get(.., skb->sk, ...);
Even when 'skb' pointer is trusted the 'sk' pointer may be NULL.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-5-alexei.starovoitov@gmail.com
---
 kernel/bpf/bpf_cgrp_storage.c  | 4 ++--
 kernel/bpf/bpf_inode_storage.c | 4 ++--
 kernel/bpf/bpf_task_storage.c  | 8 ++++----
 net/core/bpf_sk_storage.c      | 4 ++--
 net/core/filter.c              | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d17d5b694668..d44fe8dd9732 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -224,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id	= &bpf_cgroup_btf_id[0],
 	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
@@ -235,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id	= &bpf_cgroup_btf_id[0],
 };
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e17ad581b9be..a4d93df78c75 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -229,7 +229,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id	= &bpf_inode_storage_btf_ids[0],
 	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
@@ -240,6 +240,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id	= &bpf_inode_storage_btf_ids[0],
 };
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index d1af0c8f9ce4..adf6dfe0ba68 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -338,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
 	.gpl_only = false,
 	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
 	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg4_type = ARG_ANYTHING,
@@ -349,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
 	.gpl_only = false,
 	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
 	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg4_type = ARG_ANYTHING,
@@ -360,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
 	.gpl_only = false,
 	.ret_type = RET_INTEGER,
 	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
 };
 
@@ -369,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
 	.gpl_only = false,
 	.ret_type = RET_INTEGER,
 	.arg1_type = ARG_CONST_MAP_PTR,
-	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
 };
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 085025c7130a..d4172534dfa8 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -412,7 +412,7 @@ const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
 	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
@@ -424,7 +424,7 @@ const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,
 	.arg2_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
 	.allowed	= bpf_sk_storage_tracing_allowed,
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index 1f2abf0f60e6..727c5269867d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4998,7 +4998,7 @@ const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
 	.func		= bpf_get_socket_ptr_cookie,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
 };
 
 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
-- 
cgit v1.2.3


From add68b843f33d4e5dcbdc7ba6dffe7750a964159 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:26 -0700
Subject: bpf: Refactor NULL-ness check in check_reg_type().

check_reg_type() unconditionally disallows PTR_TO_BTF_ID | PTR_MAYBE_NULL.
It's problematic for helpers that allow ARG_PTR_TO_BTF_ID_OR_NULL like
bpf_sk_storage_get(). Allow passing PTR_TO_BTF_ID | PTR_MAYBE_NULL into such
helpers. That technically includes bpf_kptr_xchg() helper, but in practice:
  bpf_kptr_xchg(..., bpf_cpumask_create());
is still disallowed because bpf_cpumask_create() returns ref counted pointer
with ref_obj_id > 0.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-6-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cd2e0b725cd..4e7d671497f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7168,6 +7168,8 @@ found:
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
 	case PTR_TO_BTF_ID | MEM_RCU:
+	case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+	case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
 	{
 		/* For bpf_sk_release, it needs to match against first member
 		 * 'struct sock_common', hence make an exception for it. This
@@ -7176,6 +7178,12 @@ found:
 		bool strict_type_match = arg_type_is_release(arg_type) &&
 					 meta->func_id != BPF_FUNC_sk_release;
 
+		if (type_may_be_null(reg->type) &&
+		    (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+			verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+			return -EACCES;
+		}
+
 		if (!arg_btf_id) {
 			if (!compatible->btf_id) {
 				verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -7206,10 +7214,6 @@ found:
 		}
 		break;
 	}
-	case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
-	case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
-		verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
-		return -EACCES;
 	case PTR_TO_BTF_ID | MEM_ALLOC:
 		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
 		    meta->func_id != BPF_FUNC_kptr_xchg) {
-- 
cgit v1.2.3


From 30ee9821f9430c34a6254134a5f0e8db227510be Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:27 -0700
Subject: bpf: Allowlist few fields similar to __rcu tag.

Allow bpf program access cgrp->kn, mm->exe_file, skb->sk, req->sk.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-7-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4e7d671497f4..fd90ba498ccc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5378,6 +5378,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
 }
 
 #define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type)  __PASTE(__type, __safe_rcu_or_null)
 #define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted)
 
 /*
@@ -5394,10 +5395,31 @@ BTF_TYPE_SAFE_RCU(struct task_struct) {
 	struct task_struct *group_leader;
 };
 
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+	/* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+	struct kernfs_node *kn;
+};
+
 BTF_TYPE_SAFE_RCU(struct css_set) {
 	struct cgroup *dfl_cgrp;
 };
 
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+	struct file __rcu *exe_file;
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+	struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+	struct sock *sk;
+};
+
 /* full trusted: these fields are trusted even outside of RCU CS and never NULL */
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
 	struct seq_file *seq;
@@ -5430,11 +5452,23 @@ static bool type_is_rcu(struct bpf_verifier_env *env,
 			const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
 
 	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
 }
 
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+				struct bpf_reg_state *reg,
+				const char *field_name, u32 btf_id)
+{
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
+
 static bool type_is_trusted(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *reg,
 			    const char *field_name, u32 btf_id)
@@ -5561,9 +5595,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			if (type_is_rcu(env, reg, field_name, btf_id)) {
 				/* ignore __rcu tag and mark it MEM_RCU */
 				flag |= MEM_RCU;
-			} else if (flag & MEM_RCU) {
+			} else if (flag & MEM_RCU ||
+				   type_is_rcu_or_null(env, reg, field_name, btf_id)) {
 				/* __rcu tagged pointers can be NULL */
-				flag |= PTR_MAYBE_NULL;
+				flag |= MEM_RCU | PTR_MAYBE_NULL;
 			} else if (flag & (MEM_PERCPU | MEM_USER)) {
 				/* keep as-is */
 			} else {
-- 
cgit v1.2.3


From afeebf9f57a4965e0be90e4dc87f8f3a1417376d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:28 -0700
Subject: bpf: Undo strict enforcement for walking untagged fields.

The commit 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.")
broke several tracing bpf programs. Even in clang compiled kernels there are
many fields that are not marked with __rcu that are safe to read and pass into
helpers, but the verifier doesn't know that they're safe. Aggressively marking
them as PTR_UNTRUSTED was premature.

Fixes: 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-8-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fd90ba498ccc..56f569811f70 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4974,6 +4974,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg)
 	return reg->type & MEM_RCU;
 }
 
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+	*flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
@@ -5602,8 +5607,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			} else if (flag & (MEM_PERCPU | MEM_USER)) {
 				/* keep as-is */
 			} else {
-				/* walking unknown pointers yields untrusted pointer */
-				flag = PTR_UNTRUSTED;
+				/* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+				clear_trusted_flags(&flag);
 			}
 		} else {
 			/*
@@ -5617,7 +5622,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		}
 	} else {
 		/* Old compat. Deprecated */
-		flag &= ~PTR_TRUSTED;
+		clear_trusted_flags(&flag);
 	}
 
 	if (atype == BPF_READ && value_regno >= 0)
-- 
cgit v1.2.3


From d099f594ad5650e8d8232b5f31f5f90104e65def Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 4 Apr 2023 00:02:54 +0200
Subject: kallsyms: Disable preemption for find_kallsyms_symbol_value

Artem reported suspicious RCU usage [1]. The reason is that verifier
calls find_kallsyms_symbol_value with preemption enabled which will
trigger suspicious RCU usage warning in rcu_dereference_sched call.

Disabling preemption in find_kallsyms_symbol_value and adding
__find_kallsyms_symbol_value function.

Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules")
Reported-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Artem Savkov <asavkov@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/bpf/20230403220254.2191240-1-jolsa@kernel.org

[1] https://lore.kernel.org/bpf/ZBrPMkv8YVRiWwCR@samus.usersys.redhat.com/
---
 kernel/module/kallsyms.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index ab2376a1be88..bdc911dbcde5 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 }
 
 /* Given a module and name of symbol, find and return the symbol's value */
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
 {
 	unsigned int i;
 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
@@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
 	if (colon) {
 		mod = find_module_all(name, colon - name, false);
 		if (mod)
-			return find_kallsyms_symbol_value(mod, colon + 1);
+			return __find_kallsyms_symbol_value(mod, colon + 1);
 		return 0;
 	}
 
@@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
 
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		ret = find_kallsyms_symbol_value(mod, name);
+		ret = __find_kallsyms_symbol_value(mod, name);
 		if (ret)
 			return ret;
 	}
@@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	return ret;
 }
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+	unsigned long ret;
+
+	preempt_disable();
+	ret = __find_kallsyms_symbol_value(mod, name);
+	preempt_enable();
+	return ret;
+}
+
 int module_kallsyms_on_each_symbol(const char *modname,
 				   int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
-- 
cgit v1.2.3


From 13fbcee55706db45ce047a7cea14811d68f94ee3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 6 Apr 2023 09:44:55 -0700
Subject: bpf: Improve verifier JEQ/JNE insn branch taken checking

Currently, for BPF_JEQ/BPF_JNE insn, verifier determines
whether the branch is taken or not only if both operands
are constants. Therefore, for the following code snippet,
  0: (85) call bpf_ktime_get_ns#5       ; R0_w=scalar()
  1: (a5) if r0 < 0x3 goto pc+2         ; R0_w=scalar(umin=3)
  2: (b7) r2 = 2                        ; R2_w=2
  3: (1d) if r0 == r2 goto pc+2 6

At insn 3, since r0 is not a constant, verifier assumes both branch
can be taken which may lead inproper verification failure.

Add comparing umin/umax value and the constant. If the umin value
is greater than the constant, or umax value is smaller than the constant,
for JEQ the branch must be not-taken, and for JNE the branch must be taken.
The jmp32 mode JEQ/JNE branch taken checking is also handled similarly.

The following lists the veristat result w.r.t. changed number
of processes insns during verification:

File                                                   Program                                               Insns (A)  Insns (B)  Insns    (DIFF)
-----------------------------------------------------  ----------------------------------------------------  ---------  ---------  ---------------
test_cls_redirect.bpf.linked3.o                        cls_redirect                                              64980      73472  +8492 (+13.07%)
test_seg6_loop.bpf.linked3.o                           __add_egr_x                                               12425      12423      -2 (-0.02%)
test_tcp_hdr_options.bpf.linked3.o                     estab                                                      2634       2558     -76 (-2.89%)
test_parse_tcp_hdr_opt.bpf.linked3.o                   xdp_ingress_v6                                             1421       1420      -1 (-0.07%)
test_parse_tcp_hdr_opt_dynptr.bpf.linked3.o            xdp_ingress_v6                                             1238       1237      -1 (-0.08%)
test_tc_dtime.bpf.linked3.o                            egress_fwdns_prio100                                        414        411      -3 (-0.72%)

Mostly a small improvement but test_cls_redirect.bpf.linked3.o has a 13% regression.
I checked with verifier log and found it this is due to pruning.
For some JEQ/JNE branches impacted by this patch,
one branch is explored and the other has state equivalence and
pruned.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230406164455.1045294-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 56f569811f70..5c6b90e384a5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12651,10 +12651,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 	case BPF_JEQ:
 		if (tnum_is_const(subreg))
 			return !!tnum_equals_const(subreg, val);
+		else if (val < reg->u32_min_value || val > reg->u32_max_value)
+			return 0;
 		break;
 	case BPF_JNE:
 		if (tnum_is_const(subreg))
 			return !tnum_equals_const(subreg, val);
+		else if (val < reg->u32_min_value || val > reg->u32_max_value)
+			return 1;
 		break;
 	case BPF_JSET:
 		if ((~subreg.mask & subreg.value) & val)
@@ -12724,10 +12728,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
 	case BPF_JEQ:
 		if (tnum_is_const(reg->var_off))
 			return !!tnum_equals_const(reg->var_off, val);
+		else if (val < reg->umin_value || val > reg->umax_value)
+			return 0;
 		break;
 	case BPF_JNE:
 		if (tnum_is_const(reg->var_off))
 			return !tnum_equals_const(reg->var_off, val);
+		else if (val < reg->umin_value || val > reg->umax_value)
+			return 1;
 		break;
 	case BPF_JSET:
 		if ((~reg->var_off.mask & reg->var_off.value) & val)
-- 
cgit v1.2.3


From 953d9f5beaf75e88c69a13d70ce424cd606a29f5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 6 Apr 2023 09:45:05 -0700
Subject: bpf: Improve handling of pattern '<const> <cond_op> <non_const>' in
 verifier

Currently, the verifier does not handle '<const> <cond_op> <non_const>' well.
For example,
  ...
  10: (79) r1 = *(u64 *)(r10 -16)       ; R1_w=scalar() R10=fp0
  11: (b7) r2 = 0                       ; R2_w=0
  12: (2d) if r2 > r1 goto pc+2
  13: (b7) r0 = 0
  14: (95) exit
  15: (65) if r1 s> 0x1 goto pc+3
  16: (0f) r0 += r1
  ...
At insn 12, verifier decides both true and false branch are possible, but
actually only false branch is possible.

Currently, the verifier already supports patterns '<non_const> <cond_op> <const>.
Add support for patterns '<const> <cond_op> <non_const>' in a similar way.

Also fix selftest 'verifier_bounds_mix_sign_unsign/bounds checks mixing signed and unsigned, variant 10'
due to this change.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230406164505.1046801-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                                        | 12 ++++++++++++
 .../selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c    |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c6b90e384a5..3660b573048a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13356,6 +13356,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 				       src_reg->var_off.value,
 				       opcode,
 				       is_jmp32);
+	} else if (dst_reg->type == SCALAR_VALUE &&
+		   is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
+		pred = is_branch_taken(src_reg,
+				       tnum_subreg(dst_reg->var_off).value,
+				       flip_opcode(opcode),
+				       is_jmp32);
+	} else if (dst_reg->type == SCALAR_VALUE &&
+		   !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
+		pred = is_branch_taken(src_reg,
+				       dst_reg->var_off.value,
+				       flip_opcode(opcode),
+				       is_jmp32);
 	} else if (reg_is_pkt_pointer_any(dst_reg) &&
 		   reg_is_pkt_pointer_any(src_reg) &&
 		   !is_jmp32) {
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c
index 91a66357896a..4f40144748a5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c
@@ -354,7 +354,7 @@ __naked void signed_and_unsigned_variant_10(void)
 	call %[bpf_map_lookup_elem];			\
 	if r0 == 0 goto l0_%=;				\
 	r1 = *(u64*)(r10 - 16);				\
-	r2 = 0;						\
+	r2 = -1;						\
 	if r2 > r1 goto l1_%=;				\
 	r0 = 0;						\
 	exit;						\
-- 
cgit v1.2.3


From f3f21349779776135349a8e6f114a1485b2476b7 Mon Sep 17 00:00:00 2001
From: Barret Rhoden <brho@google.com>
Date: Thu, 6 Apr 2023 20:18:08 -0400
Subject: bpf: ensure all memory is initialized in bpf_get_current_comm

BPF helpers that take an ARG_PTR_TO_UNINIT_MEM must ensure that all of
the memory is set, including beyond the end of the string.

Signed-off-by: Barret Rhoden <brho@google.com>
Link: https://lore.kernel.org/r/20230407001808.1622968-1-brho@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6be16db9f188..b6a5cda5bb59 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -258,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
 		goto err_clear;
 
 	/* Verifier guarantees that size > 0 */
-	strscpy(buf, task->comm, size);
+	strscpy_pad(buf, task->comm, size);
 	return 0;
 err_clear:
 	memset(buf, 0, size);
-- 
cgit v1.2.3


From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:47 -0700
Subject: bpf: Split off basic BPF verifier log into separate file

kernel/bpf/verifier.c file is large and growing larger all the time. So
it's good to start splitting off more or less self-contained parts into
separate files to keep source code size (somewhat) somewhat under
control.

This patch is a one step in this direction, moving some of BPF verifier log
routines into a separate kernel/bpf/log.c. Right now it's most low-level
and isolated routines to append data to log, reset log to previous
position, etc. Eventually we could probably move verifier state
printing logic here as well, but this patch doesn't attempt to do that
yet.

Subsequent patches will add more logic to verifier log management, so
having basics in a separate file will make sure verifier.c doesn't grow
more with new changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 19 ++++------
 kernel/bpf/Makefile          |  3 +-
 kernel/bpf/log.c             | 85 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 69 -----------------------------------
 4 files changed, 94 insertions(+), 82 deletions(-)
 create mode 100644 kernel/bpf/log.c

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 81d525d057c7..83dff25545ee 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -498,11 +498,6 @@ struct bpf_verifier_log {
 	u32 len_total;
 };
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	return log->len_used >= log->len_total - 1;
-}
-
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
@@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log &&
@@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 		 log->level == BPF_LOG_KERNEL);
 }
 
-static inline bool
-bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
-{
-	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
-	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
-}
-
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
@@ -608,12 +601,14 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 02242614dcc7..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..920061e38d2e
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
+	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+		       va_list args)
+{
+	unsigned int n;
+
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	if (log->level == BPF_LOG_KERNEL) {
+		bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+		return;
+	}
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+	char zero = 0;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	log->len_used = new_pos;
+	if (put_user(zero, log->ubuf + new_pos))
+		log->ubuf = NULL;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+					   const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(&env->log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(&env->log, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+			    const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3660b573048a..745ae0cd01d4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -335,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
 	return &linfo[i - 1];
 }
 
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
-		       va_list args)
-{
-	unsigned int n;
-
-	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
-	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
-		  "verifier log line truncated - local buffer too short\n");
-
-	if (log->level == BPF_LOG_KERNEL) {
-		bool newline = n > 0 && log->kbuf[n - 1] == '\n';
-
-		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
-		return;
-	}
-
-	n = min(log->len_total - log->len_used - 1, n);
-	log->kbuf[n] = '\0';
-	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
-		log->len_used += n;
-	else
-		log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
-	char zero = 0;
-
-	if (!bpf_verifier_log_needed(log))
-		return;
-
-	log->len_used = new_pos;
-	if (put_user(zero, log->ubuf + new_pos))
-		log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
-					   const char *fmt, ...)
-{
-	va_list args;
-
-	if (!bpf_verifier_log_needed(&env->log))
-		return;
-
-	va_start(args, fmt);
-	bpf_verifier_vlog(&env->log, fmt, args);
-	va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
 __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 {
 	struct bpf_verifier_env *env = private_data;
@@ -403,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
-			    const char *fmt, ...)
-{
-	va_list args;
-
-	if (!bpf_verifier_log_needed(log))
-		return;
-
-	va_start(args, fmt);
-	bpf_verifier_vlog(log, fmt, args);
-	va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_log);
-
 static const char *ltrim(const char *s)
 {
 	while (isspace(*s))
-- 
cgit v1.2.3


From 03cc3aa6a53394481f01c16231f99298332066f9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:48 -0700
Subject: bpf: Remove minimum size restrictions on verifier log buffer

It's not clear why we have 128 as minimum size, but it makes testing
harder and seems unnecessary, as we carefully handle truncation
scenarios and use proper snprintf variants. So remove this limitation
and just enforce positive length for log buffer.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-3-andrii@kernel.org
---
 kernel/bpf/log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 920061e38d2e..1974891fc324 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -11,7 +11,7 @@
 
 bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
-	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
+	return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 &&
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
-- 
cgit v1.2.3


From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:49 -0700
Subject: bpf: Switch BPF verifier log to be a rotating log by default

Currently, if user-supplied log buffer to collect BPF verifier log turns
out to be too small to contain full log, bpf() syscall returns -ENOSPC,
fails BPF program verification/load, and preserves first N-1 bytes of
the verifier log (where N is the size of user-supplied buffer).

This is problematic in a bunch of common scenarios, especially when
working with real-world BPF programs that tend to be pretty complex as
far as verification goes and require big log buffers. Typically, it's
when debugging tricky cases at log level 2 (verbose). Also, when BPF program
is successfully validated, log level 2 is the only way to actually see
verifier state progression and all the important details.

Even with log level 1, it's possible to get -ENOSPC even if the final
verifier log fits in log buffer, if there is a code path that's deep
enough to fill up entire log, even if normally it would be reset later
on (there is a logic to chop off successfully validated portions of BPF
verifier log).

In short, it's not always possible to pre-size log buffer. Also, what's
worse, in practice, the end of the log most often is way more important
than the beginning, but verifier stops emitting log as soon as initial
log buffer is filled up.

This patch switches BPF verifier log behavior to effectively behave as
rotating log. That is, if user-supplied log buffer turns out to be too
short, verifier will keep overwriting previously written log,
effectively treating user's log buffer as a ring buffer. -ENOSPC is
still going to be returned at the end, to notify user that log contents
was truncated, but the important last N bytes of the log would be
returned, which might be all that user really needs. This consistent
-ENOSPC behavior, regardless of rotating or fixed log behavior, allows
to prevent backwards compatibility breakage. The only user-visible
change is which portion of verifier log user ends up seeing *if buffer
is too small*. Given contents of verifier log itself is not an ABI,
there is no breakage due to this behavior change. Specialized tools that
rely on specific contents of verifier log in -ENOSPC scenario are
expected to be easily adapted to accommodate old and new behaviors.

Importantly, though, to preserve good user experience and not require
every user-space application to adopt to this new behavior, before
exiting to user-space verifier will rotate log (in place) to make it
start at the very beginning of user buffer as a continuous
zero-terminated string. The contents will be a chopped off N-1 last
bytes of full verifier log, of course.

Given beginning of log is sometimes important as well, we add
BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows
tools like veristat to request first part of verifier log, if necessary.
BPF_LOG_FIXED flag is also a simple and straightforward way to check if
BPF verifier supports rotating behavior.

On the implementation side, conceptually, it's all simple. We maintain
64-bit logical start and end positions. If we need to truncate the log,
start position will be adjusted accordingly to lag end position by
N bytes. We then use those logical positions to calculate their matching
actual positions in user buffer and handle wrap around the end of the
buffer properly. Finally, right before returning from bpf_check(), we
rotate user log buffer contents in-place as necessary, to make log
contents contiguous. See comments in relevant functions for details.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org
---
 include/linux/bpf_verifier.h                       |  33 +++-
 kernel/bpf/btf.c                                   |   3 +-
 kernel/bpf/log.c                                   | 198 ++++++++++++++++++++-
 kernel/bpf/verifier.c                              |  19 +-
 tools/testing/selftests/bpf/prog_tests/log_fixup.c |   1 +
 5 files changed, 228 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 83dff25545ee..4c926227f612 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -491,25 +491,42 @@ struct bpf_insn_aux_data {
 #define BPF_VERIFIER_TMP_LOG_SIZE	1024
 
 struct bpf_verifier_log {
-	u32 level;
-	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
+	/* Logical start and end positions of a "log window" of the verifier log.
+	 * start_pos == 0 means we haven't truncated anything.
+	 * Once truncation starts to happen, start_pos + len_total == end_pos,
+	 * except during log reset situations, in which (end_pos - start_pos)
+	 * might get smaller than len_total (see bpf_vlog_reset()).
+	 * Generally, (end_pos - start_pos) gives number of useful data in
+	 * user log buffer.
+	 */
+	u64 start_pos;
+	u64 end_pos;
 	char __user *ubuf;
-	u32 len_used;
+	u32 level;
 	u32 len_total;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
+#define BPF_LOG_FIXED	8
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
-#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED)
 #define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
+{
+	return log->end_pos - log->start_pos;
+}
+
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
-	return log->len_used >= log->len_total - 1;
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
@@ -596,7 +613,7 @@ struct bpf_verifier_env {
 	u32 scratched_regs;
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
-	u32 prev_log_len, prev_insn_print_len;
+	u64 prev_log_pos, prev_insn_print_pos;
 	/* buffer used in reg_type_str() to generate reg_type string */
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
@@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
+void bpf_vlog_finalize(struct bpf_verifier_log *log);
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 593c45a294d0..20a05b8932db 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5593,7 +5593,8 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		}
 	}
 
-	if (log->level && bpf_verifier_log_full(log)) {
+	bpf_vlog_finalize(log);
+	if (log->level && bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
 	}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 1974891fc324..92b1c8ad6601 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
+#include <linux/math64.h>
 
 bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
@@ -32,23 +33,202 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		return;
 	}
 
-	n = min(log->len_total - log->len_used - 1, n);
-	log->kbuf[n] = '\0';
-	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
-		log->len_used += n;
-	else
-		log->ubuf = NULL;
+	if (log->level & BPF_LOG_FIXED) {
+		n = min(log->len_total - bpf_log_used(log) - 1, n);
+		log->kbuf[n] = '\0';
+		n += 1;
+
+		if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n))
+			goto fail;
+
+		log->end_pos += n - 1; /* don't count terminating '\0' */
+	} else {
+		u64 new_end, new_start, cur_pos;
+		u32 buf_start, buf_end, new_n;
+
+		n += 1;
+
+		new_end = log->end_pos + n;
+		if (new_end - log->start_pos >= log->len_total)
+			new_start = new_end - log->len_total;
+		else
+			new_start = log->start_pos;
+		new_n = min(n, log->len_total);
+		cur_pos = new_end - new_n;
+
+		div_u64_rem(cur_pos, log->len_total, &buf_start);
+		div_u64_rem(new_end, log->len_total, &buf_end);
+		/* new_end and buf_end are exclusive indices, so if buf_end is
+		 * exactly zero, then it actually points right to the end of
+		 * ubuf and there is no wrap around
+		 */
+		if (buf_end == 0)
+			buf_end = log->len_total;
+
+		/* if buf_start > buf_end, we wrapped around;
+		 * if buf_start == buf_end, then we fill ubuf completely; we
+		 * can't have buf_start == buf_end to mean that there is
+		 * nothing to write, because we always write at least
+		 * something, even if terminal '\0'
+		 */
+		if (buf_start < buf_end) {
+			/* message fits within contiguous chunk of ubuf */
+			if (copy_to_user(log->ubuf + buf_start,
+					 log->kbuf + n - new_n,
+					 buf_end - buf_start))
+				goto fail;
+		} else {
+			/* message wraps around the end of ubuf, copy in two chunks */
+			if (copy_to_user(log->ubuf + buf_start,
+					 log->kbuf + n - new_n,
+					 log->len_total - buf_start))
+				goto fail;
+			if (copy_to_user(log->ubuf,
+					 log->kbuf + n - buf_end,
+					 buf_end))
+				goto fail;
+		}
+
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+	}
+
+	return;
+fail:
+	log->ubuf = NULL;
 }
 
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 {
 	char zero = 0;
+	u32 pos;
+
+	if (WARN_ON_ONCE(new_pos > log->end_pos))
+		return;
 
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	log->len_used = new_pos;
-	if (put_user(zero, log->ubuf + new_pos))
+	/* if position to which we reset is beyond current log window,
+	 * then we didn't preserve any useful content and should adjust
+	 * start_pos to end up with an empty log (start_pos == end_pos)
+	 */
+	log->end_pos = new_pos;
+	if (log->end_pos < log->start_pos)
+		log->start_pos = log->end_pos;
+	div_u64_rem(new_pos, log->len_total, &pos);
+	if (put_user(zero, log->ubuf + pos))
+		log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+	int i, j;
+
+	for (i = 0, j = len - 1; i < j; i++, j--)
+		swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+	/* we split log->kbuf into two equal parts for both ends of array */
+	int n = sizeof(log->kbuf) / 2, nn;
+	char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+	/* Read ubuf's section [start, end) two chunks at a time, from left
+	 * and right side; within each chunk, swap all the bytes; after that
+	 * reverse the order of lbuf and rbuf and write result back to ubuf.
+	 * This way we'll end up with swapped contents of specified
+	 * [start, end) ubuf segment.
+	 */
+	while (end - start > 1) {
+		nn = min(n, (end - start ) / 2);
+
+		if (copy_from_user(lbuf, log->ubuf + start, nn))
+			return -EFAULT;
+		if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+			return -EFAULT;
+
+		bpf_vlog_reverse_kbuf(lbuf, nn);
+		bpf_vlog_reverse_kbuf(rbuf, nn);
+
+		/* we write lbuf to the right end of ubuf, while rbuf to the
+		 * left one to end up with properly reversed overall ubuf
+		 */
+		if (copy_to_user(log->ubuf + start, rbuf, nn))
+			return -EFAULT;
+		if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+			return -EFAULT;
+
+		start += nn;
+		end -= nn;
+	}
+
+	return 0;
+}
+
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
+{
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	else
+		return log->start_pos > 0;
+}
+
+void bpf_vlog_finalize(struct bpf_verifier_log *log)
+{
+	u32 sublen;
+	int err;
+
+	if (!log || !log->level || !log->ubuf)
+		return;
+	if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL)
+		return;
+
+	/* If we never truncated log, there is nothing to move around. */
+	if (log->start_pos == 0)
+		return;
+
+	/* Otherwise we need to rotate log contents to make it start from the
+	 * buffer beginning and be a continuous zero-terminated string. Note
+	 * that if log->start_pos != 0 then we definitely filled up entire log
+	 * buffer with no gaps, and we just need to shift buffer contents to
+	 * the left by (log->start_pos % log->len_total) bytes.
+	 *
+	 * Unfortunately, user buffer could be huge and we don't want to
+	 * allocate temporary kernel memory of the same size just to shift
+	 * contents in a straightforward fashion. Instead, we'll be clever and
+	 * do in-place array rotation. This is a leetcode-style problem, which
+	 * could be solved by three rotations.
+	 *
+	 * Let's say we have log buffer that has to be shifted left by 7 bytes
+	 * (spaces and vertical bar is just for demonstrative purposes):
+	 *   E F G H I J K | A B C D
+	 *
+	 * First, we reverse entire array:
+	 *   D C B A | K J I H G F E
+	 *
+	 * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+	 * (KJIHGFE), resulting in a properly rotated array:
+	 *   A B C D | E F G H I J K
+	 *
+	 * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+	 * bytes, and write them back. Doing it byte-by-byte would be
+	 * unnecessarily inefficient. Altogether we are going to read and
+	 * write each byte twice, for total 4 memory copies between kernel and
+	 * user space.
+	 */
+
+	/* length of the chopped off part that will be the beginning;
+	 * len(ABCD) in the example above
+	 */
+	div_u64_rem(log->start_pos, log->len_total, &sublen);
+	sublen = log->len_total - sublen;
+
+	err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+	err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+	if (err)
 		log->ubuf = NULL;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 745ae0cd01d4..a476bb319685 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1439,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos)
 static void print_insn_state(struct bpf_verifier_env *env,
 			     const struct bpf_func_state *state)
 {
-	if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
 		/* remove new line character */
-		bpf_vlog_reset(&env->log, env->prev_log_len - 1);
-		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
 	} else {
 		verbose(env, "%d:", env->insn_idx);
 	}
@@ -1750,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
-	elem->log_pos = env->log.len_used;
+	elem->log_pos = env->log.end_pos;
 	env->head = elem;
 	env->stack_size++;
 	err = copy_verifier_state(&elem->st, cur);
@@ -2286,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
-	elem->log_pos = env->log.len_used;
+	elem->log_pos = env->log.end_pos;
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
@@ -15638,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env)
 				print_insn_state(env, state->frame[state->curframe]);
 
 			verbose_linfo(env, env->insn_idx, "; ");
-			env->prev_log_len = env->log.len_used;
+			env->prev_log_pos = env->log.end_pos;
 			verbose(env, "%d: ", env->insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
-			env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
-			env->prev_log_len = env->log.len_used;
+			env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+			env->prev_log_pos = env->log.end_pos;
 		}
 
 		if (bpf_prog_is_offloaded(env->prog->aux)) {
@@ -18860,7 +18860,8 @@ skip_full_check:
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
 
-	if (log->level && bpf_verifier_log_full(log))
+	bpf_vlog_finalize(log);
+	if (log->level && bpf_vlog_truncated(log))
 		ret = -ENOSPC;
 	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
index 239e1c5753b0..bc27170bdeb0 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
@@ -24,6 +24,7 @@ static void bad_core_relo(size_t log_buf_size, enum trunc_type trunc_type)
 	bpf_program__set_autoload(skel->progs.bad_relo, true);
 	memset(log_buf, 0, sizeof(log_buf));
 	bpf_program__set_log_buf(skel->progs.bad_relo, log_buf, log_buf_size ?: sizeof(log_buf));
+	bpf_program__set_log_level(skel->progs.bad_relo, 1 | 8); /* BPF_LOG_FIXED to force truncation */
 
 	err = test_log_fixup__load(skel);
 	if (!ASSERT_ERR(err, "load_fail"))
-- 
cgit v1.2.3


From 24bc80887adb4d6fc0057d4f14fabeaa4502b2a0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:53 -0700
Subject: bpf: Ignore verifier log reset in BPF_LOG_KERNEL mode

Verifier log position reset is meaningless in BPF_LOG_KERNEL mode, so
just exit early in bpf_vlog_reset() if log->level is BPF_LOG_KERNEL.

This avoid meaningless put_user() into NULL log->ubuf.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-8-andrii@kernel.org
---
 kernel/bpf/log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 92b1c8ad6601..d99a50f07187 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -106,7 +106,7 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 	if (WARN_ON_ONCE(new_pos > log->end_pos))
 		return;
 
-	if (!bpf_verifier_log_needed(log))
+	if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
 		return;
 
 	/* if position to which we reset is beyond current log window,
-- 
cgit v1.2.3


From 971fb5057d787d0a7e7c8cb910207c82e2db920e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:54 -0700
Subject: bpf: Fix missing -EFAULT return on user log buf error in btf_parse()

btf_parse() is missing -EFAULT error return if log->ubuf was NULL-ed out
due to error while copying data into user-provided buffer. Add it, but
handle a special case of BPF_LOG_KERNEL in which log->ubuf is always NULL.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-9-andrii@kernel.org
---
 kernel/bpf/btf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 20a05b8932db..6372c144a294 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5598,6 +5598,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		err = -ENOSPC;
 		goto errout_meta;
 	}
+	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) {
+		err = -EFAULT;
+		goto errout_meta;
+	}
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
-- 
cgit v1.2.3


From cbedb42a0da3bb48819b2200af4b4cb5d922c518 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:55 -0700
Subject: bpf: Avoid incorrect -EFAULT error in BPF_LOG_KERNEL mode

If verifier log is in BPF_LOG_KERNEL mode, no log->ubuf is expected and
it stays NULL throughout entire verification process. Don't erroneously
return -EFAULT in such case.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-10-andrii@kernel.org
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a476bb319685..0323149803f5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18863,7 +18863,7 @@ skip_full_check:
 	bpf_vlog_finalize(log);
 	if (log->level && bpf_vlog_truncated(log))
 		ret = -ENOSPC;
-	if (log->level && !log->ubuf) {
+	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) {
 		ret = -EFAULT;
 		goto err_release_maps;
 	}
-- 
cgit v1.2.3


From 8a6ca6bc553e3c878fa53c506bc6ec281efdc039 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:56 -0700
Subject: bpf: Simplify logging-related error conditions handling

Move log->level == 0 check into bpf_vlog_truncated() instead of doing it
explicitly. Also remove unnecessary goto in kernel/bpf/verifier.c.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-11-andrii@kernel.org
---
 kernel/bpf/btf.c      | 2 +-
 kernel/bpf/log.c      | 4 +++-
 kernel/bpf/verifier.c | 6 ++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6372c144a294..5aa540ee611f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5594,7 +5594,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 
 	bpf_vlog_finalize(log);
-	if (log->level && bpf_vlog_truncated(log)) {
+	if (bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
 	}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index d99a50f07187..c778f3b290cb 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -169,7 +169,9 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 
 bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
-	if (log->level & BPF_LOG_FIXED)
+	if (!log->level)
+		return false;
+	else if (log->level & BPF_LOG_FIXED)
 		return bpf_log_used(log) >= log->len_total - 1;
 	else
 		return log->start_pos > 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0323149803f5..a98cbc046d1e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18861,12 +18861,10 @@ skip_full_check:
 	env->prog->aux->verified_insns = env->insn_processed;
 
 	bpf_vlog_finalize(log);
-	if (log->level && bpf_vlog_truncated(log))
+	if (bpf_vlog_truncated(log))
 		ret = -ENOSPC;
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) {
+	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
 		ret = -EFAULT;
-		goto err_release_maps;
-	}
 
 	if (ret)
 		goto err_release_maps;
-- 
cgit v1.2.3


From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:57 -0700
Subject: bpf: Keep track of total log content size in both fixed and rolling
 modes

Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos
as *logical* log position. This means that we can go beyond physical log
buffer size now and be able to tell what log buffer size should be to
fit entire log contents without -ENOSPC.

To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting
logic of not vsnprintf()'ing further log content once we filled up
user-provided buffer, which is done by bpf_verifier_log_needed() checks.
We modify these checks to always keep going if log->level is non-zero
(i.e., log is requested), even if log->ubuf was NULL'ed out due to
copying data to user-space, or if entire log buffer is physically full.
We adopt bpf_verifier_vlog() routine to work correctly with
log->ubuf == NULL condition, performing log formatting into temporary
kernel buffer, doing all the necessary accounting, but just avoiding
copying data out if buffer is full or NULL'ed out.

With these changes, it's now possible to do this sort of determination of
log contents size in both BPF_LOG_FIXED and default rolling log mode.
We need to keep in mind bpf_vlog_reset(), though, which shrinks log
contents after successful verification of a particular code path. This
log reset means that log->end_pos isn't always increasing, so to return
back to users what should be the log buffer size to fit all log content
without causing -ENOSPC even in the presence of log resetting, we need
to keep maximum over "lifetime" of logging. We do this accounting in
bpf_vlog_update_len_max() helper.

A related and subtle aspect is that with this logical log->end_pos even in
BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset
it back with bpf_vlog_reset() to a position inside user-supplied
log_buf. In such situation we still want to properly maintain
terminating zero. We will eventually return -ENOSPC even if final log
buffer is small (we detect this through log->len_max check). This
behavior is simpler to reason about and is consistent with current
behavior of verifier log. Handling of this required a small addition to
bpf_vlog_reset() logic to avoid doing put_user() beyond physical log
buffer dimensions.

Another issue to keep in mind is that we limit log buffer size to 32-bit
value and keep such log length as u32, but theoretically verifier could
produce huge log stretching beyond 4GB. Instead of keeping (and later
returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes
it impossible to specify log buffer size bigger than 4GB anyways, so we
don't really loose anything here and keep everything consistently 32-bit
in UAPI. This property will be utilized in next patch.

Doing the same determination of maximum log buffer for rolling mode is
trivial, as log->end_pos and log->start_pos are already logical
positions, so there is nothing new there.

These changes do incidentally fix one small issue with previous logging
logic. Previously, if use provided log buffer of size N, and actual log
output was exactly N-1 bytes + terminating \0, kernel logic coun't
distinguish this condition from log truncation scenario which would end
up with truncated log contents of N-1 bytes + terminating \0 as well.

But now with log->end_pos being logical position that could go beyond
actual log buffer size, we can distinguish these two conditions, which
we do in this patch. This plays nicely with returning log_size_actual
(implemented in UAPI in the next patch), as we can now guarantee that if
user takes such log_size_actual and provides log buffer of that exact
size, they will not get -ENOSPC in return.

All in all, all these changes do conceptually unify fixed and rolling
log modes much better, and allow a nice feature requested by users:
knowing what should be the size of the buffer to avoid -ENOSPC.

We'll plumb this through the UAPI and the code in the next patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 12 ++------
 kernel/bpf/log.c             | 67 ++++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4c926227f612..98d2eb382dbb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -504,6 +504,7 @@ struct bpf_verifier_log {
 	char __user *ubuf;
 	u32 level;
 	u32 len_total;
+	u32 len_max;
 	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
@@ -517,23 +518,16 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
-{
-	return log->end_pos - log->start_pos;
-}
-
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
 	if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
+		return log->end_pos >= log->len_total;
 	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
-	return log &&
-		((log->level && log->ubuf && !bpf_verifier_log_full(log)) ||
-		 log->level == BPF_LOG_KERNEL);
+	return log && log->level;
 }
 
 #define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index c778f3b290cb..47bea2fad6fe 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -16,10 +16,26 @@ bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+	/* add_len includes terminal \0, so no need for +1. */
+	u64 len = log->end_pos + add_len;
+
+	/* log->len_max could be larger than our current len due to
+	 * bpf_vlog_reset() calls, so we maintain the max of any length at any
+	 * previous point
+	 */
+	if (len > UINT_MAX)
+		log->len_max = UINT_MAX;
+	else if (len > log->len_max)
+		log->len_max = len;
+}
+
 void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		       va_list args)
 {
-	unsigned int n;
+	u64 cur_pos;
+	u32 new_n, n;
 
 	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 
@@ -33,21 +49,27 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		return;
 	}
 
-	if (log->level & BPF_LOG_FIXED) {
-		n = min(log->len_total - bpf_log_used(log) - 1, n);
-		log->kbuf[n] = '\0';
-		n += 1;
+	n += 1; /* include terminating zero */
+	bpf_vlog_update_len_max(log, n);
 
-		if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n))
-			goto fail;
+	if (log->level & BPF_LOG_FIXED) {
+		/* check if we have at least something to put into user buf */
+		new_n = 0;
+		if (log->end_pos < log->len_total) {
+			new_n = min_t(u32, log->len_total - log->end_pos, n);
+			log->kbuf[new_n - 1] = '\0';
+		}
 
+		cur_pos = log->end_pos;
 		log->end_pos += n - 1; /* don't count terminating '\0' */
+
+		if (log->ubuf && new_n &&
+		    copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+			goto fail;
 	} else {
-		u64 new_end, new_start, cur_pos;
+		u64 new_end, new_start;
 		u32 buf_start, buf_end, new_n;
 
-		n += 1;
-
 		new_end = log->end_pos + n;
 		if (new_end - log->start_pos >= log->len_total)
 			new_start = new_end - log->len_total;
@@ -65,6 +87,12 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		if (buf_end == 0)
 			buf_end = log->len_total;
 
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+		if (!log->ubuf)
+			return;
+
 		/* if buf_start > buf_end, we wrapped around;
 		 * if buf_start == buf_end, then we fill ubuf completely; we
 		 * can't have buf_start == buf_end to mean that there is
@@ -88,9 +116,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 					 buf_end))
 				goto fail;
 		}
-
-		log->start_pos = new_start;
-		log->end_pos = new_end - 1; /* don't count terminating '\0' */
 	}
 
 	return;
@@ -116,8 +141,13 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 	log->end_pos = new_pos;
 	if (log->end_pos < log->start_pos)
 		log->start_pos = log->end_pos;
-	div_u64_rem(new_pos, log->len_total, &pos);
-	if (put_user(zero, log->ubuf + pos))
+
+	if (log->level & BPF_LOG_FIXED)
+		pos = log->end_pos + 1;
+	else
+		div_u64_rem(new_pos, log->len_total, &pos);
+
+	if (log->ubuf && pos < log->len_total && put_user(zero, log->ubuf + pos))
 		log->ubuf = NULL;
 }
 
@@ -169,12 +199,7 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 
 bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
-	if (!log->level)
-		return false;
-	else if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
-	else
-		return log->start_pos > 0;
+	return log->len_max > log->len_total;
 }
 
 void bpf_vlog_finalize(struct bpf_verifier_log *log)
-- 
cgit v1.2.3


From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:58 -0700
Subject: bpf: Add log_true_size output field to return necessary log buffer
 size

Add output-only log_true_size and btf_log_true_size field to
BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return
the size of log buffer necessary to fit in all the log contents at
specified log_level. This is very useful for BPF loader libraries like
libbpf to be able to size log buffer correctly, but could be used by
users directly, if necessary, as well.

This patch plumbs all this through the code, taking into account actual
bpf_attr size provided by user to determine if these new fields are
expected by users. And if they are, set them from kernel on return.

We refactory btf_parse() function to accommodate this, moving attr and
uattr handling inside it. The rest is very straightforward code, which
is split from the logging accounting changes in the previous patch to
make it simpler to review logic vs UAPI changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org
---
 include/linux/bpf.h            |  2 +-
 include/linux/btf.h            |  2 +-
 include/uapi/linux/bpf.h       | 10 ++++++++++
 kernel/bpf/btf.c               | 32 ++++++++++++++++++--------------
 kernel/bpf/syscall.c           | 16 ++++++++--------
 kernel/bpf/verifier.c          |  8 +++++++-
 tools/include/uapi/linux/bpf.h | 12 +++++++++++-
 7 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 002a811b6b90..2c6095bd7d69 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d53b10cc55f2..495250162422 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -125,7 +125,7 @@ extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr);
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
 struct btf *btf_get_by_fd(int fd);
 int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e3d3b5160d26..3823100b7934 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1407,6 +1407,11 @@ union bpf_attr {
 		__aligned_u64	fd_array;	/* array of FDs */
 		__aligned_u64	core_relos;
 		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		log_true_size;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1492,6 +1497,11 @@ union bpf_attr {
 		__u32		btf_size;
 		__u32		btf_log_size;
 		__u32		btf_log_level;
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		btf_log_true_size;
 	};
 
 	struct {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5aa540ee611f..0748cf4b8ab6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5504,9 +5504,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
-			     u32 log_level, char __user *log_ubuf, u32 log_size)
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
+	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
@@ -5514,7 +5515,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	u8 *data;
 	int err;
 
-	if (btf_data_size > BTF_MAX_SIZE)
+	if (attr->btf_size > BTF_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -5522,13 +5523,13 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		return ERR_PTR(-ENOMEM);
 
 	log = &env->log;
-	if (log_level || log_ubuf || log_size) {
+	if (attr->btf_log_level || log_ubuf || attr->btf_log_size) {
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log->level = log_level;
+		log->level = attr->btf_log_level;
 		log->ubuf = log_ubuf;
-		log->len_total = log_size;
+		log->len_total = attr->btf_log_size;
 
 		/* log attributes have to be sane */
 		if (!bpf_verifier_log_attr_valid(log)) {
@@ -5544,16 +5545,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 	env->btf = btf;
 
-	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
 		err = -ENOMEM;
 		goto errout;
 	}
 
 	btf->data = data;
-	btf->data_size = btf_data_size;
+	btf->data_size = attr->btf_size;
 
-	if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+	if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
@@ -5594,6 +5595,12 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 
 	bpf_vlog_finalize(log);
+	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+				  &log->len_max, sizeof(log->len_max))) {
+		err = -EFAULT;
+		goto errout_meta;
+	}
 	if (bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
@@ -7218,15 +7225,12 @@ static int __btf_new_fd(struct btf *btf)
 	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
 }
 
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	struct btf *btf;
 	int ret;
 
-	btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
-			attr->btf_size, attr->btf_log_level,
-			u64_to_user_ptr(attr->btf_log_buf),
-			attr->btf_log_size);
+	btf = btf_parse(attr, uattr, uattr_size);
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e18ac7fdc210..6d575505f89c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2501,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
 
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
@@ -2653,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
-	err = bpf_check(&prog, attr, uattr);
+	err = bpf_check(&prog, attr, uattr, uattr_size);
 	if (err < 0)
 		goto free_used_maps;
 
@@ -4371,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
 
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
@@ -4381,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
 	if (!bpf_capable())
 		return -EPERM;
 
-	return btf_new_fd(attr, uattr);
+	return btf_new_fd(attr, uattr, uattr_size);
 }
 
 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -5059,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = map_freeze(&attr);
 		break;
 	case BPF_PROG_LOAD:
-		err = bpf_prog_load(&attr, uattr);
+		err = bpf_prog_load(&attr, uattr, size);
 		break;
 	case BPF_OBJ_PIN:
 		err = bpf_obj_pin(&attr);
@@ -5104,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
 	case BPF_BTF_LOAD:
-		err = bpf_btf_load(&attr, uattr);
+		err = bpf_btf_load(&attr, uattr, size);
 		break;
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a98cbc046d1e..308e7abeb979 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18694,7 +18694,7 @@ struct btf *bpf_get_btf_vmlinux(void)
 	return btf_vmlinux;
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
@@ -18861,6 +18861,12 @@ skip_full_check:
 	env->prog->aux->verified_insns = env->insn_processed;
 
 	bpf_vlog_finalize(log);
+	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+				  &log->len_max, sizeof(log->len_max))) {
+		ret = -EFAULT;
+		goto err_release_maps;
+	}
 	if (bpf_vlog_truncated(log))
 		ret = -ENOSPC;
 	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d6c5a022ae28..3823100b7934 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1407,6 +1407,11 @@ union bpf_attr {
 		__aligned_u64	fd_array;	/* array of FDs */
 		__aligned_u64	core_relos;
 		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		log_true_size;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1492,6 +1497,11 @@ union bpf_attr {
 		__u32		btf_size;
 		__u32		btf_log_size;
 		__u32		btf_log_level;
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		btf_log_true_size;
 	};
 
 	struct {
@@ -1513,7 +1523,7 @@ union bpf_attr {
 	struct { /* struct used by BPF_LINK_CREATE command */
 		union {
 			__u32		prog_fd;	/* eBPF program to attach */
-			__u32		map_fd;		/* eBPF struct_ops to attach */
+			__u32		map_fd;		/* struct_ops to attach */
 		};
 		union {
 			__u32		target_fd;	/* object to attach to */
-- 
cgit v1.2.3


From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:59 -0700
Subject: bpf: Simplify internal verifier log interface

Simplify internal verifier log API down to bpf_vlog_init() and
bpf_vlog_finalize(). The former handles input arguments validation in
one place and makes it easier to change it. The latter subsumes -ENOSPC
(truncation) and -EFAULT handling and simplifies both caller's code
(bpf_check() and btf_parse()).

For btf_parse(), this patch also makes sure that verifier log
finalization happens even if there is some error condition during BTF
verification process prior to normal finalization step.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 13 ++-------
 kernel/bpf/btf.c             | 65 ++++++++++++++++++++++----------------------
 kernel/bpf/log.c             | 48 ++++++++++++++++++++++++++------
 kernel/bpf/verifier.c        | 39 +++++++++++---------------
 4 files changed, 90 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 98d2eb382dbb..f03852b89d28 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -518,13 +518,6 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	if (log->level & BPF_LOG_FIXED)
-		return log->end_pos >= log->len_total;
-	return false;
-}
-
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log && log->level;
@@ -612,16 +605,16 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size);
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
-void bpf_vlog_finalize(struct bpf_verifier_log *log);
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0748cf4b8ab6..ffc31a1c84af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5504,16 +5504,30 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+	u32 log_true_size;
+	int err;
+
+	err = bpf_vlog_finalize(log, &log_true_size);
+
+	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+				  &log_true_size, sizeof(log_true_size)))
+		err = -EFAULT;
+
+	return err;
+}
+
 static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
 	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
-	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
 	u8 *data;
-	int err;
+	int err, ret;
 
 	if (attr->btf_size > BTF_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
@@ -5522,21 +5536,13 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (!env)
 		return ERR_PTR(-ENOMEM);
 
-	log = &env->log;
-	if (attr->btf_log_level || log_ubuf || attr->btf_log_size) {
-		/* user requested verbose verifier output
-		 * and supplied buffer to store the verification trace
-		 */
-		log->level = attr->btf_log_level;
-		log->ubuf = log_ubuf;
-		log->len_total = attr->btf_log_size;
-
-		/* log attributes have to be sane */
-		if (!bpf_verifier_log_attr_valid(log)) {
-			err = -EINVAL;
-			goto errout;
-		}
-	}
+	/* user could have requested verbose verifier output
+	 * and supplied buffer to store the verification trace
+	 */
+	err = bpf_vlog_init(&env->log, attr->btf_log_level,
+			    log_ubuf, attr->btf_log_size);
+	if (err)
+		goto errout_free;
 
 	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
 	if (!btf) {
@@ -5577,7 +5583,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (err)
 		goto errout;
 
-	struct_meta_tab = btf_parse_struct_metas(log, btf);
+	struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
 	if (IS_ERR(struct_meta_tab)) {
 		err = PTR_ERR(struct_meta_tab);
 		goto errout;
@@ -5594,21 +5600,9 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 		}
 	}
 
-	bpf_vlog_finalize(log);
-	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
-	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
-				  &log->len_max, sizeof(log->len_max))) {
-		err = -EFAULT;
-		goto errout_meta;
-	}
-	if (bpf_vlog_truncated(log)) {
-		err = -ENOSPC;
-		goto errout_meta;
-	}
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) {
-		err = -EFAULT;
-		goto errout_meta;
-	}
+	err = finalize_log(&env->log, uattr, uattr_size);
+	if (err)
+		goto errout_free;
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
@@ -5617,6 +5611,11 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 errout_meta:
 	btf_free_struct_meta_tab(btf);
 errout:
+	/* overwrite err with -ENOSPC or -EFAULT */
+	ret = finalize_log(&env->log, uattr, uattr_size);
+	if (ret)
+		err = ret;
+errout_free:
 	btf_verifier_env_free(env);
 	if (btf)
 		btf_free(btf);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 47bea2fad6fe..1fae2c5d7ae4 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -10,12 +10,26 @@
 #include <linux/bpf_verifier.h>
 #include <linux/math64.h>
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
 	return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 &&
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size)
+{
+	log->level = log_level;
+	log->ubuf = log_buf;
+	log->len_total = log_size;
+
+	/* log attributes have to be sane */
+	if (!bpf_verifier_log_attr_valid(log))
+		return -EINVAL;
+
+	return 0;
+}
+
 static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
 {
 	/* add_len includes terminal \0, so no need for +1. */
@@ -197,24 +211,25 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 	return 0;
 }
 
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
+static bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
 	return log->len_max > log->len_total;
 }
 
-void bpf_vlog_finalize(struct bpf_verifier_log *log)
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
 {
 	u32 sublen;
 	int err;
 
-	if (!log || !log->level || !log->ubuf)
-		return;
-	if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL)
-		return;
+	*log_size_actual = 0;
+	if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+		return 0;
 
+	if (!log->ubuf)
+		goto skip_log_rotate;
 	/* If we never truncated log, there is nothing to move around. */
-	if (log->start_pos == 0)
-		return;
+	if ((log->level & BPF_LOG_FIXED) || log->start_pos == 0)
+		goto skip_log_rotate;
 
 	/* Otherwise we need to rotate log contents to make it start from the
 	 * buffer beginning and be a continuous zero-terminated string. Note
@@ -257,6 +272,21 @@ void bpf_vlog_finalize(struct bpf_verifier_log *log)
 	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
 	if (err)
 		log->ubuf = NULL;
+
+skip_log_rotate:
+	*log_size_actual = log->len_max;
+
+	/* properly initialized log has either both ubuf!=NULL and len_total>0
+	 * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+	 * we got a fault somewhere along the way, so report it back
+	 */
+	if (!!log->ubuf != !!log->len_total)
+		return -EFAULT;
+
+	if (bpf_vlog_truncated(log))
+		return -ENOSPC;
+
+	return 0;
 }
 
 /* log_level controls verbosity level of eBPF verifier.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 308e7abeb979..d6db6de3e9ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18698,8 +18698,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
-	struct bpf_verifier_log *log;
-	int i, len, ret = -EINVAL;
+	int i, len, ret = -EINVAL, err;
+	u32 log_true_size;
 	bool is_priv;
 
 	/* no program is valid */
@@ -18712,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
-	log = &env->log;
 
 	len = (*prog)->len;
 	env->insn_aux_data =
@@ -18733,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (!is_priv)
 		mutex_lock(&bpf_verifier_lock);
 
-	if (attr->log_level || attr->log_buf || attr->log_size) {
-		/* user requested verbose verifier output
-		 * and supplied buffer to store the verification trace
-		 */
-		log->level = attr->log_level;
-		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log->len_total = attr->log_size;
-
-		/* log attributes have to be sane */
-		if (!bpf_verifier_log_attr_valid(log)) {
-			ret = -EINVAL;
-			goto err_unlock;
-		}
-	}
+	/* user could have requested verbose verifier output
+	 * and supplied buffer to store the verification trace
+	 */
+	ret = bpf_vlog_init(&env->log, attr->log_level,
+			    (char __user *) (unsigned long) attr->log_buf,
+			    attr->log_size);
+	if (ret)
+		goto err_unlock;
 
 	mark_verifier_state_clean(env);
 
@@ -18860,17 +18853,17 @@ skip_full_check:
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
 
-	bpf_vlog_finalize(log);
+	/* preserve original error even if log finalization is successful */
+	err = bpf_vlog_finalize(&env->log, &log_true_size);
+	if (err)
+		ret = err;
+
 	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
 	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
-				  &log->len_max, sizeof(log->len_max))) {
+				  &log_true_size, sizeof(log_true_size))) {
 		ret = -EFAULT;
 		goto err_release_maps;
 	}
-	if (bpf_vlog_truncated(log))
-		ret = -ENOSPC;
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
-		ret = -EFAULT;
 
 	if (ret)
 		goto err_release_maps;
-- 
cgit v1.2.3


From fac08d45e2531f91d8fb3d11fc6576f588049476 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:42:00 -0700
Subject: bpf: Relax log_buf NULL conditions when log_level>0 is requested

Drop the log_size>0 and log_buf!=NULL condition when log_level>0. This
allows users to request log_true_size of a full log without providing
actual (even if small) log buffer. Verifier log handling code was mostly
ready to handle NULL log->ubuf, so only few small changes were necessary
to prevent NULL log->ubuf from causing problems.

Note, that if user provided NULL log_buf with log_level>0 we don't
consider this a log truncation, and thus won't return -ENOSPC.

We also enforce that either (log_buf==NULL && log_size==0) or
(log_buf!=NULL && log_size>0).

Suggested-by: Lorenz Bauer <lmb@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-15-andrii@kernel.org
---
 kernel/bpf/log.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 1fae2c5d7ae4..046ddff37a76 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -12,8 +12,17 @@
 
 static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
-	return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 &&
-	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
+	/* ubuf and len_total should both be specified (or not) together */
+	if (!!log->ubuf != !!log->len_total)
+		return false;
+	/* log buf without log_level is meaningless */
+	if (log->ubuf && log->level == 0)
+		return false;
+	if (log->level & ~BPF_LOG_MASK)
+		return false;
+	if (log->len_total > UINT_MAX >> 2)
+		return false;
+	return true;
 }
 
 int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
@@ -89,9 +98,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 			new_start = new_end - log->len_total;
 		else
 			new_start = log->start_pos;
+
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+		if (!log->ubuf)
+			return;
+
 		new_n = min(n, log->len_total);
 		cur_pos = new_end - new_n;
-
 		div_u64_rem(cur_pos, log->len_total, &buf_start);
 		div_u64_rem(new_end, log->len_total, &buf_end);
 		/* new_end and buf_end are exclusive indices, so if buf_end is
@@ -101,12 +116,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		if (buf_end == 0)
 			buf_end = log->len_total;
 
-		log->start_pos = new_start;
-		log->end_pos = new_end - 1; /* don't count terminating '\0' */
-
-		if (!log->ubuf)
-			return;
-
 		/* if buf_start > buf_end, we wrapped around;
 		 * if buf_start == buf_end, then we fill ubuf completely; we
 		 * can't have buf_start == buf_end to mean that there is
@@ -156,12 +165,15 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 	if (log->end_pos < log->start_pos)
 		log->start_pos = log->end_pos;
 
+	if (!log->ubuf)
+		return;
+
 	if (log->level & BPF_LOG_FIXED)
 		pos = log->end_pos + 1;
 	else
 		div_u64_rem(new_pos, log->len_total, &pos);
 
-	if (log->ubuf && pos < log->len_total && put_user(zero, log->ubuf + pos))
+	if (pos < log->len_total && put_user(zero, log->ubuf + pos))
 		log->ubuf = NULL;
 }
 
@@ -211,11 +223,6 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 	return 0;
 }
 
-static bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
-{
-	return log->len_max > log->len_total;
-}
-
 int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
 {
 	u32 sublen;
@@ -228,7 +235,7 @@ int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
 	if (!log->ubuf)
 		goto skip_log_rotate;
 	/* If we never truncated log, there is nothing to move around. */
-	if ((log->level & BPF_LOG_FIXED) || log->start_pos == 0)
+	if (log->start_pos == 0)
 		goto skip_log_rotate;
 
 	/* Otherwise we need to rotate log contents to make it start from the
@@ -283,7 +290,8 @@ skip_log_rotate:
 	if (!!log->ubuf != !!log->len_total)
 		return -EFAULT;
 
-	if (bpf_vlog_truncated(log))
+	/* did truncation actually happen? */
+	if (log->ubuf && log->len_max > log->len_total)
 		return -ENOSPC;
 
 	return 0;
-- 
cgit v1.2.3


From 91f2dc6838c19342f7f2993627c622835cc24890 Mon Sep 17 00:00:00 2001
From: Feng Zhou <zhoufeng.zf@bytedance.com>
Date: Mon, 10 Apr 2023 16:59:07 +0800
Subject: bpf/btf: Fix is_int_ptr()

When tracing a kernel function with arg type is u32*, btf_ctx_access()
would report error: arg2 type INT is not a struct.

The commit bb6728d75611 ("bpf: Allow access to int pointer arguments
in tracing programs") added support for int pointer, but did not skip
modifiers before checking it's type. This patch fixes it.

Fixes: bb6728d75611 ("bpf: Allow access to int pointer arguments in tracing programs")
Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230410085908.98493-2-zhoufeng.zf@bytedance.com
---
 kernel/bpf/btf.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ffc31a1c84af..913b9d717a4a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5921,12 +5921,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
 
 static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
 {
-	/* t comes in already as a pointer */
-	t = btf_type_by_id(btf, t->type);
-
-	/* allow const */
-	if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
-		t = btf_type_by_id(btf, t->type);
+	/* skip modifiers */
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
 
 	return btf_type_is_int(t);
 }
-- 
cgit v1.2.3


From 10fd5f70c397782a97f411f25bfb312ea92b55bc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 12 Apr 2023 10:12:52 -0700
Subject: bpf: Handle NULL in bpf_local_storage_free.

During OOM bpf_local_storage_alloc() may fail to allocate 'storage' and
call to bpf_local_storage_free() with NULL pointer will cause a crash like:
[ 271718.917646] BUG: kernel NULL pointer dereference, address: 00000000000000a0
[ 271719.019620] RIP: 0010:call_rcu+0x2d/0x240
[ 271719.216274]  bpf_local_storage_alloc+0x19e/0x1e0
[ 271719.250121]  bpf_local_storage_update+0x33b/0x740

Fixes: 7e30a8477b0b ("bpf: Add bpf_local_storage_free()")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230412171252.15635-1-alexei.starovoitov@gmail.com
---
 kernel/bpf/bpf_local_storage.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index dab2ff4c99d9..47d9948d768f 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -157,6 +157,9 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_map *smap,
 				   bool bpf_ma, bool reuse_now)
 {
+	if (!local_storage)
+		return;
+
 	if (!bpf_ma) {
 		__bpf_local_storage_free(local_storage, reuse_now);
 		return;
-- 
cgit v1.2.3


From 1d71283987c729dceccce834a864c27301ba155e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 10 Apr 2023 23:16:31 -0500
Subject: bpf: Make bpf_cgroup_acquire() KF_RCU | KF_RET_NULL

struct cgroup is already an RCU-safe type in the verifier. We can
therefore update bpf_cgroup_acquire() to be KF_RCU | KF_RET_NULL, and
subsequently remove bpf_cgroup_kptr_get(). This patch does the first of
these by updating bpf_cgroup_acquire() to be KF_RCU | KF_RET_NULL, and
also updates selftests accordingly.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230411041633.179404-1-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                               |  5 ++--
 .../selftests/bpf/progs/cgrp_kfunc_common.h        |  5 ++++
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       | 35 ++++++++++++++++++----
 .../selftests/bpf/progs/cgrp_kfunc_success.c       |  5 +++-
 4 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b6a5cda5bb59..71f0604bdc97 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2037,8 +2037,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
  */
 __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
 {
-	cgroup_get(cgrp);
-	return cgrp;
+	return cgroup_tryget(cgrp) ? cgrp : NULL;
 }
 
 /**
@@ -2314,7 +2313,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_add)
 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 
 #ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
index d0b7cd0d09d7..b0e279f4652b 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
@@ -61,6 +61,11 @@ static inline int cgrps_kfunc_map_insert(struct cgroup *cgrp)
 	}
 
 	acquired = bpf_cgroup_acquire(cgrp);
+	if (!acquired) {
+		bpf_map_delete_elem(&__cgrps_kfunc_map, &id);
+		return -ENOENT;
+	}
+
 	old = bpf_kptr_xchg(&v->cgrp, acquired);
 	if (old) {
 		bpf_cgroup_release(old);
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 48b2034cadb3..49347f12de39 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -41,6 +41,23 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path
 
 	/* Can't invoke bpf_cgroup_acquire() on an untrusted pointer. */
 	acquired = bpf_cgroup_acquire(v->cgrp);
+	if (acquired)
+		bpf_cgroup_release(acquired);
+
+	return 0;
+}
+
+SEC("tp_btf/cgroup_mkdir")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
+int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *path)
+{
+	struct cgroup *acquired;
+
+	acquired = bpf_cgroup_acquire(cgrp);
+	/*
+	 * Can't invoke bpf_cgroup_release() without checking the return value
+	 * of bpf_cgroup_acquire().
+	 */
 	bpf_cgroup_release(acquired);
 
 	return 0;
@@ -54,7 +71,8 @@ int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path)
 
 	/* Can't invoke bpf_cgroup_acquire() on a random frame pointer. */
 	acquired = bpf_cgroup_acquire((struct cgroup *)&stack_cgrp);
-	bpf_cgroup_release(acquired);
+	if (acquired)
+		bpf_cgroup_release(acquired);
 
 	return 0;
 }
@@ -67,7 +85,8 @@ int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp)
 
 	/* Can't acquire an untrusted struct cgroup * pointer. */
 	acquired = bpf_cgroup_acquire(cgrp);
-	bpf_cgroup_release(acquired);
+	if (acquired)
+		bpf_cgroup_release(acquired);
 
 	return 0;
 }
@@ -80,7 +99,8 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char
 
 	/* Can't invoke bpf_cgroup_acquire() on a pointer obtained from walking a trusted cgroup. */
 	acquired = bpf_cgroup_acquire(cgrp->old_dom_cgrp);
-	bpf_cgroup_release(acquired);
+	if (acquired)
+		bpf_cgroup_release(acquired);
 
 	return 0;
 }
@@ -93,9 +113,8 @@ int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path)
 
 	/* Can't invoke bpf_cgroup_acquire() on a NULL pointer. */
 	acquired = bpf_cgroup_acquire(NULL);
-	if (!acquired)
-		return 0;
-	bpf_cgroup_release(acquired);
+	if (acquired)
+		bpf_cgroup_release(acquired);
 
 	return 0;
 }
@@ -137,6 +156,8 @@ int BPF_PROG(cgrp_kfunc_get_non_kptr_acquired, struct cgroup *cgrp, const char *
 	struct cgroup *kptr, *acquired;
 
 	acquired = bpf_cgroup_acquire(cgrp);
+	if (!acquired)
+		return 0;
 
 	/* Cannot use bpf_cgroup_kptr_get() on a non-map-value, even if the kptr was acquired. */
 	kptr = bpf_cgroup_kptr_get(&acquired);
@@ -256,6 +277,8 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path)
 		return -ENOENT;
 
 	acquired = bpf_cgroup_acquire(cgrp);
+	if (!acquired)
+		return -ENOENT;
 
 	old = bpf_kptr_xchg(&v->cgrp, acquired);
 
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c
index 030aff700084..e9dbd1af05a7 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c
@@ -38,7 +38,10 @@ int BPF_PROG(test_cgrp_acquire_release_argument, struct cgroup *cgrp, const char
 		return 0;
 
 	acquired = bpf_cgroup_acquire(cgrp);
-	bpf_cgroup_release(acquired);
+	if (!acquired)
+		err = 1;
+	else
+		bpf_cgroup_release(acquired);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 6499fe6edc4fd5b91aed4d5cd84bd113e1c58d5f Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 10 Apr 2023 23:16:32 -0500
Subject: bpf: Remove bpf_cgroup_kptr_get() kfunc

Now that bpf_cgroup_acquire() is KF_RCU | KF_RET_NULL,
bpf_cgroup_kptr_get() is redundant. Let's remove it, and update
selftests to instead use bpf_cgroup_acquire() where appropriate. The
next patch will update the BPF documentation to not mention
bpf_cgroup_kptr_get().

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230411041633.179404-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                               | 32 ----------
 .../selftests/bpf/progs/cgrp_kfunc_common.h        |  3 +-
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       | 68 +++-------------------
 .../selftests/bpf/progs/cgrp_kfunc_success.c       | 10 ++--
 4 files changed, 14 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 71f0604bdc97..f04e60a4847f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2040,37 +2040,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
 	return cgroup_tryget(cgrp) ? cgrp : NULL;
 }
 
-/**
- * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_cgroup_release().
- * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
-{
-	struct cgroup *cgrp;
-
-	rcu_read_lock();
-	/* Another context could remove the cgroup from the map and release it
-	 * at any time, including after we've done the lookup above. This is
-	 * safe because we're in an RCU read region, so the cgroup is
-	 * guaranteed to remain valid until at least the rcu_read_unlock()
-	 * below.
-	 */
-	cgrp = READ_ONCE(*cgrpp);
-
-	if (cgrp && !cgroup_tryget(cgrp))
-		/* If the cgroup had been removed from the map and freed as
-		 * described above, cgroup_tryget() will return false. The
-		 * cgroup will be freed at some point after the current RCU gp
-		 * has ended, so just return NULL to the user.
-		 */
-		cgrp = NULL;
-	rcu_read_unlock();
-
-	return cgrp;
-}
-
 /**
  * bpf_cgroup_release - Release the reference acquired on a cgroup.
  * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
@@ -2314,7 +2283,6 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 
 #ifdef CONFIG_CGROUPS
 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
index b0e279f4652b..22914a70db54 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
@@ -21,10 +21,11 @@ struct hash_map {
 } __cgrps_kfunc_map SEC(".maps");
 
 struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym;
-struct cgroup *bpf_cgroup_kptr_get(struct cgroup **pp) __ksym;
 void bpf_cgroup_release(struct cgroup *p) __ksym;
 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
 
 static inline struct __cgrps_kfunc_map_value *cgrps_kfunc_map_value_lookup(struct cgroup *cgrp)
 {
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 49347f12de39..0fa564a5cc5b 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -133,59 +133,6 @@ int BPF_PROG(cgrp_kfunc_acquire_unreleased, struct cgroup *cgrp, const char *pat
 	return 0;
 }
 
-SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 expected pointer to map value")
-int BPF_PROG(cgrp_kfunc_get_non_kptr_param, struct cgroup *cgrp, const char *path)
-{
-	struct cgroup *kptr;
-
-	/* Cannot use bpf_cgroup_kptr_get() on a non-kptr, even on a valid cgroup. */
-	kptr = bpf_cgroup_kptr_get(&cgrp);
-	if (!kptr)
-		return 0;
-
-	bpf_cgroup_release(kptr);
-
-	return 0;
-}
-
-SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 expected pointer to map value")
-int BPF_PROG(cgrp_kfunc_get_non_kptr_acquired, struct cgroup *cgrp, const char *path)
-{
-	struct cgroup *kptr, *acquired;
-
-	acquired = bpf_cgroup_acquire(cgrp);
-	if (!acquired)
-		return 0;
-
-	/* Cannot use bpf_cgroup_kptr_get() on a non-map-value, even if the kptr was acquired. */
-	kptr = bpf_cgroup_kptr_get(&acquired);
-	bpf_cgroup_release(acquired);
-	if (!kptr)
-		return 0;
-
-	bpf_cgroup_release(kptr);
-
-	return 0;
-}
-
-SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 expected pointer to map value")
-int BPF_PROG(cgrp_kfunc_get_null, struct cgroup *cgrp, const char *path)
-{
-	struct cgroup *kptr;
-
-	/* Cannot use bpf_cgroup_kptr_get() on a NULL pointer. */
-	kptr = bpf_cgroup_kptr_get(NULL);
-	if (!kptr)
-		return 0;
-
-	bpf_cgroup_release(kptr);
-
-	return 0;
-}
-
 SEC("tp_btf/cgroup_mkdir")
 __failure __msg("Unreleased reference")
 int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path)
@@ -207,8 +154,8 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("Unreleased reference")
-int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
+__failure __msg("must be referenced or trusted")
+int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr;
 	struct __cgrps_kfunc_map_value *v;
@@ -217,11 +164,12 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
 	if (!v)
 		return 0;
 
-	kptr = bpf_cgroup_kptr_get(&v->cgrp);
-	if (!kptr)
-		return 0;
-
-	/* Kptr acquired above is never released. */
+	bpf_rcu_read_lock();
+	kptr = v->cgrp;
+	if (kptr)
+		/* Can't release a cgroup kptr stored in a map. */
+		bpf_cgroup_release(kptr);
+	bpf_rcu_read_unlock();
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c
index e9dbd1af05a7..5354455a01be 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c
@@ -126,13 +126,11 @@ int BPF_PROG(test_cgrp_get_release, struct cgroup *cgrp, const char *path)
 		return 0;
 	}
 
-	kptr = bpf_cgroup_kptr_get(&v->cgrp);
-	if (!kptr) {
+	bpf_rcu_read_lock();
+	kptr = v->cgrp;
+	if (!kptr)
 		err = 3;
-		return 0;
-	}
-
-	bpf_cgroup_release(kptr);
+	bpf_rcu_read_unlock();
 
 	return 0;
 }
-- 
cgit v1.2.3