security/landlock/tsync.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock - Cross-thread ruleset enforcement
 *
 * Copyright © 2025 Google LLC
 */

#include <linux/atomic.h>
#include <linux/cleanup.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/overflow.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/task_work.h>

#include "cred.h"
#include "tsync.h"

/*
 * Shared state between multiple threads which are enforcing Landlock rulesets
 * in lockstep with each other.
 */
struct tsync_shared_context {
	/* The old and tentative new creds of the calling thread. */
	const struct cred *old_cred;
	const struct cred *new_cred;

	/* True if sibling tasks need to set the no_new_privs flag. */
	bool set_no_new_privs;

	/* An error encountered in preparation step, or 0. */
	atomic_t preparation_error;

	/*
	 * Barrier after preparation step in restrict_one_thread.
	 * The calling thread waits for completion.
	 *
	 * Re-initialized on every round of looking for newly spawned threads.
	 */
	atomic_t num_preparing;
	struct completion all_prepared;

	/* Sibling threads wait for completion. */
	struct completion ready_to_commit;

	/*
	 * Barrier after commit step (used by syscall impl to wait for
	 * completion).
	 */
	atomic_t num_unfinished;
	struct completion all_finished;
};

struct tsync_work {
	struct callback_head work;
	struct task_struct *task;
	struct tsync_shared_context *shared_ctx;
};

/*
 * restrict_one_thread - update a thread's Landlock domain in lockstep with the
 * other threads in the same process
 *
 * When this is run, the same function gets run in all other threads in the same
 * process (except for the calling thread which called landlock_restrict_self).
 * The concurrently running invocations of restrict_one_thread coordinate
 * through the shared ctx object to do their work in lockstep to implement
 * all-or-nothing semantics for enforcing the new Landlock domain.
 *
 * Afterwards, depending on the presence of an error, all threads either commit
 * or abort the prepared credentials.  The commit operation can not fail any
 * more.
 */
static void restrict_one_thread(struct tsync_shared_context *ctx)
{
	int err;
	struct cred *cred = NULL;

	if (current_cred() == ctx->old_cred) {
		/*
		 * Switch out old_cred with new_cred, if possible.
		 *
		 * In the common case, where all threads initially point to the same
		 * struct cred, this optimization avoids creating separate redundant
		 * credentials objects for each, which would all have the same contents.
		 *
		 * Note: We are intentionally dropping the const qualifier here, because
		 * it is required by commit_creds() and abort_creds().
		 */
		cred = (struct cred *)get_cred(ctx->new_cred);
	} else {
		/* Else, prepare new creds and populate them. */
		cred = prepare_creds();

		if (!cred) {
			atomic_set(&ctx->preparation_error, -ENOMEM);

			/*
			 * Even on error, we need to adhere to the protocol and coordinate
			 * with concurrently running invocations.
			 */
			if (atomic_dec_return(&ctx->num_preparing) == 0)
				complete_all(&ctx->all_prepared);

			goto out;
		}

		landlock_cred_copy(landlock_cred(cred),
				   landlock_cred(ctx->new_cred));
	}

	/*
	 * Barrier: Wait until all threads are done preparing.
	 * After this point, we can have no more failures.
	 */
	if (atomic_dec_return(&ctx->num_preparing) == 0)
		complete_all(&ctx->all_prepared);

	/*
	 * Wait for signal from calling thread that it's safe to read the
	 * preparation error now and we are ready to commit (or abort).
	 */
	wait_for_completion(&ctx->ready_to_commit);

	/* Abort the commit if any of the other threads had an error. */
	err = atomic_read(&ctx->preparation_error);
	if (err) {
		abort_creds(cred);
		goto out;
	}

	/*
	 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
	 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
	 * kernel/seccomp.c)
	 */
	if (ctx->set_no_new_privs)
		task_set_no_new_privs(current);

	commit_creds(cred);

out:
	/* Notify the calling thread once all threads are done */
	if (atomic_dec_return(&ctx->num_unfinished) == 0)
		complete_all(&ctx->all_finished);
}

/*
 * restrict_one_thread_callback - task_work callback for restricting a thread
 *
 * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
 */
static void restrict_one_thread_callback(struct callback_head *work)
{
	struct tsync_work *ctx = container_of(work, struct tsync_work, work);

	restrict_one_thread(ctx->shared_ctx);
}

/*
 * struct tsync_works - a growable array of per-task contexts
 *
 * The zero-initialized struct represents the empty array.
 */
struct tsync_works {
	struct tsync_work **works;
	size_t size;
	size_t capacity;
};

/*
 * tsync_works_provide - provides a preallocated tsync_work for the given task
 *
 * This also stores a task pointer in the context and increments the reference
 * count of the task.
 *
 * This function may fail in the case where we did not preallocate sufficient
 * capacity.  This can legitimately happen if new threads get started after we
 * grew the capacity.
 *
 * Returns:
 *   A pointer to the preallocated context struct, with task filled in.
 *
 *   NULL, if we ran out of preallocated context structs.
 */
static struct tsync_work *tsync_works_provide(struct tsync_works *s,
					      struct task_struct *task)
{
	struct tsync_work *ctx;

	if (s->size >= s->capacity)
		return NULL;

	ctx = s->works[s->size];
	s->size++;

	ctx->task = get_task_struct(task);
	return ctx;
}

/*
 * tsync_works_grow_by - preallocates space for n more contexts in s
 *
 * On a successful return, the subsequent n calls to tsync_works_provide() are
 * guaranteed to succeed.  (size + n <= capacity)
 *
 * Returns:
 *   -ENOMEM if the (re)allocation fails

 *   0       if the allocation succeeds, partially succeeds, or no reallocation
 *           was needed
 */
static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
{
	size_t i;
	size_t new_capacity;
	struct tsync_work **works;
	struct tsync_work *work;

	if (check_add_overflow(s->size, n, &new_capacity))
		return -EOVERFLOW;

	/* No need to reallocate if s already has sufficient capacity. */
	if (new_capacity <= s->capacity)
		return 0;

	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
			       flags);
	if (!works)
		return -ENOMEM;

	s->works = works;

	for (i = s->capacity; i < new_capacity; i++) {
		work = kzalloc(sizeof(*work), flags);
		if (!work) {
			/*
			 * Leave the object in a consistent state,
			 * but return an error.
			 */
			s->capacity = i;
			return -ENOMEM;
		}
		s->works[i] = work;
	}
	s->capacity = new_capacity;
	return 0;
}

/*
 * tsync_works_contains - checks for presence of task in s
 */
static bool tsync_works_contains_task(const struct tsync_works *s,
				      struct task_struct *task)
{
	size_t i;

	for (i = 0; i < s->size; i++)
		if (s->works[i]->task == task)
			return true;
	return false;
}

/*
 * tsync_works_release - frees memory held by s and drops all task references
 *
 * This does not free s itself, only the data structures held by it.
 */
static void tsync_works_release(struct tsync_works *s)
{
	size_t i;

	for (i = 0; i < s->size; i++) {
		if (!s->works[i]->task)
			continue;

		put_task_struct(s->works[i]->task);
	}

	for (i = 0; i < s->capacity; i++)
		kfree(s->works[i]);
	kfree(s->works);
	s->works = NULL;
	s->size = 0;
	s->capacity = 0;
}

/*
 * count_additional_threads - counts the sibling threads that are not in works
 */
static size_t count_additional_threads(const struct tsync_works *works)
{
	struct task_struct *thread, *caller;
	size_t n = 0;

	caller = current;

	guard(rcu)();

	for_each_thread(caller, thread) {
		/* Skip current, since it is initiating the sync. */
		if (thread == caller)
			continue;

		/* Skip exited threads. */
		if (thread->flags & PF_EXITING)
			continue;

		/* Skip threads that we have already seen. */
		if (tsync_works_contains_task(works, thread))
			continue;

		n++;
	}
	return n;
}

/*
 * schedule_task_work - adds task_work for all eligible sibling threads
 *                      which have not been scheduled yet
 *
 * For each added task_work, atomically increments shared_ctx->num_preparing and
 * shared_ctx->num_unfinished.
 *
 * Returns:
 *     true, if at least one eligible sibling thread was found
 */
static bool schedule_task_work(struct tsync_works *works,
			       struct tsync_shared_context *shared_ctx)
{
	int err;
	struct task_struct *thread, *caller;
	struct tsync_work *ctx;
	bool found_more_threads = false;

	caller = current;

	guard(rcu)();

	for_each_thread(caller, thread) {
		/* Skip current, since it is initiating the sync. */
		if (thread == caller)
			continue;

		/* Skip exited threads. */
		if (thread->flags & PF_EXITING)
			continue;

		/* Skip threads that we already looked at. */
		if (tsync_works_contains_task(works, thread))
			continue;

		/*
		 * We found a sibling thread that is not doing its task_work yet, and
		 * which might spawn new threads before our task work runs, so we need
		 * at least one more round in the outer loop.
		 */
		found_more_threads = true;

		ctx = tsync_works_provide(works, thread);
		if (!ctx) {
			/*
			 * We ran out of preallocated contexts -- we need to try again with
			 * this thread at a later time!
			 * found_more_threads is already true at this point.
			 */
			break;
		}

		ctx->shared_ctx = shared_ctx;

		atomic_inc(&shared_ctx->num_preparing);
		atomic_inc(&shared_ctx->num_unfinished);

		init_task_work(&ctx->work, restrict_one_thread_callback);
		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
		if (err) {
			/*
			 * task_work_add() only fails if the task is about to exit.  We
			 * checked that earlier, but it can happen as a race.  Resume
			 * without setting an error, as the task is probably gone in the
			 * next loop iteration.  For consistency, remove the task from ctx
			 * so that it does not look like we handed it a task_work.
			 */
			put_task_struct(ctx->task);
			ctx->task = NULL;

			atomic_dec(&shared_ctx->num_preparing);
			atomic_dec(&shared_ctx->num_unfinished);
		}
	}

	return found_more_threads;
}

/*
 * cancel_tsync_works - cancel all task works where it is possible
 *
 * Task works can be canceled as long as they are still queued and have not
 * started running.  If they get canceled, we decrement
 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
 * completions if needed, as if the task was never scheduled.
 */
static void cancel_tsync_works(struct tsync_works *works,
			       struct tsync_shared_context *shared_ctx)
{
	int i;

	for (i = 0; i < works->size; i++) {
		if (!task_work_cancel(works->works[i]->task,
				      &works->works[i]->work))
			continue;

		/* After dequeueing, act as if the task work had executed. */

		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
			complete_all(&shared_ctx->all_prepared);

		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
			complete_all(&shared_ctx->all_finished);
	}
}

/*
 * restrict_sibling_threads - enables a Landlock policy for all sibling threads
 */
int landlock_restrict_sibling_threads(const struct cred *old_cred,
				      const struct cred *new_cred)
{
	int err;
	struct tsync_shared_context shared_ctx;
	struct tsync_works works = {};
	size_t newly_discovered_threads;
	bool found_more_threads;

	atomic_set(&shared_ctx.preparation_error, 0);
	init_completion(&shared_ctx.all_prepared);
	init_completion(&shared_ctx.ready_to_commit);
	atomic_set(&shared_ctx.num_unfinished, 1);
	init_completion(&shared_ctx.all_finished);
	shared_ctx.old_cred = old_cred;
	shared_ctx.new_cred = new_cred;
	shared_ctx.set_no_new_privs = task_no_new_privs(current);

	/*
	 * We schedule a pseudo-signal task_work for each of the calling task's
	 * sibling threads.  In the task work, each thread:
	 *
	 * 1) runs prepare_creds() and writes back the error to
	 *    shared_ctx.preparation_error, if needed.
	 *
	 * 2) signals that it's done with prepare_creds() to the calling task.
	 *    (completion "all_prepared").
	 *
	 * 3) waits for the completion "ready_to_commit".  This is sent by the
	 *    calling task after ensuring that all sibling threads have done
	 *    with the "preparation" stage.
	 *
	 *    After this barrier is reached, it's safe to read
	 *    shared_ctx.preparation_error.
	 *
	 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
	 *    or abort_creds().
	 *
	 * 5) signals that it's done altogether (barrier synchronization
	 *    "all_finished")
	 *
	 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
	 * acquire the cred_guard_mutex and sighand->siglock:
	 *
	 * - As in our case, all threads are themselves exchanging their own struct
	 *   cred through the credentials API, no locks are needed for that.
	 * - Our for_each_thread() loops are protected by RCU.
	 * - We do not acquire a lock to keep the list of sibling threads stable
	 *   between our for_each_thread loops.  If the list of available sibling
	 *   threads changes between these for_each_thread loops, we make up for
	 *   that by continuing to look for threads until they are all discovered
	 *   and have entered their task_work, where they are unable to spawn new
	 *   threads.
	 */
	do {
		/* In RCU read-lock, count the threads we need. */
		newly_discovered_threads = count_additional_threads(&works);

		if (newly_discovered_threads == 0)
			break; /* done */

		err = tsync_works_grow_by(&works, newly_discovered_threads,
					  GFP_KERNEL_ACCOUNT);
		if (err) {
			atomic_set(&shared_ctx.preparation_error, err);
			break;
		}

		/*
		 * The "all_prepared" barrier is used locally to the loop body, this use
		 * of for_each_thread().  We can reset it on each loop iteration because
		 * all previous loop iterations are done with it already.
		 *
		 * num_preparing is initialized to 1 so that the counter can not go to 0
		 * and mark the completion as done before all task works are registered.
		 * We decrement it at the end of the loop body.
		 */
		atomic_set(&shared_ctx.num_preparing, 1);
		reinit_completion(&shared_ctx.all_prepared);

		/*
		 * In RCU read-lock, schedule task work on newly discovered sibling
		 * tasks.
		 */
		found_more_threads = schedule_task_work(&works, &shared_ctx);

		/*
		 * Decrement num_preparing for current, to undo that we initialized it
		 * to 1 a few lines above.
		 */
		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
			if (wait_for_completion_interruptible(
				    &shared_ctx.all_prepared)) {
				/* In case of interruption, we need to retry the system call. */
				atomic_set(&shared_ctx.preparation_error,
					   -ERESTARTNOINTR);

				/*
				 * Cancel task works for tasks that did not start running yet,
				 * and decrement all_prepared and num_unfinished accordingly.
				 */
				cancel_tsync_works(&works, &shared_ctx);

				/*
				 * The remaining task works have started running, so waiting for
				 * their completion will finish.
				 */
				wait_for_completion(&shared_ctx.all_prepared);
			}
		}
	} while (found_more_threads &&
		 !atomic_read(&shared_ctx.preparation_error));

	/*
	 * We now have all sibling threads blocking and in "prepared" state in the
	 * task work. Ask all threads to commit.
	 */
	complete_all(&shared_ctx.ready_to_commit);

	/*
	 * Decrement num_unfinished for current, to undo that we initialized it to 1
	 * at the beginning.
	 */
	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
		wait_for_completion(&shared_ctx.all_finished);

	tsync_works_release(&works);

	return atomic_read(&shared_ctx.preparation_error);
}