1 files changed, 46 insertions, 44 deletions
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index a00561b1d3e5..f7d0c8d4644e 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -89,15 +89,14 @@ struct rqspinlock_timeout {
 DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
 EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
 
-static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts)
+static bool is_lock_released(rqspinlock_t *lock, u32 mask)
 {
 	if (!(atomic_read_acquire(&lock->val) & (mask)))
 		return true;
 	return false;
 }
 
-static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
-				      struct rqspinlock_timeout *ts)
+static noinline int check_deadlock_AA(rqspinlock_t *lock)
 {
 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
 	int cnt = min(RES_NR_HELD, rqh->cnt);
@@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
  * more locks, which reduce to ABBA). This is not exhaustive, and we rely on
  * timeouts as the final line of defense.
  */
-static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
-					struct rqspinlock_timeout *ts)
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
 {
 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
 	int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
@@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
 		 * Let's ensure to break out of this loop if the lock is available for
 		 * us to potentially acquire.
 		 */
-		if (is_lock_released(lock, mask, ts))
+		if (is_lock_released(lock, mask))
 			return 0;
 
 		/*
@@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
 	return 0;
 }
 
-static noinline int check_deadlock(rqspinlock_t *lock, u32 mask,
-				   struct rqspinlock_timeout *ts)
-{
-	int ret;
-
-	ret = check_deadlock_AA(lock, mask, ts);
-	if (ret)
-		return ret;
-	ret = check_deadlock_ABBA(lock, mask, ts);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 				  struct rqspinlock_timeout *ts)
 {
-	u64 time = ktime_get_mono_fast_ns();
 	u64 prev = ts->cur;
+	u64 time;
 
 	if (!ts->timeout_end) {
-		ts->cur = time;
-		ts->timeout_end = time + ts->duration;
+		if (check_deadlock_AA(lock))
+			return -EDEADLK;
+		ts->cur = ktime_get_mono_fast_ns();
+		ts->timeout_end = ts->cur + ts->duration;
 		return 0;
 	}
 
+	time = ktime_get_mono_fast_ns();
 	if (time > ts->timeout_end)
 		return -ETIMEDOUT;
 
@@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 	 */
 	if (prev + NSEC_PER_MSEC < time) {
 		ts->cur = time;
-		return check_deadlock(lock, mask, ts);
+		return check_deadlock_ABBA(lock, mask);
 	}
 
 	return 0;
@@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
 	int val, ret = 0;
 
 	RES_INIT_TIMEOUT(ts);
+	/*
+	 * The fast path is not invoked for the TAS fallback, so we must grab
+	 * the deadlock detection entry here.
+	 */
 	grab_held_lock_entry(lock);
 
 	/*
@@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 		goto queue;
 	}
 
-	/*
-	 * Grab an entry in the held locks array, to enable deadlock detection.
-	 */
-	grab_held_lock_entry(lock);
+	/* Deadlock detection entry already held after failing fast path. */
 
 	/*
 	 * We're pending, wait for the owner to go away.
@@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 * queuing.
 	 */
 queue:
-	lockevent_inc(lock_slowpath);
 	/*
-	 * Grab deadlock detection entry for the queue path.
+	 * Do not queue if we're a waiter and someone is attempting this lock on
+	 * the same CPU. In case of NMIs, this prevents long timeouts where we
+	 * interrupt the pending waiter, and the owner, that will eventually
+	 * signal the head of our queue, both of which are logically but not
+	 * physically part of the queue, hence outside the scope of the idx > 0
+	 * check above for the trylock fallback.
 	 */
-	grab_held_lock_entry(lock);
+	if (check_deadlock_AA(lock)) {
+		ret = -EDEADLK;
+		goto err_release_entry;
+	}
 
+	lockevent_inc(lock_slowpath);
+	/* Deadlock detection entry already held after failing fast path. */
 	node = this_cpu_ptr(&rqnodes[0].mcs);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
@@ -467,19 +463,17 @@ queue:
 	 * not be nested NMIs taking spinlocks. That may not be true in
 	 * some architectures even though the chance of needing more than
 	 * 4 nodes will still be extremely unlikely. When that happens,
-	 * we fall back to spinning on the lock directly without using
-	 * any MCS node. This is not the most elegant solution, but is
-	 * simple enough.
+	 * we fall back to attempting a trylock operation without using
+	 * any MCS node. Unlike qspinlock which cannot fail, we have the
+	 * option of failing the slow path, and under contention, such a
+	 * trylock spinning will likely be treated unfairly due to lack of
+	 * queueing, hence do not spin.
 	 */
-	if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
+	if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
 		lockevent_inc(lock_no_node);
-		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
-		while (!queued_spin_trylock(lock)) {
-			if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
-				lockevent_inc(rqspinlock_lock_timeout);
-				goto err_release_node;
-			}
-			cpu_relax();
+		if (!queued_spin_trylock(lock)) {
+			ret = -EDEADLK;
+			goto err_release_node;
 		}
 		goto release;
 	}
@@ -540,7 +534,7 @@ queue:
 
 		val = arch_mcs_spin_lock_contended(&node->locked);
 		if (val == RES_TIMEOUT_VAL) {
-			ret = -EDEADLK;
+			ret = -ETIMEDOUT;
 			goto waitq_timeout;
 		}
 
@@ -575,6 +569,14 @@ queue:
 	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
 					   RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
 
+	/* Disable queue destruction when we detect deadlocks. */
+	if (ret == -EDEADLK) {
+		if (!next)
+			next = smp_cond_load_relaxed(&node->next, (VAL));
+		arch_mcs_spin_unlock_contended(&next->locked);
+		goto err_release_node;
+	}
+
 waitq_timeout:
 	if (ret) {
 		/*