Skip to content

Commit 9147566

Browse files
committed
Merge tag 'sched_ext-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - Fix SCX_KICK_WAIT deadlock where multiple CPUs waiting for each other in hardirq context form a cycle. Move the wait to a balance callback which can drop the rq lock and process IPIs. - Fix inconsistent NUMA node lookup in scx_select_cpu_dfl() where the waker_node used cpu_to_node() while prev_cpu used scx_cpu_node_if_enabled(), leading to undefined behavior when per-node idle tracking is disabled. * tag 'sched_ext-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: selftests/sched_ext: Add cyclic SCX_KICK_WAIT stress test sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback sched_ext: Fix inconsistent NUMA node lookup in scx_select_cpu_dfl()
2 parents 0958d65 + 090d34f commit 9147566

6 files changed

Lines changed: 337 additions & 26 deletions

File tree

kernel/sched/ext.c

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
24042404
{
24052405
struct scx_sched *sch = scx_root;
24062406

2407-
/* see kick_cpus_irq_workfn() */
2407+
/* see kick_sync_wait_bal_cb() */
24082408
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
24092409

24102410
update_curr_scx(rq);
@@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
24472447
switch_class(rq, next);
24482448
}
24492449

2450+
static void kick_sync_wait_bal_cb(struct rq *rq)
2451+
{
2452+
struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
2453+
unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
2454+
bool waited;
2455+
s32 cpu;
2456+
2457+
/*
2458+
* Drop rq lock and enable IRQs while waiting. IRQs must be enabled
2459+
* — a target CPU may be waiting for us to process an IPI (e.g. TLB
2460+
* flush) while we wait for its kick_sync to advance.
2461+
*
2462+
* Also, keep advancing our own kick_sync so that new kick_sync waits
2463+
* targeting us, which can start after we drop the lock, cannot form
2464+
* cyclic dependencies.
2465+
*/
2466+
retry:
2467+
waited = false;
2468+
for_each_cpu(cpu, rq->scx.cpus_to_sync) {
2469+
/*
2470+
* smp_load_acquire() pairs with smp_store_release() on
2471+
* kick_sync updates on the target CPUs.
2472+
*/
2473+
if (cpu == cpu_of(rq) ||
2474+
smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
2475+
cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
2476+
continue;
2477+
}
2478+
2479+
raw_spin_rq_unlock_irq(rq);
2480+
while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
2481+
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
2482+
cpu_relax();
2483+
}
2484+
raw_spin_rq_lock_irq(rq);
2485+
waited = true;
2486+
}
2487+
2488+
if (waited)
2489+
goto retry;
2490+
}
2491+
24502492
static struct task_struct *first_local_task(struct rq *rq)
24512493
{
24522494
return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
24602502
bool keep_prev;
24612503
struct task_struct *p;
24622504

2463-
/* see kick_cpus_irq_workfn() */
2505+
/* see kick_sync_wait_bal_cb() */
24642506
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
24652507

24662508
rq_modified_begin(rq, &ext_sched_class);
@@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
24702512
rq_repin_lock(rq, rf);
24712513
maybe_queue_balance_callback(rq);
24722514

2515+
/*
2516+
* Defer to a balance callback which can drop rq lock and enable
2517+
* IRQs. Waiting directly in the pick path would deadlock against
2518+
* CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
2519+
*/
2520+
if (unlikely(rq->scx.kick_sync_pending)) {
2521+
rq->scx.kick_sync_pending = false;
2522+
queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
2523+
kick_sync_wait_bal_cb);
2524+
}
2525+
24732526
/*
24742527
* If any higher-priority sched class enqueued a runnable task on
24752528
* this rq during balance_one(), abort and return RETRY_TASK, so
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
47134766
if (!cpumask_empty(rq->scx.cpus_to_wait))
47144767
dump_line(&ns, " cpus_to_wait : %*pb",
47154768
cpumask_pr_args(rq->scx.cpus_to_wait));
4769+
if (!cpumask_empty(rq->scx.cpus_to_sync))
4770+
dump_line(&ns, " cpus_to_sync : %*pb",
4771+
cpumask_pr_args(rq->scx.cpus_to_sync));
47164772

47174773
used = seq_buf_used(&ns);
47184774
if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
56105666

56115667
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
56125668
if (cur_class == &ext_sched_class) {
5669+
cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
56135670
ksyncs[cpu] = rq->scx.kick_sync;
56145671
should_wait = true;
5615-
} else {
5616-
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
56175672
}
5673+
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
56185674
}
56195675

56205676
resched_curr(rq);
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
56695725
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
56705726
}
56715727

5672-
if (!should_wait)
5673-
return;
5674-
5675-
for_each_cpu(cpu, this_scx->cpus_to_wait) {
5676-
unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
5677-
5678-
/*
5679-
* Busy-wait until the task running at the time of kicking is no
5680-
* longer running. This can be used to implement e.g. core
5681-
* scheduling.
5682-
*
5683-
* smp_cond_load_acquire() pairs with store_releases in
5684-
* pick_task_scx() and put_prev_task_scx(). The former breaks
5685-
* the wait if SCX's scheduling path is entered even if the same
5686-
* task is picked subsequently. The latter is necessary to break
5687-
* the wait when $cpu is taken by a higher sched class.
5688-
*/
5689-
if (cpu != cpu_of(this_rq))
5690-
smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
5691-
5692-
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5728+
/*
5729+
* Can't wait in hardirq — kick_sync can't advance, deadlocking if
5730+
* CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
5731+
*/
5732+
if (should_wait) {
5733+
raw_spin_rq_lock(this_rq);
5734+
this_scx->kick_sync_pending = true;
5735+
resched_curr(this_rq);
5736+
raw_spin_rq_unlock(this_rq);
56935737
}
56945738
}
56955739

@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
57945838
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
57955839
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
57965840
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
5841+
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
57975842
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
57985843
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
57995844

kernel/sched/ext_idle.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
543543
* piled up on it even if there is an idle core elsewhere on
544544
* the system.
545545
*/
546-
waker_node = cpu_to_node(cpu);
546+
waker_node = scx_cpu_node_if_enabled(cpu);
547547
if (!(current->flags & PF_EXITING) &&
548548
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
549549
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&

kernel/sched/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,9 +805,12 @@ struct scx_rq {
805805
cpumask_var_t cpus_to_kick_if_idle;
806806
cpumask_var_t cpus_to_preempt;
807807
cpumask_var_t cpus_to_wait;
808+
cpumask_var_t cpus_to_sync;
809+
bool kick_sync_pending;
808810
unsigned long kick_sync;
809811
local_t reenq_local_deferred;
810812
struct balance_callback deferred_bal_cb;
813+
struct balance_callback kick_sync_bal_cb;
811814
struct irq_work deferred_irq_work;
812815
struct irq_work kick_cpus_irq_work;
813816
struct scx_dispatch_q bypass_dsq;

tools/testing/selftests/sched_ext/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ auto-test-targets := \
188188
rt_stall \
189189
test_example \
190190
total_bw \
191+
cyclic_kick_wait \
191192

192193
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
193194

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock.
4+
*
5+
* Three CPUs are designated from userspace. Every enqueue from one of the
6+
* three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a
7+
* persistent A -> B -> C -> A wait cycle pressure.
8+
*/
9+
#include <scx/common.bpf.h>
10+
11+
char _license[] SEC("license") = "GPL";
12+
13+
const volatile s32 test_cpu_a;
14+
const volatile s32 test_cpu_b;
15+
const volatile s32 test_cpu_c;
16+
17+
u64 nr_enqueues;
18+
u64 nr_wait_kicks;
19+
20+
UEI_DEFINE(uei);
21+
22+
static s32 target_cpu(s32 cpu)
23+
{
24+
if (cpu == test_cpu_a)
25+
return test_cpu_b;
26+
if (cpu == test_cpu_b)
27+
return test_cpu_c;
28+
if (cpu == test_cpu_c)
29+
return test_cpu_a;
30+
return -1;
31+
}
32+
33+
void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p,
34+
u64 enq_flags)
35+
{
36+
s32 this_cpu = bpf_get_smp_processor_id();
37+
s32 tgt;
38+
39+
__sync_fetch_and_add(&nr_enqueues, 1);
40+
41+
if (p->flags & PF_KTHREAD) {
42+
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
43+
enq_flags | SCX_ENQ_PREEMPT);
44+
return;
45+
}
46+
47+
scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
48+
49+
tgt = target_cpu(this_cpu);
50+
if (tgt < 0 || tgt == this_cpu)
51+
return;
52+
53+
__sync_fetch_and_add(&nr_wait_kicks, 1);
54+
scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT);
55+
}
56+
57+
void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei)
58+
{
59+
UEI_RECORD(uei, ei);
60+
}
61+
62+
SEC(".struct_ops.link")
63+
struct sched_ext_ops cyclic_kick_wait_ops = {
64+
.enqueue = cyclic_kick_wait_enqueue,
65+
.exit = cyclic_kick_wait_exit,
66+
.name = "cyclic_kick_wait",
67+
.timeout_ms = 1000U,
68+
};

0 commit comments

Comments
 (0)