@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
24042404{
24052405 struct scx_sched * sch = scx_root ;
24062406
2407- /* see kick_cpus_irq_workfn () */
2407+ /* see kick_sync_wait_bal_cb () */
24082408 smp_store_release (& rq -> scx .kick_sync , rq -> scx .kick_sync + 1 );
24092409
24102410 update_curr_scx (rq );
@@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
24472447 switch_class (rq , next );
24482448}
24492449
2450+ static void kick_sync_wait_bal_cb (struct rq * rq )
2451+ {
2452+ struct scx_kick_syncs __rcu * ks = __this_cpu_read (scx_kick_syncs );
2453+ unsigned long * ksyncs = rcu_dereference_sched (ks )-> syncs ;
2454+ bool waited ;
2455+ s32 cpu ;
2456+
2457+ /*
2458+ * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
2459+ * — a target CPU may be waiting for us to process an IPI (e.g. TLB
2460+ * flush) while we wait for its kick_sync to advance.
2461+ *
2462+ * Also, keep advancing our own kick_sync so that new kick_sync waits
2463+ * targeting us, which can start after we drop the lock, cannot form
2464+ * cyclic dependencies.
2465+ */
2466+ retry :
2467+ waited = false;
2468+ for_each_cpu (cpu , rq -> scx .cpus_to_sync ) {
2469+ /*
2470+ * smp_load_acquire() pairs with smp_store_release() on
2471+ * kick_sync updates on the target CPUs.
2472+ */
2473+ if (cpu == cpu_of (rq ) ||
2474+ smp_load_acquire (& cpu_rq (cpu )-> scx .kick_sync ) != ksyncs [cpu ]) {
2475+ cpumask_clear_cpu (cpu , rq -> scx .cpus_to_sync );
2476+ continue ;
2477+ }
2478+
2479+ raw_spin_rq_unlock_irq (rq );
2480+ while (READ_ONCE (cpu_rq (cpu )-> scx .kick_sync ) == ksyncs [cpu ]) {
2481+ smp_store_release (& rq -> scx .kick_sync , rq -> scx .kick_sync + 1 );
2482+ cpu_relax ();
2483+ }
2484+ raw_spin_rq_lock_irq (rq );
2485+ waited = true;
2486+ }
2487+
2488+ if (waited )
2489+ goto retry ;
2490+ }
2491+
24502492static struct task_struct * first_local_task (struct rq * rq )
24512493{
24522494 return list_first_entry_or_null (& rq -> scx .local_dsq .list ,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
24602502 bool keep_prev ;
24612503 struct task_struct * p ;
24622504
2463- /* see kick_cpus_irq_workfn () */
2505+ /* see kick_sync_wait_bal_cb () */
24642506 smp_store_release (& rq -> scx .kick_sync , rq -> scx .kick_sync + 1 );
24652507
24662508 rq_modified_begin (rq , & ext_sched_class );
@@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
24702512 rq_repin_lock (rq , rf );
24712513 maybe_queue_balance_callback (rq );
24722514
2515+ /*
2516+ * Defer to a balance callback which can drop rq lock and enable
2517+ * IRQs. Waiting directly in the pick path would deadlock against
2518+ * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
2519+ */
2520+ if (unlikely (rq -> scx .kick_sync_pending )) {
2521+ rq -> scx .kick_sync_pending = false;
2522+ queue_balance_callback (rq , & rq -> scx .kick_sync_bal_cb ,
2523+ kick_sync_wait_bal_cb );
2524+ }
2525+
24732526 /*
24742527 * If any higher-priority sched class enqueued a runnable task on
24752528 * this rq during balance_one(), abort and return RETRY_TASK, so
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
47134766 if (!cpumask_empty (rq -> scx .cpus_to_wait ))
47144767 dump_line (& ns , " cpus_to_wait : %*pb" ,
47154768 cpumask_pr_args (rq -> scx .cpus_to_wait ));
4769+ if (!cpumask_empty (rq -> scx .cpus_to_sync ))
4770+ dump_line (& ns , " cpus_to_sync : %*pb" ,
4771+ cpumask_pr_args (rq -> scx .cpus_to_sync ));
47164772
47174773 used = seq_buf_used (& ns );
47184774 if (SCX_HAS_OP (sch , dump_cpu )) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
56105666
56115667 if (cpumask_test_cpu (cpu , this_scx -> cpus_to_wait )) {
56125668 if (cur_class == & ext_sched_class ) {
5669+ cpumask_set_cpu (cpu , this_scx -> cpus_to_sync );
56135670 ksyncs [cpu ] = rq -> scx .kick_sync ;
56145671 should_wait = true;
5615- } else {
5616- cpumask_clear_cpu (cpu , this_scx -> cpus_to_wait );
56175672 }
5673+ cpumask_clear_cpu (cpu , this_scx -> cpus_to_wait );
56185674 }
56195675
56205676 resched_curr (rq );
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
56695725 cpumask_clear_cpu (cpu , this_scx -> cpus_to_kick_if_idle );
56705726 }
56715727
5672- if (!should_wait )
5673- return ;
5674-
5675- for_each_cpu (cpu , this_scx -> cpus_to_wait ) {
5676- unsigned long * wait_kick_sync = & cpu_rq (cpu )-> scx .kick_sync ;
5677-
5678- /*
5679- * Busy-wait until the task running at the time of kicking is no
5680- * longer running. This can be used to implement e.g. core
5681- * scheduling.
5682- *
5683- * smp_cond_load_acquire() pairs with store_releases in
5684- * pick_task_scx() and put_prev_task_scx(). The former breaks
5685- * the wait if SCX's scheduling path is entered even if the same
5686- * task is picked subsequently. The latter is necessary to break
5687- * the wait when $cpu is taken by a higher sched class.
5688- */
5689- if (cpu != cpu_of (this_rq ))
5690- smp_cond_load_acquire (wait_kick_sync , VAL != ksyncs [cpu ]);
5691-
5692- cpumask_clear_cpu (cpu , this_scx -> cpus_to_wait );
5728+ /*
5729+ * Can't wait in hardirq — kick_sync can't advance, deadlocking if
5730+ * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
5731+ */
5732+ if (should_wait ) {
5733+ raw_spin_rq_lock (this_rq );
5734+ this_scx -> kick_sync_pending = true;
5735+ resched_curr (this_rq );
5736+ raw_spin_rq_unlock (this_rq );
56935737 }
56945738}
56955739
@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
57945838 BUG_ON (!zalloc_cpumask_var_node (& rq -> scx .cpus_to_kick_if_idle , GFP_KERNEL , n ));
57955839 BUG_ON (!zalloc_cpumask_var_node (& rq -> scx .cpus_to_preempt , GFP_KERNEL , n ));
57965840 BUG_ON (!zalloc_cpumask_var_node (& rq -> scx .cpus_to_wait , GFP_KERNEL , n ));
5841+ BUG_ON (!zalloc_cpumask_var_node (& rq -> scx .cpus_to_sync , GFP_KERNEL , n ));
57975842 rq -> scx .deferred_irq_work = IRQ_WORK_INIT_HARD (deferred_irq_workfn );
57985843 rq -> scx .kick_cpus_irq_work = IRQ_WORK_INIT_HARD (kick_cpus_irq_workfn );
57995844
0 commit comments