sched/rt: Fix PI handling vs. sched_setscheduler()

Andrea Parri reported:

> I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not
> handled correctly:
>
>     T1 (prio = 20)
>        lock(rtmutex);
>
>     T2 (prio = 20)
>        blocks on rtmutex  (rt_nr_boosted = 0 on T1's rq)
>
>     T1 (prio = 20)
>        sys_set_scheduler(prio = 0)
>           [new_effective_prio == oldprio]
>           T1 prio = 20    (rt_nr_boosted = 0 on T1's rq)
>
> The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted());
> in particular, if we continue with
>
>    T1 (prio = 20)
>       unlock(rtmutex)
>          wakeup(T2)
>          adjust_prio(T1)
>             [prio != rt_mutex_getprio(T1)]
>	    dequeue(T1)
>	       rt_nr_boosted = (unsigned long)(-1)
>	       ...
>             T1 prio = 0
>
> then we end up leaving rt_nr_boosted in an "inconsistent" state.
>
> The simple program attached could reproduce the previous scenario; note
> that, as a consequence of the presence of this state, the "assertion"
>
>     WARN_ON(!rt_nr_running && rt_nr_boosted)
>
> from dec_rt_group() may trigger.

So normally we dequeue/enqueue tasks in sched_setscheduler(), which
would ensure the accounting stays correct. However in the early PI path
we fail to do so.

So this was introduced at around v3.14, by:

  c365c292d0 ("sched: Consider pi boosting in setscheduler()")

which fixed another problem exactly because that dequeue/enqueue, joy.

Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it
preserve runqueue location with that option. This requires decoupling
the on_rt_rq() state from being on the list.

In order to allow for explicit movement during the SAVE/RESTORE,
introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these
cases to preserve other invariants.

Respecting the SAVE/RESTORE flags also has the (nice) side-effect that
things like sys_nice()/sys_sched_setaffinity() also do not reorder
FIFO tasks (whereas they used to before this patch).

Reported-by: Andrea Parri <parri.andrea@gmail.com>
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Peter Zijlstra
2016-01-18 15:27:07 +01:00
committed by Ingo Molnar
parent 41d9339733
commit ff77e46853
4 changed files with 113 additions and 50 deletions

View File

@@ -2221,6 +2221,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3531,7 +3535,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3559,11 +3563,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
if (oldprio == prio)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
@@ -3581,7 +3589,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
enqueue_flag |= ENQUEUE_REPLENISH;
queue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3589,7 +3597,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
enqueue_flag |= ENQUEUE_HEAD;
queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3604,7 +3612,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, enqueue_flag);
enqueue_task(rq, p, queue_flag);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3960,6 +3968,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq *rq;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -4142,17 +4151,14 @@ change:
* itself.
*/
new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
if (new_effective_prio == oldprio) {
__setscheduler_params(p, attr);
task_rq_unlock(rq, p, &flags);
return 0;
}
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
@@ -4162,15 +4168,14 @@ change:
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
if (oldprio <= p->prio)
enqueue_flags |= ENQUEUE_HEAD;
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, enqueue_flags);
enqueue_task(rq, p, queue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -7958,7 +7963,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE);
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -7982,7 +7987,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE);
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
task_rq_unlock(rq, tsk, &flags);
}