Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull core timer updates from Thomas Gleixner:
 "Timers and timekeeping updates:

   - A large overhaul of the posix CPU timer code which is a preparation
     for moving the CPU timer expiry out into task work so it can be
     properly accounted on the task/process.

     An update to the bogus permission checks will come later during the
     merge window as feedback was not complete before heading of for
     travel.

   - Switch the timerqueue code to use cached rbtrees and get rid of the
     homebrewn caching of the leftmost node.

   - Consolidate hrtimer_init() + hrtimer_init_sleeper() calls into a
     single function

   - Implement the separation of hrtimers to be forced to expire in hard
     interrupt context even when PREEMPT_RT is enabled and mark the
     affected timers accordingly.

   - Implement a mechanism for hrtimers and the timer wheel to protect
     RT against priority inversion and live lock issues when a (hr)timer
     which should be canceled is currently executing the callback.
     Instead of infinitely spinning, the task which tries to cancel the
     timer blocks on a per cpu base expiry lock which is held and
     released by the (hr)timer expiry code.

   - Enable the Hyper-V TSC page based sched_clock for Hyper-V guests
     resulting in faster access to timekeeping functions.

   - Updates to various clocksource/clockevent drivers and their device
     tree bindings.

   - The usual small improvements all over the place"

* 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (101 commits)
  posix-cpu-timers: Fix permission check regression
  posix-cpu-timers: Always clear head pointer on dequeue
  hrtimer: Add a missing bracket and hide `migration_base' on !SMP
  posix-cpu-timers: Make expiry_active check actually work correctly
  posix-timers: Unbreak CONFIG_POSIX_TIMERS=n build
  tick: Mark sched_timer to expire in hard interrupt context
  hrtimer: Add kernel doc annotation for HRTIMER_MODE_HARD
  x86/hyperv: Hide pv_ops access for CONFIG_PARAVIRT=n
  posix-cpu-timers: Utilize timerqueue for storage
  posix-cpu-timers: Move state tracking to struct posix_cputimers
  posix-cpu-timers: Deduplicate rlimit handling
  posix-cpu-timers: Remove pointless comparisons
  posix-cpu-timers: Get rid of 64bit divisions
  posix-cpu-timers: Consolidate timer expiry further
  posix-cpu-timers: Get rid of zero checks
  rlimit: Rewrite non-sensical RLIMIT_CPU comment
  posix-cpu-timers: Respect INFINITY for hard RTTIME limit
  posix-cpu-timers: Switch thread group sampling to array
  posix-cpu-timers: Restructure expiry array
  posix-cpu-timers: Remove cputime_expires
  ...
This commit is contained in:
Linus Torvalds
2019-09-17 12:35:15 -07:00
61 changed files with 1484 additions and 903 deletions

View File

@@ -5,7 +5,8 @@
#include <linux/time.h>
#include <linux/hrtimer.h>
#include <linux/timerqueue.h>
#include <linux/rtc.h>
struct rtc_device;
enum alarmtimer_type {
ALARM_REALTIME,

View File

@@ -32,12 +32,15 @@ struct hrtimer_cpu_base;
* when starting the timer)
* HRTIMER_MODE_SOFT - Timer callback function will be executed in
* soft irq context
* HRTIMER_MODE_HARD - Timer callback function will be executed in
* hard irq context even on PREEMPT_RT.
*/
enum hrtimer_mode {
HRTIMER_MODE_ABS = 0x00,
HRTIMER_MODE_REL = 0x01,
HRTIMER_MODE_PINNED = 0x02,
HRTIMER_MODE_SOFT = 0x04,
HRTIMER_MODE_HARD = 0x08,
HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
@@ -48,6 +51,11 @@ enum hrtimer_mode {
HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
};
/*
@@ -101,6 +109,8 @@ enum hrtimer_restart {
* @state: state information (See bit values above)
* @is_rel: Set if the timer was armed relative
* @is_soft: Set if hrtimer will be expired in soft interrupt context.
* @is_hard: Set if hrtimer will be expired in hard interrupt context
* even on RT.
*
* The hrtimer structure must be initialized by hrtimer_init()
*/
@@ -112,6 +122,7 @@ struct hrtimer {
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
};
/**
@@ -183,6 +194,10 @@ enum hrtimer_base_type {
* @nr_retries: Total number of hrtimer interrupt retries
* @nr_hangs: Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
* @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
* expired
* @timer_waiters: A hrtimer_cancel() invocation waits for the timer
* callback to finish.
* @expires_next: absolute time of the next event, is required for remote
* hrtimer enqueue; it is the total first expiry time (hard
* and soft hrtimer are taken into account)
@@ -209,6 +224,10 @@ struct hrtimer_cpu_base {
unsigned short nr_retries;
unsigned short nr_hangs;
unsigned int max_hang_time;
#endif
#ifdef CONFIG_PREEMPT_RT
spinlock_t softirq_expiry_lock;
atomic_t timer_waiters;
#endif
ktime_t expires_next;
struct hrtimer *next_timer;
@@ -341,16 +360,29 @@ extern void hrtimers_resume(void);
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
#ifdef CONFIG_PREEMPT_RT
void hrtimer_cancel_wait_running(const struct hrtimer *timer);
#else
static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
{
cpu_relax();
}
#endif
/* Exported timer functions: */
/* Initialize timers: */
extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
enum hrtimer_mode mode);
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
enum hrtimer_mode mode);
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
enum hrtimer_mode mode);
extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
clockid_t clock_id,
enum hrtimer_mode mode);
extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
#else
@@ -360,6 +392,14 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer,
{
hrtimer_init(timer, which_clock, mode);
}
static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
clockid_t clock_id,
enum hrtimer_mode mode)
{
hrtimer_init_sleeper(sl, clock_id, mode);
}
static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
#endif
@@ -395,6 +435,9 @@ static inline void hrtimer_start_expires(struct hrtimer *timer,
hrtimer_start_range_ns(timer, soft, delta, mode);
}
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
enum hrtimer_mode mode);
static inline void hrtimer_restart(struct hrtimer *timer)
{
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
@@ -463,11 +506,8 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
const enum hrtimer_mode mode,
const clockid_t clockid);
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
struct task_struct *tsk);
extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode);
const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
u64 delta,
const enum hrtimer_mode mode,

View File

@@ -36,17 +36,6 @@ extern struct cred init_cred;
#define INIT_PREV_CPUTIME(x)
#endif
#ifdef CONFIG_POSIX_TIMERS
#define INIT_CPU_TIMERS(s) \
.cpu_timers = { \
LIST_HEAD_INIT(s.cpu_timers[0]), \
LIST_HEAD_INIT(s.cpu_timers[1]), \
LIST_HEAD_INIT(s.cpu_timers[2]), \
},
#else
#define INIT_CPU_TIMERS(s)
#endif
#define INIT_TASK_COMM "swapper"
/* Attach to the init_task data structure for proper alignment */

View File

@@ -4,18 +4,11 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/timex.h>
#include <linux/alarmtimer.h>
#include <linux/timerqueue.h>
struct siginfo;
struct cpu_timer_list {
struct list_head entry;
u64 expires;
struct task_struct *task;
int firing;
};
struct kernel_siginfo;
struct task_struct;
/*
* Bit fields within a clockid:
@@ -63,6 +56,115 @@ static inline int clockid_to_fd(const clockid_t clk)
return ~(clk >> 3);
}
#ifdef CONFIG_POSIX_TIMERS
/**
* cpu_timer - Posix CPU timer representation for k_itimer
* @node: timerqueue node to queue in the task/sig
* @head: timerqueue head on which this timer is queued
* @task: Pointer to target task
* @elist: List head for the expiry list
* @firing: Timer is currently firing
*/
struct cpu_timer {
struct timerqueue_node node;
struct timerqueue_head *head;
struct task_struct *task;
struct list_head elist;
int firing;
};
static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
struct cpu_timer *ctmr)
{
ctmr->head = head;
return timerqueue_add(head, &ctmr->node);
}
static inline void cpu_timer_dequeue(struct cpu_timer *ctmr)
{
if (ctmr->head) {
timerqueue_del(ctmr->head, &ctmr->node);
ctmr->head = NULL;
}
}
static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr)
{
return ctmr->node.expires;
}
static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp)
{
ctmr->node.expires = exp;
}
/**
* posix_cputimer_base - Container per posix CPU clock
* @nextevt: Earliest-expiration cache
* @tqhead: timerqueue head for cpu_timers
*/
struct posix_cputimer_base {
u64 nextevt;
struct timerqueue_head tqhead;
};
/**
* posix_cputimers - Container for posix CPU timer related data
* @bases: Base container for posix CPU clocks
* @timers_active: Timers are queued.
* @expiry_active: Timer expiry is active. Used for
* process wide timers to avoid multiple
* task trying to handle expiry concurrently
*
* Used in task_struct and signal_struct
*/
struct posix_cputimers {
struct posix_cputimer_base bases[CPUCLOCK_MAX];
unsigned int timers_active;
unsigned int expiry_active;
};
static inline void posix_cputimers_init(struct posix_cputimers *pct)
{
memset(pct, 0, sizeof(*pct));
pct->bases[0].nextevt = U64_MAX;
pct->bases[1].nextevt = U64_MAX;
pct->bases[2].nextevt = U64_MAX;
}
void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);
static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
u64 runtime)
{
pct->bases[CPUCLOCK_SCHED].nextevt = runtime;
}
/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) { \
.nextevt = U64_MAX, \
}
#define INIT_CPU_TIMERBASES(b) { \
INIT_CPU_TIMERBASE(b[0]), \
INIT_CPU_TIMERBASE(b[1]), \
INIT_CPU_TIMERBASE(b[2]), \
}
#define INIT_CPU_TIMERS(s) \
.posix_cputimers = { \
.bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \
},
#else
struct posix_cputimers { };
struct cpu_timer { };
#define INIT_CPU_TIMERS(s)
static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
u64 cpu_limit) { }
#endif
#define REQUEUE_PENDING 1
/**
@@ -85,7 +187,8 @@ static inline int clockid_to_fd(const clockid_t clk)
* @it_process: The task to wakeup on clock_nanosleep (CPU timers)
* @sigq: Pointer to preallocated sigqueue
* @it: Union representing the various posix timer type
* internals. Also used for rcu freeing the timer.
* internals.
* @rcu: RCU head for freeing the timer.
*/
struct k_itimer {
struct list_head list;
@@ -110,15 +213,15 @@ struct k_itimer {
struct {
struct hrtimer timer;
} real;
struct cpu_timer_list cpu;
struct cpu_timer cpu;
struct {
struct alarm alarmtimer;
} alarm;
struct rcu_head rcu;
} it;
struct rcu_head rcu;
};
void run_posix_cpu_timers(struct task_struct *task);
void run_posix_cpu_timers(void);
void posix_cpu_timers_exit(struct task_struct *task);
void posix_cpu_timers_exit_group(struct task_struct *task);
void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,

View File

@@ -25,9 +25,11 @@
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers.h>
#include <linux/rseq.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -244,27 +246,6 @@ struct prev_cputime {
#endif
};
/**
* struct task_cputime - collected CPU time counts
* @utime: time spent in user mode, in nanoseconds
* @stime: time spent in kernel mode, in nanoseconds
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
*
* This structure groups together three kinds of CPU time that are tracked for
* threads and thread groups. Most things considering CPU time want to group
* these counts together and treat all three of them in parallel.
*/
struct task_cputime {
u64 utime;
u64 stime;
unsigned long long sum_exec_runtime;
};
/* Alternate field names when used on cache expirations: */
#define virt_exp utime
#define prof_exp stime
#define sched_exp sum_exec_runtime
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
@@ -881,10 +862,8 @@ struct task_struct {
unsigned long min_flt;
unsigned long maj_flt;
#ifdef CONFIG_POSIX_TIMERS
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
#endif
/* Empty if CONFIG_POSIX_CPUTIMERS=n */
struct posix_cputimers posix_cputimers;
/* Process credentials: */

View File

@@ -61,8 +61,7 @@ extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
* Thread group CPU time accounting.
*/
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples);
/*
* The following are functions that support scheduler-internal time accounting.
@@ -71,7 +70,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
*/
/**
* get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
* get_running_cputimer - return &tsk->signal->cputimer if cputimers are active
*
* @tsk: Pointer to target task.
*/
@@ -81,8 +80,11 @@ struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
/* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(cputimer->running))
/*
* Check whether posix CPU timers are active. If not the thread
* group accounting is not active either. Lockless check.
*/
if (!READ_ONCE(tsk->signal->posix_cputimers.timers_active))
return NULL;
/*

View File

@@ -9,6 +9,7 @@
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/posix-timers.h>
/*
* Types defining task->signal and task->sighand and APIs using them:
@@ -56,18 +57,12 @@ struct task_cputime_atomic {
/**
* struct thread_group_cputimer - thread group interval timer counts
* @cputime_atomic: atomic thread group interval timers.
* @running: true when there are timers running and
* @cputime_atomic receives updates.
* @checking_timer: true when a thread in the group is in the
* process of checking for thread group timers.
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
struct task_cputime_atomic cputime_atomic;
bool running;
bool checking_timer;
};
struct multiprocess_signals {
@@ -148,12 +143,9 @@ struct signal_struct {
*/
struct thread_group_cputimer cputimer;
/* Earliest-expiration cache. */
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
#endif
/* Empty if CONFIG_POSIX_TIMERS=n */
struct posix_cputimers posix_cputimers;
/* PID/PID hash table linkage. */
struct pid *pids[PIDTYPE_MAX];

View File

@@ -0,0 +1,23 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TYPES_H
#define _LINUX_SCHED_TYPES_H
#include <linux/types.h>
/**
* struct task_cputime - collected CPU time counts
* @stime: time spent in kernel mode, in nanoseconds
* @utime: time spent in user mode, in nanoseconds
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
*
* This structure groups together three kinds of CPU time that are tracked for
* threads and thread groups. Most things considering CPU time want to group
* these counts together and treat all three of them in parallel.
*/
struct task_cputime {
u64 stime;
u64 utime;
unsigned long long sum_exec_runtime;
};
#endif

View File

@@ -183,7 +183,7 @@ extern void add_timer(struct timer_list *timer);
extern int try_to_del_timer_sync(struct timer_list *timer);
#ifdef CONFIG_SMP
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
extern int del_timer_sync(struct timer_list *timer);
#else
# define del_timer_sync(t) del_timer(t)

View File

@@ -12,8 +12,7 @@ struct timerqueue_node {
};
struct timerqueue_head {
struct rb_root head;
struct timerqueue_node *next;
struct rb_root_cached rb_root;
};
@@ -29,13 +28,14 @@ extern struct timerqueue_node *timerqueue_iterate_next(
*
* @head: head of timerqueue
*
* Returns a pointer to the timer node that has the
* earliest expiration time.
* Returns a pointer to the timer node that has the earliest expiration time.
*/
static inline
struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
{
return head->next;
struct rb_node *leftmost = rb_first_cached(&head->rb_root);
return rb_entry(leftmost, struct timerqueue_node, node);
}
static inline void timerqueue_init(struct timerqueue_node *node)
@@ -43,9 +43,18 @@ static inline void timerqueue_init(struct timerqueue_node *node)
RB_CLEAR_NODE(&node->node);
}
static inline bool timerqueue_node_queued(struct timerqueue_node *node)
{
return !RB_EMPTY_NODE(&node->node);
}
static inline bool timerqueue_node_expires(struct timerqueue_node *node)
{
return node->expires;
}
static inline void timerqueue_init_head(struct timerqueue_head *head)
{
head->head = RB_ROOT;
head->next = NULL;
head->rb_root = RB_ROOT_CACHED;
}
#endif /* _LINUX_TIMERQUEUE_H */

View File

@@ -501,8 +501,8 @@ do { \
int __ret = 0; \
struct hrtimer_sleeper __t; \
\
hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \
hrtimer_init_sleeper(&__t, current); \
hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \
HRTIMER_MODE_REL); \
if ((timeout) != KTIME_MAX) \
hrtimer_start_range_ns(&__t.timer, timeout, \
current->timer_slack_ns, \