Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu

Pull the v5.9 RCU bits from Paul E. McKenney:

 - Documentation updates
 - Miscellaneous fixes
 - kfree_rcu updates
 - RCU tasks updates
 - Read-side scalability tests
 - SRCU updates
 - Torture-test updates

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2020-07-31 00:15:53 +02:00
61 changed files with 2385 additions and 670 deletions

View File

@@ -61,6 +61,25 @@ config RCU_TORTURE_TEST
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
select TASKS_RCU
select TASKS_RUDE_RCU
select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance tests
useful comparing RCU with various read-side synchronization mechanisms.
The kernel module may be built after the fact on the running kernel to be
tested, if desired.
Say Y here if you want these performance tests built into the kernel.
Say M if you want to build it as a module instead.
Say N if you are unsure.
config RCU_CPU_STALL_TIMEOUT
int "RCU CPU stall timeout in seconds"
depends on RCU_STALL_COMMON

View File

@@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o

View File

@@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
* value specified by nr_cpus for a read-only test.
*
* Various other use cases may of course be specified.
*
* Note that this test's readers are intended only as a test load for
* the writers. The reader performance statistics will be overly
* pessimistic due to the per-critical-section interrupt disabling,
* test-end checks, and the pair of calls through pointers.
*/
#ifdef MODULE
@@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void)
}
/*
* RCU perf reader kthread. Repeatedly does empty RCU read-side
* critical section, minimizing update-side interference.
* RCU perf reader kthread. Repeatedly does empty RCU read-side critical
* section, minimizing update-side interference. However, the point of
* this test is not to evaluate reader performance, but instead to serve
* as a test load for update-side performance testing.
*/
static int
rcu_perf_reader(void *arg)
@@ -576,11 +583,8 @@ static int compute_real(int n)
static int
rcu_perf_shutdown(void *arg)
{
do {
wait_event(shutdown_wq,
atomic_read(&n_rcu_perf_writer_finished) >=
nrealwriters);
} while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
wait_event(shutdown_wq,
atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_perf_cleanup();
kernel_power_off();
@@ -693,11 +697,8 @@ kfree_perf_cleanup(void)
static int
kfree_perf_shutdown(void *arg)
{
do {
wait_event(shutdown_wq,
atomic_read(&n_kfree_perf_thread_ended) >=
kfree_nrealthreads);
} while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
wait_event(shutdown_wq,
atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */

View File

@@ -7,7 +7,7 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
* See also: Documentation/RCU/torture.txt
* See also: Documentation/RCU/torture.rst
*/
#define pr_fmt(fmt) fmt
@@ -109,6 +109,10 @@ torture_param(int, object_debug, 0,
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (jiffies), 0=disable");
torture_param(int, read_exit_delay, 13,
"Delay between read-then-exit episodes (s)");
torture_param(int, read_exit_burst, 16,
"# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -146,6 +150,7 @@ static struct task_struct *stall_task;
static struct task_struct *fwd_prog_task;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -177,6 +182,7 @@ static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
@@ -1166,6 +1172,7 @@ rcu_torture_writer(void *arg)
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
}
} while (!torture_must_stop());
rcu_torture_current = NULL; // Let stats task know that we are done.
/* Reset expediting back to unexpedited. */
if (expediting > 0)
expediting = -expediting;
@@ -1370,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
struct rt_read_seg *rtrsp1;
unsigned long long ts;
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
started = cur_ops->get_gp_seq();
@@ -1539,10 +1547,11 @@ rcu_torture_stats_print(void)
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
torture_onoff_stats();
pr_cont("barrier: %ld/%ld:%ld\n",
pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
pr_cont("read-exits: %ld\n", data_race(n_read_exits));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
@@ -1634,7 +1643,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d\n",
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1643,7 +1653,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block,
n_barrier_cbs,
onoff_interval, onoff_holdoff);
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2175,7 +2186,7 @@ static void rcu_torture_barrier1cb(void *rcu_void)
static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
bool lastphase = 0;
bool lastphase = false;
bool newphase;
struct rcu_head rcu;
@@ -2338,6 +2349,99 @@ static bool rcu_torture_can_boost(void)
return true;
}
static bool read_exit_child_stop;
static bool read_exit_child_stopped;
static wait_queue_head_t read_exit_wq;
// Child kthread which just does an rcutorture reader and exits.
static int rcu_torture_read_exit_child(void *trsp_in)
{
struct torture_random_state *trsp = trsp_in;
set_user_nice(current, MAX_NICE);
// Minimize time between reading and exiting.
while (!kthread_should_stop())
schedule_timeout_uninterruptible(1);
(void)rcu_torture_one_read(trsp);
return 0;
}
// Parent kthread which creates and destroys read-exit child kthreads.
static int rcu_torture_read_exit(void *unused)
{
int count = 0;
bool errexit = false;
int i;
struct task_struct *tsp;
DEFINE_TORTURE_RANDOM(trs);
// Allocate and initialize.
set_user_nice(current, MAX_NICE);
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
// Each pass through this loop does one read-exit episode.
do {
if (++count > read_exit_burst) {
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
rcu_barrier(); // Wait for task_struct free, avoid OOM.
for (i = 0; i < read_exit_delay; i++) {
schedule_timeout_uninterruptible(HZ);
if (READ_ONCE(read_exit_child_stop))
break;
}
if (!READ_ONCE(read_exit_child_stop))
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
count = 0;
}
if (READ_ONCE(read_exit_child_stop))
break;
// Spawn child.
tsp = kthread_run(rcu_torture_read_exit_child,
&trs, "%s",
"rcu_torture_read_exit_child");
if (IS_ERR(tsp)) {
VERBOSE_TOROUT_ERRSTRING("out of memory");
errexit = true;
tsp = NULL;
break;
}
cond_resched();
kthread_stop(tsp);
n_read_exits ++;
stutter_wait("rcu_torture_read_exit");
} while (!errexit && !READ_ONCE(read_exit_child_stop));
// Clean up and exit.
smp_store_release(&read_exit_child_stopped, true); // After reaping.
smp_mb(); // Store before wakeup.
wake_up(&read_exit_wq);
while (!torture_must_stop())
schedule_timeout_uninterruptible(1);
torture_kthread_stopping("rcu_torture_read_exit");
return 0;
}
static int rcu_torture_read_exit_init(void)
{
if (read_exit_burst <= 0)
return -EINVAL;
init_waitqueue_head(&read_exit_wq);
read_exit_child_stop = false;
read_exit_child_stopped = false;
return torture_create_kthread(rcu_torture_read_exit, NULL,
read_exit_task);
}
static void rcu_torture_read_exit_cleanup(void)
{
if (!read_exit_task)
return;
WRITE_ONCE(read_exit_child_stop, true);
smp_mb(); // Above write before wait.
wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
torture_stop_kthread(rcutorture_read_exit, read_exit_task);
}
static enum cpuhp_state rcutor_hp;
static void
@@ -2359,6 +2463,7 @@ rcu_torture_cleanup(void)
}
show_rcu_gp_kthreads();
rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2370,7 +2475,6 @@ rcu_torture_cleanup(void)
reader_tasks[i]);
kfree(reader_tasks);
}
rcu_torture_current = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
@@ -2680,6 +2784,9 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
firsterr = rcu_torture_barrier_init();
if (firsterr)
goto unwind;
firsterr = rcu_torture_read_exit_init();
if (firsterr)
goto unwind;
if (object_debug)

717
kernel/rcu/refscale.c Normal file
View File

@@ -0,0 +1,717 @@
// SPDX-License-Identifier: GPL-2.0+
//
// Scalability test comparing RCU vs other mechanisms
// for acquiring references on objects.
//
// Copyright (C) Google, 2020.
//
// Author: Joel Fernandes <joel@joelfernandes.org>
#define pr_fmt(fmt) fmt
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/rcupdate_trace.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/stat.h>
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/torture.h>
#include <linux/types.h>
#include "rcu.h"
#define SCALE_FLAG "-ref-scale: "
#define SCALEOUT(s, x...) \
pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
#define VERBOSE_SCALEOUT(s, x...) \
do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
static char *scale_type = "rcu";
module_param(scale_type, charp, 0444);
MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
// Number of loops per experiment, all readers execute operations concurrently.
torture_param(long, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
// Number of runs.
torture_param(int, nruns, 30, "Number of experiments to run.");
// Reader delay in nanoseconds, 0 for no delay.
torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
#ifdef MODULE
# define REFSCALE_SHUTDOWN 0
#else
# define REFSCALE_SHUTDOWN 1
#endif
torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
"Shutdown at end of scalability tests.");
struct reader_task {
struct task_struct *task;
int start_reader;
wait_queue_head_t wq;
u64 last_duration_ns;
};
static struct task_struct *shutdown_task;
static wait_queue_head_t shutdown_wq;
static struct task_struct *main_task;
static wait_queue_head_t main_wq;
static int shutdown_start;
static struct reader_task *reader_tasks;
// Number of readers that are part of the current experiment.
static atomic_t nreaders_exp;
// Use to wait for all threads to start.
static atomic_t n_init;
static atomic_t n_started;
static atomic_t n_warmedup;
static atomic_t n_cooleddown;
// Track which experiment is currently running.
static int exp_idx;
// Operations vector for selecting different types of tests.
struct ref_scale_ops {
void (*init)(void);
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
const char *name;
};
static struct ref_scale_ops *cur_ops;
static void un_delay(const int udl, const int ndl)
{
if (udl)
udelay(udl);
if (ndl)
ndelay(ndl);
}
static void ref_rcu_read_section(const int nloops)
{
int i;
for (i = nloops; i >= 0; i--) {
rcu_read_lock();
rcu_read_unlock();
}
}
static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
for (i = nloops; i >= 0; i--) {
rcu_read_lock();
un_delay(udl, ndl);
rcu_read_unlock();
}
}
static void rcu_sync_scale_init(void)
{
}
static struct ref_scale_ops rcu_ops = {
.init = rcu_sync_scale_init,
.readsection = ref_rcu_read_section,
.delaysection = ref_rcu_delay_section,
.name = "rcu"
};
// Definitions for SRCU ref scale testing.
DEFINE_STATIC_SRCU(srcu_refctl_scale);
static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
static void srcu_ref_scale_read_section(const int nloops)
{
int i;
int idx;
for (i = nloops; i >= 0; i--) {
idx = srcu_read_lock(srcu_ctlp);
srcu_read_unlock(srcu_ctlp, idx);
}
}
static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
int idx;
for (i = nloops; i >= 0; i--) {
idx = srcu_read_lock(srcu_ctlp);
un_delay(udl, ndl);
srcu_read_unlock(srcu_ctlp, idx);
}
}
static struct ref_scale_ops srcu_ops = {
.init = rcu_sync_scale_init,
.readsection = srcu_ref_scale_read_section,
.delaysection = srcu_ref_scale_delay_section,
.name = "srcu"
};
// Definitions for RCU Tasks ref scale testing: Empty read markers.
// These definitions also work for RCU Rude readers.
static void rcu_tasks_ref_scale_read_section(const int nloops)
{
int i;
for (i = nloops; i >= 0; i--)
continue;
}
static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
for (i = nloops; i >= 0; i--)
un_delay(udl, ndl);
}
static struct ref_scale_ops rcu_tasks_ops = {
.init = rcu_sync_scale_init,
.readsection = rcu_tasks_ref_scale_read_section,
.delaysection = rcu_tasks_ref_scale_delay_section,
.name = "rcu-tasks"
};
// Definitions for RCU Tasks Trace ref scale testing.
static void rcu_trace_ref_scale_read_section(const int nloops)
{
int i;
for (i = nloops; i >= 0; i--) {
rcu_read_lock_trace();
rcu_read_unlock_trace();
}
}
static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
for (i = nloops; i >= 0; i--) {
rcu_read_lock_trace();
un_delay(udl, ndl);
rcu_read_unlock_trace();
}
}
static struct ref_scale_ops rcu_trace_ops = {
.init = rcu_sync_scale_init,
.readsection = rcu_trace_ref_scale_read_section,
.delaysection = rcu_trace_ref_scale_delay_section,
.name = "rcu-trace"
};
// Definitions for reference count
static atomic_t refcnt;
static void ref_refcnt_section(const int nloops)
{
int i;
for (i = nloops; i >= 0; i--) {
atomic_inc(&refcnt);
atomic_dec(&refcnt);
}
}
static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
for (i = nloops; i >= 0; i--) {
atomic_inc(&refcnt);
un_delay(udl, ndl);
atomic_dec(&refcnt);
}
}
static struct ref_scale_ops refcnt_ops = {
.init = rcu_sync_scale_init,
.readsection = ref_refcnt_section,
.delaysection = ref_refcnt_delay_section,
.name = "refcnt"
};
// Definitions for rwlock
static rwlock_t test_rwlock;
static void ref_rwlock_init(void)
{
rwlock_init(&test_rwlock);
}
static void ref_rwlock_section(const int nloops)
{
int i;
for (i = nloops; i >= 0; i--) {
read_lock(&test_rwlock);
read_unlock(&test_rwlock);
}
}
static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
for (i = nloops; i >= 0; i--) {
read_lock(&test_rwlock);
un_delay(udl, ndl);
read_unlock(&test_rwlock);
}
}
static struct ref_scale_ops rwlock_ops = {
.init = ref_rwlock_init,
.readsection = ref_rwlock_section,
.delaysection = ref_rwlock_delay_section,
.name = "rwlock"
};
// Definitions for rwsem
static struct rw_semaphore test_rwsem;
static void ref_rwsem_init(void)
{
init_rwsem(&test_rwsem);
}
static void ref_rwsem_section(const int nloops)
{
int i;
for (i = nloops; i >= 0; i--) {
down_read(&test_rwsem);
up_read(&test_rwsem);
}
}
static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
for (i = nloops; i >= 0; i--) {
down_read(&test_rwsem);
un_delay(udl, ndl);
up_read(&test_rwsem);
}
}
static struct ref_scale_ops rwsem_ops = {
.init = ref_rwsem_init,
.readsection = ref_rwsem_section,
.delaysection = ref_rwsem_delay_section,
.name = "rwsem"
};
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
cur_ops->readsection(loops);
else
cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
}
// Reader kthread. Repeatedly does empty RCU read-side
// critical section, minimizing update-side interference.
static int
ref_scale_reader(void *arg)
{
unsigned long flags;
long me = (long)arg;
struct reader_task *rt = &(reader_tasks[me]);
u64 start;
s64 duration;
VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
set_user_nice(current, MAX_NICE);
atomic_inc(&n_init);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
repeat:
VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
// Wait for signal that this reader can start.
wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
torture_must_stop());
if (torture_must_stop())
goto end;
// Make sure that the CPU is affinitized appropriately during testing.
WARN_ON_ONCE(smp_processor_id() != me);
WRITE_ONCE(rt->start_reader, 0);
if (!atomic_dec_return(&n_started))
while (atomic_read_acquire(&n_started))
cpu_relax();
VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
// To reduce noise, do an initial cache-warming invocation, check
// in, and then keep warming until everyone has checked in.
rcu_scale_one_reader();
if (!atomic_dec_return(&n_warmedup))
while (atomic_read_acquire(&n_warmedup))
rcu_scale_one_reader();
// Also keep interrupts disabled. This also has the effect
// of preventing entries into slow path for rcu_read_unlock().
local_irq_save(flags);
start = ktime_get_mono_fast_ns();
rcu_scale_one_reader();
duration = ktime_get_mono_fast_ns() - start;
local_irq_restore(flags);
rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
// To reduce runtime-skew noise, do maintain-load invocations until
// everyone is done.
if (!atomic_dec_return(&n_cooleddown))
while (atomic_read_acquire(&n_cooleddown))
rcu_scale_one_reader();
if (atomic_dec_and_test(&nreaders_exp))
wake_up(&main_wq);
VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
me, exp_idx, atomic_read(&nreaders_exp));
if (!torture_must_stop())
goto repeat;
end:
torture_kthread_stopping("ref_scale_reader");
return 0;
}
static void reset_readers(void)
{
int i;
struct reader_task *rt;
for (i = 0; i < nreaders; i++) {
rt = &(reader_tasks[i]);
rt->last_duration_ns = 0;
}
}
// Print the results of each reader and return the sum of all their durations.
static u64 process_durations(int n)
{
int i;
struct reader_task *rt;
char buf1[64];
char *buf;
u64 sum = 0;
buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
if (!buf)
return 0;
buf[0] = 0;
sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
exp_idx);
for (i = 0; i < n && !torture_must_stop(); i++) {
rt = &(reader_tasks[i]);
sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
if (i % 5 == 0)
strcat(buf, "\n");
strcat(buf, buf1);
sum += rt->last_duration_ns;
}
strcat(buf, "\n");
SCALEOUT("%s\n", buf);
kfree(buf);
return sum;
}
// The main_func is the main orchestrator, it performs a bunch of
// experiments. For every experiment, it orders all the readers
// involved to start and waits for them to finish the experiment. It
// then reads their timestamps and starts the next experiment. Each
// experiment progresses from 1 concurrent reader to N of them at which
// point all the timestamps are printed.
static int main_func(void *arg)
{
bool errexit = false;
int exp, r;
char buf1[64];
char *buf;
u64 *result_avg;
set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
set_user_nice(current, MAX_NICE);
VERBOSE_SCALEOUT("main_func task started");
result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
if (!result_avg || !buf) {
VERBOSE_SCALEOUT_ERRSTRING("out of memory");
errexit = true;
}
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
// Wait for all threads to start.
atomic_inc(&n_init);
while (atomic_read(&n_init) < nreaders + 1)
schedule_timeout_uninterruptible(1);
// Start exp readers up per experiment
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
if (errexit)
break;
if (torture_must_stop())
goto end;
reset_readers();
atomic_set(&nreaders_exp, nreaders);
atomic_set(&n_started, nreaders);
atomic_set(&n_warmedup, nreaders);
atomic_set(&n_cooleddown, nreaders);
exp_idx = exp;
for (r = 0; r < nreaders; r++) {
smp_store_release(&reader_tasks[r].start_reader, 1);
wake_up(&reader_tasks[r].wq);
}
VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
nreaders);
wait_event(main_wq,
!atomic_read(&nreaders_exp) || torture_must_stop());
VERBOSE_SCALEOUT("main_func: experiment ended");
if (torture_must_stop())
goto end;
result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
}
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
buf[0] = 0;
strcat(buf, "\n");
strcat(buf, "Runs\tTime(ns)\n");
for (exp = 0; exp < nruns; exp++) {
u64 avg;
u32 rem;
if (errexit)
break;
avg = div_u64_rem(result_avg[exp], 1000, &rem);
sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
strcat(buf, buf1);
}
if (!errexit)
SCALEOUT("%s", buf);
// This will shutdown everything including us.
if (shutdown) {
shutdown_start = 1;
wake_up(&shutdown_wq);
}
// Wait for torture to stop us
while (!torture_must_stop())
schedule_timeout_uninterruptible(1);
end:
torture_kthread_stopping("main_func");
kfree(result_avg);
kfree(buf);
return 0;
}
static void
ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
"--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
}
static void
ref_scale_cleanup(void)
{
int i;
if (torture_cleanup_begin())
return;
if (!cur_ops) {
torture_cleanup_end();
return;
}
if (reader_tasks) {
for (i = 0; i < nreaders; i++)
torture_stop_kthread("ref_scale_reader",
reader_tasks[i].task);
}
kfree(reader_tasks);
torture_stop_kthread("main_task", main_task);
kfree(main_task);
// Do scale-type-specific cleanup operations.
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
torture_cleanup_end();
}
// Shutdown kthread. Just waits to be awakened, then shuts down system.
static int
ref_scale_shutdown(void *arg)
{
wait_event(shutdown_wq, shutdown_start);
smp_mb(); // Wake before output.
ref_scale_cleanup();
kernel_power_off();
return -EINVAL;
}
static int __init
ref_scale_init(void)
{
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
&refcnt_ops, &rwlock_ops, &rwsem_ops,
};
if (!torture_init_begin(scale_type, verbose))
return -EBUSY;
for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
cur_ops = scale_ops[i];
if (strcmp(scale_type, cur_ops->name) == 0)
break;
}
if (i == ARRAY_SIZE(scale_ops)) {
pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
pr_alert("rcu-scale types:");
for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
pr_cont(" %s", scale_ops[i]->name);
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
firsterr = -EINVAL;
cur_ops = NULL;
goto unwind;
}
if (cur_ops->init)
cur_ops->init();
ref_scale_print_module_parms(cur_ops, "Start of test");
// Shutdown task
if (shutdown) {
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
shutdown_task);
if (firsterr)
goto unwind;
schedule_timeout_uninterruptible(1);
}
// Reader tasks (default to ~75% of online CPUs).
if (nreaders < 0)
nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
VERBOSE_SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
for (i = 0; i < nreaders; i++) {
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
if (firsterr)
goto unwind;
init_waitqueue_head(&(reader_tasks[i].wq));
}
// Main Task
init_waitqueue_head(&main_wq);
firsterr = torture_create_kthread(main_func, NULL, main_task);
if (firsterr)
goto unwind;
torture_init_end();
return 0;
unwind:
torture_init_end();
ref_scale_cleanup();
return firsterr;
}
module_init(ref_scale_init);
module_exit(ref_scale_cleanup);

View File

@@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp)
* it, if this function was preempted for enough time for the counters
* to wrap, it really doesn't matter whether or not we expedite the grace
* period. The extra overhead of a needlessly expedited grace period is
* negligible when amoritized over that time period, and the extra latency
* negligible when amortized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
static bool srcu_might_be_idle(struct srcu_struct *ssp)
@@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long t;
unsigned long tlast;
check_init_srcu_struct(ssp);
/* If the local srcu_data structure has callbacks, not idle. */
local_irq_save(flags);
sdp = this_cpu_ptr(ssp->sda);
sdp = raw_cpu_ptr(ssp->sda);
spin_lock_irqsave_rcu_node(sdp, flags);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
local_irq_restore(flags);
spin_unlock_irqrestore_rcu_node(sdp, flags);
return false; /* Callbacks already present, so not idle. */
}
local_irq_restore(flags);
spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
* No local callbacks, so probabalistically probe global state.
@@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
}
rhp->func = func;
idx = srcu_read_lock(ssp);
local_irq_save(flags);
sdp = this_cpu_ptr(ssp->sda);
spin_lock_rcu_node(sdp);
sdp = raw_cpu_ptr(ssp->sda);
spin_lock_irqsave_rcu_node(sdp, flags);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_gp_seq));

View File

@@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644);
#define RTGS_WAIT_READERS 9
#define RTGS_INVOKE_CBS 10
#define RTGS_WAIT_CBS 11
#ifndef CONFIG_TINY_RCU
static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INIT",
"RTGS_WAIT_WAIT_CBS",
@@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INVOKE_CBS",
"RTGS_WAIT_CBS",
};
#endif /* #ifndef CONFIG_TINY_RCU */
////////////////////////////////////////////////////////////////////////
//
@@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
rtp->gp_jiffies = jiffies;
}
#ifndef CONFIG_TINY_RCU
/* Return state name. */
static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
{
@@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
return "???";
return rcu_tasks_gp_state_names[j];
}
#endif /* #ifndef CONFIG_TINY_RCU */
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
@@ -205,7 +209,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
if (!rtp->cbs_head) {
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
schedule_timeout_interruptible(HZ/10);
schedule_timeout_idle(HZ/10);
}
continue;
}
@@ -227,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
cond_resched();
}
/* Paranoid sleep to keep this from entering a tight loop */
schedule_timeout_uninterruptible(HZ/10);
schedule_timeout_idle(HZ/10);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
}
@@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifndef CONFIG_TINY_RCU */
#ifndef CONFIG_TINY_RCU
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
".C"[!!data_race(rtp->cbs_head)],
s);
}
#endif /* #ifndef CONFIG_TINY_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -336,7 +342,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
/* Slowly back off waiting for holdouts */
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
schedule_timeout_interruptible(HZ/fract);
schedule_timeout_idle(HZ/fract);
if (fract > 1)
fract--;
@@ -402,7 +408,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
/* Processing between scanning taskslist and draining the holdout list. */
void rcu_tasks_postscan(struct list_head *hop)
static void rcu_tasks_postscan(struct list_head *hop)
{
/*
* Wait for tasks that are in the process of exiting. This
@@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void)
}
core_initcall(rcu_spawn_tasks_kthread);
#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_classic_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
#endif /* #ifndef CONFIG_TINY_RCU */
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
@@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
}
core_initcall(rcu_spawn_tasks_rude_kthread);
#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_rude_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_RUDE_RCU */
static void show_rcu_tasks_rude_gp_kthread(void) {}
@@ -727,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
atomic_t trc_n_readers_need_end; // Number of waited-for readers.
DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -835,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
bool ofl = cpu_is_offline(cpu);
if (task_curr(t)) {
WARN_ON_ONCE(ofl & !is_idle_task(t));
WARN_ON_ONCE(ofl && !is_idle_task(t));
// If no chance of heavyweight readers, do it the hard way.
if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
@@ -1118,11 +1128,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
* synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
*
* Control will return to the caller some time after a trace rcu-tasks
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
* cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
* anyway) cond_resched().
* grace period has elapsed, in other words after all currently executing
* rcu-tasks read-side critical sections have elapsed. These read-side
* critical sections are delimited by calls to rcu_read_lock_trace()
* and rcu_read_unlock_trace().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -1164,6 +1173,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
}
core_initcall(rcu_spawn_tasks_trace_kthread);
#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
@@ -1174,18 +1184,21 @@ static void show_rcu_tasks_trace_gp_kthread(void)
data_race(n_heavy_reader_attempts));
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
#ifndef CONFIG_TINY_RCU
void show_rcu_tasks_gp_kthreads(void)
{
show_rcu_tasks_classic_gp_kthread();
show_rcu_tasks_rude_gp_kthread();
show_rcu_tasks_trace_gp_kthread();
}
#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}

View File

@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/prefetch.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include "rcu.h"
@@ -84,9 +85,9 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
trace_rcu_invoke_kfree_callback("", head, offset);
kfree((void *)head - offset);
if (__is_kvfree_rcu_offset(offset)) {
trace_rcu_invoke_kvfree_callback("", head, offset);
kvfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
}

View File

@@ -57,6 +57,8 @@
#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -175,6 +177,15 @@ module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
/*
* This rcu parameter is runtime-read-only. It reflects
* a minimum allowed number of objects which can be cached
* per-CPU. Object size is equal to one page. This value
* can be changed at boot time.
*/
static int rcu_min_cached_objs = 2;
module_param(rcu_min_cached_objs, int, 0444);
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -954,7 +965,6 @@ void __rcu_irq_enter_check_tick(void)
/**
* rcu_nmi_enter - inform RCU of entry to NMI context
* @irq: Is this call from rcu_irq_enter?
*
* If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
* rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
@@ -990,8 +1000,11 @@ noinstr void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
// ... but is watching here.
if (!in_nmi())
if (!in_nmi()) {
instrumentation_begin();
rcu_cleanup_after_idle();
instrumentation_end();
}
instrumentation_begin();
// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
@@ -1638,7 +1651,7 @@ static void rcu_gp_slow(int delay)
if (delay > 0 &&
!(rcu_seq_ctr(rcu_state.gp_seq) %
(rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
schedule_timeout_uninterruptible(delay);
schedule_timeout_idle(delay);
}
static unsigned long sleep_duration;
@@ -1661,7 +1674,7 @@ static void rcu_gp_torture_wait(void)
duration = xchg(&sleep_duration, 0UL);
if (duration > 0) {
pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
schedule_timeout_uninterruptible(duration);
schedule_timeout_idle(duration);
pr_alert("%s: Wait complete\n", __func__);
}
}
@@ -2443,6 +2456,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
local_irq_save(flags);
rcu_nocb_lock(rdp);
count = -rcl.len;
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2726,7 +2740,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
}
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
schedule_timeout_interruptible(2);
schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
}
@@ -2894,8 +2908,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
if (__is_kvfree_rcu_offset((unsigned long)func))
trace_rcu_kvfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_cbs(&rdp->cblist));
else
@@ -2957,53 +2971,53 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
#define FREE_N_CHANNELS 2
/**
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
* @nr_records: Number of active pointers in the array
* @next: Next bulk object in the block chain
* @records: Array of the kvfree_rcu() pointers
*/
struct kvfree_rcu_bulk_data {
unsigned long nr_records;
struct kvfree_rcu_bulk_data *next;
void *records[];
};
/*
* This macro defines how many entries the "records" array
* will contain. It is based on the fact that the size of
* kfree_rcu_bulk_data structure becomes exactly one page.
* kvfree_rcu_bulk_data structure becomes exactly one page.
*/
#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
/**
* struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
* @nr_records: Number of active pointers in the array
* @records: Array of the kfree_rcu() pointers
* @next: Next bulk object in the block chain
* @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
*/
struct kfree_rcu_bulk_data {
unsigned long nr_records;
void *records[KFREE_BULK_MAX_ENTR];
struct kfree_rcu_bulk_data *next;
struct rcu_head *head_free_debug;
};
#define KVFREE_BULK_MAX_ENTR \
((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
* @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
* @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
struct kfree_rcu_bulk_data *bhead_free;
struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
* @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
* @bcached: Keeps at most one object for later reuse when build chain blocks
* @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @lock and @rcu_work fields have been initialized
* @initialized: The @rcu_work fields have been initialized
* @count: Number of objects for which GP not started
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
@@ -3012,28 +3026,84 @@ struct kfree_rcu_cpu_work {
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
struct kfree_rcu_bulk_data *bhead;
struct kfree_rcu_bulk_data *bcached;
struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
spinlock_t lock;
raw_spinlock_t lock;
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
// Number of objects for which GP not started
int count;
/*
* A simple cache list that contains objects for
* reuse purpose. In order to save some per-cpu
* space the list is singular. Even though it is
* lockless an access has to be protected by the
* per-cpu lock.
*/
struct llist_head bkvcache;
int nr_bkv_objs;
};
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
};
static __always_inline void
debug_rcu_head_unqueue_bulk(struct rcu_head *head)
debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
{
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
for (; head; head = head->next)
debug_rcu_head_unqueue(head);
int i;
for (i = 0; i < bhead->nr_records; i++)
debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
#endif
}
static inline struct kfree_rcu_cpu *
krc_this_cpu_lock(unsigned long *flags)
{
struct kfree_rcu_cpu *krcp;
local_irq_save(*flags); // For safely calling this_cpu_ptr().
krcp = this_cpu_ptr(&krc);
raw_spin_lock(&krcp->lock);
return krcp;
}
static inline void
krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
{
raw_spin_unlock(&krcp->lock);
local_irq_restore(flags);
}
static inline struct kvfree_rcu_bulk_data *
get_cached_bnode(struct kfree_rcu_cpu *krcp)
{
if (!krcp->nr_bkv_objs)
return NULL;
krcp->nr_bkv_objs--;
return (struct kvfree_rcu_bulk_data *)
llist_del_first(&krcp->bkvcache);
}
static inline bool
put_cached_bnode(struct kfree_rcu_cpu *krcp,
struct kvfree_rcu_bulk_data *bnode)
{
// Check the limit.
if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
return false;
llist_add((struct llist_node *) bnode, &krcp->bkvcache);
krcp->nr_bkv_objs++;
return true;
}
/*
* This function is invoked in workqueue context after a grace period.
* It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3041,38 +3111,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head)
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
struct rcu_head *head, *next;
struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
int i, j;
krwp = container_of(to_rcu_work(work),
struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
spin_lock_irqsave(&krcp->lock, flags);
raw_spin_lock_irqsave(&krcp->lock, flags);
// Channels 1 and 2.
for (i = 0; i < FREE_N_CHANNELS; i++) {
bkvhead[i] = krwp->bkvhead_free[i];
krwp->bkvhead_free[i] = NULL;
}
// Channel 3.
head = krwp->head_free;
krwp->head_free = NULL;
bhead = krwp->bhead_free;
krwp->bhead_free = NULL;
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
/* "bhead" is now private, so traverse locklessly. */
for (; bhead; bhead = bnext) {
bnext = bhead->next;
// Handle two first channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
for (; bkvhead[i]; bkvhead[i] = bnext) {
bnext = bkvhead[i]->next;
debug_rcu_bhead_unqueue(bkvhead[i]);
debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
rcu_lock_acquire(&rcu_callback_map);
if (i == 0) { // kmalloc() / kfree().
trace_rcu_invoke_kfree_bulk_callback(
rcu_state.name, bkvhead[i]->nr_records,
bkvhead[i]->records);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
bhead->nr_records, bhead->records);
kfree_bulk(bkvhead[i]->nr_records,
bkvhead[i]->records);
} else { // vmalloc() / vfree().
for (j = 0; j < bkvhead[i]->nr_records; j++) {
trace_rcu_invoke_kvfree_callback(
rcu_state.name,
bkvhead[i]->records[j], 0);
kfree_bulk(bhead->nr_records, bhead->records);
rcu_lock_release(&rcu_callback_map);
vfree(bkvhead[i]->records[j]);
}
}
rcu_lock_release(&rcu_callback_map);
if (cmpxchg(&krcp->bcached, NULL, bhead))
free_page((unsigned long) bhead);
krcp = krc_this_cpu_lock(&flags);
if (put_cached_bnode(krcp, bkvhead[i]))
bkvhead[i] = NULL;
krc_this_cpu_unlock(krcp, flags);
cond_resched_tasks_rcu_qs();
if (bkvhead[i])
free_page((unsigned long) bkvhead[i]);
cond_resched_tasks_rcu_qs();
}
}
/*
@@ -3082,14 +3177,15 @@ static void kfree_rcu_work(struct work_struct *work)
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
void *ptr = (void *)head - offset;
next = head->next;
debug_rcu_head_unqueue(head);
debug_rcu_head_unqueue((struct rcu_head *)ptr);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
kfree((void *)head - offset);
if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -3105,8 +3201,8 @@ static void kfree_rcu_work(struct work_struct *work)
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
struct kfree_rcu_cpu_work *krwp;
bool queued = false;
int i;
bool repeat = false;
int i, j;
lockdep_assert_held(&krcp->lock);
@@ -3114,21 +3210,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
krwp = &(krcp->krw_arr[i]);
/*
* Try to detach bhead or head and attach it over any
* Try to detach bkvhead or head and attach it over any
* available corresponding free channel. It can be that
* a previous RCU batch is in progress, it means that
* immediately to queue another one is not possible so
* return false to tell caller to retry.
*/
if ((krcp->bhead && !krwp->bhead_free) ||
if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
/* Channel 1. */
if (!krwp->bhead_free) {
krwp->bhead_free = krcp->bhead;
krcp->bhead = NULL;
// Channel 1 corresponds to SLAB ptrs.
// Channel 2 corresponds to vmalloc ptrs.
for (j = 0; j < FREE_N_CHANNELS; j++) {
if (!krwp->bkvhead_free[j]) {
krwp->bkvhead_free[j] = krcp->bkvhead[j];
krcp->bkvhead[j] = NULL;
}
}
/* Channel 2. */
// Channel 3 corresponds to emergency path.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
@@ -3137,17 +3237,21 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0);
/*
* One work is per one batch, so there are two "free channels",
* "bhead_free" and "head_free" the batch can handle. It can be
* that the work is in the pending state when two channels have
* been detached following each other, one by one.
* One work is per one batch, so there are three
* "free channels", the batch can handle. It can
* be that the work is in the pending state when
* channels have been detached following by each
* other.
*/
queue_rcu_work(system_wq, &krwp->rcu_work);
queued = true;
}
// Repeat if any "free" corresponding channel is still busy.
if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
repeat = true;
}
return queued;
return !repeat;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
@@ -3157,14 +3261,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
krcp->monitor_todo = false;
if (queue_kfree_rcu_work(krcp)) {
// Success! Our job is done here.
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
return;
}
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
@@ -3177,32 +3281,50 @@ static void kfree_rcu_monitor(struct work_struct *work)
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
monitor_work.work);
spin_lock_irqsave(&krcp->lock, flags);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static inline bool
kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
struct rcu_head *head, rcu_callback_t func)
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
struct kfree_rcu_bulk_data *bnode;
struct kvfree_rcu_bulk_data *bnode;
int idx;
if (unlikely(!krcp->initialized))
return false;
lockdep_assert_held(&krcp->lock);
idx = !!is_vmalloc_addr(ptr);
/* Check if a new block is required. */
if (!krcp->bhead ||
krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
bnode = xchg(&krcp->bcached, NULL);
if (!krcp->bkvhead[idx] ||
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(krcp);
if (!bnode) {
WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
/*
* To keep this path working on raw non-preemptible
* sections, prevent the optional entry into the
* allocator as it uses sleeping locks. In fact, even
* if the caller of kfree_rcu() is preemptible, this
* path still is not, as krcp->lock is a raw spinlock.
* With additional page pre-allocation in the works,
* hitting this return is going to be much less likely.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT))
return false;
bnode = (struct kfree_rcu_bulk_data *)
/*
* NOTE: For one argument of kvfree_rcu() we can
* drop the lock and get the page in sleepable
* context. That would allow to maintain an array
* for the CONFIG_PREEMPT_RT as well if no cached
* pages are available.
*/
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
}
@@ -3212,53 +3334,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
/* Initialize the new block. */
bnode->nr_records = 0;
bnode->next = krcp->bhead;
bnode->head_free_debug = NULL;
bnode->next = krcp->bkvhead[idx];
/* Attach it to the head. */
krcp->bhead = bnode;
krcp->bkvhead[idx] = bnode;
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
head->func = func;
head->next = krcp->bhead->head_free_debug;
krcp->bhead->head_free_debug = head;
#endif
/* Finally insert. */
krcp->bhead->records[krcp->bhead->nr_records++] =
(void *) head - (unsigned long) func;
krcp->bkvhead[idx]->records
[krcp->bkvhead[idx]->nr_records++] = ptr;
return true;
}
/*
* Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
* period. Please note there are two paths are maintained, one is the main one
* that uses kfree_bulk() interface and second one is emergency one, that is
* used only when the main path can not be maintained temporary, due to memory
* pressure.
* Queue a request for lazy invocation of appropriate free routine after a
* grace period. Please note there are three paths are maintained, two are the
* main ones that use array of pointers interface and third one is emergency
* one, that is used only when the main path can not be maintained temporary,
* due to memory pressure.
*
* Each kfree_call_rcu() request is added to a batch. The batch will be drained
* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu() load.
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
bool success;
void *ptr;
local_irq_save(flags); // For safely calling this_cpu_ptr().
krcp = this_cpu_ptr(&krc);
if (krcp->initialized)
spin_lock(&krcp->lock);
if (head) {
ptr = (void *) head - (unsigned long) func;
} else {
/*
* Please note there is a limitation for the head-less
* variant, that is why there is a clear rule for such
* objects: it can be used from might_sleep() context
* only. For other places please embed an rcu_head to
* your data.
*/
might_sleep();
ptr = (unsigned long *) func;
}
krcp = krc_this_cpu_lock(&flags);
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(head)) {
if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
// Mark as success and leave.
success = true;
goto unlock_return;
}
@@ -3266,10 +3397,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
* Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained.
*/
if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
if (head == NULL)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
head->func = func;
head->next = krcp->head;
krcp->head = head;
success = true;
}
WRITE_ONCE(krcp->count, krcp->count + 1);
@@ -3282,11 +3419,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
unlock_return:
if (krcp->initialized)
spin_unlock(&krcp->lock);
local_irq_restore(flags);
krc_this_cpu_unlock(krcp, flags);
/*
* Inline kvfree() after synchronize_rcu(). We can do
* it from might_sleep() context only, so the current
* CPU can pass the QS state.
*/
if (!success) {
debug_rcu_head_unqueue((struct rcu_head *) ptr);
synchronize_rcu();
kvfree(ptr);
}
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
@@ -3315,11 +3461,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count = krcp->count;
spin_lock_irqsave(&krcp->lock, flags);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
sc->nr_to_scan -= count;
freed += count;
@@ -3328,7 +3474,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
break;
}
return freed;
return freed == 0 ? SHRINK_STOP : freed;
}
static struct shrinker kfree_rcu_shrinker = {
@@ -3346,15 +3492,15 @@ void __init kfree_rcu_scheduler_running(void)
for_each_online_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
spin_lock_irqsave(&krcp->lock, flags);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (!krcp->head || krcp->monitor_todo) {
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
krcp->monitor_todo = true;
schedule_delayed_work_on(cpu, &krcp->monitor_work,
KFREE_DRAIN_JIFFIES);
spin_unlock_irqrestore(&krcp->lock, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3842,10 +3988,9 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
int nbits;
unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
bool newcpu;
if (per_cpu(rcu_cpu_started, cpu))
return;
@@ -3857,12 +4002,10 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
oldmask = rnp->expmaskinitnext;
newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
oldmask ^= rnp->expmaskinitnext;
nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
/* Allow lockless access for expedited grace periods. */
smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4249,13 +4392,23 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
struct kvfree_rcu_bulk_data *bnode;
spin_lock_init(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
for (i = 0; i < rcu_min_cached_objs; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (bnode)
put_cached_bnode(krcp, bnode);
else
pr_err("Failed to preallocate for %d CPU!\n", cpu);
}
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}

View File

@@ -41,7 +41,7 @@ struct rcu_node {
raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
/* some rcu_state fields as well as */
/* following. */
unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */
unsigned long gp_seq; /* Track rsp->gp_seq. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -73,9 +73,9 @@ struct rcu_node {
unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
int grphi; /* highest-numbered CPU or group here. */
u8 grpnum; /* CPU/group number for next level up. */
int grplo; /* lowest-numbered CPU here. */
int grphi; /* highest-numbered CPU here. */
u8 grpnum; /* group number for next level up. */
u8 level; /* root is at level 0. */
bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
/* exit RCU read-side critical sections */
@@ -149,7 +149,7 @@ union rcu_noqs {
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */
unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
bool core_needs_qs; /* Core waits for quiesc state. */
@@ -171,6 +171,7 @@ struct rcu_data {
/* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -301,6 +302,8 @@ struct rcu_state {
u8 boost ____cacheline_internodealigned_in_smp;
/* Subject to priority boost. */
unsigned long gp_seq; /* Grace-period sequence #. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
@@ -346,8 +349,6 @@ struct rcu_state {
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
/* GP start. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */

View File

@@ -403,7 +403,7 @@ retry_ipi:
/* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
schedule_timeout_uninterruptible(1);
schedule_timeout_idle(1);
goto retry_ipi;
}
/* CPU really is offline, so we must report its QS. */

View File

@@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg)
if (spincnt > 10) {
WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
schedule_timeout_interruptible(2);
schedule_timeout_idle(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
@@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
/* Polling, so trace if first poll in the series. */
if (gotcbs)
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
schedule_timeout_interruptible(1);
schedule_timeout_idle(1);
} else if (!needwait_gp) {
/* Wait for callbacks to appear. */
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));

View File

@@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr {
*/
static bool check_slow_task(struct task_struct *t, void *arg)
{
struct rcu_node *rnp;
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
return false; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
rnp = t->rcu_blocked_node;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
return true;
}
@@ -468,7 +466,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
/*
* OK, time to rat on our buddy...
* See Documentation/RCU/stallwarn.txt for info on how to debug
* See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
@@ -535,7 +533,7 @@ static void print_cpu_stall(unsigned long gps)
/*
* OK, time to rat on ourselves...
* See Documentation/RCU/stallwarn.txt for info on how to debug
* See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
@@ -649,6 +647,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
*/
void show_rcu_gp_kthreads(void)
{
unsigned long cbs = 0;
int cpu;
unsigned long j;
unsigned long ja;
@@ -690,9 +689,11 @@ void show_rcu_gp_kthreads(void)
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
cbs += data_race(rdp->n_cbs_invoked);
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);

View File

@@ -42,6 +42,7 @@
#include <linux/kprobes.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
#include <linux/rcupdate_trace.h>
#define CREATE_TRACE_POINTS
@@ -207,7 +208,7 @@ void rcu_end_inkernel_boot(void)
rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
rcu_boot_ended = 1;
rcu_boot_ended = true;
}
/*
@@ -279,6 +280,7 @@ struct lockdep_map rcu_sched_lock_map = {
};
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
// Tell lockdep when RCU callbacks are being invoked.
static struct lock_class_key rcu_callback_key;
struct lockdep_map rcu_callback_map =
STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
@@ -390,13 +392,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
might_sleep();
continue;
}
init_rcu_head_on_stack(&rs_array[i].head);
init_completion(&rs_array[i].completion);
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
if (j == i)
if (j == i) {
init_rcu_head_on_stack(&rs_array[i].head);
init_completion(&rs_array[i].completion);
(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
}
}
/* Wait for all callbacks to be invoked. */
@@ -407,9 +410,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
if (j == i)
if (j == i) {
wait_for_completion(&rs_array[i].completion);
destroy_rcu_head_on_stack(&rs_array[i].head);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);