Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar: "The main changes in this cycle are: - rwsem scalability improvements, phase #2, by Waiman Long, which are rather impressive: "On a 2-socket 40-core 80-thread Skylake system with 40 reader and writer locking threads, the min/mean/max locking operations done in a 5-second testing window before the patchset were: 40 readers, Iterations Min/Mean/Max = 1,807/1,808/1,810 40 writers, Iterations Min/Mean/Max = 1,807/50,344/151,255 After the patchset, they became: 40 readers, Iterations Min/Mean/Max = 30,057/31,359/32,741 40 writers, Iterations Min/Mean/Max = 94,466/95,845/97,098" There's a lot of changes to the locking implementation that makes it similar to qrwlock, including owner handoff for more fair locking. Another microbenchmark shows how across the spectrum the improvements are: "With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 2-socket Skylake system with equal numbers of readers and writers (mixed) before and after this patchset were: # of Threads Before Patch After Patch ------------ ------------ ----------- 2 2,618 4,193 4 1,202 3,726 8 802 3,622 16 729 3,359 32 319 2,826 64 102 2,744" The changes are extensive and the patch-set has been through several iterations addressing various locking workloads. There might be more regressions, but unless they are pathological I believe we want to use this new implementation as the baseline going forward. - jump-label optimizations by Daniel Bristot de Oliveira: the primary motivation was to remove IPI disturbance of isolated RT-workload CPUs, which resulted in the implementation of batched jump-label updates. Beyond the improvement of the real-time characteristics kernel, in one test this patchset improved static key update overhead from 57 msecs to just 1.4 msecs - which is a nice speedup as well. - atomic64_t cross-arch type cleanups by Mark Rutland: over the last ~10 years of atomic64_t existence the various types used by the APIs only had to be self-consistent within each architecture - which means they became wildly inconsistent across architectures. Mark puts and end to this by reworking all the atomic64 implementations to use 's64' as the base type for atomic64_t, and to ensure that this type is consistently used for parameters and return values in the API, avoiding further problems in this area. - A large set of small improvements to lockdep by Yuyang Du: type cleanups, output cleanups, function return type and othr cleanups all around the place. - A set of percpu ops cleanups and fixes by Peter Zijlstra. - Misc other changes - please see the Git log for more details" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits) locking/lockdep: increase size of counters for lockdep statistics locking/atomics: Use sed(1) instead of non-standard head(1) option locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING x86/jump_label: Make tp_vec_nr static x86/percpu: Optimize raw_cpu_xchg() x86/percpu, sched/fair: Avoid local_clock() x86/percpu, x86/irq: Relax {set,get}_irq_regs() x86/percpu: Relax smp_processor_id() x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}() locking/rwsem: Guard against making count negative locking/rwsem: Adaptive disabling of reader optimistic spinning locking/rwsem: Enable time-based spinning on reader-owned rwsem locking/rwsem: Make rwsem->owner an atomic_long_t locking/rwsem: Enable readers spinning on writer locking/rwsem: Clarify usage of owner's nonspinaable bit locking/rwsem: Wake up almost all readers in wait queue locking/rwsem: More optimal RT task handling of null owner locking/rwsem: Always release wait_lock before waking up tasks locking/rwsem: Implement lock handoff to prevent lock starvation locking/rwsem: Make rwsem_spin_on_owner() return owner state ...
Cette révision appartient à :
@@ -14,6 +14,7 @@
|
||||
#include <linux/kdebug.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/mmu_context.h>
|
||||
#include <linux/bsearch.h>
|
||||
#include <asm/text-patching.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/sections.h>
|
||||
@@ -848,81 +849,133 @@ static void do_sync_core(void *info)
|
||||
sync_core();
|
||||
}
|
||||
|
||||
static bool bp_patching_in_progress;
|
||||
static void *bp_int3_handler, *bp_int3_addr;
|
||||
static struct bp_patching_desc {
|
||||
struct text_poke_loc *vec;
|
||||
int nr_entries;
|
||||
} bp_patching;
|
||||
|
||||
static int patch_cmp(const void *key, const void *elt)
|
||||
{
|
||||
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
|
||||
|
||||
if (key < tp->addr)
|
||||
return -1;
|
||||
if (key > tp->addr)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
NOKPROBE_SYMBOL(patch_cmp);
|
||||
|
||||
int poke_int3_handler(struct pt_regs *regs)
|
||||
{
|
||||
struct text_poke_loc *tp;
|
||||
unsigned char int3 = 0xcc;
|
||||
void *ip;
|
||||
|
||||
/*
|
||||
* Having observed our INT3 instruction, we now must observe
|
||||
* bp_patching_in_progress.
|
||||
* bp_patching.nr_entries.
|
||||
*
|
||||
* in_progress = TRUE INT3
|
||||
* nr_entries != 0 INT3
|
||||
* WMB RMB
|
||||
* write INT3 if (in_progress)
|
||||
* write INT3 if (nr_entries)
|
||||
*
|
||||
* Idem for bp_int3_handler.
|
||||
* Idem for other elements in bp_patching.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
if (likely(!bp_patching_in_progress))
|
||||
if (likely(!bp_patching.nr_entries))
|
||||
return 0;
|
||||
|
||||
if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
|
||||
if (user_mode(regs))
|
||||
return 0;
|
||||
|
||||
/* set up the specified breakpoint handler */
|
||||
regs->ip = (unsigned long) bp_int3_handler;
|
||||
/*
|
||||
* Discount the sizeof(int3). See text_poke_bp_batch().
|
||||
*/
|
||||
ip = (void *) regs->ip - sizeof(int3);
|
||||
|
||||
/*
|
||||
* Skip the binary search if there is a single member in the vector.
|
||||
*/
|
||||
if (unlikely(bp_patching.nr_entries > 1)) {
|
||||
tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
|
||||
sizeof(struct text_poke_loc),
|
||||
patch_cmp);
|
||||
if (!tp)
|
||||
return 0;
|
||||
} else {
|
||||
tp = bp_patching.vec;
|
||||
if (tp->addr != ip)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* set up the specified breakpoint detour */
|
||||
regs->ip = (unsigned long) tp->detour;
|
||||
|
||||
return 1;
|
||||
}
|
||||
NOKPROBE_SYMBOL(poke_int3_handler);
|
||||
|
||||
/**
|
||||
* text_poke_bp() -- update instructions on live kernel on SMP
|
||||
* @addr: address to patch
|
||||
* @opcode: opcode of new instruction
|
||||
* @len: length to copy
|
||||
* @handler: address to jump to when the temporary breakpoint is hit
|
||||
* text_poke_bp_batch() -- update instructions on live kernel on SMP
|
||||
* @tp: vector of instructions to patch
|
||||
* @nr_entries: number of entries in the vector
|
||||
*
|
||||
* Modify multi-byte instruction by using int3 breakpoint on SMP.
|
||||
* We completely avoid stop_machine() here, and achieve the
|
||||
* synchronization using int3 breakpoint.
|
||||
*
|
||||
* The way it is done:
|
||||
* - add a int3 trap to the address that will be patched
|
||||
* - For each entry in the vector:
|
||||
* - add a int3 trap to the address that will be patched
|
||||
* - sync cores
|
||||
* - update all but the first byte of the patched range
|
||||
* - For each entry in the vector:
|
||||
* - update all but the first byte of the patched range
|
||||
* - sync cores
|
||||
* - replace the first byte (int3) by the first byte of
|
||||
* replacing opcode
|
||||
* - For each entry in the vector:
|
||||
* - replace the first byte (int3) by the first byte of
|
||||
* replacing opcode
|
||||
* - sync cores
|
||||
*/
|
||||
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
|
||||
{
|
||||
int patched_all_but_first = 0;
|
||||
unsigned char int3 = 0xcc;
|
||||
|
||||
bp_int3_handler = handler;
|
||||
bp_int3_addr = (u8 *)addr + sizeof(int3);
|
||||
bp_patching_in_progress = true;
|
||||
unsigned int i;
|
||||
|
||||
lockdep_assert_held(&text_mutex);
|
||||
|
||||
bp_patching.vec = tp;
|
||||
bp_patching.nr_entries = nr_entries;
|
||||
|
||||
/*
|
||||
* Corresponding read barrier in int3 notifier for making sure the
|
||||
* in_progress and handler are correctly ordered wrt. patching.
|
||||
* nr_entries and handler are correctly ordered wrt. patching.
|
||||
*/
|
||||
smp_wmb();
|
||||
|
||||
text_poke(addr, &int3, sizeof(int3));
|
||||
/*
|
||||
* First step: add a int3 trap to the address that will be patched.
|
||||
*/
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
text_poke(tp[i].addr, &int3, sizeof(int3));
|
||||
|
||||
on_each_cpu(do_sync_core, NULL, 1);
|
||||
|
||||
if (len - sizeof(int3) > 0) {
|
||||
/* patch all but the first byte */
|
||||
text_poke((char *)addr + sizeof(int3),
|
||||
(const char *) opcode + sizeof(int3),
|
||||
len - sizeof(int3));
|
||||
/*
|
||||
* Second step: update all but the first byte of the patched range.
|
||||
*/
|
||||
for (i = 0; i < nr_entries; i++) {
|
||||
if (tp[i].len - sizeof(int3) > 0) {
|
||||
text_poke((char *)tp[i].addr + sizeof(int3),
|
||||
(const char *)tp[i].opcode + sizeof(int3),
|
||||
tp[i].len - sizeof(int3));
|
||||
patched_all_but_first++;
|
||||
}
|
||||
}
|
||||
|
||||
if (patched_all_but_first) {
|
||||
/*
|
||||
* According to Intel, this core syncing is very likely
|
||||
* not necessary and we'd be safe even without it. But
|
||||
@@ -931,14 +984,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
on_each_cpu(do_sync_core, NULL, 1);
|
||||
}
|
||||
|
||||
/* patch the first byte */
|
||||
text_poke(addr, opcode, sizeof(int3));
|
||||
/*
|
||||
* Third step: replace the first byte (int3) by the first byte of
|
||||
* replacing opcode.
|
||||
*/
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
|
||||
|
||||
on_each_cpu(do_sync_core, NULL, 1);
|
||||
/*
|
||||
* sync_core() implies an smp_mb() and orders this store against
|
||||
* the writing of the new instruction.
|
||||
*/
|
||||
bp_patching_in_progress = false;
|
||||
bp_patching.vec = NULL;
|
||||
bp_patching.nr_entries = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* text_poke_bp() -- update instructions on live kernel on SMP
|
||||
* @addr: address to patch
|
||||
* @opcode: opcode of new instruction
|
||||
* @len: length to copy
|
||||
* @handler: address to jump to when the temporary breakpoint is hit
|
||||
*
|
||||
* Update a single instruction with the vector in the stack, avoiding
|
||||
* dynamically allocated memory. This function should be used when it is
|
||||
* not possible to allocate memory.
|
||||
*/
|
||||
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
{
|
||||
struct text_poke_loc tp = {
|
||||
.detour = handler,
|
||||
.addr = addr,
|
||||
.len = len,
|
||||
};
|
||||
|
||||
if (len > POKE_MAX_OPCODE_SIZE) {
|
||||
WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy((void *)tp.opcode, opcode, len);
|
||||
|
||||
text_poke_bp_batch(&tp, 1);
|
||||
}
|
||||
|
@@ -35,41 +35,43 @@ static void bug_at(unsigned char *ip, int line)
|
||||
BUG();
|
||||
}
|
||||
|
||||
static void __ref __jump_label_transform(struct jump_entry *entry,
|
||||
enum jump_label_type type,
|
||||
int init)
|
||||
static void __jump_label_set_jump_code(struct jump_entry *entry,
|
||||
enum jump_label_type type,
|
||||
union jump_code_union *code,
|
||||
int init)
|
||||
{
|
||||
union jump_code_union jmp;
|
||||
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
|
||||
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
|
||||
const void *expect, *code;
|
||||
const void *expect;
|
||||
int line;
|
||||
|
||||
jmp.jump = 0xe9;
|
||||
jmp.offset = jump_entry_target(entry) -
|
||||
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
|
||||
code->jump = 0xe9;
|
||||
code->offset = jump_entry_target(entry) -
|
||||
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
|
||||
|
||||
if (type == JUMP_LABEL_JMP) {
|
||||
if (init) {
|
||||
expect = default_nop; line = __LINE__;
|
||||
} else {
|
||||
expect = ideal_nop; line = __LINE__;
|
||||
}
|
||||
|
||||
code = &jmp.code;
|
||||
if (init) {
|
||||
expect = default_nop; line = __LINE__;
|
||||
} else if (type == JUMP_LABEL_JMP) {
|
||||
expect = ideal_nop; line = __LINE__;
|
||||
} else {
|
||||
if (init) {
|
||||
expect = default_nop; line = __LINE__;
|
||||
} else {
|
||||
expect = &jmp.code; line = __LINE__;
|
||||
}
|
||||
|
||||
code = ideal_nop;
|
||||
expect = code->code; line = __LINE__;
|
||||
}
|
||||
|
||||
if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
|
||||
bug_at((void *)jump_entry_code(entry), line);
|
||||
|
||||
if (type == JUMP_LABEL_NOP)
|
||||
memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
|
||||
}
|
||||
|
||||
static void __ref __jump_label_transform(struct jump_entry *entry,
|
||||
enum jump_label_type type,
|
||||
int init)
|
||||
{
|
||||
union jump_code_union code;
|
||||
|
||||
__jump_label_set_jump_code(entry, type, &code, init);
|
||||
|
||||
/*
|
||||
* As long as only a single processor is running and the code is still
|
||||
* not marked as RO, text_poke_early() can be used; Checking that
|
||||
@@ -82,12 +84,12 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
|
||||
* always nop being the 'currently valid' instruction
|
||||
*/
|
||||
if (init || system_state == SYSTEM_BOOTING) {
|
||||
text_poke_early((void *)jump_entry_code(entry), code,
|
||||
text_poke_early((void *)jump_entry_code(entry), &code,
|
||||
JUMP_LABEL_NOP_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE,
|
||||
text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
|
||||
(void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
|
||||
}
|
||||
|
||||
@@ -99,6 +101,75 @@ void arch_jump_label_transform(struct jump_entry *entry,
|
||||
mutex_unlock(&text_mutex);
|
||||
}
|
||||
|
||||
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
|
||||
static struct text_poke_loc tp_vec[TP_VEC_MAX];
|
||||
static int tp_vec_nr;
|
||||
|
||||
bool arch_jump_label_transform_queue(struct jump_entry *entry,
|
||||
enum jump_label_type type)
|
||||
{
|
||||
struct text_poke_loc *tp;
|
||||
void *entry_code;
|
||||
|
||||
if (system_state == SYSTEM_BOOTING) {
|
||||
/*
|
||||
* Fallback to the non-batching mode.
|
||||
*/
|
||||
arch_jump_label_transform(entry, type);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* No more space in the vector, tell upper layer to apply
|
||||
* the queue before continuing.
|
||||
*/
|
||||
if (tp_vec_nr == TP_VEC_MAX)
|
||||
return false;
|
||||
|
||||
tp = &tp_vec[tp_vec_nr];
|
||||
|
||||
entry_code = (void *)jump_entry_code(entry);
|
||||
|
||||
/*
|
||||
* The INT3 handler will do a bsearch in the queue, so we need entries
|
||||
* to be sorted. We can survive an unsorted list by rejecting the entry,
|
||||
* forcing the generic jump_label code to apply the queue. Warning once,
|
||||
* to raise the attention to the case of an unsorted entry that is
|
||||
* better not happen, because, in the worst case we will perform in the
|
||||
* same way as we do without batching - with some more overhead.
|
||||
*/
|
||||
if (tp_vec_nr > 0) {
|
||||
int prev = tp_vec_nr - 1;
|
||||
struct text_poke_loc *prev_tp = &tp_vec[prev];
|
||||
|
||||
if (WARN_ON_ONCE(prev_tp->addr > entry_code))
|
||||
return false;
|
||||
}
|
||||
|
||||
__jump_label_set_jump_code(entry, type,
|
||||
(union jump_code_union *) &tp->opcode, 0);
|
||||
|
||||
tp->addr = entry_code;
|
||||
tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
|
||||
tp->len = JUMP_LABEL_NOP_SIZE;
|
||||
|
||||
tp_vec_nr++;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void arch_jump_label_transform_apply(void)
|
||||
{
|
||||
if (!tp_vec_nr)
|
||||
return;
|
||||
|
||||
mutex_lock(&text_mutex);
|
||||
text_poke_bp_batch(tp_vec, tp_vec_nr);
|
||||
mutex_unlock(&text_mutex);
|
||||
|
||||
tp_vec_nr = 0;
|
||||
}
|
||||
|
||||
static enum {
|
||||
JL_STATE_START,
|
||||
JL_STATE_NO_UPDATE,
|
||||
|
Référencer dans un nouveau ticket
Bloquer un utilisateur