Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar:
 "The main changes in this cycle are:

   - rwsem scalability improvements, phase #2, by Waiman Long, which are
     rather impressive:

       "On a 2-socket 40-core 80-thread Skylake system with 40 reader
        and writer locking threads, the min/mean/max locking operations
        done in a 5-second testing window before the patchset were:

         40 readers, Iterations Min/Mean/Max = 1,807/1,808/1,810
         40 writers, Iterations Min/Mean/Max = 1,807/50,344/151,255

        After the patchset, they became:

         40 readers, Iterations Min/Mean/Max = 30,057/31,359/32,741
         40 writers, Iterations Min/Mean/Max = 94,466/95,845/97,098"

     There's a lot of changes to the locking implementation that makes
     it similar to qrwlock, including owner handoff for more fair
     locking.

     Another microbenchmark shows how across the spectrum the
     improvements are:

       "With a locking microbenchmark running on 5.1 based kernel, the
        total locking rates (in kops/s) on a 2-socket Skylake system
        with equal numbers of readers and writers (mixed) before and
        after this patchset were:

        # of Threads   Before Patch      After Patch
        ------------   ------------      -----------
             2            2,618             4,193
             4            1,202             3,726
             8              802             3,622
            16              729             3,359
            32              319             2,826
            64              102             2,744"

     The changes are extensive and the patch-set has been through
     several iterations addressing various locking workloads. There
     might be more regressions, but unless they are pathological I
     believe we want to use this new implementation as the baseline
     going forward.

   - jump-label optimizations by Daniel Bristot de Oliveira: the primary
     motivation was to remove IPI disturbance of isolated RT-workload
     CPUs, which resulted in the implementation of batched jump-label
     updates. Beyond the improvement of the real-time characteristics
     kernel, in one test this patchset improved static key update
     overhead from 57 msecs to just 1.4 msecs - which is a nice speedup
     as well.

   - atomic64_t cross-arch type cleanups by Mark Rutland: over the last
     ~10 years of atomic64_t existence the various types used by the
     APIs only had to be self-consistent within each architecture -
     which means they became wildly inconsistent across architectures.
     Mark puts and end to this by reworking all the atomic64
     implementations to use 's64' as the base type for atomic64_t, and
     to ensure that this type is consistently used for parameters and
     return values in the API, avoiding further problems in this area.

   - A large set of small improvements to lockdep by Yuyang Du: type
     cleanups, output cleanups, function return type and othr cleanups
     all around the place.

   - A set of percpu ops cleanups and fixes by Peter Zijlstra.

   - Misc other changes - please see the Git log for more details"

* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits)
  locking/lockdep: increase size of counters for lockdep statistics
  locking/atomics: Use sed(1) instead of non-standard head(1) option
  locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING
  x86/jump_label: Make tp_vec_nr static
  x86/percpu: Optimize raw_cpu_xchg()
  x86/percpu, sched/fair: Avoid local_clock()
  x86/percpu, x86/irq: Relax {set,get}_irq_regs()
  x86/percpu: Relax smp_processor_id()
  x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()
  locking/rwsem: Guard against making count negative
  locking/rwsem: Adaptive disabling of reader optimistic spinning
  locking/rwsem: Enable time-based spinning on reader-owned rwsem
  locking/rwsem: Make rwsem->owner an atomic_long_t
  locking/rwsem: Enable readers spinning on writer
  locking/rwsem: Clarify usage of owner's nonspinaable bit
  locking/rwsem: Wake up almost all readers in wait queue
  locking/rwsem: More optimal RT task handling of null owner
  locking/rwsem: Always release wait_lock before waking up tasks
  locking/rwsem: Implement lock handoff to prevent lock starvation
  locking/rwsem: Make rwsem_spin_on_owner() return owner state
  ...
Cette révision appartient à :
Linus Torvalds
2019-07-08 16:12:03 -07:00
révision e192832869
55 fichiers modifiés avec 2785 ajouts et 2017 suppressions

Voir le fichier

@@ -14,6 +14,7 @@
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -848,81 +849,133 @@ static void do_sync_core(void *info)
sync_core();
}
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;
static struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
} bp_patching;
static int patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
if (key < tp->addr)
return -1;
if (key > tp->addr)
return 1;
return 0;
}
NOKPROBE_SYMBOL(patch_cmp);
int poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
unsigned char int3 = 0xcc;
void *ip;
/*
* Having observed our INT3 instruction, we now must observe
* bp_patching_in_progress.
* bp_patching.nr_entries.
*
* in_progress = TRUE INT3
* nr_entries != 0 INT3
* WMB RMB
* write INT3 if (in_progress)
* write INT3 if (nr_entries)
*
* Idem for bp_int3_handler.
* Idem for other elements in bp_patching.
*/
smp_rmb();
if (likely(!bp_patching_in_progress))
if (likely(!bp_patching.nr_entries))
return 0;
if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
if (user_mode(regs))
return 0;
/* set up the specified breakpoint handler */
regs->ip = (unsigned long) bp_int3_handler;
/*
* Discount the sizeof(int3). See text_poke_bp_batch().
*/
ip = (void *) regs->ip - sizeof(int3);
/*
* Skip the binary search if there is a single member in the vector.
*/
if (unlikely(bp_patching.nr_entries > 1)) {
tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
return 0;
} else {
tp = bp_patching.vec;
if (tp->addr != ip)
return 0;
}
/* set up the specified breakpoint detour */
regs->ip = (unsigned long) tp->detour;
return 1;
}
NOKPROBE_SYMBOL(poke_int3_handler);
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
* @handler: address to jump to when the temporary breakpoint is hit
* text_poke_bp_batch() -- update instructions on live kernel on SMP
* @tp: vector of instructions to patch
* @nr_entries: number of entries in the vector
*
* Modify multi-byte instruction by using int3 breakpoint on SMP.
* We completely avoid stop_machine() here, and achieve the
* synchronization using int3 breakpoint.
*
* The way it is done:
* - add a int3 trap to the address that will be patched
* - For each entry in the vector:
* - add a int3 trap to the address that will be patched
* - sync cores
* - update all but the first byte of the patched range
* - For each entry in the vector:
* - update all but the first byte of the patched range
* - sync cores
* - replace the first byte (int3) by the first byte of
* replacing opcode
* - For each entry in the vector:
* - replace the first byte (int3) by the first byte of
* replacing opcode
* - sync cores
*/
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
int patched_all_but_first = 0;
unsigned char int3 = 0xcc;
bp_int3_handler = handler;
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
unsigned int i;
lockdep_assert_held(&text_mutex);
bp_patching.vec = tp;
bp_patching.nr_entries = nr_entries;
/*
* Corresponding read barrier in int3 notifier for making sure the
* in_progress and handler are correctly ordered wrt. patching.
* nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
text_poke(addr, &int3, sizeof(int3));
/*
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
text_poke(tp[i].addr, &int3, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
if (len - sizeof(int3) > 0) {
/* patch all but the first byte */
text_poke((char *)addr + sizeof(int3),
(const char *) opcode + sizeof(int3),
len - sizeof(int3));
/*
* Second step: update all but the first byte of the patched range.
*/
for (i = 0; i < nr_entries; i++) {
if (tp[i].len - sizeof(int3) > 0) {
text_poke((char *)tp[i].addr + sizeof(int3),
(const char *)tp[i].opcode + sizeof(int3),
tp[i].len - sizeof(int3));
patched_all_but_first++;
}
}
if (patched_all_but_first) {
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -931,14 +984,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
on_each_cpu(do_sync_core, NULL, 1);
}
/* patch the first byte */
text_poke(addr, opcode, sizeof(int3));
/*
* Third step: replace the first byte (int3) by the first byte of
* replacing opcode.
*/
for (i = 0; i < nr_entries; i++)
text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
/*
* sync_core() implies an smp_mb() and orders this store against
* the writing of the new instruction.
*/
bp_patching_in_progress = false;
bp_patching.vec = NULL;
bp_patching.nr_entries = 0;
}
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
* @handler: address to jump to when the temporary breakpoint is hit
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
struct text_poke_loc tp = {
.detour = handler,
.addr = addr,
.len = len,
};
if (len > POKE_MAX_OPCODE_SIZE) {
WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
return;
}
memcpy((void *)tp.opcode, opcode, len);
text_poke_bp_batch(&tp, 1);
}

Voir le fichier

@@ -35,41 +35,43 @@ static void bug_at(unsigned char *ip, int line)
BUG();
}
static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
static void __jump_label_set_jump_code(struct jump_entry *entry,
enum jump_label_type type,
union jump_code_union *code,
int init)
{
union jump_code_union jmp;
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
const void *expect, *code;
const void *expect;
int line;
jmp.jump = 0xe9;
jmp.offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
code->jump = 0xe9;
code->offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
if (type == JUMP_LABEL_JMP) {
if (init) {
expect = default_nop; line = __LINE__;
} else {
expect = ideal_nop; line = __LINE__;
}
code = &jmp.code;
if (init) {
expect = default_nop; line = __LINE__;
} else if (type == JUMP_LABEL_JMP) {
expect = ideal_nop; line = __LINE__;
} else {
if (init) {
expect = default_nop; line = __LINE__;
} else {
expect = &jmp.code; line = __LINE__;
}
code = ideal_nop;
expect = code->code; line = __LINE__;
}
if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
bug_at((void *)jump_entry_code(entry), line);
if (type == JUMP_LABEL_NOP)
memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
}
static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
{
union jump_code_union code;
__jump_label_set_jump_code(entry, type, &code, init);
/*
* As long as only a single processor is running and the code is still
* not marked as RO, text_poke_early() can be used; Checking that
@@ -82,12 +84,12 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
text_poke_early((void *)jump_entry_code(entry), code,
text_poke_early((void *)jump_entry_code(entry), &code,
JUMP_LABEL_NOP_SIZE);
return;
}
text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE,
text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
(void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
}
@@ -99,6 +101,75 @@ void arch_jump_label_transform(struct jump_entry *entry,
mutex_unlock(&text_mutex);
}
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
static int tp_vec_nr;
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
struct text_poke_loc *tp;
void *entry_code;
if (system_state == SYSTEM_BOOTING) {
/*
* Fallback to the non-batching mode.
*/
arch_jump_label_transform(entry, type);
return true;
}
/*
* No more space in the vector, tell upper layer to apply
* the queue before continuing.
*/
if (tp_vec_nr == TP_VEC_MAX)
return false;
tp = &tp_vec[tp_vec_nr];
entry_code = (void *)jump_entry_code(entry);
/*
* The INT3 handler will do a bsearch in the queue, so we need entries
* to be sorted. We can survive an unsorted list by rejecting the entry,
* forcing the generic jump_label code to apply the queue. Warning once,
* to raise the attention to the case of an unsorted entry that is
* better not happen, because, in the worst case we will perform in the
* same way as we do without batching - with some more overhead.
*/
if (tp_vec_nr > 0) {
int prev = tp_vec_nr - 1;
struct text_poke_loc *prev_tp = &tp_vec[prev];
if (WARN_ON_ONCE(prev_tp->addr > entry_code))
return false;
}
__jump_label_set_jump_code(entry, type,
(union jump_code_union *) &tp->opcode, 0);
tp->addr = entry_code;
tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
tp->len = JUMP_LABEL_NOP_SIZE;
tp_vec_nr++;
return true;
}
void arch_jump_label_transform_apply(void)
{
if (!tp_vec_nr)
return;
mutex_lock(&text_mutex);
text_poke_bp_batch(tp_vec, tp_vec_nr);
mutex_unlock(&text_mutex);
tp_vec_nr = 0;
}
static enum {
JL_STATE_START,
JL_STATE_NO_UPDATE,