Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio updates from Rusty Russell:
 "OK, this has the big virtio 1.0 implementation, as specified by OASIS.

  On top of tht is the major rework of lguest, to use PCI and virtio
  1.0, to double-check the implementation.

  Then comes the inevitable fixes and cleanups from that work"

* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (80 commits)
  virtio: don't set VIRTIO_CONFIG_S_DRIVER_OK twice.
  virtio_net: unconditionally define struct virtio_net_hdr_v1.
  tools/lguest: don't use legacy definitions for net device in example launcher.
  virtio: Don't expose legacy net features when VIRTIO_NET_NO_LEGACY defined.
  tools/lguest: use common error macros in the example launcher.
  tools/lguest: give virtqueues names for better error messages
  tools/lguest: more documentation and checking of virtio 1.0 compliance.
  lguest: don't look in console features to find emerg_wr.
  tools/lguest: don't start devices until DRIVER_OK status set.
  tools/lguest: handle indirect partway through chain.
  tools/lguest: insert driver references from the 1.0 spec (4.1 Virtio Over PCI)
  tools/lguest: insert device references from the 1.0 spec (4.1 Virtio Over PCI)
  tools/lguest: rename virtio_pci_cfg_cap field to match spec.
  tools/lguest: fix features_accepted logic in example launcher.
  tools/lguest: handle device reset correctly in example launcher.
  virtual: Documentation: simplify and generalize paravirt_ops.txt
  lguest: remove NOTIFY call and eventfd facility.
  lguest: remove NOTIFY facility from demonstration launcher.
  lguest: use the PCI console device's emerg_wr for early boot messages.
  lguest: always put console in PCI slot #1.
  ...
This commit is contained in:
Linus Torvalds
2015-02-18 09:24:01 -08:00
43 changed files with 3397 additions and 1635 deletions

View File

@@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
}
/*:*/
unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
{
switch (reg_off) {
case offsetof(struct pt_regs, bx):
return &cpu->regs->ebx;
case offsetof(struct pt_regs, cx):
return &cpu->regs->ecx;
case offsetof(struct pt_regs, dx):
return &cpu->regs->edx;
case offsetof(struct pt_regs, si):
return &cpu->regs->esi;
case offsetof(struct pt_regs, di):
return &cpu->regs->edi;
case offsetof(struct pt_regs, bp):
return &cpu->regs->ebp;
case offsetof(struct pt_regs, ax):
return &cpu->regs->eax;
case offsetof(struct pt_regs, ip):
return &cpu->regs->eip;
case offsetof(struct pt_regs, sp):
return &cpu->regs->esp;
}
/* Launcher can read these, but we don't allow any setting. */
if (any) {
switch (reg_off) {
case offsetof(struct pt_regs, ds):
return &cpu->regs->ds;
case offsetof(struct pt_regs, es):
return &cpu->regs->es;
case offsetof(struct pt_regs, fs):
return &cpu->regs->fs;
case offsetof(struct pt_regs, gs):
return &cpu->regs->gs;
case offsetof(struct pt_regs, cs):
return &cpu->regs->cs;
case offsetof(struct pt_regs, flags):
return &cpu->regs->eflags;
case offsetof(struct pt_regs, ss):
return &cpu->regs->ss;
}
}
return NULL;
}
/*M:002
* There are hooks in the scheduler which we can register to tell when we
* get kicked off the CPU (preempt_notifier_register()). This would allow us
@@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
* usually attached to a PC.
*
* When the Guest uses one of these instructions, we get a trap (General
* Protection Fault) and come here. We see if it's one of those troublesome
* instructions and skip over it. We return true if we did.
* Protection Fault) and come here. We queue this to be sent out to the
* Launcher to handle.
*/
static int emulate_insn(struct lg_cpu *cpu)
/*
* The eip contains the *virtual* address of the Guest's instruction:
* we copy the instruction here so the Launcher doesn't have to walk
* the page tables to decode it. We handle the case (eg. in a kernel
* module) where the instruction is over two pages, and the pages are
* virtually but not physically contiguous.
*
* The longest possible x86 instruction is 15 bytes, but we don't handle
* anything that strange.
*/
static void copy_from_guest(struct lg_cpu *cpu,
void *dst, unsigned long vaddr, size_t len)
{
u8 insn;
unsigned int insnlen = 0, in = 0, small_operand = 0;
/*
* The eip contains the *virtual* address of the Guest's instruction:
* walk the Guest's page tables to find the "physical" address.
*/
unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
unsigned long paddr;
/*
* This must be the Guest kernel trying to do something, not userspace!
* The bottom two bits of the CS segment register are the privilege
* level.
*/
if ((cpu->regs->cs & 3) != GUEST_PL)
return 0;
BUG_ON(len > PAGE_SIZE);
/* Decoding x86 instructions is icky. */
insn = lgread(cpu, physaddr, u8);
/*
* Around 2.6.33, the kernel started using an emulation for the
* cmpxchg8b instruction in early boot on many configurations. This
* code isn't paravirtualized, and it tries to disable interrupts.
* Ignore it, which will Mostly Work.
*/
if (insn == 0xfa) {
/* "cli", or Clear Interrupt Enable instruction. Skip it. */
cpu->regs->eip++;
return 1;
/* If it goes over a page, copy in two parts. */
if (len > to_page_end) {
/* But make sure the next page is mapped! */
if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
copy_from_guest(cpu, dst + to_page_end,
vaddr + to_page_end,
len - to_page_end);
else
/* Otherwise fill with zeroes. */
memset(dst + to_page_end, 0, len - to_page_end);
len = to_page_end;
}
/*
* 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
*/
if (insn == 0x66) {
small_operand = 1;
/* The instruction is 1 byte so far, read the next byte. */
insnlen = 1;
insn = lgread(cpu, physaddr + insnlen, u8);
}
/* This will kill the guest if it isn't mapped, but that
* shouldn't happen. */
__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
}
/*
* We can ignore the lower bit for the moment and decode the 4 opcodes
* we need to emulate.
*/
switch (insn & 0xFE) {
case 0xE4: /* in <next byte>,%al */
insnlen += 2;
in = 1;
break;
case 0xEC: /* in (%dx),%al */
insnlen += 1;
in = 1;
break;
case 0xE6: /* out %al,<next byte> */
insnlen += 2;
break;
case 0xEE: /* out %al,(%dx) */
insnlen += 1;
break;
default:
/* OK, we don't know what this is, can't emulate. */
return 0;
}
/*
* If it was an "IN" instruction, they expect the result to be read
* into %eax, so we change %eax. We always return all-ones, which
* traditionally means "there's nothing there".
*/
if (in) {
/* Lower bit tells means it's a 32/16 bit access */
if (insn & 0x1) {
if (small_operand)
cpu->regs->eax |= 0xFFFF;
else
cpu->regs->eax = 0xFFFFFFFF;
} else
cpu->regs->eax |= 0xFF;
}
/* Finally, we've "done" the instruction, so move past it. */
cpu->regs->eip += insnlen;
/* Success! */
return 1;
static void setup_emulate_insn(struct lg_cpu *cpu)
{
cpu->pending.trap = 13;
copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
sizeof(cpu->pending.insn));
}
static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
{
cpu->pending.trap = 14;
cpu->pending.addr = iomem_addr;
copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
sizeof(cpu->pending.insn));
}
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
unsigned long iomem_addr;
switch (cpu->regs->trapnum) {
case 13: /* We've intercepted a General Protection Fault. */
/*
* Check if this was one of those annoying IN or OUT
* instructions which we need to emulate. If so, we just go
* back into the Guest after we've done it.
*/
/* Hand to Launcher to emulate those pesky IN and OUT insns */
if (cpu->regs->errcode == 0) {
if (emulate_insn(cpu))
return;
setup_emulate_insn(cpu);
return;
}
break;
case 14: /* We've intercepted a Page Fault. */
@@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
* whether kernel or userspace code.
*/
if (demand_page(cpu, cpu->arch.last_pagefault,
cpu->regs->errcode))
cpu->regs->errcode, &iomem_addr))
return;
/* Was this an access to memory mapped IO? */
if (iomem_addr) {
/* Tell Launcher, let it handle it. */
setup_iomem_insn(cpu, iomem_addr);
return;
}
/*
* OK, it's really not there (or not OK): the Guest needs to
* know. We write out the cr2 value so it knows where the