Merge branch 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  x86, pat: Fix cacheflush address in change_page_attr_set_clr()
  mm: remove !NUMA condition from PAGEFLAGS_EXTENDED condition set
  x86: Fix earlyprintk=dbgp for machines without NX
  x86, pat: Sanity check remap_pfn_range for RAM region
  x86, pat: Lookup the protection from memtype list on vm_insert_pfn()
  x86, pat: Add lookup_memtype to get the current memtype of a paddr
  x86, pat: Use page flags to track memtypes of RAM pages
  x86, pat: Generalize the use of page flag PG_uncached
  x86, pat: Add rbtree to do quick lookup in memtype tracking
  x86, pat: Add PAT reserve free to io_mapping* APIs
  x86, pat: New i/f for driver to request memtype for IO regions
  x86, pat: ioremap to follow same PAT restrictions as other PAT users
  x86, pat: Keep identity maps consistent with mmaps even when pat_disabled
  x86, mtrr: make mtrr_aps_delayed_init static bool
  x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init
  generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled
  x86: Fix an incorrect argument of reserve_bootmem()
  x86: Fix system crash when loading with "reservetop" parameter
This commit is contained in:
Linus Torvalds
2009-09-15 09:19:38 -07:00
19 changed files with 510 additions and 143 deletions

View File

@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags)
* areas). All the aliases have the same cache attributes of course.
* Zero attributes are represented as holes.
*
* Currently the data structure is a list because the number of mappings
* are expected to be relatively small. If this should be a problem
* it could be changed to a rbtree or similar.
* The data structure is a list that is also organized as an rbtree
* sorted on the start address of memtype range.
*
* memtype_lock protects the whole list.
* memtype_lock protects both the linear list and rbtree.
*/
struct memtype {
@@ -160,11 +160,53 @@ struct memtype {
u64 end;
unsigned long type;
struct list_head nd;
struct rb_node rb;
};
static struct rb_root memtype_rbroot = RB_ROOT;
static LIST_HEAD(memtype_list);
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
{
struct rb_node *node = root->rb_node;
struct memtype *last_lower = NULL;
while (node) {
struct memtype *data = container_of(node, struct memtype, rb);
if (data->start < start) {
last_lower = data;
node = node->rb_right;
} else if (data->start > start) {
node = node->rb_left;
} else
return data;
}
/* Will return NULL if there is no entry with its start <= start */
return last_lower;
}
static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
{
struct rb_node **new = &(root->rb_node);
struct rb_node *parent = NULL;
while (*new) {
struct memtype *this = container_of(*new, struct memtype, rb);
parent = *new;
if (data->start <= this->start)
new = &((*new)->rb_left);
else if (data->start > this->start)
new = &((*new)->rb_right);
}
rb_link_node(&data->rb, parent, new);
rb_insert_color(&data->rb, root);
}
/*
* Does intersection of PAT memory type and MTRR memory type and returns
* the resulting memory type as PAT understands it.
@@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
return -EBUSY;
}
static struct memtype *cached_entry;
static u64 cached_start;
static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
{
int ram_page = 0, not_rampage = 0;
@@ -249,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
}
/*
* For RAM pages, mark the pages as non WB memory type using
* PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
* set_memory_wc() on a RAM page at a time before marking it as WB again.
* This is ok, because only one driver will be owning the page and
* doing set_memory_*() calls.
* For RAM pages, we use page flags to mark the pages with appropriate type.
* Here we do two pass:
* - Find the memtype of all the pages in the range, look for any conflicts
* - In case of no conflicts, set the new memtype for pages in the range
*
* For now, we use PageNonWB to track that the RAM page is being mapped
* as non WB. In future, we will have to use one more flag
* (or some other mechanism in page_struct) to distinguish between
* UC and WC mapping.
* Caller must hold memtype_lock for atomicity.
*/
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
{
struct page *page;
u64 pfn, end_pfn;
u64 pfn;
if (req_type == _PAGE_CACHE_UC) {
/* We do not support strong UC */
WARN_ON_ONCE(1);
req_type = _PAGE_CACHE_UC_MINUS;
}
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
unsigned long type;
page = pfn_to_page(pfn);
type = get_page_memtype(page);
if (type != -1) {
printk(KERN_INFO "reserve_ram_pages_type failed "
"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
start, end, type, req_type);
if (new_type)
*new_type = type;
return -EBUSY;
}
}
if (new_type)
*new_type = req_type;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
if (page_mapped(page) || PageNonWB(page))
goto out;
SetPageNonWB(page);
set_page_memtype(page, req_type);
}
return 0;
out:
end_pfn = pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
page = pfn_to_page(pfn);
ClearPageNonWB(page);
}
return -EINVAL;
}
static int free_ram_pages_type(u64 start, u64 end)
{
struct page *page;
u64 pfn, end_pfn;
u64 pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
if (page_mapped(page) || !PageNonWB(page))
goto out;
ClearPageNonWB(page);
set_page_memtype(page, -1);
}
return 0;
out:
end_pfn = pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
page = pfn_to_page(pfn);
SetPageNonWB(page);
}
return -EINVAL;
}
/*
@@ -339,6 +376,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (new_type) {
if (req_type == -1)
*new_type = _PAGE_CACHE_WB;
else if (req_type == _PAGE_CACHE_WC)
*new_type = _PAGE_CACHE_UC_MINUS;
else
*new_type = req_type & _PAGE_CACHE_MASK;
}
@@ -364,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
*new_type = actual_type;
is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1)
return reserve_ram_pages_type(start, end, req_type,
new_type);
else if (is_range_ram < 0)
if (is_range_ram == 1) {
spin_lock(&memtype_lock);
err = reserve_ram_pages_type(start, end, req_type, new_type);
spin_unlock(&memtype_lock);
return err;
} else if (is_range_ram < 0) {
return -EINVAL;
}
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
@@ -380,17 +424,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
spin_lock(&memtype_lock);
if (cached_entry && start >= cached_start)
entry = cached_entry;
else
entry = memtype_rb_search(&memtype_rbroot, new->start);
if (likely(entry != NULL)) {
/* To work correctly with list_for_each_entry_continue */
entry = list_entry(entry->nd.prev, struct memtype, nd);
} else {
entry = list_entry(&memtype_list, struct memtype, nd);
}
/* Search for existing mapping that overlaps the current range */
where = NULL;
list_for_each_entry_continue(entry, &memtype_list, nd) {
if (end <= entry->start) {
where = entry->nd.prev;
cached_entry = list_entry(where, struct memtype, nd);
break;
} else if (start <= entry->start) { /* end > entry->start */
err = chk_conflict(new, entry, new_type);
@@ -398,8 +444,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
where = entry->nd.prev;
cached_entry = list_entry(where,
struct memtype, nd);
}
break;
} else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +451,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (!err) {
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
cached_entry = list_entry(entry->nd.prev,
struct memtype, nd);
/*
* Move to right position in the linked
@@ -436,13 +478,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
return err;
}
cached_start = start;
if (where)
list_add(&new->nd, where);
else
list_add_tail(&new->nd, &memtype_list);
memtype_rb_insert(&memtype_rbroot, new);
spin_unlock(&memtype_lock);
dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +496,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
int free_memtype(u64 start, u64 end)
{
struct memtype *entry;
struct memtype *entry, *saved_entry;
int err = -EINVAL;
int is_range_ram;
@@ -466,23 +508,58 @@ int free_memtype(u64 start, u64 end)
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1)
return free_ram_pages_type(start, end);
else if (is_range_ram < 0)
if (is_range_ram == 1) {
spin_lock(&memtype_lock);
err = free_ram_pages_type(start, end);
spin_unlock(&memtype_lock);
return err;
} else if (is_range_ram < 0) {
return -EINVAL;
}
spin_lock(&memtype_lock);
entry = memtype_rb_search(&memtype_rbroot, start);
if (unlikely(entry == NULL))
goto unlock_ret;
/*
* Saved entry points to an entry with start same or less than what
* we searched for. Now go through the list in both directions to look
* for the entry that matches with both start and end, with list stored
* in sorted start address
*/
saved_entry = entry;
list_for_each_entry(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
if (cached_entry == entry || cached_start == start)
cached_entry = NULL;
rb_erase(&entry->rb, &memtype_rbroot);
list_del(&entry->nd);
kfree(entry);
err = 0;
break;
} else if (entry->start > start) {
break;
}
}
if (!err)
goto unlock_ret;
entry = saved_entry;
list_for_each_entry_reverse(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
rb_erase(&entry->rb, &memtype_rbroot);
list_del(&entry->nd);
kfree(entry);
err = 0;
break;
} else if (entry->start < start) {
break;
}
}
unlock_ret:
spin_unlock(&memtype_lock);
if (err) {
@@ -496,6 +573,101 @@ int free_memtype(u64 start, u64 end)
}
/**
* lookup_memtype - Looksup the memory type for a physical address
* @paddr: physical address of which memory type needs to be looked up
*
* Only to be called when PAT is enabled
*
* Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
* _PAGE_CACHE_UC
*/
static unsigned long lookup_memtype(u64 paddr)
{
int rettype = _PAGE_CACHE_WB;
struct memtype *entry;
if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
return rettype;
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
struct page *page;
spin_lock(&memtype_lock);
page = pfn_to_page(paddr >> PAGE_SHIFT);
rettype = get_page_memtype(page);
spin_unlock(&memtype_lock);
/*
* -1 from get_page_memtype() implies RAM page is in its
* default state and not reserved, and hence of type WB
*/
if (rettype == -1)
rettype = _PAGE_CACHE_WB;
return rettype;
}
spin_lock(&memtype_lock);
entry = memtype_rb_search(&memtype_rbroot, paddr);
if (entry != NULL)
rettype = entry->type;
else
rettype = _PAGE_CACHE_UC_MINUS;
spin_unlock(&memtype_lock);
return rettype;
}
/**
* io_reserve_memtype - Request a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
* @type: A pointer to memtype, with requested type. On success, requested
* or any other compatible type that was available for the region is returned
*
* On success, returns 0
* On failure, returns non-zero
*/
int io_reserve_memtype(resource_size_t start, resource_size_t end,
unsigned long *type)
{
resource_size_t size = end - start;
unsigned long req_type = *type;
unsigned long new_type;
int ret;
WARN_ON_ONCE(iomem_map_sanity_check(start, size));
ret = reserve_memtype(start, end, req_type, &new_type);
if (ret)
goto out_err;
if (!is_new_memtype_allowed(start, size, req_type, new_type))
goto out_free;
if (kernel_map_sync_memtype(start, size, new_type) < 0)
goto out_free;
*type = new_type;
return 0;
out_free:
free_memtype(start, end);
ret = -EBUSY;
out_err:
return ret;
}
/**
* io_free_memtype - Release a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
*/
void io_free_memtype(resource_size_t start, resource_size_t end)
{
free_memtype(start, end);
}
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -577,7 +749,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
if (!pat_enabled || base >= __pa(high_memory))
if (base >= __pa(high_memory))
return 0;
id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +784,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
* reserve_pfn_range() doesn't support RAM pages. Maintain the current
* behavior with RAM pages by returning success.
* reserve_pfn_range() for RAM pages. We do not refcount to keep
* track of number of mappings of RAM pages. We can assert that
* the type requested matches the type of first page in the range.
*/
if (is_ram != 0)
if (is_ram) {
if (!pat_enabled)
return 0;
flags = lookup_memtype(paddr);
if (want_flags != flags) {
printk(KERN_WARNING
"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
(unsigned long long)(paddr + size),
cattr_name(flags));
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
(~_PAGE_CACHE_MASK)) |
flags);
}
return 0;
}
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
@@ -678,14 +868,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (!pat_enabled)
return 0;
/*
* For now, only handle remap_pfn_range() vmas where
* is_linear_pfn_mapping() == TRUE. Handling of
* vm_insert_pfn() is TBD.
*/
if (is_linear_pfn_mapping(vma)) {
/*
* reserve the whole chunk covered by vma. We need the
@@ -713,23 +895,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long size)
{
unsigned long flags;
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
if (!pat_enabled)
return 0;
/*
* For now, only handle remap_pfn_range() vmas where
* is_linear_pfn_mapping() == TRUE. Handling of
* vm_insert_pfn() is TBD.
*/
if (is_linear_pfn_mapping(vma)) {
/* reserve the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
return reserve_pfn_range(paddr, vma_size, prot, 0);
}
if (!pat_enabled)
return 0;
/* for vm_insert_pfn and friends, we set prot based on lookup */
flags = lookup_memtype(pfn << PAGE_SHIFT);
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
flags);
return 0;
}
@@ -744,14 +927,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
if (!pat_enabled)
return;
/*
* For now, only handle remap_pfn_range() vmas where
* is_linear_pfn_mapping() == TRUE. Handling of
* vm_insert_pfn() is TBD.
*/
if (is_linear_pfn_mapping(vma)) {
/* free the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;