Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton:

 - a few misc things

 - kexec updates

 - DMA-mapping updates to better support networking DMA operations

 - IPC updates

 - various MM changes to improve DAX fault handling

 - lots of radix-tree changes, mainly to the test suite. All leading up
   to reimplementing the IDA/IDR code to be a wrapper layer over the
   radix-tree. However the final trigger-pulling patch is held off for
   4.11.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits)
  radix tree test suite: delete unused rcupdate.c
  radix tree test suite: add new tag check
  radix-tree: ensure counts are initialised
  radix tree test suite: cache recently freed objects
  radix tree test suite: add some more functionality
  idr: reduce the number of bits per level from 8 to 6
  rxrpc: abstract away knowledge of IDR internals
  tpm: use idr_find(), not idr_find_slowpath()
  idr: add ida_is_empty
  radix tree test suite: check multiorder iteration
  radix-tree: fix replacement for multiorder entries
  radix-tree: add radix_tree_split_preload()
  radix-tree: add radix_tree_split
  radix-tree: add radix_tree_join
  radix-tree: delete radix_tree_range_tag_if_tagged()
  radix-tree: delete radix_tree_locate_item()
  radix-tree: improve multiorder iterators
  btrfs: fix race in btrfs_free_dummy_fs_info()
  radix-tree: improve dump output
  radix-tree: make radix_tree_find_next_bit more useful
  ...
This commit is contained in:
Linus Torvalds
2016-12-14 17:25:18 -08:00
140 changed files with 3435 additions and 2225 deletions

View File

@@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking, not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup filesystem should skip it. Filesystem should use do_set_pte() to setup
page table entry. Pointer to entry associated with the page is passed in page table entry. Pointer to entry associated with the page is passed in
"pte" field in fault_env structure. Pointers to entries for other offsets "pte" field in vm_fault structure. Pointers to entries for other offsets
should be calculated relative to "pte". should be calculated relative to "pte".
->page_mkwrite() is called when a previously read-only pte is ->page_mkwrite() is called when a previously read-only pte is

View File

@@ -158,7 +158,10 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
unsigned long attrs) unsigned long attrs)
{ {
phys_addr_t paddr = page_to_phys(page) + offset; phys_addr_t paddr = page_to_phys(page) + offset;
_dma_cache_sync(paddr, size, dir);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
_dma_cache_sync(paddr, size, dir);
return plat_phys_to_dma(dev, paddr); return plat_phys_to_dma(dev, paddr);
} }

View File

@@ -243,7 +243,8 @@ static int needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size)
} }
static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
enum dma_data_direction dir) enum dma_data_direction dir,
unsigned long attrs)
{ {
struct dmabounce_device_info *device_info = dev->archdata.dmabounce; struct dmabounce_device_info *device_info = dev->archdata.dmabounce;
struct safe_buffer *buf; struct safe_buffer *buf;
@@ -262,7 +263,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
__func__, buf->ptr, virt_to_dma(dev, buf->ptr), __func__, buf->ptr, virt_to_dma(dev, buf->ptr),
buf->safe, buf->safe_dma_addr); buf->safe, buf->safe_dma_addr);
if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) { if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n", dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n",
__func__, ptr, buf->safe, size); __func__, ptr, buf->safe, size);
memcpy(buf->safe, ptr, size); memcpy(buf->safe, ptr, size);
@@ -272,7 +274,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
} }
static inline void unmap_single(struct device *dev, struct safe_buffer *buf, static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
size_t size, enum dma_data_direction dir) size_t size, enum dma_data_direction dir,
unsigned long attrs)
{ {
BUG_ON(buf->size != size); BUG_ON(buf->size != size);
BUG_ON(buf->direction != dir); BUG_ON(buf->direction != dir);
@@ -283,7 +286,8 @@ static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
DO_STATS(dev->archdata.dmabounce->bounce_count++); DO_STATS(dev->archdata.dmabounce->bounce_count++);
if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) { if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
void *ptr = buf->ptr; void *ptr = buf->ptr;
dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n", dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n",
@@ -334,7 +338,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, struct page *page,
return DMA_ERROR_CODE; return DMA_ERROR_CODE;
} }
return map_single(dev, page_address(page) + offset, size, dir); return map_single(dev, page_address(page) + offset, size, dir, attrs);
} }
/* /*
@@ -357,7 +361,7 @@ static void dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t
return; return;
} }
unmap_single(dev, buf, size, dir); unmap_single(dev, buf, size, dir, attrs);
} }
static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr, static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,

View File

@@ -146,7 +146,8 @@ static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page,
{ {
void *cpu_addr = page_address(page) + offset; void *cpu_addr = page_address(page) + offset;
dma_cache_sync(dev, cpu_addr, size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_cache_sync(dev, cpu_addr, size, direction);
return virt_to_bus(cpu_addr); return virt_to_bus(cpu_addr);
} }
@@ -162,6 +163,10 @@ static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist,
sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset;
virt = sg_virt(sg); virt = sg_virt(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
dma_cache_sync(dev, virt, sg->length, direction); dma_cache_sync(dev, virt, sg->length, direction);
} }

View File

@@ -118,6 +118,10 @@ static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list,
for_each_sg(sg_list, sg, nents, i) { for_each_sg(sg_list, sg, nents, i) {
sg->dma_address = (dma_addr_t) sg_virt(sg); sg->dma_address = (dma_addr_t) sg_virt(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
__dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction);
} }
@@ -143,7 +147,9 @@ static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page,
{ {
dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); dma_addr_t handle = (dma_addr_t)(page_address(page) + offset);
_dma_sync(handle, size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
_dma_sync(handle, size, dir);
return handle; return handle;
} }

View File

@@ -42,14 +42,17 @@ static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page,
{ {
dma_addr_t handle = virt_to_phys(page_address(page) + offset); dma_addr_t handle = virt_to_phys(page_address(page) + offset);
c6x_dma_sync(handle, size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
c6x_dma_sync(handle, size, dir);
return handle; return handle;
} }
static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir, unsigned long attrs) size_t size, enum dma_data_direction dir, unsigned long attrs)
{ {
c6x_dma_sync(handle, size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
c6x_dma_sync(handle, size, dir);
} }
static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -60,7 +63,8 @@ static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
c6x_dma_sync(sg->dma_address, sg->length, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
c6x_dma_sync(sg->dma_address, sg->length, dir);
} }
return nents; return nents;
@@ -72,9 +76,11 @@ static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
struct scatterlist *sg; struct scatterlist *sg;
int i; int i;
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return;
for_each_sg(sglist, sg, nents, i) for_each_sg(sglist, sg, nents, i)
c6x_dma_sync(sg_dma_address(sg), sg->length, dir); c6x_dma_sync(sg_dma_address(sg), sg->length, dir);
} }
static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,

View File

@@ -109,16 +109,19 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist,
int nents, enum dma_data_direction direction, int nents, enum dma_data_direction direction,
unsigned long attrs) unsigned long attrs)
{ {
int i;
struct scatterlist *sg; struct scatterlist *sg;
int i;
BUG_ON(direction == DMA_NONE);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return nents;
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
frv_cache_wback_inv(sg_dma_address(sg), frv_cache_wback_inv(sg_dma_address(sg),
sg_dma_address(sg) + sg_dma_len(sg)); sg_dma_address(sg) + sg_dma_len(sg));
} }
BUG_ON(direction == DMA_NONE);
return nents; return nents;
} }
@@ -127,7 +130,10 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page,
enum dma_data_direction direction, unsigned long attrs) enum dma_data_direction direction, unsigned long attrs)
{ {
BUG_ON(direction == DMA_NONE); BUG_ON(direction == DMA_NONE);
flush_dcache_page(page);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
flush_dcache_page(page);
return (dma_addr_t) page_to_phys(page) + offset; return (dma_addr_t) page_to_phys(page) + offset;
} }

View File

@@ -40,13 +40,16 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist,
int nents, enum dma_data_direction direction, int nents, enum dma_data_direction direction,
unsigned long attrs) unsigned long attrs)
{ {
struct scatterlist *sg;
unsigned long dampr2; unsigned long dampr2;
void *vaddr; void *vaddr;
int i; int i;
struct scatterlist *sg;
BUG_ON(direction == DMA_NONE); BUG_ON(direction == DMA_NONE);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return nents;
dampr2 = __get_DAMPR(2); dampr2 = __get_DAMPR(2);
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
@@ -70,7 +73,9 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, unsigned long offset, size_t size,
enum dma_data_direction direction, unsigned long attrs) enum dma_data_direction direction, unsigned long attrs)
{ {
flush_dcache_page(page); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
flush_dcache_page(page);
return (dma_addr_t) page_to_phys(page) + offset; return (dma_addr_t) page_to_phys(page) + offset;
} }

View File

@@ -119,6 +119,9 @@ static int hexagon_map_sg(struct device *hwdev, struct scatterlist *sg,
s->dma_length = s->length; s->dma_length = s->length;
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
flush_dcache_range(dma_addr_to_virt(s->dma_address), flush_dcache_range(dma_addr_to_virt(s->dma_address),
dma_addr_to_virt(s->dma_address + s->length)); dma_addr_to_virt(s->dma_address + s->length));
} }
@@ -180,7 +183,8 @@ static dma_addr_t hexagon_map_page(struct device *dev, struct page *page,
if (!check_addr("map_single", dev, bus, size)) if (!check_addr("map_single", dev, bus, size))
return bad_dma_address; return bad_dma_address;
dma_sync(dma_addr_to_virt(bus), size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_sync(dma_addr_to_virt(bus), size, dir);
return bus; return bus;
} }

View File

@@ -134,7 +134,9 @@ static dma_addr_t m68k_dma_map_page(struct device *dev, struct page *page,
{ {
dma_addr_t handle = page_to_phys(page) + offset; dma_addr_t handle = page_to_phys(page) + offset;
dma_sync_single_for_device(dev, handle, size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_sync_single_for_device(dev, handle, size, dir);
return handle; return handle;
} }
@@ -146,6 +148,10 @@ static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist,
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
dma_sync_single_for_device(dev, sg->dma_address, sg->length, dma_sync_single_for_device(dev, sg->dma_address, sg->length,
dir); dir);
} }

View File

@@ -484,8 +484,9 @@ static dma_addr_t metag_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, unsigned long offset, size_t size,
enum dma_data_direction direction, unsigned long attrs) enum dma_data_direction direction, unsigned long attrs)
{ {
dma_sync_for_device((void *)(page_to_phys(page) + offset), size, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
direction); dma_sync_for_device((void *)(page_to_phys(page) + offset),
size, direction);
return page_to_phys(page) + offset; return page_to_phys(page) + offset;
} }
@@ -493,7 +494,8 @@ static void metag_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
size_t size, enum dma_data_direction direction, size_t size, enum dma_data_direction direction,
unsigned long attrs) unsigned long attrs)
{ {
dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
} }
static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -507,6 +509,10 @@ static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist,
BUG_ON(!sg_page(sg)); BUG_ON(!sg_page(sg));
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
dma_sync_for_device(sg_virt(sg), sg->length, direction); dma_sync_for_device(sg_virt(sg), sg->length, direction);
} }
@@ -525,6 +531,10 @@ static void metag_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
BUG_ON(!sg_page(sg)); BUG_ON(!sg_page(sg));
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
dma_sync_for_cpu(sg_virt(sg), sg->length, direction); dma_sync_for_cpu(sg_virt(sg), sg->length, direction);
} }
} }

View File

@@ -61,6 +61,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
/* FIXME this part of code is untested */ /* FIXME this part of code is untested */
for_each_sg(sgl, sg, nents, i) { for_each_sg(sgl, sg, nents, i) {
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
__dma_sync(page_to_phys(sg_page(sg)) + sg->offset, __dma_sync(page_to_phys(sg_page(sg)) + sg->offset,
sg->length, direction); sg->length, direction);
} }
@@ -80,7 +84,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
enum dma_data_direction direction, enum dma_data_direction direction,
unsigned long attrs) unsigned long attrs)
{ {
__dma_sync(page_to_phys(page) + offset, size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync(page_to_phys(page) + offset, size, direction);
return page_to_phys(page) + offset; return page_to_phys(page) + offset;
} }
@@ -95,7 +100,8 @@ static inline void dma_direct_unmap_page(struct device *dev,
* phys_to_virt is here because in __dma_sync_page is __virt_to_phys and * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and
* dma_address is physical address * dma_address is physical address
*/ */
__dma_sync(dma_address, size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync(dma_address, size, direction);
} }
static inline void static inline void

View File

@@ -61,7 +61,7 @@ static int loongson_dma_map_sg(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, int nents, enum dma_data_direction dir,
unsigned long attrs) unsigned long attrs)
{ {
int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, 0); int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, attrs);
mb(); mb();
return r; return r;

View File

@@ -293,7 +293,7 @@ static inline void __dma_sync(struct page *page,
static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction direction, unsigned long attrs) size_t size, enum dma_data_direction direction, unsigned long attrs)
{ {
if (cpu_needs_post_dma_flush(dev)) if (cpu_needs_post_dma_flush(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync(dma_addr_to_page(dev, dma_addr), __dma_sync(dma_addr_to_page(dev, dma_addr),
dma_addr & ~PAGE_MASK, size, direction); dma_addr & ~PAGE_MASK, size, direction);
plat_post_dma_flush(dev); plat_post_dma_flush(dev);
@@ -307,7 +307,8 @@ static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist,
struct scatterlist *sg; struct scatterlist *sg;
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
if (!plat_device_is_coherent(dev)) if (!plat_device_is_coherent(dev) &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync(sg_page(sg), sg->offset, sg->length, __dma_sync(sg_page(sg), sg->offset, sg->length,
direction); direction);
#ifdef CONFIG_NEED_SG_DMA_LENGTH #ifdef CONFIG_NEED_SG_DMA_LENGTH
@@ -324,7 +325,7 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction direction, unsigned long offset, size_t size, enum dma_data_direction direction,
unsigned long attrs) unsigned long attrs)
{ {
if (!plat_device_is_coherent(dev)) if (!plat_device_is_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync(page, offset, size, direction); __dma_sync(page, offset, size, direction);
return plat_map_dma_mem_page(dev, page) + offset; return plat_map_dma_mem_page(dev, page) + offset;
@@ -339,6 +340,7 @@ static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
for_each_sg(sglist, sg, nhwentries, i) { for_each_sg(sglist, sg, nhwentries, i) {
if (!plat_device_is_coherent(dev) && if (!plat_device_is_coherent(dev) &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
direction != DMA_TO_DEVICE) direction != DMA_TO_DEVICE)
__dma_sync(sg_page(sg), sg->offset, sg->length, __dma_sync(sg_page(sg), sg->offset, sg->length,
direction); direction);

View File

@@ -98,13 +98,17 @@ static int nios2_dma_map_sg(struct device *dev, struct scatterlist *sg,
int i; int i;
for_each_sg(sg, sg, nents, i) { for_each_sg(sg, sg, nents, i) {
void *addr; void *addr = sg_virt(sg);
addr = sg_virt(sg); if (!addr)
if (addr) { continue;
__dma_sync_for_device(addr, sg->length, direction);
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
}
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
__dma_sync_for_device(addr, sg->length, direction);
} }
return nents; return nents;
@@ -117,7 +121,9 @@ static dma_addr_t nios2_dma_map_page(struct device *dev, struct page *page,
{ {
void *addr = page_address(page) + offset; void *addr = page_address(page) + offset;
__dma_sync_for_device(addr, size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync_for_device(addr, size, direction);
return page_to_phys(page) + offset; return page_to_phys(page) + offset;
} }
@@ -125,7 +131,8 @@ static void nios2_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
size_t size, enum dma_data_direction direction, size_t size, enum dma_data_direction direction,
unsigned long attrs) unsigned long attrs)
{ {
__dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
} }
static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
@@ -138,6 +145,9 @@ static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
if (direction == DMA_TO_DEVICE) if (direction == DMA_TO_DEVICE)
return; return;
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return;
for_each_sg(sg, sg, nhwentries, i) { for_each_sg(sg, sg, nhwentries, i) {
addr = sg_virt(sg); addr = sg_virt(sg);
if (addr) if (addr)

View File

@@ -141,6 +141,9 @@ or1k_map_page(struct device *dev, struct page *page,
unsigned long cl; unsigned long cl;
dma_addr_t addr = page_to_phys(page) + offset; dma_addr_t addr = page_to_phys(page) + offset;
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return addr;
switch (dir) { switch (dir) {
case DMA_TO_DEVICE: case DMA_TO_DEVICE:
/* Flush the dcache for the requested range */ /* Flush the dcache for the requested range */

View File

@@ -459,7 +459,9 @@ static dma_addr_t pa11_dma_map_page(struct device *dev, struct page *page,
void *addr = page_address(page) + offset; void *addr = page_address(page) + offset;
BUG_ON(direction == DMA_NONE); BUG_ON(direction == DMA_NONE);
flush_kernel_dcache_range((unsigned long) addr, size); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
flush_kernel_dcache_range((unsigned long) addr, size);
return virt_to_phys(addr); return virt_to_phys(addr);
} }
@@ -469,8 +471,11 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
{ {
BUG_ON(direction == DMA_NONE); BUG_ON(direction == DMA_NONE);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return;
if (direction == DMA_TO_DEVICE) if (direction == DMA_TO_DEVICE)
return; return;
/* /*
* For PCI_DMA_FROMDEVICE this flush is not necessary for the * For PCI_DMA_FROMDEVICE this flush is not necessary for the
@@ -479,7 +484,6 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
*/ */
flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size);
return;
} }
static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -496,6 +500,10 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist,
sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
sg_dma_len(sg) = sg->length; sg_dma_len(sg) = sg->length;
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
flush_kernel_dcache_range(vaddr, sg->length); flush_kernel_dcache_range(vaddr, sg->length);
} }
return nents; return nents;
@@ -510,14 +518,16 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
BUG_ON(direction == DMA_NONE); BUG_ON(direction == DMA_NONE);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return;
if (direction == DMA_TO_DEVICE) if (direction == DMA_TO_DEVICE)
return; return;
/* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
for_each_sg(sglist, sg, nents, i) for_each_sg(sglist, sg, nents, i)
flush_kernel_vmap_range(sg_virt(sg), sg->length); flush_kernel_vmap_range(sg_virt(sg), sg->length);
return;
} }
static void pa11_dma_sync_single_for_cpu(struct device *dev, static void pa11_dma_sync_single_for_cpu(struct device *dev,

View File

@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
for_each_sg(sgl, sg, nents, i) { for_each_sg(sgl, sg, nents, i) {
sg->dma_address = sg_phys(sg) + get_dma_offset(dev); sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
sg->dma_length = sg->length; sg->dma_length = sg->length;
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
} }
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
unsigned long attrs) unsigned long attrs)
{ {
BUG_ON(dir == DMA_NONE); BUG_ON(dir == DMA_NONE);
__dma_sync_page(page, offset, size, dir);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync_page(page, offset, size, dir);
return page_to_phys(page) + offset + get_dma_offset(dev); return page_to_phys(page) + offset + get_dma_offset(dev);
} }

View File

@@ -236,7 +236,6 @@ static int
spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct spu_context *ctx = vma->vm_file->private_data; struct spu_context *ctx = vma->vm_file->private_data;
unsigned long address = (unsigned long)vmf->virtual_address;
unsigned long pfn, offset; unsigned long pfn, offset;
offset = vmf->pgoff << PAGE_SHIFT; offset = vmf->pgoff << PAGE_SHIFT;
@@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n",
address, offset); vmf->address, offset);
if (spu_acquire(ctx)) if (spu_acquire(ctx))
return VM_FAULT_NOPAGE; return VM_FAULT_NOPAGE;
@@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
} }
vm_insert_pfn(vma, address, pfn); vm_insert_pfn(vma, vmf->address, pfn);
spu_release(ctx); spu_release(ctx);
@@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
} else { } else {
area = ctx->spu->problem_phys + ps_offs; area = ctx->spu->problem_phys + ps_offs;
vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
(area + offset) >> PAGE_SHIFT);
spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
} }

View File

@@ -18,7 +18,9 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
dma_addr_t addr = page_to_phys(page) + offset; dma_addr_t addr = page_to_phys(page) + offset;
WARN_ON(size == 0); WARN_ON(size == 0);
dma_cache_sync(dev, page_address(page) + offset, size, dir);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_cache_sync(dev, page_address(page) + offset, size, dir);
return addr; return addr;
} }
@@ -35,7 +37,8 @@ static int nommu_map_sg(struct device *dev, struct scatterlist *sg,
for_each_sg(sg, s, nents, i) { for_each_sg(sg, s, nents, i) {
BUG_ON(!sg_page(s)); BUG_ON(!sg_page(s));
dma_cache_sync(dev, sg_virt(s), s->length, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_cache_sync(dev, sg_virt(s), s->length, dir);
s->dma_address = sg_phys(s); s->dma_address = sg_phys(s);
s->dma_length = s->length; s->dma_length = s->length;

View File

@@ -415,7 +415,7 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL; ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL;
/* Step 1: Kick data out of streaming buffers if necessary. */ /* Step 1: Kick data out of streaming buffers if necessary. */
if (strbuf->strbuf_enabled) if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
strbuf_flush(strbuf, iommu, bus_addr, ctx, strbuf_flush(strbuf, iommu, bus_addr, ctx,
npages, direction); npages, direction);
@@ -640,7 +640,7 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
base = iommu->page_table + entry; base = iommu->page_table + entry;
dma_handle &= IO_PAGE_MASK; dma_handle &= IO_PAGE_MASK;
if (strbuf->strbuf_enabled) if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
strbuf_flush(strbuf, iommu, dma_handle, ctx, strbuf_flush(strbuf, iommu, dma_handle, ctx,
npages, direction); npages, direction);

View File

@@ -527,7 +527,7 @@ static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size,
enum dma_data_direction dir, unsigned long attrs) enum dma_data_direction dir, unsigned long attrs)
{ {
if (dir != PCI_DMA_TODEVICE) if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_make_coherent(ba, PAGE_ALIGN(size)); dma_make_coherent(ba, PAGE_ALIGN(size));
} }
@@ -572,7 +572,7 @@ static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
struct scatterlist *sg; struct scatterlist *sg;
int n; int n;
if (dir != PCI_DMA_TODEVICE) { if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
for_each_sg(sgl, sg, nents, n) { for_each_sg(sgl, sg, nents, n) {
dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length));
} }

View File

@@ -42,7 +42,7 @@ static int panic_on_timeout;
*/ */
atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_active);
static int nmi_init_done;
static unsigned int nmi_hz = HZ; static unsigned int nmi_hz = HZ;
static DEFINE_PER_CPU(short, wd_enabled); static DEFINE_PER_CPU(short, wd_enabled);
static int endflag __initdata; static int endflag __initdata;
@@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
void stop_nmi_watchdog(void *unused) void stop_nmi_watchdog(void *unused)
{ {
if (!__this_cpu_read(wd_enabled))
return;
pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
__this_cpu_write(wd_enabled, 0); __this_cpu_write(wd_enabled, 0);
atomic_dec(&nmi_active); atomic_dec(&nmi_active);
@@ -207,6 +209,9 @@ error:
void start_nmi_watchdog(void *unused) void start_nmi_watchdog(void *unused)
{ {
if (__this_cpu_read(wd_enabled))
return;
__this_cpu_write(wd_enabled, 1); __this_cpu_write(wd_enabled, 1);
atomic_inc(&nmi_active); atomic_inc(&nmi_active);
@@ -259,6 +264,8 @@ int __init nmi_init(void)
} }
} }
nmi_init_done = 1;
return err; return err;
} }
@@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str)
return 0; return 0;
} }
__setup("nmi_watchdog=", setup_nmi_watchdog); __setup("nmi_watchdog=", setup_nmi_watchdog);
/*
* sparc specific NMI watchdog enable function.
* Enables watchdog if it is not enabled already.
*/
int watchdog_nmi_enable(unsigned int cpu)
{
if (atomic_read(&nmi_active) == -1) {
pr_warn("NMI watchdog cannot be enabled or disabled\n");
return -1;
}
/*
* watchdog thread could start even before nmi_init is called.
* Just Return in that case. Let nmi_init finish the init
* process first.
*/
if (!nmi_init_done)
return 0;
smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1);
return 0;
}
/*
* sparc specific NMI watchdog disable function.
* Disables watchdog if it is not disabled already.
*/
void watchdog_nmi_disable(unsigned int cpu)
{
if (atomic_read(&nmi_active) == -1)
pr_warn_once("NMI watchdog cannot be enabled or disabled\n");
else
smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1);
}

View File

@@ -213,10 +213,12 @@ static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist,
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
__dma_prep_pa_range(sg->dma_address, sg->length, direction);
#ifdef CONFIG_NEED_SG_DMA_LENGTH #ifdef CONFIG_NEED_SG_DMA_LENGTH
sg->dma_length = sg->length; sg->dma_length = sg->length;
#endif #endif
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
__dma_prep_pa_range(sg->dma_address, sg->length, direction);
} }
return nents; return nents;
@@ -232,6 +234,8 @@ static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
BUG_ON(!valid_dma_direction(direction)); BUG_ON(!valid_dma_direction(direction));
for_each_sg(sglist, sg, nents, i) { for_each_sg(sglist, sg, nents, i) {
sg->dma_address = sg_phys(sg); sg->dma_address = sg_phys(sg);
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
continue;
__dma_complete_pa_range(sg->dma_address, sg->length, __dma_complete_pa_range(sg->dma_address, sg->length,
direction); direction);
} }
@@ -245,7 +249,8 @@ static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page,
BUG_ON(!valid_dma_direction(direction)); BUG_ON(!valid_dma_direction(direction));
BUG_ON(offset + size > PAGE_SIZE); BUG_ON(offset + size > PAGE_SIZE);
__dma_prep_page(page, offset, size, direction); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_prep_page(page, offset, size, direction);
return page_to_pa(page) + offset; return page_to_pa(page) + offset;
} }
@@ -256,6 +261,9 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
{ {
BUG_ON(!valid_dma_direction(direction)); BUG_ON(!valid_dma_direction(direction));
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
return;
__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
dma_address & (PAGE_SIZE - 1), size, direction); dma_address & (PAGE_SIZE - 1), size, direction);
} }

View File

@@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) { if (sym_offset == image->sym_vvar_page) {
ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, ret = vm_insert_pfn(vma, vmf->address,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT); __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) { } else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti = struct pvclock_vsyscall_time_info *pvti =
@@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
ret = vm_insert_pfn( ret = vm_insert_pfn(
vma, vma,
(unsigned long)vmf->virtual_address, vmf->address,
__pa(pvti) >> PAGE_SHIFT); __pa(pvti) >> PAGE_SHIFT);
} }
} }

View File

@@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void) void arch_crash_save_vmcoreinfo(void)
{ {
VMCOREINFO_SYMBOL(phys_base); VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_level4_pgt); VMCOREINFO_SYMBOL(init_level4_pgt);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
@@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif #endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset()); kaslr_offset());
VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
VMCOREINFO_VMALLOC_START(VMALLOC_START);
VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
} }
/* arch-dependent functionality related to kexec file-based syscall */ /* arch-dependent functionality related to kexec file-based syscall */

View File

@@ -189,7 +189,9 @@ static dma_addr_t xtensa_map_page(struct device *dev, struct page *page,
{ {
dma_addr_t dma_handle = page_to_phys(page) + offset; dma_addr_t dma_handle = page_to_phys(page) + offset;
xtensa_sync_single_for_device(dev, dma_handle, size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
xtensa_sync_single_for_device(dev, dma_handle, size, dir);
return dma_handle; return dma_handle;
} }
@@ -197,7 +199,8 @@ static void xtensa_unmap_page(struct device *dev, dma_addr_t dma_handle,
size_t size, enum dma_data_direction dir, size_t size, enum dma_data_direction dir,
unsigned long attrs) unsigned long attrs)
{ {
xtensa_sync_single_for_cpu(dev, dma_handle, size, dir); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
xtensa_sync_single_for_cpu(dev, dma_handle, size, dir);
} }
static int xtensa_map_sg(struct device *dev, struct scatterlist *sg, static int xtensa_map_sg(struct device *dev, struct scatterlist *sg,

View File

@@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
unsigned long pa; unsigned long pa;
struct page *page; struct page *page;
dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
+ agp->aperture.bus_base;
pa = agp->ops->translate(agp, dma_addr); pa = agp->ops->translate(agp, dma_addr);
if (pa == (unsigned long)-EINVAL) if (pa == (unsigned long)-EINVAL)

View File

@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* be because another thread has installed the pte first, so it * be because another thread has installed the pte first, so it
* is no problem. * is no problem.
*/ */
vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); vm_insert_pfn(vma, vmf->address, pfn);
return VM_FAULT_NOPAGE; return VM_FAULT_NOPAGE;
} }

View File

@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(tpm_put_ops);
* *
* The return'd chip has been tpm_try_get_ops'd and must be released via * The return'd chip has been tpm_try_get_ops'd and must be released via
* tpm_put_ops * tpm_put_ops
*/ */
struct tpm_chip *tpm_chip_find_get(int chip_num) struct tpm_chip *tpm_chip_find_get(int chip_num)
{ {
struct tpm_chip *chip, *res = NULL; struct tpm_chip *chip, *res = NULL;
@@ -103,7 +103,7 @@ struct tpm_chip *tpm_chip_find_get(int chip_num)
} }
} while (chip_prev != chip_num); } while (chip_prev != chip_num);
} else { } else {
chip = idr_find_slowpath(&dev_nums_idr, chip_num); chip = idr_find(&dev_nums_idr, chip_num);
if (chip && !tpm_try_get_ops(chip)) if (chip && !tpm_try_get_ops(chip))
res = chip; res = chip;
} }

View File

@@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
struct vm_fault *vmf) struct vm_fault *vmf)
{ {
unsigned long vaddr = (unsigned long) vmf->virtual_address;
struct device *dev = &dax_dev->dev; struct device *dev = &dax_dev->dev;
struct dax_region *dax_region; struct dax_region *dax_region;
int rc = VM_FAULT_SIGBUS; int rc = VM_FAULT_SIGBUS;
@@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
rc = vm_insert_mixed(vma, vaddr, pfn); rc = vm_insert_mixed(vma, vmf->address, pfn);
if (rc == -ENOMEM) if (rc == -ENOMEM)
return VM_FAULT_OOM; return VM_FAULT_OOM;

View File

@@ -17,12 +17,11 @@
static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data);
unsigned long addr = (unsigned long)vmf->virtual_address;
unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; unsigned long pfn = obj->phys_addr >> PAGE_SHIFT;
int ret; int ret;
pfn += (addr - vma->vm_start) >> PAGE_SHIFT; pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT;
ret = vm_insert_pfn(vma, addr, pfn); ret = vm_insert_pfn(vma, vmf->address, pfn);
switch (ret) { switch (ret) {
case 0: case 0:

View File

@@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* Using vm_pgoff as a selector forces us to use this unusual * Using vm_pgoff as a selector forces us to use this unusual
* addressing scheme. * addressing scheme.
*/ */
resource_size_t offset = (unsigned long)vmf->virtual_address - resource_size_t offset = vmf->address - vma->vm_start;
vma->vm_start;
resource_size_t baddr = map->offset + offset; resource_size_t baddr = map->offset + offset;
struct drm_agp_mem *agpmem; struct drm_agp_mem *agpmem;
struct page *page; struct page *page;
@@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!map) if (!map)
return VM_FAULT_SIGBUS; /* Nothing allocated */ return VM_FAULT_SIGBUS; /* Nothing allocated */
offset = (unsigned long)vmf->virtual_address - vma->vm_start; offset = vmf->address - vma->vm_start;
i = (unsigned long)map->handle + offset; i = (unsigned long)map->handle + offset;
page = vmalloc_to_page((void *)i); page = vmalloc_to_page((void *)i);
if (!page) if (!page)
@@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!dma->pagelist) if (!dma->pagelist)
return VM_FAULT_SIGBUS; /* Nothing allocated */ return VM_FAULT_SIGBUS; /* Nothing allocated */
offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ offset = vmf->address - vma->vm_start;
/* vm_[pg]off[set] should be 0 */
page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */
page = virt_to_page((void *)dma->pagelist[page_nr]); page = virt_to_page((void *)dma->pagelist[page_nr]);
@@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!entry->pagelist) if (!entry->pagelist)
return VM_FAULT_SIGBUS; /* Nothing allocated */ return VM_FAULT_SIGBUS; /* Nothing allocated */
offset = (unsigned long)vmf->virtual_address - vma->vm_start; offset = vmf->address - vma->vm_start;
map_offset = map->offset - (unsigned long)dev->sg->virtual; map_offset = map->offset - (unsigned long)dev->sg->virtual;
page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT);
page = entry->pagelist[page_offset]; page = entry->pagelist[page_offset];

View File

@@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
} }
/* We don't use vmf->pgoff since that has the fake offset: */ /* We don't use vmf->pgoff since that has the fake offset: */
pgoff = ((unsigned long)vmf->virtual_address - pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start) >> PAGE_SHIFT;
page = pages[pgoff]; page = pages[pgoff];
VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT);
ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); ret = vm_insert_page(vma, vmf->address, page);
out: out:
switch (ret) { switch (ret) {
@@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
while (pinned < npages) { while (pinned < npages) {
ret = get_user_pages_remote(task, mm, ptr, npages - pinned, ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
flags, pvec + pinned, NULL); flags, pvec + pinned, NULL, NULL);
if (ret < 0) if (ret < 0)
break; break;

View File

@@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
pgoff_t page_offset; pgoff_t page_offset;
int ret; int ret;
page_offset = ((unsigned long)vmf->virtual_address - page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start) >> PAGE_SHIFT;
if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) {
DRM_ERROR("invalid page offset\n"); DRM_ERROR("invalid page offset\n");
@@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
} }
pfn = page_to_pfn(exynos_gem->pages[page_offset]); pfn = page_to_pfn(exynos_gem->pages[page_offset]);
ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
__pfn_to_pfn_t(pfn, PFN_DEV));
out: out:
switch (ret) { switch (ret) {

View File

@@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
psbfb->gtt->offset; psbfb->gtt->offset;
page_num = vma_pages(vma); page_num = vma_pages(vma);
address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); address = vmf->address - (vmf->pgoff << PAGE_SHIFT);
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

View File

@@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Page relative to the VMA start - we must calculate this ourselves /* Page relative to the VMA start - we must calculate this ourselves
because vmf->pgoff is the fake GEM offset */ because vmf->pgoff is the fake GEM offset */
page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
>> PAGE_SHIFT;
/* CPU view of the page, don't go via the GART for CPU writes */ /* CPU view of the page, don't go via the GART for CPU writes */
if (r->stolen) if (r->stolen)
pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT;
else else
pfn = page_to_pfn(r->pages[page_offset]); pfn = page_to_pfn(r->pages[page_offset]);
ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); ret = vm_insert_pfn(vma, vmf->address, pfn);
fail: fail:
mutex_unlock(&dev_priv->mmap_mutex); mutex_unlock(&dev_priv->mmap_mutex);

View File

@@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
int ret; int ret;
/* We don't use vmf->pgoff since that has the fake offset */ /* We don't use vmf->pgoff since that has the fake offset */
page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
PAGE_SHIFT;
trace_i915_gem_object_fault(obj, page_offset, true, write); trace_i915_gem_object_fault(obj, page_offset, true, write);

View File

@@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
obj->userptr.ptr + pinned * PAGE_SIZE, obj->userptr.ptr + pinned * PAGE_SIZE,
npages - pinned, npages - pinned,
flags, flags,
pvec + pinned, NULL); pvec + pinned, NULL, NULL);
if (ret < 0) if (ret < 0)
break; break;

View File

@@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
} }
/* We don't use vmf->pgoff since that has the fake offset: */ /* We don't use vmf->pgoff since that has the fake offset: */
pgoff = ((unsigned long)vmf->virtual_address - pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start) >> PAGE_SHIFT;
pfn = page_to_pfn(pages[pgoff]); pfn = page_to_pfn(pages[pgoff]);
VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
pfn, pfn << PAGE_SHIFT); pfn, pfn << PAGE_SHIFT);
ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
__pfn_to_pfn_t(pfn, PFN_DEV));
out_unlock: out_unlock:
mutex_unlock(&dev->struct_mutex); mutex_unlock(&dev->struct_mutex);

View File

@@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj,
pgoff_t pgoff; pgoff_t pgoff;
/* We don't use vmf->pgoff since that has the fake offset: */ /* We don't use vmf->pgoff since that has the fake offset: */
pgoff = ((unsigned long)vmf->virtual_address - pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start) >> PAGE_SHIFT;
if (omap_obj->pages) { if (omap_obj->pages) {
omap_gem_cpu_sync(obj, pgoff); omap_gem_cpu_sync(obj, pgoff);
@@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj,
pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff;
} }
VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
pfn, pfn << PAGE_SHIFT); pfn, pfn << PAGE_SHIFT);
return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
__pfn_to_pfn_t(pfn, PFN_DEV));
} }
/* Special handling for the case of faulting in 2d tiled buffers */ /* Special handling for the case of faulting in 2d tiled buffers */
@@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj,
struct page *pages[64]; /* XXX is this too much to have on stack? */ struct page *pages[64]; /* XXX is this too much to have on stack? */
unsigned long pfn; unsigned long pfn;
pgoff_t pgoff, base_pgoff; pgoff_t pgoff, base_pgoff;
void __user *vaddr; unsigned long vaddr;
int i, ret, slots; int i, ret, slots;
/* /*
@@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj,
const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE);
/* We don't use vmf->pgoff since that has the fake offset: */ /* We don't use vmf->pgoff since that has the fake offset: */
pgoff = ((unsigned long)vmf->virtual_address - pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start) >> PAGE_SHIFT;
/* /*
* Actual address we start mapping at is rounded down to previous slot * Actual address we start mapping at is rounded down to previous slot
@@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj,
/* figure out buffer width in slots */ /* figure out buffer width in slots */
slots = omap_obj->width >> priv->usergart[fmt].slot_shift; slots = omap_obj->width >> priv->usergart[fmt].slot_shift;
vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT);
entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last];
@@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj,
pfn = entry->paddr >> PAGE_SHIFT; pfn = entry->paddr >> PAGE_SHIFT;
VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
pfn, pfn << PAGE_SHIFT); pfn, pfn << PAGE_SHIFT);
for (i = n; i > 0; i--) { for (i = n; i > 0; i--) {
vm_insert_mixed(vma, (unsigned long)vaddr, vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV));
__pfn_to_pfn_t(pfn, PFN_DEV));
pfn += priv->usergart[fmt].stride_pfn; pfn += priv->usergart[fmt].stride_pfn;
vaddr += PAGE_SIZE * m; vaddr += PAGE_SIZE * m;
} }

View File

@@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!bo->pages) if (!bo->pages)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
page = bo->pages[offset]; page = bo->pages[offset];
err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); err = vm_insert_page(vma, vmf->address, page);
switch (err) { switch (err) {
case -EAGAIN: case -EAGAIN:
case 0: case 0:

View File

@@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page; struct page *page;
int ret; int ret;
int i; int i;
unsigned long address = (unsigned long)vmf->virtual_address; unsigned long address = vmf->address;
int retval = VM_FAULT_NOPAGE; int retval = VM_FAULT_NOPAGE;
struct ttm_mem_type_manager *man = struct ttm_mem_type_manager *man =
&bdev->man[bo->mem.mem_type]; &bdev->man[bo->mem.mem_type];

View File

@@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned int page_offset; unsigned int page_offset;
int ret = 0; int ret = 0;
page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
PAGE_SHIFT;
if (!obj->pages) if (!obj->pages)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
page = obj->pages[page_offset]; page = obj->pages[page_offset];
ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); ret = vm_insert_page(vma, vmf->address, page);
switch (ret) { switch (ret) {
case -EAGAIN: case -EAGAIN:
case 0: case 0:

View File

@@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct drm_vgem_gem_object *obj = vma->vm_private_data; struct drm_vgem_gem_object *obj = vma->vm_private_data;
/* We don't use vmf->pgoff since that has the fake offset */ /* We don't use vmf->pgoff since that has the fake offset */
unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned long vaddr = vmf->address;
struct page *page; struct page *page;
page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping,

View File

@@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
*/ */
npages = get_user_pages_remote(owning_process, owning_mm, npages = get_user_pages_remote(owning_process, owning_mm,
user_virt, gup_num_pages, user_virt, gup_num_pages,
flags, local_page_list, NULL); flags, local_page_list, NULL, NULL);
up_read(&owning_mm->mmap_sem); up_read(&owning_mm->mmap_sem);
if (npages < 0) if (npages < 0)

View File

@@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page; struct page *page;
dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
(unsigned long)vmf->virtual_address, vmf->address, vma->vm_start, vma->vm_end);
vma->vm_start, vma->vm_end);
page = alloc_page(GFP_USER | __GFP_DMA32); page = alloc_page(GFP_USER | __GFP_DMA32);
if (!page) if (!page)
return VM_FAULT_OOM; return VM_FAULT_OOM;
clear_user_highpage(page, (unsigned long)vmf->virtual_address); clear_user_highpage(page, vmf->address);
vmf->page = page; vmf->page = page;
return 0; return 0;

View File

@@ -117,13 +117,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct cxl_context *ctx = vma->vm_file->private_data; struct cxl_context *ctx = vma->vm_file->private_data;
unsigned long address = (unsigned long)vmf->virtual_address;
u64 area, offset; u64 area, offset;
offset = vmf->pgoff << PAGE_SHIFT; offset = vmf->pgoff << PAGE_SHIFT;
pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n",
__func__, ctx->pe, address, offset); __func__, ctx->pe, vmf->address, offset);
if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
area = ctx->afu->psn_phys; area = ctx->afu->psn_phys;
@@ -155,7 +154,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
mutex_unlock(&ctx->status_mutex); mutex_unlock(&ctx->status_mutex);

View File

@@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned long paddr, vaddr; unsigned long paddr, vaddr;
unsigned long expires; unsigned long expires;
vaddr = (unsigned long)vmf->virtual_address; vaddr = vmf->address;
gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
vma, vaddr, GSEG_BASE(vaddr)); vma, vaddr, GSEG_BASE(vaddr));
STAT(nopfn); STAT(nopfn);

View File

@@ -210,7 +210,12 @@ struct igb_tx_buffer {
struct igb_rx_buffer { struct igb_rx_buffer {
dma_addr_t dma; dma_addr_t dma;
struct page *page; struct page *page;
unsigned int page_offset; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
__u32 page_offset;
#else
__u16 page_offset;
#endif
__u16 pagecnt_bias;
}; };
struct igb_tx_queue_stats { struct igb_tx_queue_stats {

View File

@@ -3947,11 +3947,23 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
if (!buffer_info->page) if (!buffer_info->page)
continue; continue;
dma_unmap_page(rx_ring->dev, /* Invalidate cache lines that may have been written to by
buffer_info->dma, * device so that we avoid corrupting memory.
PAGE_SIZE, */
DMA_FROM_DEVICE); dma_sync_single_range_for_cpu(rx_ring->dev,
__free_page(buffer_info->page); buffer_info->dma,
buffer_info->page_offset,
IGB_RX_BUFSZ,
DMA_FROM_DEVICE);
/* free resources associated with mapping */
dma_unmap_page_attrs(rx_ring->dev,
buffer_info->dma,
PAGE_SIZE,
DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
__page_frag_drain(buffer_info->page, 0,
buffer_info->pagecnt_bias);
buffer_info->page = NULL; buffer_info->page = NULL;
} }
@@ -6812,12 +6824,6 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
/* transfer page from old buffer to new buffer */ /* transfer page from old buffer to new buffer */
*new_buff = *old_buff; *new_buff = *old_buff;
/* sync the buffer for use by the device */
dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma,
old_buff->page_offset,
IGB_RX_BUFSZ,
DMA_FROM_DEVICE);
} }
static inline bool igb_page_is_reserved(struct page *page) static inline bool igb_page_is_reserved(struct page *page)
@@ -6829,13 +6835,15 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
struct page *page, struct page *page,
unsigned int truesize) unsigned int truesize)
{ {
unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
/* avoid re-using remote pages */ /* avoid re-using remote pages */
if (unlikely(igb_page_is_reserved(page))) if (unlikely(igb_page_is_reserved(page)))
return false; return false;
#if (PAGE_SIZE < 8192) #if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */ /* if we are only owner of page we can reuse it */
if (unlikely(page_count(page) != 1)) if (unlikely(page_ref_count(page) != pagecnt_bias))
return false; return false;
/* flip page offset to other buffer */ /* flip page offset to other buffer */
@@ -6848,10 +6856,14 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
return false; return false;
#endif #endif
/* Even if we own the page, we are not allowed to use atomic_set() /* If we have drained the page fragment pool we need to update
* This would break get_page_unless_zero() users. * the pagecnt_bias and page count so that we fully restock the
* number of references the driver holds.
*/ */
page_ref_inc(page); if (unlikely(pagecnt_bias == 1)) {
page_ref_add(page, USHRT_MAX);
rx_buffer->pagecnt_bias = USHRT_MAX;
}
return true; return true;
} }
@@ -6903,7 +6915,6 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring,
return true; return true;
/* this page cannot be reused so discard it */ /* this page cannot be reused so discard it */
__free_page(page);
return false; return false;
} }
@@ -6938,6 +6949,13 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
page = rx_buffer->page; page = rx_buffer->page;
prefetchw(page); prefetchw(page);
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
rx_buffer->dma,
rx_buffer->page_offset,
size,
DMA_FROM_DEVICE);
if (likely(!skb)) { if (likely(!skb)) {
void *page_addr = page_address(page) + void *page_addr = page_address(page) +
rx_buffer->page_offset; rx_buffer->page_offset;
@@ -6962,21 +6980,18 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
prefetchw(skb->data); prefetchw(skb->data);
} }
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
rx_buffer->dma,
rx_buffer->page_offset,
size,
DMA_FROM_DEVICE);
/* pull page into skb */ /* pull page into skb */
if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
/* hand second half of page back to the ring */ /* hand second half of page back to the ring */
igb_reuse_rx_page(rx_ring, rx_buffer); igb_reuse_rx_page(rx_ring, rx_buffer);
} else { } else {
/* we are not reusing the buffer so unmap it */ /* We are not reusing the buffer so unmap it and free
dma_unmap_page(rx_ring->dev, rx_buffer->dma, * any references we are holding to it
PAGE_SIZE, DMA_FROM_DEVICE); */
dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
PAGE_SIZE, DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
__page_frag_drain(page, 0, rx_buffer->pagecnt_bias);
} }
/* clear contents of rx_buffer */ /* clear contents of rx_buffer */
@@ -7234,7 +7249,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
} }
/* map page for use */ /* map page for use */
dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
/* if mapping failed free memory back to system since /* if mapping failed free memory back to system since
* there isn't much point in holding memory we can't use * there isn't much point in holding memory we can't use
@@ -7249,6 +7265,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
bi->dma = dma; bi->dma = dma;
bi->page = page; bi->page = page;
bi->page_offset = 0; bi->page_offset = 0;
bi->pagecnt_bias = 1;
return true; return true;
} }
@@ -7275,6 +7292,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
if (!igb_alloc_mapped_page(rx_ring, bi)) if (!igb_alloc_mapped_page(rx_ring, bi))
break; break;
/* sync the buffer for use by the device */
dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
bi->page_offset,
IGB_RX_BUFSZ,
DMA_FROM_DEVICE);
/* Refresh the desc even if buffer_addrs didn't change /* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info. * because each write-back erases this info.
*/ */

View File

@@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
/* bound gain by 2 bits value max, 3rd bit is sign */ /* bound gain by 2 bits value max, 3rd bit is sign */
data->delta_gain_code[i] = data->delta_gain_code[i] =
min(abs(delta_g), min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
(s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
if (delta_g < 0) if (delta_g < 0)
/* /*

View File

@@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); ret = vm_insert_pfn(vma, vmf->address, pfn);
mutex_unlock(&buffer->lock); mutex_unlock(&buffer->lock);
if (ret) if (ret)
return VM_FAULT_ERROR; return VM_FAULT_ERROR;

View File

@@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
"page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
vmf->page, vmf->page->mapping, vmf->page->index, vmf->page, vmf->page->mapping, vmf->page->index,
(long)vmf->page->flags, page_count(vmf->page), (long)vmf->page->flags, page_count(vmf->page),
page_private(vmf->page), vmf->virtual_address); page_private(vmf->page), (void *)vmf->address);
if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
lock_page(vmf->page); lock_page(vmf->page);
cfio->ft_flags |= VM_FAULT_LOCKED; cfio->ft_flags |= VM_FAULT_LOCKED;
@@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
} }
if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address);
return -EFAULT; return -EFAULT;
} }
if (cfio->ft_flags & VM_FAULT_OOM) { if (cfio->ft_flags & VM_FAULT_OOM) {
CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address);
return -ENOMEM; return -ENOMEM;
} }

View File

@@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f)
mutex_lock(&hidg_ida_lock); mutex_lock(&hidg_ida_lock);
hidg_put_minor(opts->minor); hidg_put_minor(opts->minor);
if (idr_is_empty(&hidg_ida.idr)) if (ida_is_empty(&hidg_ida))
ghid_cleanup(); ghid_cleanup();
mutex_unlock(&hidg_ida_lock); mutex_unlock(&hidg_ida_lock);
@@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void)
mutex_lock(&hidg_ida_lock); mutex_lock(&hidg_ida_lock);
if (idr_is_empty(&hidg_ida.idr)) { if (ida_is_empty(&hidg_ida)) {
status = ghid_setup(NULL, HIDG_MINORS); status = ghid_setup(NULL, HIDG_MINORS);
if (status) { if (status) {
ret = ERR_PTR(status); ret = ERR_PTR(status);
@@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void)
if (opts->minor < 0) { if (opts->minor < 0) {
ret = ERR_PTR(opts->minor); ret = ERR_PTR(opts->minor);
kfree(opts); kfree(opts);
if (idr_is_empty(&hidg_ida.idr)) if (ida_is_empty(&hidg_ida))
ghid_cleanup(); ghid_cleanup();
goto unlock; goto unlock;
} }

View File

@@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f)
mutex_lock(&printer_ida_lock); mutex_lock(&printer_ida_lock);
gprinter_put_minor(opts->minor); gprinter_put_minor(opts->minor);
if (idr_is_empty(&printer_ida.idr)) if (ida_is_empty(&printer_ida))
gprinter_cleanup(); gprinter_cleanup();
mutex_unlock(&printer_ida_lock); mutex_unlock(&printer_ida_lock);
@@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void)
mutex_lock(&printer_ida_lock); mutex_lock(&printer_ida_lock);
if (idr_is_empty(&printer_ida.idr)) { if (ida_is_empty(&printer_ida)) {
status = gprinter_setup(PRINTER_MINORS); status = gprinter_setup(PRINTER_MINORS);
if (status) { if (status) {
ret = ERR_PTR(status); ret = ERR_PTR(status);
@@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void)
if (opts->minor < 0) { if (opts->minor < 0) {
ret = ERR_PTR(opts->minor); ret = ERR_PTR(opts->minor);
kfree(opts); kfree(opts);
if (idr_is_empty(&printer_ida.idr)) if (ida_is_empty(&printer_ida))
gprinter_cleanup(); gprinter_cleanup();
goto unlock; goto unlock;
} }

View File

@@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
NULL); NULL, NULL);
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
} }

View File

@@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
vma, vma->vm_start, vma->vm_end, vma, vma->vm_start, vma->vm_end,
vmf->pgoff, vmf->virtual_address); vmf->pgoff, (void *)vmf->address);
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }

View File

@@ -202,12 +202,12 @@ static struct ratelimit_state printk_limits[] = {
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{ {
struct super_block *sb = fs_info->sb; struct super_block *sb = fs_info->sb;
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1]; char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf; struct va_format vaf;
va_list args; va_list args;
const char *type = NULL;
int kern_level; int kern_level;
struct ratelimit_state *ratelimit; const char *type = logtypes[4];
struct ratelimit_state *ratelimit = &printk_limits[4];
va_start(args, fmt); va_start(args, fmt);
@@ -223,12 +223,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
fmt += size; fmt += size;
} }
if (!type) {
*lvl = '\0';
type = logtypes[4];
ratelimit = &printk_limits[4];
}
vaf.fmt = fmt; vaf.fmt = fmt;
vaf.va = &args; vaf.va = &args;

View File

@@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
slot = radix_tree_iter_retry(&iter); slot = radix_tree_iter_retry(&iter);
continue; continue;
} }
slot = radix_tree_iter_resume(slot, &iter);
spin_unlock(&fs_info->buffer_lock); spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb); free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock); spin_lock(&fs_info->buffer_lock);

208
fs/dax.c
View File

@@ -31,6 +31,7 @@
#include <linux/vmstat.h> #include <linux/vmstat.h>
#include <linux/pfn_t.h> #include <linux/pfn_t.h>
#include <linux/sizes.h> #include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h> #include <linux/iomap.h>
#include "internal.h" #include "internal.h"
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
} }
} }
static void dax_unlock_mapping_entry(struct address_space *mapping,
pgoff_t index)
{
void *entry, **slot;
spin_lock_irq(&mapping->tree_lock);
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot))) {
spin_unlock_irq(&mapping->tree_lock);
return;
}
unlock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}
static void put_locked_mapping_entry(struct address_space *mapping, static void put_locked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry) pgoff_t index, void *entry)
{ {
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
} }
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
{
void *entry, **slot;
spin_lock_irq(&mapping->tree_lock);
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot))) {
spin_unlock_irq(&mapping->tree_lock);
return;
}
unlock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}
/* /*
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
* entry to get unlocked before deleting it. * entry to get unlocked before deleting it.
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
/* This will replace locked radix tree entry with a hole page */ /* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff, page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO); vmf->gfp_mask | __GFP_ZERO);
if (!page) { if (!page)
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
return VM_FAULT_OOM; return VM_FAULT_OOM;
}
vmf->page = page; vmf->page = page;
return VM_FAULT_LOCKED; return VM_FAULT_LOCKED;
} }
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
return new_entry; return new_entry;
} }
static inline unsigned long
pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
{
unsigned long address;
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
return address;
}
/* Walk all mappings of a given index of a file and writeprotect them */
static void dax_mapping_entry_mkclean(struct address_space *mapping,
pgoff_t index, unsigned long pfn)
{
struct vm_area_struct *vma;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
bool changed;
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
unsigned long address;
cond_resched();
if (!(vma->vm_flags & VM_SHARED))
continue;
address = pgoff_address(index, vma);
changed = false;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
continue;
if (pfn != pte_pfn(*ptep))
goto unlock;
if (!pte_dirty(*ptep) && !pte_write(*ptep))
goto unlock;
flush_cache_page(vma, address, pfn);
pte = ptep_clear_flush(vma, address, ptep);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
changed = true;
unlock:
pte_unmap_unlock(ptep, ptl);
if (changed)
mmu_notifier_invalidate_page(vma->vm_mm, address);
}
i_mmap_unlock_read(mapping);
}
static int dax_writeback_one(struct block_device *bdev, static int dax_writeback_one(struct block_device *bdev,
struct address_space *mapping, pgoff_t index, void *entry) struct address_space *mapping, pgoff_t index, void *entry)
{ {
struct radix_tree_root *page_tree = &mapping->page_tree; struct radix_tree_root *page_tree = &mapping->page_tree;
struct radix_tree_node *node;
struct blk_dax_ctl dax; struct blk_dax_ctl dax;
void **slot; void *entry2, **slot;
int ret = 0; int ret = 0;
spin_lock_irq(&mapping->tree_lock);
/* /*
* Regular page slots are stabilized by the page lock even * A page got tagged dirty in DAX mapping? Something is seriously
* without the tree itself locked. These unlocked entries * wrong.
* need verification under the tree lock.
*/ */
if (!__radix_tree_lookup(page_tree, index, &node, &slot)) if (WARN_ON(!radix_tree_exceptional_entry(entry)))
goto unlock; return -EIO;
if (*slot != entry)
goto unlock;
/* another fsync thread may have already written back this entry */
if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
goto unlock;
spin_lock_irq(&mapping->tree_lock);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
if (!entry2 || !radix_tree_exceptional_entry(entry2))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
* compare sectors as we must not bail out due to difference in lockbit
* or entry type.
*/
if (dax_radix_sector(entry2) != dax_radix_sector(entry))
goto put_unlocked;
if (WARN_ON_ONCE(dax_is_empty_entry(entry) || if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
dax_is_zero_entry(entry))) { dax_is_zero_entry(entry))) {
ret = -EIO; ret = -EIO;
goto unlock; goto put_unlocked;
} }
/* Another fsync thread may have already written back this entry */
if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
goto put_unlocked;
/* Lock the entry to serialize with page faults */
entry = lock_slot(mapping, slot);
/*
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
* actually flush the caches. This is achieved as the calls will look
* at the entry only under tree_lock and once they do that they will
* see the entry locked and wait for it to unlock.
*/
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
spin_unlock_irq(&mapping->tree_lock);
/* /*
* Even if dax_writeback_mapping_range() was given a wbc->range_start * Even if dax_writeback_mapping_range() was given a wbc->range_start
* in the middle of a PMD, the 'index' we are given will be aligned to * in the middle of a PMD, the 'index' we are given will be aligned to
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev,
*/ */
dax.sector = dax_radix_sector(entry); dax.sector = dax_radix_sector(entry);
dax.size = PAGE_SIZE << dax_radix_order(entry); dax.size = PAGE_SIZE << dax_radix_order(entry);
spin_unlock_irq(&mapping->tree_lock);
/* /*
* We cannot hold tree_lock while calling dax_map_atomic() because it * We cannot hold tree_lock while calling dax_map_atomic() because it
* eventually calls cond_resched(). * eventually calls cond_resched().
*/ */
ret = dax_map_atomic(bdev, &dax); ret = dax_map_atomic(bdev, &dax);
if (ret < 0) if (ret < 0) {
put_locked_mapping_entry(mapping, index, entry);
return ret; return ret;
}
if (WARN_ON_ONCE(ret < dax.size)) { if (WARN_ON_ONCE(ret < dax.size)) {
ret = -EIO; ret = -EIO;
goto unmap; goto unmap;
} }
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
wb_cache_pmem(dax.addr, dax.size); wb_cache_pmem(dax.addr, dax.size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
spin_lock_irq(&mapping->tree_lock); spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
unmap: unmap:
dax_unmap_atomic(bdev, &dax); dax_unmap_atomic(bdev, &dax);
put_locked_mapping_entry(mapping, index, entry);
return ret; return ret;
unlock: put_unlocked:
put_unlocked_mapping_entry(mapping, index, entry2);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
return ret; return ret;
} }
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, sector_t sector, size_t size, struct block_device *bdev, sector_t sector, size_t size,
void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned long vaddr = vmf->address;
struct blk_dax_ctl dax = { struct blk_dax_ctl dax = {
.sector = sector, .sector = sector,
.size = size, .size = size,
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct file *file = vma->vm_file; struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
void *entry; void *entry, **slot;
pgoff_t index = vmf->pgoff; pgoff_t index = vmf->pgoff;
spin_lock_irq(&mapping->tree_lock); spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL); entry = get_unlocked_mapping_entry(mapping, index, &slot);
if (!entry || !radix_tree_exceptional_entry(entry)) if (!entry || !radix_tree_exceptional_entry(entry)) {
goto out; if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
put_unlocked_mapping_entry(mapping, index, entry); entry = lock_slot(mapping, slot);
out:
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
/*
* If we race with somebody updating the PTE and finish_mkwrite_fault()
* fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
* the fault in either case.
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE; return VM_FAULT_NOPAGE;
} }
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
{ {
struct address_space *mapping = vma->vm_file->f_mapping; struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
sector_t sector; sector_t sector;
struct iomap iomap = { 0 }; struct iomap iomap = { 0 };
unsigned flags = IOMAP_FAULT; unsigned flags = IOMAP_FAULT;
int error, major = 0; int error, major = 0;
int locked_status = 0; int vmf_ret = 0;
void *entry; void *entry;
/* /*
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (error) if (error)
goto finish_iomap; goto finish_iomap;
if (!radix_tree_exceptional_entry(entry)) {
vmf->page = entry; __SetPageUptodate(vmf->cow_page);
locked_status = VM_FAULT_LOCKED; vmf_ret = finish_fault(vmf);
} else { if (!vmf_ret)
vmf->entry = entry; vmf_ret = VM_FAULT_DONE_COW;
locked_status = VM_FAULT_DAX_LOCKED;
}
goto finish_iomap; goto finish_iomap;
} }
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) { if (!(vmf->flags & FAULT_FLAG_WRITE)) {
locked_status = dax_load_hole(mapping, entry, vmf); vmf_ret = dax_load_hole(mapping, entry, vmf);
break; break;
} }
/*FALLTHRU*/ /*FALLTHRU*/
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
finish_iomap: finish_iomap:
if (ops->iomap_end) { if (ops->iomap_end) {
if (error) { if (error || (vmf_ret & VM_FAULT_ERROR)) {
/* keep previous error */ /* keep previous error */
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
&iomap); &iomap);
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
} }
} }
unlock_entry: unlock_entry:
if (!locked_status || error) if (vmf_ret != VM_FAULT_LOCKED || error)
put_locked_mapping_entry(mapping, vmf->pgoff, entry); put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out: out:
if (error == -ENOMEM) if (error == -ENOMEM)
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
/* -EBUSY is fine, somebody else faulted on the same PTE */ /* -EBUSY is fine, somebody else faulted on the same PTE */
if (error < 0 && error != -EBUSY) if (error < 0 && error != -EBUSY)
return VM_FAULT_SIGBUS | major; return VM_FAULT_SIGBUS | major;
if (locked_status) { if (vmf_ret) {
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
return locked_status; return vmf_ret;
} }
return VM_FAULT_NOPAGE | major; return VM_FAULT_NOPAGE | major;
} }

View File

@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* doing the exec and bprm->mm is the new process's mm. * doing the exec and bprm->mm is the new process's mm.
*/ */
ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
&page, NULL); &page, NULL, NULL);
if (ret <= 0) if (ret <= 0)
return NULL; return NULL;

View File

@@ -257,9 +257,9 @@ out:
* fatal_signal_pending()s, and the mmap_sem must be released before * fatal_signal_pending()s, and the mmap_sem must be released before
* returning it. * returning it.
*/ */
int handle_userfault(struct fault_env *fe, unsigned long reason) int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{ {
struct mm_struct *mm = fe->vma->vm_mm; struct mm_struct *mm = vmf->vma->vm_mm;
struct userfaultfd_ctx *ctx; struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq; struct userfaultfd_wait_queue uwq;
int ret; int ret;
@@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
ctx = fe->vma->vm_userfaultfd_ctx.ctx; ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx) if (!ctx)
goto out; goto out;
@@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* without first stopping userland access to the memory. For * without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now. * VM_UFFD_MISSING userfaults this is enough for now.
*/ */
if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
/* /*
* Validate the invariant that nowait must allow retry * Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on * to be sure not to return SIGBUS erroneously on
* nowait invocations. * nowait invocations.
*/ */
BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) { if (printk_ratelimit()) {
printk(KERN_WARNING printk(KERN_WARNING
"FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); "FAULT_FLAG_ALLOW_RETRY missing %x\n",
vmf->flags);
dump_stack(); dump_stack();
} }
#endif #endif
@@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* and wait. * and wait.
*/ */
ret = VM_FAULT_RETRY; ret = VM_FAULT_RETRY;
if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out; goto out;
/* take the reference before dropping the mmap_sem */ /* take the reference before dropping the mmap_sem */
@@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current; uwq.wq.private = current;
uwq.msg = userfault_msg(fe->address, fe->flags, reason); uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
uwq.ctx = ctx; uwq.ctx = ctx;
return_to_userland = return_to_userland =
(fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
TASK_KILLABLE); TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock(&ctx->fault_pending_wqh.lock);
must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
reason);
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) && if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&

View File

@@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
#ifdef CONFIG_FS_DAX #ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n); struct page *read_dax_sector(struct block_device *bdev, sector_t n);
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
int __dax_zero_page_range(struct block_device *bdev, sector_t sector, int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
unsigned int offset, unsigned int length); unsigned int offset, unsigned int length);
#else #else
@@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
{ {
return ERR_PTR(-ENXIO); return ERR_PTR(-ENXIO);
} }
/* Shouldn't ever be called when dax is disabled. */
static inline void dax_unlock_mapping_entry(struct address_space *mapping,
pgoff_t index)
{
BUG();
}
static inline int __dax_zero_page_range(struct block_device *bdev, static inline int __dax_zero_page_range(struct block_device *bdev,
sector_t sector, unsigned int offset, unsigned int length) sector_t sector, unsigned int offset, unsigned int length)
{ {

View File

@@ -243,29 +243,33 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
ops->unmap_sg(dev, sg, nents, dir, attrs); ops->unmap_sg(dev, sg, nents, dir, attrs);
} }
static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, static inline dma_addr_t dma_map_page_attrs(struct device *dev,
size_t offset, size_t size, struct page *page,
enum dma_data_direction dir) size_t offset, size_t size,
enum dma_data_direction dir,
unsigned long attrs)
{ {
struct dma_map_ops *ops = get_dma_ops(dev); struct dma_map_ops *ops = get_dma_ops(dev);
dma_addr_t addr; dma_addr_t addr;
kmemcheck_mark_initialized(page_address(page) + offset, size); kmemcheck_mark_initialized(page_address(page) + offset, size);
BUG_ON(!valid_dma_direction(dir)); BUG_ON(!valid_dma_direction(dir));
addr = ops->map_page(dev, page, offset, size, dir, 0); addr = ops->map_page(dev, page, offset, size, dir, attrs);
debug_dma_map_page(dev, page, offset, size, dir, addr, false); debug_dma_map_page(dev, page, offset, size, dir, addr, false);
return addr; return addr;
} }
static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, static inline void dma_unmap_page_attrs(struct device *dev,
size_t size, enum dma_data_direction dir) dma_addr_t addr, size_t size,
enum dma_data_direction dir,
unsigned long attrs)
{ {
struct dma_map_ops *ops = get_dma_ops(dev); struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir)); BUG_ON(!valid_dma_direction(dir));
if (ops->unmap_page) if (ops->unmap_page)
ops->unmap_page(dev, addr, size, dir, 0); ops->unmap_page(dev, addr, size, dir, attrs);
debug_dma_unmap_page(dev, addr, size, dir, false); debug_dma_unmap_page(dev, addr, size, dir, false);
} }
@@ -385,6 +389,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0)
#define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
#define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size); void *cpu_addr, dma_addr_t dma_addr, size_t size);

View File

@@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold);
extern void free_hot_cold_page_list(struct list_head *list, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold);
struct page_frag_cache; struct page_frag_cache;
extern void __page_frag_drain(struct page *page, unsigned int order,
unsigned int count);
extern void *__alloc_page_frag(struct page_frag_cache *nc, extern void *__alloc_page_frag(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask); unsigned int fragsz, gfp_t gfp_mask);
extern void __free_page_frag(void *addr); extern void __free_page_frag(void *addr);

View File

@@ -1,12 +1,12 @@
#ifndef _LINUX_HUGE_MM_H #ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H
extern int do_huge_pmd_anonymous_page(struct fault_env *fe); extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma); struct vm_area_struct *vma);
extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, unsigned long addr,
pmd_t *pmd, pmd_t *pmd,
@@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page)
return 1; return 1;
} }
extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *huge_zero_page; extern struct page *huge_zero_page;
@@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
return NULL; return NULL;
} }
static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
{ {
return 0; return 0;
} }

View File

@@ -18,12 +18,11 @@
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
/* /*
* We want shallower trees and thus more bits covered at each layer. 8 * Using 6 bits at each layer allows us to allocate 7 layers out of each page.
* bits gives us large enough first layer for most use cases and maximum * 8 bits only gave us 3 layers out of every pair of pages, which is less
* tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and * efficient except for trees with a largest element between 192-255 inclusive.
* 1k on 32bit.
*/ */
#define IDR_BITS 8 #define IDR_BITS 6
#define IDR_SIZE (1 << IDR_BITS) #define IDR_SIZE (1 << IDR_BITS)
#define IDR_MASK ((1 << IDR_BITS)-1) #define IDR_MASK ((1 << IDR_BITS)-1)
@@ -55,6 +54,32 @@ struct idr {
} }
#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) #define DEFINE_IDR(name) struct idr name = IDR_INIT(name)
/**
* idr_get_cursor - Return the current position of the cyclic allocator
* @idr: idr handle
*
* The value returned is the value that will be next returned from
* idr_alloc_cyclic() if it is free (otherwise the search will start from
* this position).
*/
static inline unsigned int idr_get_cursor(struct idr *idr)
{
return READ_ONCE(idr->cur);
}
/**
* idr_set_cursor - Set the current position of the cyclic allocator
* @idr: idr handle
* @val: new position
*
* The next call to idr_alloc_cyclic() will return @val if it is free
* (otherwise the search will start from this position).
*/
static inline void idr_set_cursor(struct idr *idr, unsigned int val)
{
WRITE_ONCE(idr->cur, val);
}
/** /**
* DOC: idr sync * DOC: idr sync
* idr synchronization (stolen from radix-tree.h) * idr synchronization (stolen from radix-tree.h)
@@ -195,6 +220,11 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
return ida_get_new_above(ida, 0, p_id); return ida_get_new_above(ida, 0, p_id);
} }
static inline bool ida_is_empty(struct ida *ida)
{
return idr_is_empty(&ida->idr);
}
void __init idr_init_cache(void); void __init idr_init_cache(void);
#endif /* __IDR_H__ */ #endif /* __IDR_H__ */

View File

@@ -77,7 +77,6 @@ extern int kdb_poll_idx;
* number whenever the kernel debugger is entered. * number whenever the kernel debugger is entered.
*/ */
extern int kdb_initial_cpu; extern int kdb_initial_cpu;
extern atomic_t kdb_event;
/* Types and messages used for dynamically added kdb shell commands */ /* Types and messages used for dynamically added kdb shell commands */
@@ -162,6 +161,7 @@ enum kdb_msgsrc {
}; };
extern int kdb_trap_printk; extern int kdb_trap_printk;
extern int kdb_printf_cpu;
extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
va_list args); va_list args);
extern __printf(1, 2) int kdb_printf(const char *, ...); extern __printf(1, 2) int kdb_printf(const char *, ...);

View File

@@ -259,12 +259,6 @@ phys_addr_t paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \ #define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name) vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
#define VMCOREINFO_PAGE_OFFSET(value) \
vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value)
#define VMCOREINFO_VMALLOC_START(value) \
vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value)
#define VMCOREINFO_VMEMMAP_START(value) \
vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image; extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image; extern struct kimage *kexec_crash_image;

View File

@@ -292,36 +292,23 @@ extern pgprot_t protection_map[16];
* pgoff should be used in favour of virtual_address, if possible. * pgoff should be used in favour of virtual_address, if possible.
*/ */
struct vm_fault { struct vm_fault {
struct vm_area_struct *vma; /* Target VMA */
unsigned int flags; /* FAULT_FLAG_xxx flags */ unsigned int flags; /* FAULT_FLAG_xxx flags */
gfp_t gfp_mask; /* gfp mask to be used for allocations */ gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */ pgoff_t pgoff; /* Logical page offset based on vma */
void __user *virtual_address; /* Faulting virtual address */ unsigned long address; /* Faulting virtual address */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pte_t orig_pte; /* Value of PTE at the time of fault */
struct page *cow_page; /* Handler may choose to COW */ struct page *cow_page; /* Page handler may use for COW fault */
struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */
struct page *page; /* ->fault handlers should return a struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE * page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by * is set (which is also implied by
* VM_FAULT_ERROR). * VM_FAULT_ERROR).
*/ */
void *entry; /* ->fault handler can alternatively /* These three entries are valid only while holding ptl lock */
* return locked DAX entry. In that
* case handler should return
* VM_FAULT_DAX_LOCKED and fill in
* entry here.
*/
};
/*
* Page fault context: passes though page fault handler instead of endless list
* of function arguments.
*/
struct fault_env {
struct vm_area_struct *vma; /* Target VMA */
unsigned long address; /* Faulting virtual address */
unsigned int flags; /* FAULT_FLAG_xxx flags */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address'
*/
pte_t *pte; /* Pointer to pte entry matching pte_t *pte; /* Pointer to pte entry matching
* the 'address'. NULL if the page * the 'address'. NULL if the page
* table hasn't been allocated. * table hasn't been allocated.
@@ -351,7 +338,7 @@ struct vm_operations_struct {
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
int (*pmd_fault)(struct vm_area_struct *, unsigned long address, int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
pmd_t *, unsigned int flags); pmd_t *, unsigned int flags);
void (*map_pages)(struct fault_env *fe, void (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff); pgoff_t start_pgoff, pgoff_t end_pgoff);
/* notification that a previously read-only page is about to become /* notification that a previously read-only page is about to become
@@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte; return pte;
} }
int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page); struct page *page);
int finish_fault(struct vm_fault *vmf);
int finish_mkwrite_fault(struct vm_fault *vmf);
#endif #endif
/* /*
@@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
@@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma); struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping, void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows); loff_t const holebegin, loff_t const holelen, int even_cows);
int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address, int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn); unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address, int follow_phys(struct vm_area_struct *vma, unsigned long address,
@@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas); struct vm_area_struct **vmas, int *locked);
long get_user_pages(unsigned long start, unsigned long nr_pages, long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas); struct vm_area_struct **vmas);
long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked); unsigned int gup_flags, struct page **pages, int *locked);
long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags); struct page **pages, unsigned int gup_flags);
int get_user_pages_fast(unsigned long start, int nr_pages, int write, int get_user_pages_fast(unsigned long start, int nr_pages, int write,
@@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */ /* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
extern void filemap_map_pages(struct fault_env *fe, extern void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff); pgoff_t start_pgoff, pgoff_t end_pgoff);
extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);

View File

@@ -7,6 +7,23 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <asm/irq.h> #include <asm/irq.h>
/*
* The run state of the lockup detectors is controlled by the content of the
* 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
* bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
*
* 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
* are variables that are only used as an 'interface' between the parameters
* in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
* 'watchdog_thresh' variable is handled differently because its value is not
* boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
* is equal zero.
*/
#define NMI_WATCHDOG_ENABLED_BIT 0
#define SOFT_WATCHDOG_ENABLED_BIT 1
#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
/** /**
* touch_nmi_watchdog - restart NMI watchdog timeout. * touch_nmi_watchdog - restart NMI watchdog timeout.
* *
@@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled;
extern int soft_watchdog_enabled; extern int soft_watchdog_enabled;
extern int watchdog_user_enabled; extern int watchdog_user_enabled;
extern int watchdog_thresh; extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
extern unsigned long *watchdog_cpumask_bits; extern unsigned long *watchdog_cpumask_bits;
#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
extern bool is_hardlockup(void);
struct ctl_table; struct ctl_table;
extern int proc_watchdog(struct ctl_table *, int , extern int proc_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *); void __user *, size_t *, loff_t *);

View File

@@ -80,23 +80,25 @@ static inline bool radix_tree_is_internal_node(void *ptr)
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT)) RADIX_TREE_MAP_SHIFT))
/*
* @count is the count of every non-NULL element in the ->slots array
* whether that is an exceptional entry, a retry entry, a user pointer,
* a sibling entry or a pointer to the next level of the tree.
* @exceptional is the count of every element in ->slots which is
* either radix_tree_exceptional_entry() or is a sibling entry for an
* exceptional entry.
*/
struct radix_tree_node { struct radix_tree_node {
unsigned char shift; /* Bits remaining in each slot */ unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */ unsigned char offset; /* Slot offset in parent */
unsigned char count; /* Total entry count */ unsigned char count; /* Total entry count */
unsigned char exceptional; /* Exceptional entry count */ unsigned char exceptional; /* Exceptional entry count */
struct radix_tree_node *parent; /* Used when ascending tree */
void *private_data; /* For tree user */
union { union {
struct { struct list_head private_list; /* For tree user */
/* Used when ascending tree */ struct rcu_head rcu_head; /* Used when freeing node */
struct radix_tree_node *parent;
/* For tree user */
void *private_data;
};
/* Used when freeing node */
struct rcu_head rcu_head;
}; };
/* For tree user */
struct list_head private_list;
void __rcu *slots[RADIX_TREE_MAP_SIZE]; void __rcu *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
}; };
@@ -126,6 +128,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root)
return root->rnode == NULL; return root->rnode == NULL;
} }
/**
* struct radix_tree_iter - radix tree iterator state
*
* @index: index of current slot
* @next_index: one beyond the last index for this chunk
* @tags: bit-mask for tag-iterating
* @node: node that contains current slot
* @shift: shift for the node that holds our slots
*
* This radix tree iterator works in terms of "chunks" of slots. A chunk is a
* subinterval of slots contained within one radix tree leaf node. It is
* described by a pointer to its first slot and a struct radix_tree_iter
* which holds the chunk's position in the tree and its size. For tagged
* iteration radix_tree_iter also holds the slots' bit-mask for one chosen
* radix tree tag.
*/
struct radix_tree_iter {
unsigned long index;
unsigned long next_index;
unsigned long tags;
struct radix_tree_node *node;
#ifdef CONFIG_RADIX_TREE_MULTIORDER
unsigned int shift;
#endif
};
static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
{
#ifdef CONFIG_RADIX_TREE_MULTIORDER
return iter->shift;
#else
return 0;
#endif
}
/** /**
* Radix-tree synchronization * Radix-tree synchronization
* *
@@ -264,6 +301,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node, struct radix_tree_node *node,
void **slot, void *item, void **slot, void *item,
radix_tree_update_node_t update_node, void *private); radix_tree_update_node_t update_node, void *private);
void radix_tree_iter_replace(struct radix_tree_root *,
const struct radix_tree_iter *, void **slot, void *item);
void radix_tree_replace_slot(struct radix_tree_root *root, void radix_tree_replace_slot(struct radix_tree_root *root,
void **slot, void *item); void **slot, void *item);
void __radix_tree_delete_node(struct radix_tree_root *root, void __radix_tree_delete_node(struct radix_tree_root *root,
@@ -289,6 +328,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, unsigned int tag); unsigned long index, unsigned int tag);
int radix_tree_tag_get(struct radix_tree_root *root, int radix_tree_tag_get(struct radix_tree_root *root,
unsigned long index, unsigned int tag); unsigned long index, unsigned int tag);
void radix_tree_iter_tag_set(struct radix_tree_root *root,
const struct radix_tree_iter *iter, unsigned int tag);
unsigned int unsigned int
radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items, unsigned long first_index, unsigned int max_items,
@@ -297,50 +338,18 @@ unsigned int
radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
unsigned long first_index, unsigned int max_items, unsigned long first_index, unsigned int max_items,
unsigned int tag); unsigned int tag);
unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
unsigned long *first_indexp, unsigned long last_index,
unsigned long nr_to_tag,
unsigned int fromtag, unsigned int totag);
int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
static inline void radix_tree_preload_end(void) static inline void radix_tree_preload_end(void)
{ {
preempt_enable(); preempt_enable();
} }
/** int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
* struct radix_tree_iter - radix tree iterator state int radix_tree_split(struct radix_tree_root *, unsigned long index,
* unsigned new_order);
* @index: index of current slot int radix_tree_join(struct radix_tree_root *, unsigned long index,
* @next_index: one beyond the last index for this chunk unsigned new_order, void *);
* @tags: bit-mask for tag-iterating
* @shift: shift for the node that holds our slots
*
* This radix tree iterator works in terms of "chunks" of slots. A chunk is a
* subinterval of slots contained within one radix tree leaf node. It is
* described by a pointer to its first slot and a struct radix_tree_iter
* which holds the chunk's position in the tree and its size. For tagged
* iteration radix_tree_iter also holds the slots' bit-mask for one chosen
* radix tree tag.
*/
struct radix_tree_iter {
unsigned long index;
unsigned long next_index;
unsigned long tags;
#ifdef CONFIG_RADIX_TREE_MULTIORDER
unsigned int shift;
#endif
};
static inline unsigned int iter_shift(struct radix_tree_iter *iter)
{
#ifdef CONFIG_RADIX_TREE_MULTIORDER
return iter->shift;
#else
return 0;
#endif
}
#define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */
#define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */
@@ -409,20 +418,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
} }
/** /**
* radix_tree_iter_next - resume iterating when the chunk may be invalid * radix_tree_iter_resume - resume iterating when the chunk may be invalid
* @iter: iterator state * @slot: pointer to current slot
* @iter: iterator state
* Returns: New slot pointer
* *
* If the iterator needs to release then reacquire a lock, the chunk may * If the iterator needs to release then reacquire a lock, the chunk may
* have been invalidated by an insertion or deletion. Call this function * have been invalidated by an insertion or deletion. Call this function
* to continue the iteration from the next index. * before releasing the lock to continue the iteration from the next index.
*/ */
static inline __must_check void **__must_check radix_tree_iter_resume(void **slot,
void **radix_tree_iter_next(struct radix_tree_iter *iter) struct radix_tree_iter *iter);
{
iter->next_index = __radix_tree_iter_add(iter, 1);
iter->tags = 0;
return NULL;
}
/** /**
* radix_tree_chunk_size - get current chunk size * radix_tree_chunk_size - get current chunk size
@@ -436,10 +442,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
return (iter->next_index - iter->index) >> iter_shift(iter); return (iter->next_index - iter->index) >> iter_shift(iter);
} }
static inline struct radix_tree_node *entry_to_node(void *ptr) #ifdef CONFIG_RADIX_TREE_MULTIORDER
void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
unsigned flags);
#else
/* Can't happen without sibling entries, but the compiler can't tell that */
static inline void ** __radix_tree_next_slot(void **slot,
struct radix_tree_iter *iter, unsigned flags)
{ {
return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); return slot;
} }
#endif
/** /**
* radix_tree_next_slot - find next slot in chunk * radix_tree_next_slot - find next slot in chunk
@@ -453,7 +466,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr)
* For tagged lookup it also eats @iter->tags. * For tagged lookup it also eats @iter->tags.
* *
* There are several cases where 'slot' can be passed in as NULL to this * There are several cases where 'slot' can be passed in as NULL to this
* function. These cases result from the use of radix_tree_iter_next() or * function. These cases result from the use of radix_tree_iter_resume() or
* radix_tree_iter_retry(). In these cases we don't end up dereferencing * radix_tree_iter_retry(). In these cases we don't end up dereferencing
* 'slot' because either: * 'slot' because either:
* a) we are doing tagged iteration and iter->tags has been set to 0, or * a) we are doing tagged iteration and iter->tags has been set to 0, or
@@ -464,51 +477,31 @@ static __always_inline void **
radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
{ {
if (flags & RADIX_TREE_ITER_TAGGED) { if (flags & RADIX_TREE_ITER_TAGGED) {
void *canon = slot;
iter->tags >>= 1; iter->tags >>= 1;
if (unlikely(!iter->tags)) if (unlikely(!iter->tags))
return NULL; return NULL;
while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
radix_tree_is_internal_node(slot[1])) {
if (entry_to_node(slot[1]) == canon) {
iter->tags >>= 1;
iter->index = __radix_tree_iter_add(iter, 1);
slot++;
continue;
}
iter->next_index = __radix_tree_iter_add(iter, 1);
return NULL;
}
if (likely(iter->tags & 1ul)) { if (likely(iter->tags & 1ul)) {
iter->index = __radix_tree_iter_add(iter, 1); iter->index = __radix_tree_iter_add(iter, 1);
return slot + 1; slot++;
goto found;
} }
if (!(flags & RADIX_TREE_ITER_CONTIG)) { if (!(flags & RADIX_TREE_ITER_CONTIG)) {
unsigned offset = __ffs(iter->tags); unsigned offset = __ffs(iter->tags);
iter->tags >>= offset; iter->tags >>= offset++;
iter->index = __radix_tree_iter_add(iter, offset + 1); iter->index = __radix_tree_iter_add(iter, offset);
return slot + offset + 1; slot += offset;
goto found;
} }
} else { } else {
long count = radix_tree_chunk_size(iter); long count = radix_tree_chunk_size(iter);
void *canon = slot;
while (--count > 0) { while (--count > 0) {
slot++; slot++;
iter->index = __radix_tree_iter_add(iter, 1); iter->index = __radix_tree_iter_add(iter, 1);
if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
radix_tree_is_internal_node(*slot)) {
if (entry_to_node(*slot) == canon)
continue;
iter->next_index = iter->index;
break;
}
if (likely(*slot)) if (likely(*slot))
return slot; goto found;
if (flags & RADIX_TREE_ITER_CONTIG) { if (flags & RADIX_TREE_ITER_CONTIG) {
/* forbid switching to the next chunk */ /* forbid switching to the next chunk */
iter->next_index = 0; iter->next_index = 0;
@@ -517,6 +510,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
} }
} }
return NULL; return NULL;
found:
if (unlikely(radix_tree_is_internal_node(*slot)))
return __radix_tree_next_slot(slot, iter, flags);
return slot;
} }
/** /**
@@ -567,6 +565,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
slot || (slot = radix_tree_next_chunk(root, iter, \ slot || (slot = radix_tree_next_chunk(root, iter, \
RADIX_TREE_ITER_TAGGED | tag)) ; \ RADIX_TREE_ITER_TAGGED | tag)) ; \
slot = radix_tree_next_slot(slot, iter, \ slot = radix_tree_next_slot(slot, iter, \
RADIX_TREE_ITER_TAGGED)) RADIX_TREE_ITER_TAGGED | tag))
#endif /* _LINUX_RADIX_TREE_H */ #endif /* _LINUX_RADIX_TREE_H */

View File

@@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set)
} }
} }
static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
switch (_NSIG_WORDS) {
case 4:
return (set1->sig[3] == set2->sig[3]) &&
(set1->sig[2] == set2->sig[2]) &&
(set1->sig[1] == set2->sig[1]) &&
(set1->sig[0] == set2->sig[0]);
case 2:
return (set1->sig[1] == set2->sig[1]) &&
(set1->sig[0] == set2->sig[0]);
case 1:
return set1->sig[0] == set2->sig[0];
}
return 0;
}
#define sigmask(sig) (1UL << ((sig) - 1)) #define sigmask(sig) (1UL << ((sig) - 1))
#ifndef __HAVE_ARCH_SIG_SETOPS #ifndef __HAVE_ARCH_SIG_SETOPS

View File

@@ -27,7 +27,7 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
extern int handle_userfault(struct fault_env *fe, unsigned long reason); extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len); unsigned long src_start, unsigned long len);
@@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
#else /* CONFIG_USERFAULTFD */ #else /* CONFIG_USERFAULTFD */
/* mm helpers */ /* mm helpers */
static inline int handle_userfault(struct fault_env *fe, unsigned long reason) static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{ {
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }

View File

@@ -763,7 +763,10 @@ static inline int convert_mode(long *msgtyp, int msgflg)
if (*msgtyp == 0) if (*msgtyp == 0)
return SEARCH_ANY; return SEARCH_ANY;
if (*msgtyp < 0) { if (*msgtyp < 0) {
*msgtyp = -*msgtyp; if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */
*msgtyp = LONG_MAX;
else
*msgtyp = -*msgtyp;
return SEARCH_LESSEQUAL; return SEARCH_LESSEQUAL;
} }
if (msgflg & MSG_EXCEPT) if (msgflg & MSG_EXCEPT)

520
ipc/sem.c
View File

@@ -11,6 +11,7 @@
* (c) 2001 Red Hat Inc * (c) 2001 Red Hat Inc
* Lockless wakeup * Lockless wakeup
* (c) 2003 Manfred Spraul <manfred@colorfullife.com> * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
* (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
* Further wakeup optimizations, documentation * Further wakeup optimizations, documentation
* (c) 2010 Manfred Spraul <manfred@colorfullife.com> * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
* *
@@ -53,15 +54,11 @@
* Semaphores are actively given to waiting tasks (necessary for FIFO). * Semaphores are actively given to waiting tasks (necessary for FIFO).
* (see update_queue()) * (see update_queue())
* - To improve the scalability, the actual wake-up calls are performed after * - To improve the scalability, the actual wake-up calls are performed after
* dropping all locks. (see wake_up_sem_queue_prepare(), * dropping all locks. (see wake_up_sem_queue_prepare())
* wake_up_sem_queue_do())
* - All work is done by the waker, the woken up task does not have to do * - All work is done by the waker, the woken up task does not have to do
* anything - not even acquiring a lock or dropping a refcount. * anything - not even acquiring a lock or dropping a refcount.
* - A woken up task may not even touch the semaphore array anymore, it may * - A woken up task may not even touch the semaphore array anymore, it may
* have been destroyed already by a semctl(RMID). * have been destroyed already by a semctl(RMID).
* - The synchronizations between wake-ups due to a timeout/signal and a
* wake-up due to a completed semaphore operation is achieved by using an
* intermediate state (IN_WAKEUP).
* - UNDO values are stored in an array (one per process and per * - UNDO values are stored in an array (one per process and per
* semaphore array, lazily allocated). For backwards compatibility, multiple * semaphore array, lazily allocated). For backwards compatibility, multiple
* modes for the UNDO variables are supported (per process, per thread) * modes for the UNDO variables are supported (per process, per thread)
@@ -118,7 +115,8 @@ struct sem_queue {
struct sembuf *sops; /* array of pending operations */ struct sembuf *sops; /* array of pending operations */
struct sembuf *blocking; /* the operation that blocked */ struct sembuf *blocking; /* the operation that blocked */
int nsops; /* number of operations */ int nsops; /* number of operations */
int alter; /* does *sops alter the array? */ bool alter; /* does *sops alter the array? */
bool dupsop; /* sops on more than one sem_num */
}; };
/* Each task has a list of undo requests. They are executed automatically /* Each task has a list of undo requests. They are executed automatically
@@ -416,29 +414,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
* *
* The caller holds the RCU read lock. * The caller holds the RCU read lock.
*/ */
static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
int id, struct sembuf *sops, int nsops, int *locknum)
{
struct kern_ipc_perm *ipcp;
struct sem_array *sma;
ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
if (IS_ERR(ipcp))
return ERR_CAST(ipcp);
sma = container_of(ipcp, struct sem_array, sem_perm);
*locknum = sem_lock(sma, sops, nsops);
/* ipc_rmid() may have already freed the ID while sem_lock
* was spinning: verify that the structure is still valid
*/
if (ipc_valid_object(ipcp))
return container_of(ipcp, struct sem_array, sem_perm);
sem_unlock(sma, *locknum);
return ERR_PTR(-EINVAL);
}
static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
{ {
struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
@@ -471,40 +446,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
ipc_rmid(&sem_ids(ns), &s->sem_perm); ipc_rmid(&sem_ids(ns), &s->sem_perm);
} }
/*
* Lockless wakeup algorithm:
* Without the check/retry algorithm a lockless wakeup is possible:
* - queue.status is initialized to -EINTR before blocking.
* - wakeup is performed by
* * unlinking the queue entry from the pending list
* * setting queue.status to IN_WAKEUP
* This is the notification for the blocked thread that a
* result value is imminent.
* * call wake_up_process
* * set queue.status to the final value.
* - the previously blocked thread checks queue.status:
* * if it's IN_WAKEUP, then it must wait until the value changes
* * if it's not -EINTR, then the operation was completed by
* update_queue. semtimedop can return queue.status without
* performing any operation on the sem array.
* * otherwise it must acquire the spinlock and check what's up.
*
* The two-stage algorithm is necessary to protect against the following
* races:
* - if queue.status is set after wake_up_process, then the woken up idle
* thread could race forward and try (and fail) to acquire sma->lock
* before update_queue had a chance to set queue.status
* - if queue.status is written before wake_up_process and if the
* blocked process is woken up by a signal between writing
* queue.status and the wake_up_process, then the woken up
* process could return from semtimedop and die by calling
* sys_exit before wake_up_process is called. Then wake_up_process
* will oops, because the task structure is already invalid.
* (yes, this happened on s390 with sysv msg).
*
*/
#define IN_WAKEUP 1
/** /**
* newary - Create a new semaphore set * newary - Create a new semaphore set
* @ns: namespace * @ns: namespace
@@ -624,15 +565,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
} }
/** /**
* perform_atomic_semop - Perform (if possible) a semaphore operation * perform_atomic_semop[_slow] - Attempt to perform semaphore
* operations on a given array.
* @sma: semaphore array * @sma: semaphore array
* @q: struct sem_queue that describes the operation * @q: struct sem_queue that describes the operation
* *
* Caller blocking are as follows, based the value
* indicated by the semaphore operation (sem_op):
*
* (1) >0 never blocks.
* (2) 0 (wait-for-zero operation): semval is non-zero.
* (3) <0 attempting to decrement semval to a value smaller than zero.
*
* Returns 0 if the operation was possible. * Returns 0 if the operation was possible.
* Returns 1 if the operation is impossible, the caller must sleep. * Returns 1 if the operation is impossible, the caller must sleep.
* Negative values are error codes. * Returns <0 for error codes.
*/ */
static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
{ {
int result, sem_op, nsops, pid; int result, sem_op, nsops, pid;
struct sembuf *sop; struct sembuf *sop;
@@ -703,51 +652,84 @@ undo:
return result; return result;
} }
/** wake_up_sem_queue_prepare(q, error): Prepare wake-up static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
* @q: queue entry that must be signaled
* @error: Error value for the signal
*
* Prepare the wake-up of the queue entry q.
*/
static void wake_up_sem_queue_prepare(struct list_head *pt,
struct sem_queue *q, int error)
{ {
if (list_empty(pt)) { int result, sem_op, nsops;
/* struct sembuf *sop;
* Hold preempt off so that we don't get preempted and have the struct sem *curr;
* wakee busy-wait until we're scheduled back on. struct sembuf *sops;
*/ struct sem_undo *un;
preempt_disable();
}
q->status = IN_WAKEUP;
q->pid = error;
list_add_tail(&q->list, pt); sops = q->sops;
nsops = q->nsops;
un = q->undo;
if (unlikely(q->dupsop))
return perform_atomic_semop_slow(sma, q);
/*
* We scan the semaphore set twice, first to ensure that the entire
* operation can succeed, therefore avoiding any pointless writes
* to shared memory and having to undo such changes in order to block
* until the operations can go through.
*/
for (sop = sops; sop < sops + nsops; sop++) {
curr = sma->sem_base + sop->sem_num;
sem_op = sop->sem_op;
result = curr->semval;
if (!sem_op && result)
goto would_block; /* wait-for-zero */
result += sem_op;
if (result < 0)
goto would_block;
if (result > SEMVMX)
return -ERANGE;
if (sop->sem_flg & SEM_UNDO) {
int undo = un->semadj[sop->sem_num] - sem_op;
/* Exceeding the undo range is an error. */
if (undo < (-SEMAEM - 1) || undo > SEMAEM)
return -ERANGE;
}
}
for (sop = sops; sop < sops + nsops; sop++) {
curr = sma->sem_base + sop->sem_num;
sem_op = sop->sem_op;
result = curr->semval;
if (sop->sem_flg & SEM_UNDO) {
int undo = un->semadj[sop->sem_num] - sem_op;
un->semadj[sop->sem_num] = undo;
}
curr->semval += sem_op;
curr->sempid = q->pid;
}
return 0;
would_block:
q->blocking = sop;
return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
} }
/** static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
* wake_up_sem_queue_do - do the actual wake-up struct wake_q_head *wake_q)
* @pt: list of tasks to be woken up
*
* Do the actual wake-up.
* The function is called without any locks held, thus the semaphore array
* could be destroyed already and the tasks can disappear as soon as the
* status is set to the actual return code.
*/
static void wake_up_sem_queue_do(struct list_head *pt)
{ {
struct sem_queue *q, *t; wake_q_add(wake_q, q->sleeper);
int did_something; /*
* Rely on the above implicit barrier, such that we can
did_something = !list_empty(pt); * ensure that we hold reference to the task before setting
list_for_each_entry_safe(q, t, pt, list) { * q->status. Otherwise we could race with do_exit if the
wake_up_process(q->sleeper); * task is awoken by an external event before calling
/* q can disappear immediately after writing q->status. */ * wake_up_process().
smp_wmb(); */
q->status = q->pid; WRITE_ONCE(q->status, error);
}
if (did_something)
preempt_enable();
} }
static void unlink_queue(struct sem_array *sma, struct sem_queue *q) static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -767,7 +749,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
* modified the array. * modified the array.
* Note that wait-for-zero operations are handled without restart. * Note that wait-for-zero operations are handled without restart.
*/ */
static int check_restart(struct sem_array *sma, struct sem_queue *q) static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
{ {
/* pending complex alter operations are too difficult to analyse */ /* pending complex alter operations are too difficult to analyse */
if (!list_empty(&sma->pending_alter)) if (!list_empty(&sma->pending_alter))
@@ -795,21 +777,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
* wake_const_ops - wake up non-alter tasks * wake_const_ops - wake up non-alter tasks
* @sma: semaphore array. * @sma: semaphore array.
* @semnum: semaphore that was modified. * @semnum: semaphore that was modified.
* @pt: list head for the tasks that must be woken up. * @wake_q: lockless wake-queue head.
* *
* wake_const_ops must be called after a semaphore in a semaphore array * wake_const_ops must be called after a semaphore in a semaphore array
* was set to 0. If complex const operations are pending, wake_const_ops must * was set to 0. If complex const operations are pending, wake_const_ops must
* be called with semnum = -1, as well as with the number of each modified * be called with semnum = -1, as well as with the number of each modified
* semaphore. * semaphore.
* The tasks that must be woken up are added to @pt. The return code * The tasks that must be woken up are added to @wake_q. The return code
* is stored in q->pid. * is stored in q->pid.
* The function returns 1 if at least one operation was completed successfully. * The function returns 1 if at least one operation was completed successfully.
*/ */
static int wake_const_ops(struct sem_array *sma, int semnum, static int wake_const_ops(struct sem_array *sma, int semnum,
struct list_head *pt) struct wake_q_head *wake_q)
{ {
struct sem_queue *q; struct sem_queue *q, *tmp;
struct list_head *walk;
struct list_head *pending_list; struct list_head *pending_list;
int semop_completed = 0; int semop_completed = 0;
@@ -818,25 +799,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
else else
pending_list = &sma->sem_base[semnum].pending_const; pending_list = &sma->sem_base[semnum].pending_const;
walk = pending_list->next; list_for_each_entry_safe(q, tmp, pending_list, list) {
while (walk != pending_list) { int error = perform_atomic_semop(sma, q);
int error;
q = container_of(walk, struct sem_queue, list); if (error > 0)
walk = walk->next; continue;
/* operation completed, remove from queue & wakeup */
unlink_queue(sma, q);
error = perform_atomic_semop(sma, q); wake_up_sem_queue_prepare(q, error, wake_q);
if (error == 0)
if (error <= 0) { semop_completed = 1;
/* operation completed, remove from queue & wakeup */
unlink_queue(sma, q);
wake_up_sem_queue_prepare(pt, q, error);
if (error == 0)
semop_completed = 1;
}
} }
return semop_completed; return semop_completed;
} }
@@ -845,14 +820,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
* @sma: semaphore array * @sma: semaphore array
* @sops: operations that were performed * @sops: operations that were performed
* @nsops: number of operations * @nsops: number of operations
* @pt: list head of the tasks that must be woken up. * @wake_q: lockless wake-queue head
* *
* Checks all required queue for wait-for-zero operations, based * Checks all required queue for wait-for-zero operations, based
* on the actual changes that were performed on the semaphore array. * on the actual changes that were performed on the semaphore array.
* The function returns 1 if at least one operation was completed successfully. * The function returns 1 if at least one operation was completed successfully.
*/ */
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
int nsops, struct list_head *pt) int nsops, struct wake_q_head *wake_q)
{ {
int i; int i;
int semop_completed = 0; int semop_completed = 0;
@@ -865,7 +840,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
if (sma->sem_base[num].semval == 0) { if (sma->sem_base[num].semval == 0) {
got_zero = 1; got_zero = 1;
semop_completed |= wake_const_ops(sma, num, pt); semop_completed |= wake_const_ops(sma, num, wake_q);
} }
} }
} else { } else {
@@ -876,7 +851,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
for (i = 0; i < sma->sem_nsems; i++) { for (i = 0; i < sma->sem_nsems; i++) {
if (sma->sem_base[i].semval == 0) { if (sma->sem_base[i].semval == 0) {
got_zero = 1; got_zero = 1;
semop_completed |= wake_const_ops(sma, i, pt); semop_completed |= wake_const_ops(sma, i, wake_q);
} }
} }
} }
@@ -885,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
* then check the global queue, too. * then check the global queue, too.
*/ */
if (got_zero) if (got_zero)
semop_completed |= wake_const_ops(sma, -1, pt); semop_completed |= wake_const_ops(sma, -1, wake_q);
return semop_completed; return semop_completed;
} }
@@ -895,22 +870,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
* update_queue - look for tasks that can be completed. * update_queue - look for tasks that can be completed.
* @sma: semaphore array. * @sma: semaphore array.
* @semnum: semaphore that was modified. * @semnum: semaphore that was modified.
* @pt: list head for the tasks that must be woken up. * @wake_q: lockless wake-queue head.
* *
* update_queue must be called after a semaphore in a semaphore array * update_queue must be called after a semaphore in a semaphore array
* was modified. If multiple semaphores were modified, update_queue must * was modified. If multiple semaphores were modified, update_queue must
* be called with semnum = -1, as well as with the number of each modified * be called with semnum = -1, as well as with the number of each modified
* semaphore. * semaphore.
* The tasks that must be woken up are added to @pt. The return code * The tasks that must be woken up are added to @wake_q. The return code
* is stored in q->pid. * is stored in q->pid.
* The function internally checks if const operations can now succeed. * The function internally checks if const operations can now succeed.
* *
* The function return 1 if at least one semop was completed successfully. * The function return 1 if at least one semop was completed successfully.
*/ */
static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
{ {
struct sem_queue *q; struct sem_queue *q, *tmp;
struct list_head *walk;
struct list_head *pending_list; struct list_head *pending_list;
int semop_completed = 0; int semop_completed = 0;
@@ -920,13 +894,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
pending_list = &sma->sem_base[semnum].pending_alter; pending_list = &sma->sem_base[semnum].pending_alter;
again: again:
walk = pending_list->next; list_for_each_entry_safe(q, tmp, pending_list, list) {
while (walk != pending_list) {
int error, restart; int error, restart;
q = container_of(walk, struct sem_queue, list);
walk = walk->next;
/* If we are scanning the single sop, per-semaphore list of /* If we are scanning the single sop, per-semaphore list of
* one semaphore and that semaphore is 0, then it is not * one semaphore and that semaphore is 0, then it is not
* necessary to scan further: simple increments * necessary to scan further: simple increments
@@ -949,11 +919,11 @@ again:
restart = 0; restart = 0;
} else { } else {
semop_completed = 1; semop_completed = 1;
do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
restart = check_restart(sma, q); restart = check_restart(sma, q);
} }
wake_up_sem_queue_prepare(pt, q, error); wake_up_sem_queue_prepare(q, error, wake_q);
if (restart) if (restart)
goto again; goto again;
} }
@@ -984,24 +954,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
* @sops: operations that were performed * @sops: operations that were performed
* @nsops: number of operations * @nsops: number of operations
* @otime: force setting otime * @otime: force setting otime
* @pt: list head of the tasks that must be woken up. * @wake_q: lockless wake-queue head
* *
* do_smart_update() does the required calls to update_queue and wakeup_zero, * do_smart_update() does the required calls to update_queue and wakeup_zero,
* based on the actual changes that were performed on the semaphore array. * based on the actual changes that were performed on the semaphore array.
* Note that the function does not do the actual wake-up: the caller is * Note that the function does not do the actual wake-up: the caller is
* responsible for calling wake_up_sem_queue_do(@pt). * responsible for calling wake_up_q().
* It is safe to perform this call after dropping all locks. * It is safe to perform this call after dropping all locks.
*/ */
static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
int otime, struct list_head *pt) int otime, struct wake_q_head *wake_q)
{ {
int i; int i;
otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
if (!list_empty(&sma->pending_alter)) { if (!list_empty(&sma->pending_alter)) {
/* semaphore array uses the global queue - just process it. */ /* semaphore array uses the global queue - just process it. */
otime |= update_queue(sma, -1, pt); otime |= update_queue(sma, -1, wake_q);
} else { } else {
if (!sops) { if (!sops) {
/* /*
@@ -1009,7 +979,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
* known. Check all. * known. Check all.
*/ */
for (i = 0; i < sma->sem_nsems; i++) for (i = 0; i < sma->sem_nsems; i++)
otime |= update_queue(sma, i, pt); otime |= update_queue(sma, i, wake_q);
} else { } else {
/* /*
* Check the semaphores that were increased: * Check the semaphores that were increased:
@@ -1023,7 +993,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
for (i = 0; i < nsops; i++) { for (i = 0; i < nsops; i++) {
if (sops[i].sem_op > 0) { if (sops[i].sem_op > 0) {
otime |= update_queue(sma, otime |= update_queue(sma,
sops[i].sem_num, pt); sops[i].sem_num, wake_q);
} }
} }
} }
@@ -1111,8 +1081,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
struct sem_undo *un, *tu; struct sem_undo *un, *tu;
struct sem_queue *q, *tq; struct sem_queue *q, *tq;
struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
struct list_head tasks;
int i; int i;
DEFINE_WAKE_Q(wake_q);
/* Free the existing undo structures for this semaphore set. */ /* Free the existing undo structures for this semaphore set. */
ipc_assert_locked_object(&sma->sem_perm); ipc_assert_locked_object(&sma->sem_perm);
@@ -1126,25 +1096,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
} }
/* Wake up all pending processes and let them fail with EIDRM. */ /* Wake up all pending processes and let them fail with EIDRM. */
INIT_LIST_HEAD(&tasks);
list_for_each_entry_safe(q, tq, &sma->pending_const, list) { list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
unlink_queue(sma, q); unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
} }
list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
unlink_queue(sma, q); unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
} }
for (i = 0; i < sma->sem_nsems; i++) { for (i = 0; i < sma->sem_nsems; i++) {
struct sem *sem = sma->sem_base + i; struct sem *sem = sma->sem_base + i;
list_for_each_entry_safe(q, tq, &sem->pending_const, list) { list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
unlink_queue(sma, q); unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
} }
list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
unlink_queue(sma, q); unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
} }
} }
@@ -1153,7 +1122,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
sem_unlock(sma, -1); sem_unlock(sma, -1);
rcu_read_unlock(); rcu_read_unlock();
wake_up_sem_queue_do(&tasks); wake_up_q(&wake_q);
ns->used_sems -= sma->sem_nsems; ns->used_sems -= sma->sem_nsems;
ipc_rcu_putref(sma, sem_rcu_free); ipc_rcu_putref(sma, sem_rcu_free);
} }
@@ -1292,9 +1261,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
struct sem_undo *un; struct sem_undo *un;
struct sem_array *sma; struct sem_array *sma;
struct sem *curr; struct sem *curr;
int err; int err, val;
struct list_head tasks; DEFINE_WAKE_Q(wake_q);
int val;
#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
/* big-endian 64bit */ /* big-endian 64bit */
val = arg >> 32; val = arg >> 32;
@@ -1306,8 +1275,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
if (val > SEMVMX || val < 0) if (val > SEMVMX || val < 0)
return -ERANGE; return -ERANGE;
INIT_LIST_HEAD(&tasks);
rcu_read_lock(); rcu_read_lock();
sma = sem_obtain_object_check(ns, semid); sma = sem_obtain_object_check(ns, semid);
if (IS_ERR(sma)) { if (IS_ERR(sma)) {
@@ -1350,10 +1317,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
curr->sempid = task_tgid_vnr(current); curr->sempid = task_tgid_vnr(current);
sma->sem_ctime = get_seconds(); sma->sem_ctime = get_seconds();
/* maybe some queued-up processes were waiting for this */ /* maybe some queued-up processes were waiting for this */
do_smart_update(sma, NULL, 0, 0, &tasks); do_smart_update(sma, NULL, 0, 0, &wake_q);
sem_unlock(sma, -1); sem_unlock(sma, -1);
rcu_read_unlock(); rcu_read_unlock();
wake_up_sem_queue_do(&tasks); wake_up_q(&wake_q);
return 0; return 0;
} }
@@ -1365,9 +1332,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
int err, nsems; int err, nsems;
ushort fast_sem_io[SEMMSL_FAST]; ushort fast_sem_io[SEMMSL_FAST];
ushort *sem_io = fast_sem_io; ushort *sem_io = fast_sem_io;
struct list_head tasks; DEFINE_WAKE_Q(wake_q);
INIT_LIST_HEAD(&tasks);
rcu_read_lock(); rcu_read_lock();
sma = sem_obtain_object_check(ns, semid); sma = sem_obtain_object_check(ns, semid);
@@ -1478,7 +1443,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
} }
sma->sem_ctime = get_seconds(); sma->sem_ctime = get_seconds();
/* maybe some queued-up processes were waiting for this */ /* maybe some queued-up processes were waiting for this */
do_smart_update(sma, NULL, 0, 0, &tasks); do_smart_update(sma, NULL, 0, 0, &wake_q);
err = 0; err = 0;
goto out_unlock; goto out_unlock;
} }
@@ -1514,7 +1479,7 @@ out_unlock:
sem_unlock(sma, -1); sem_unlock(sma, -1);
out_rcu_wakeup: out_rcu_wakeup:
rcu_read_unlock(); rcu_read_unlock();
wake_up_sem_queue_do(&tasks); wake_up_q(&wake_q);
out_free: out_free:
if (sem_io != fast_sem_io) if (sem_io != fast_sem_io)
ipc_free(sem_io); ipc_free(sem_io);
@@ -1787,32 +1752,6 @@ out:
return un; return un;
} }
/**
* get_queue_result - retrieve the result code from sem_queue
* @q: Pointer to queue structure
*
* Retrieve the return code from the pending queue. If IN_WAKEUP is found in
* q->status, then we must loop until the value is replaced with the final
* value: This may happen if a task is woken up by an unrelated event (e.g.
* signal) and in parallel the task is woken up by another task because it got
* the requested semaphores.
*
* The function can be called with or without holding the semaphore spinlock.
*/
static int get_queue_result(struct sem_queue *q)
{
int error;
error = q->status;
while (unlikely(error == IN_WAKEUP)) {
cpu_relax();
error = q->status;
}
return error;
}
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
unsigned, nsops, const struct timespec __user *, timeout) unsigned, nsops, const struct timespec __user *, timeout)
{ {
@@ -1821,11 +1760,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf fast_sops[SEMOPM_FAST];
struct sembuf *sops = fast_sops, *sop; struct sembuf *sops = fast_sops, *sop;
struct sem_undo *un; struct sem_undo *un;
int undos = 0, alter = 0, max, locknum; int max, locknum;
bool undos = false, alter = false, dupsop = false;
struct sem_queue queue; struct sem_queue queue;
unsigned long jiffies_left = 0; unsigned long dup = 0, jiffies_left = 0;
struct ipc_namespace *ns; struct ipc_namespace *ns;
struct list_head tasks;
ns = current->nsproxy->ipc_ns; ns = current->nsproxy->ipc_ns;
@@ -1838,10 +1777,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
if (sops == NULL) if (sops == NULL)
return -ENOMEM; return -ENOMEM;
} }
if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
error = -EFAULT; error = -EFAULT;
goto out_free; goto out_free;
} }
if (timeout) { if (timeout) {
struct timespec _timeout; struct timespec _timeout;
if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
@@ -1855,18 +1796,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
} }
jiffies_left = timespec_to_jiffies(&_timeout); jiffies_left = timespec_to_jiffies(&_timeout);
} }
max = 0; max = 0;
for (sop = sops; sop < sops + nsops; sop++) { for (sop = sops; sop < sops + nsops; sop++) {
unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
if (sop->sem_num >= max) if (sop->sem_num >= max)
max = sop->sem_num; max = sop->sem_num;
if (sop->sem_flg & SEM_UNDO) if (sop->sem_flg & SEM_UNDO)
undos = 1; undos = true;
if (sop->sem_op != 0) if (dup & mask) {
alter = 1; /*
* There was a previous alter access that appears
* to have accessed the same semaphore, thus use
* the dupsop logic. "appears", because the detection
* can only check % BITS_PER_LONG.
*/
dupsop = true;
}
if (sop->sem_op != 0) {
alter = true;
dup |= mask;
}
} }
INIT_LIST_HEAD(&tasks);
if (undos) { if (undos) {
/* On success, find_alloc_undo takes the rcu_read_lock */ /* On success, find_alloc_undo takes the rcu_read_lock */
un = find_alloc_undo(ns, semid); un = find_alloc_undo(ns, semid);
@@ -1887,16 +1840,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
} }
error = -EFBIG; error = -EFBIG;
if (max >= sma->sem_nsems) if (max >= sma->sem_nsems) {
goto out_rcu_wakeup; rcu_read_unlock();
goto out_free;
}
error = -EACCES; error = -EACCES;
if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
goto out_rcu_wakeup; rcu_read_unlock();
goto out_free;
}
error = security_sem_semop(sma, sops, nsops, alter); error = security_sem_semop(sma, sops, nsops, alter);
if (error) if (error) {
goto out_rcu_wakeup; rcu_read_unlock();
goto out_free;
}
error = -EIDRM; error = -EIDRM;
locknum = sem_lock(sma, sops, nsops); locknum = sem_lock(sma, sops, nsops);
@@ -1925,24 +1884,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
queue.undo = un; queue.undo = un;
queue.pid = task_tgid_vnr(current); queue.pid = task_tgid_vnr(current);
queue.alter = alter; queue.alter = alter;
queue.dupsop = dupsop;
error = perform_atomic_semop(sma, &queue); error = perform_atomic_semop(sma, &queue);
if (error == 0) { if (error == 0) { /* non-blocking succesfull path */
/* If the operation was successful, then do DEFINE_WAKE_Q(wake_q);
/*
* If the operation was successful, then do
* the required updates. * the required updates.
*/ */
if (alter) if (alter)
do_smart_update(sma, sops, nsops, 1, &tasks); do_smart_update(sma, sops, nsops, 1, &wake_q);
else else
set_semotime(sma, sops); set_semotime(sma, sops);
sem_unlock(sma, locknum);
rcu_read_unlock();
wake_up_q(&wake_q);
goto out_free;
} }
if (error <= 0) if (error < 0) /* non-blocking error path */
goto out_unlock_free; goto out_unlock_free;
/* We need to sleep on this operation, so we put the current /*
* We need to sleep on this operation, so we put the current
* task into the pending queue and go to sleep. * task into the pending queue and go to sleep.
*/ */
if (nsops == 1) { if (nsops == 1) {
struct sem *curr; struct sem *curr;
curr = &sma->sem_base[sops->sem_num]; curr = &sma->sem_base[sops->sem_num];
@@ -1971,77 +1940,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
sma->complex_count++; sma->complex_count++;
} }
queue.status = -EINTR; do {
queue.sleeper = current; queue.status = -EINTR;
queue.sleeper = current;
sleep_again: __set_current_state(TASK_INTERRUPTIBLE);
__set_current_state(TASK_INTERRUPTIBLE); sem_unlock(sma, locknum);
sem_unlock(sma, locknum);
rcu_read_unlock();
if (timeout)
jiffies_left = schedule_timeout(jiffies_left);
else
schedule();
error = get_queue_result(&queue);
if (error != -EINTR) {
/* fast path: update_queue already obtained all requested
* resources.
* Perform a smp_mb(): User space could assume that semop()
* is a memory barrier: Without the mb(), the cpu could
* speculatively read in user space stale data that was
* overwritten by the previous owner of the semaphore.
*/
smp_mb();
goto out_free;
}
rcu_read_lock();
sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
/*
* Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
*/
error = get_queue_result(&queue);
/*
* Array removed? If yes, leave without sem_unlock().
*/
if (IS_ERR(sma)) {
rcu_read_unlock(); rcu_read_unlock();
goto out_free;
}
if (timeout)
jiffies_left = schedule_timeout(jiffies_left);
else
schedule();
/* /*
* If queue.status != -EINTR we are woken up by another process. * fastpath: the semop has completed, either successfully or
* Leave without unlink_queue(), but with sem_unlock(). * not, from the syscall pov, is quite irrelevant to us at this
*/ * point; we're done.
if (error != -EINTR) *
goto out_unlock_free; * We _do_ care, nonetheless, about being awoken by a signal or
* spuriously. The queue.status is checked again in the
* slowpath (aka after taking sem_lock), such that we can detect
* scenarios where we were awakened externally, during the
* window between wake_q_add() and wake_up_q().
*/
error = READ_ONCE(queue.status);
if (error != -EINTR) {
/*
* User space could assume that semop() is a memory
* barrier: Without the mb(), the cpu could
* speculatively read in userspace stale data that was
* overwritten by the previous owner of the semaphore.
*/
smp_mb();
goto out_free;
}
/* rcu_read_lock();
* If an interrupt occurred we have to clean up the queue sem_lock(sma, sops, nsops);
*/
if (timeout && jiffies_left == 0)
error = -EAGAIN;
/* if (!ipc_valid_object(&sma->sem_perm))
* If the wakeup was spurious, just retry goto out_unlock_free;
*/
if (error == -EINTR && !signal_pending(current)) error = READ_ONCE(queue.status);
goto sleep_again;
/*
* If queue.status != -EINTR we are woken up by another process.
* Leave without unlink_queue(), but with sem_unlock().
*/
if (error != -EINTR)
goto out_unlock_free;
/*
* If an interrupt occurred we have to clean up the queue.
*/
if (timeout && jiffies_left == 0)
error = -EAGAIN;
} while (error == -EINTR && !signal_pending(current)); /* spurious */
unlink_queue(sma, &queue); unlink_queue(sma, &queue);
out_unlock_free: out_unlock_free:
sem_unlock(sma, locknum); sem_unlock(sma, locknum);
out_rcu_wakeup:
rcu_read_unlock(); rcu_read_unlock();
wake_up_sem_queue_do(&tasks);
out_free: out_free:
if (sops != fast_sops) if (sops != fast_sops)
kfree(sops); kfree(sops);
@@ -2102,8 +2063,8 @@ void exit_sem(struct task_struct *tsk)
for (;;) { for (;;) {
struct sem_array *sma; struct sem_array *sma;
struct sem_undo *un; struct sem_undo *un;
struct list_head tasks;
int semid, i; int semid, i;
DEFINE_WAKE_Q(wake_q);
cond_resched(); cond_resched();
@@ -2191,11 +2152,10 @@ void exit_sem(struct task_struct *tsk)
} }
} }
/* maybe some queued-up processes were waiting for this */ /* maybe some queued-up processes were waiting for this */
INIT_LIST_HEAD(&tasks); do_smart_update(sma, NULL, 0, 1, &wake_q);
do_smart_update(sma, NULL, 0, 1, &tasks);
sem_unlock(sma, -1); sem_unlock(sma, -1);
rcu_read_unlock(); rcu_read_unlock();
wake_up_sem_queue_do(&tasks); wake_up_q(&wake_q);
kfree_rcu(un, rcu); kfree_rcu(un, rcu);
} }

View File

@@ -89,6 +89,7 @@ void shm_init_ns(struct ipc_namespace *ns)
static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{ {
struct shmid_kernel *shp; struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm); shp = container_of(ipcp, struct shmid_kernel, shm_perm);
if (shp->shm_nattch) { if (shp->shm_nattch) {
@@ -387,6 +388,7 @@ static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
struct file *file = vma->vm_file; struct file *file = vma->vm_file;
struct shm_file_data *sfd = shm_file_data(file); struct shm_file_data *sfd = shm_file_data(file);
int err = 0; int err = 0;
if (sfd->vm_ops->set_policy) if (sfd->vm_ops->set_policy)
err = sfd->vm_ops->set_policy(vma, new); err = sfd->vm_ops->set_policy(vma, new);
return err; return err;
@@ -417,7 +419,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
* In case of remap_file_pages() emulation, the file can represent * In case of remap_file_pages() emulation, the file can represent
* removed IPC ID: propogate shm_lock() error to caller. * removed IPC ID: propogate shm_lock() error to caller.
*/ */
ret =__shm_open(vma); ret = __shm_open(vma);
if (ret) if (ret)
return ret; return ret;
@@ -468,6 +470,7 @@ static unsigned long shm_get_unmapped_area(struct file *file,
unsigned long flags) unsigned long flags)
{ {
struct shm_file_data *sfd = shm_file_data(file); struct shm_file_data *sfd = shm_file_data(file);
return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
pgoff, flags); pgoff, flags);
} }
@@ -766,6 +769,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
} else { } else {
#ifdef CONFIG_SHMEM #ifdef CONFIG_SHMEM
struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_inode_info *info = SHMEM_I(inode);
spin_lock_irq(&info->lock); spin_lock_irq(&info->lock);
*rss_add += inode->i_mapping->nrpages; *rss_add += inode->i_mapping->nrpages;
*swp_add += info->swapped; *swp_add += info->swapped;
@@ -1028,6 +1032,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
kuid_t euid = current_euid(); kuid_t euid = current_euid();
if (!uid_eq(euid, shp->shm_perm.uid) && if (!uid_eq(euid, shp->shm_perm.uid) &&
!uid_eq(euid, shp->shm_perm.cuid)) { !uid_eq(euid, shp->shm_perm.cuid)) {
err = -EPERM; err = -EPERM;
@@ -1045,6 +1050,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
if (cmd == SHM_LOCK) { if (cmd == SHM_LOCK) {
struct user_struct *user = current_user(); struct user_struct *user = current_user();
err = shmem_lock(shm_file, 1, user); err = shmem_lock(shm_file, 1, user);
if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
shp->shm_perm.mode |= SHM_LOCKED; shp->shm_perm.mode |= SHM_LOCKED;
@@ -1354,9 +1360,10 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
vma = next; vma = next;
} }
#else /* CONFIG_MMU */ #else /* CONFIG_MMU */
/* under NOMMU conditions, the exact address to be destroyed must be /* under NOMMU conditions, the exact address to be destroyed must be
* given */ * given
*/
if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
retval = 0; retval = 0;

View File

@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o

View File

@@ -598,11 +598,11 @@ return_normal:
/* /*
* Wait for the other CPUs to be notified and be waiting for us: * Wait for the other CPUs to be notified and be waiting for us:
*/ */
time_left = loops_per_jiffy * HZ; time_left = MSEC_PER_SEC;
while (kgdb_do_roundup && --time_left && while (kgdb_do_roundup && --time_left &&
(atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
online_cpus) online_cpus)
cpu_relax(); udelay(1000);
if (!time_left) if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n"); pr_crit("Timed out waiting for secondary CPUs.\n");

View File

@@ -30,6 +30,7 @@
char kdb_prompt_str[CMD_BUFLEN]; char kdb_prompt_str[CMD_BUFLEN];
int kdb_trap_printk; int kdb_trap_printk;
int kdb_printf_cpu = -1;
static int kgdb_transition_check(char *buffer) static int kgdb_transition_check(char *buffer)
{ {
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int linecount; int linecount;
int colcount; int colcount;
int logging, saved_loglevel = 0; int logging, saved_loglevel = 0;
int saved_trap_printk;
int got_printf_lock = 0;
int retlen = 0; int retlen = 0;
int fnd, len; int fnd, len;
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> "; char *moreprompt = "more> ";
struct console *c = console_drivers; struct console *c = console_drivers;
static DEFINE_SPINLOCK(kdb_printf_lock);
unsigned long uninitialized_var(flags); unsigned long uninitialized_var(flags);
preempt_disable();
saved_trap_printk = kdb_trap_printk;
kdb_trap_printk = 0;
/* Serialize kdb_printf if multiple cpus try to write at once. /* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output, * But if any cpu goes recursive in kdb, just print the output,
* even if it is interleaved with any other text. * even if it is interleaved with any other text.
*/ */
if (!KDB_STATE(PRINTF_LOCK)) { local_irq_save(flags);
KDB_STATE_SET(PRINTF_LOCK); this_cpu = smp_processor_id();
spin_lock_irqsave(&kdb_printf_lock, flags); for (;;) {
got_printf_lock = 1; old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
atomic_inc(&kdb_event); if (old_cpu == -1 || old_cpu == this_cpu)
} else { break;
__acquire(kdb_printf_lock);
cpu_relax();
} }
diag = kdbgetintenv("LINES", &linecount); diag = kdbgetintenv("LINES", &linecount);
@@ -847,16 +843,9 @@ kdb_print_out:
suspend_grep = 0; /* end of what may have been a recursive call */ suspend_grep = 0; /* end of what may have been a recursive call */
if (logging) if (logging)
console_loglevel = saved_loglevel; console_loglevel = saved_loglevel;
if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { /* kdb_printf_cpu locked the code above. */
got_printf_lock = 0; smp_store_release(&kdb_printf_cpu, old_cpu);
spin_unlock_irqrestore(&kdb_printf_lock, flags); local_irq_restore(flags);
KDB_STATE_CLEAR(PRINTF_LOCK);
atomic_dec(&kdb_event);
} else {
__release(kdb_printf_lock);
}
kdb_trap_printk = saved_trap_printk;
preempt_enable();
return retlen; return retlen;
} }

View File

@@ -60,7 +60,6 @@ int kdb_grep_trailing;
* Kernel debugger state flags * Kernel debugger state flags
*/ */
int kdb_flags; int kdb_flags;
atomic_t kdb_event;
/* /*
* kdb_lock protects updates to kdb_initial_cpu. Used to * kdb_lock protects updates to kdb_initial_cpu. Used to

View File

@@ -132,7 +132,6 @@ extern int kdb_state;
#define KDB_STATE_PAGER 0x00000400 /* pager is available */ #define KDB_STATE_PAGER 0x00000400 /* pager is available */
#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
* back to initial cpu */ * back to initial cpu */
#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been

View File

@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry: retry:
/* Read the page with vaddr into memory */ /* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
&vma); &vma, NULL);
if (ret <= 0) if (ret <= 0)
return ret; return ret;
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* essentially a kernel access to the memory. * essentially a kernel access to the memory.
*/ */
result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
NULL); NULL, NULL);
if (result < 0) if (result < 0)
return result; return result;

View File

@@ -1,11 +1,16 @@
#define pr_fmt(fmt) "kcov: " fmt #define pr_fmt(fmt) "kcov: " fmt
#define DISABLE_BRANCH_PROFILING #define DISABLE_BRANCH_PROFILING
#include <linux/atomic.h>
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/init.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h> #include <linux/printk.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/slab.h> #include <linux/slab.h>

View File

@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
while (hole_end <= crashk_res.end) { while (hole_end <= crashk_res.end) {
unsigned long i; unsigned long i;
cond_resched();
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break; break;
/* See if I overlap any of the segments */ /* See if I overlap any of the segments */
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void)
#endif #endif
VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_X86
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
#endif
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#endif #endif

View File

@@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args)
int r; int r;
#ifdef CONFIG_KGDB_KDB #ifdef CONFIG_KGDB_KDB
if (unlikely(kdb_trap_printk)) { /* Allow to pass printk() to kdb but avoid a recursion. */
if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
return r; return r;
} }

View File

@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan,
{ {
struct rchan_buf *buf; struct rchan_buf *buf;
if (!chan) if (!chan || cpu >= NR_CPUS)
return; return;
buf = *per_cpu_ptr(chan->buf, cpu); buf = *per_cpu_ptr(chan->buf, cpu);
if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) if (!buf || subbufs_consumed > chan->n_subbufs)
return; return;
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)

View File

@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
/*
* In case the signal mask hasn't changed, there is nothing we need
* to do. The current->blocked shouldn't be modified by other task.
*/
if (sigequalsets(&tsk->blocked, newset))
return;
spin_lock_irq(&tsk->sighand->siglock); spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, newset); __set_task_blocked(tsk, newset);
spin_unlock_irq(&tsk->sighand->siglock); spin_unlock_irq(&tsk->sighand->siglock);

View File

@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void)
#ifdef CONFIG_COREDUMP #ifdef CONFIG_COREDUMP
if (suid_dumpable == SUID_DUMP_ROOT && if (suid_dumpable == SUID_DUMP_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|') { core_pattern[0] != '/' && core_pattern[0] != '|') {
printk(KERN_WARNING "Unsafe core_pattern used with "\ printk(KERN_WARNING
"suid_dumpable=2. Pipe handler or fully qualified "\ "Unsafe core_pattern used with fs.suid_dumpable=2.\n"
"core dump path required.\n"); "Pipe handler or fully qualified core dump path required.\n"
"Set kernel.core_pattern before fs.suid_dumpable.\n"
);
} }
#endif #endif
} }

View File

@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
"warning: process `%s' used the deprecated sysctl " "warning: process `%s' used the deprecated sysctl "
"system call with ", current->comm); "system call with ", current->comm);
for (i = 0; i < nlen; i++) for (i = 0; i < nlen; i++)
printk("%d.", name[i]); printk(KERN_CONT "%d.", name[i]);
printk("\n"); printk(KERN_CONT "\n");
} }
return; return;
} }

View File

@@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
spin_lock_irqsave(&ptr->it_lock, flags); spin_lock_irqsave(&ptr->it_lock, flags);
if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
if (posix_timer_event(ptr, 0) != 0) if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
posix_timer_event(ptr, 0) != 0)
ptr->it_overrun++; ptr->it_overrun++;
} }

View File

@@ -24,32 +24,14 @@
#include <asm/irq_regs.h> #include <asm/irq_regs.h>
#include <linux/kvm_para.h> #include <linux/kvm_para.h>
#include <linux/perf_event.h>
#include <linux/kthread.h> #include <linux/kthread.h>
/*
* The run state of the lockup detectors is controlled by the content of the
* 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
* bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
*
* 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
* are variables that are only used as an 'interface' between the parameters
* in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
* 'watchdog_thresh' variable is handled differently because its value is not
* boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
* is equal zero.
*/
#define NMI_WATCHDOG_ENABLED_BIT 0
#define SOFT_WATCHDOG_ENABLED_BIT 1
#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
static DEFINE_MUTEX(watchdog_proc_mutex); static DEFINE_MUTEX(watchdog_proc_mutex);
#ifdef CONFIG_HARDLOCKUP_DETECTOR #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
#else #else
static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif #endif
int __read_mostly nmi_watchdog_enabled; int __read_mostly nmi_watchdog_enabled;
int __read_mostly soft_watchdog_enabled; int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif #endif
static struct cpumask watchdog_cpumask __read_mostly; static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
static unsigned long soft_lockup_nmi_warn; static unsigned long soft_lockup_nmi_warn;
/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static unsigned long hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
* cases this function can be called to disable hard lockup detection. This
* function should only be executed once by the boot processor before the
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
void hardlockup_detector_disable(void)
{
watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
}
static int __init hardlockup_panic_setup(char *str)
{
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
else if (!strncmp(str, "1", 1))
watchdog_enabled |= NMI_WATCHDOG_ENABLED;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
#endif
unsigned int __read_mostly softlockup_panic = unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void)
wq_watchdog_touch(-1); wq_watchdog_touch(-1);
} }
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
/*
* Using __raw here because some code paths have
* preemption enabled. If preemption is enabled
* then interrupts should be enabled too, in which
* case we shouldn't have to worry about the watchdog
* going off.
*/
raw_cpu_write(watchdog_nmi_touch, true);
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
#endif
void touch_softlockup_watchdog_sync(void) void touch_softlockup_watchdog_sync(void)
{ {
__this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(softlockup_touch_sync, true);
__this_cpu_write(watchdog_touch_ts, 0); __this_cpu_write(watchdog_touch_ts, 0);
} }
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */ /* watchdog detector functions */
static bool is_hardlockup(void) bool is_hardlockup(void)
{ {
unsigned long hrint = __this_cpu_read(hrtimer_interrupts); unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +219,6 @@ static bool is_hardlockup(void)
__this_cpu_write(hrtimer_interrupts_saved, hrint); __this_cpu_write(hrtimer_interrupts_saved, hrint);
return false; return false;
} }
#endif
static int is_softlockup(unsigned long touch_ts) static int is_softlockup(unsigned long touch_ts)
{ {
@@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
return 0; return 0;
} }
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
};
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
}
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
* fired multiple times before we overflow'd. If it hasn't
* then this is a good indication the cpu is stuck
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
struct pt_regs *regs = get_irq_regs();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
/*
* Perform all-CPU dump only once to avoid multiple hardlockups
* generating interleaving traces
*/
if (sysctl_hardlockup_all_cpu_backtrace &&
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
}
__this_cpu_write(hard_watchdog_warn, false);
return;
}
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static void watchdog_interrupt_count(void) static void watchdog_interrupt_count(void)
{ {
__this_cpu_inc(hrtimer_interrupts); __this_cpu_inc(hrtimer_interrupts);
} }
static int watchdog_nmi_enable(unsigned int cpu); /*
static void watchdog_nmi_disable(unsigned int cpu); * These two functions are mostly architecture specific
* defining them as weak here.
*/
int __weak watchdog_nmi_enable(unsigned int cpu)
{
return 0;
}
void __weak watchdog_nmi_disable(unsigned int cpu)
{
}
static int watchdog_enable_all_cpus(void); static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void); static void watchdog_disable_all_cpus(void);
@@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu)
watchdog_nmi_disable(cpu); watchdog_nmi_disable(cpu);
} }
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/*
* People like the simple clean cpu node info on boot.
* Reduce the watchdog noise by only printing messages
* that are different from what cpu0 displayed.
*/
static unsigned long cpu0_err;
static int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
/* nothing to do if the hard lockup detector is disabled */
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
goto out;
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
goto out;
/* it is setup but not enabled */
if (event != NULL)
goto out_enable;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
/* save cpu0 error for future comparision */
if (cpu == 0 && IS_ERR(event))
cpu0_err = PTR_ERR(event);
if (!IS_ERR(event)) {
/* only print for cpu0 or different than cpu0 */
if (cpu == 0 || cpu0_err)
pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
goto out_save;
}
/*
* Disable the hard lockup detector if _any_ CPU fails to set up
* set up the hardware perf event. The watchdog() function checks
* the NMI_WATCHDOG_ENABLED bit periodically.
*
* The barriers are for syncing up watchdog_enabled across all the
* cpus, as clear_bit() does not use barriers.
*/
smp_mb__before_atomic();
clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
smp_mb__after_atomic();
/* skip displaying the same error again */
if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
return PTR_ERR(event);
/* vary the KERN level based on the returned errno */
if (PTR_ERR(event) == -EOPNOTSUPP)
pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
pr_warn("disabled (cpu%i): hardware events not enabled\n",
cpu);
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
cpu, PTR_ERR(event));
pr_info("Shutting down hard lockup detector on all cpus\n");
return PTR_ERR(event);
/* success path */
out_save:
per_cpu(watchdog_ev, cpu) = event;
out_enable:
perf_event_enable(per_cpu(watchdog_ev, cpu));
out:
return 0;
}
static void watchdog_nmi_disable(unsigned int cpu)
{
struct perf_event *event = per_cpu(watchdog_ev, cpu);
if (event) {
perf_event_disable(event);
per_cpu(watchdog_ev, cpu) = NULL;
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
}
if (cpu == 0) {
/* watchdog_nmi_enable() expects this to be zero initially. */
cpu0_err = 0;
}
}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static struct smp_hotplug_thread watchdog_threads = { static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog, .store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run, .thread_should_run = watchdog_should_run,

227
kernel/watchdog_hld.c Normal file
View File

@@ -0,0 +1,227 @@
/*
* Detect hard lockups on a system
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
* Note: Most of this code is borrowed heavily from the original softlockup
* detector, so thanks to Ingo for the initial implementation.
* Some chunks also taken from the old x86-specific nmi watchdog code, thanks
* to those contributors as well.
*/
#define pr_fmt(fmt) "NMI watchdog: " fmt
#include <linux/nmi.h>
#include <linux/module.h>
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static unsigned long hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
* cases this function can be called to disable hard lockup detection. This
* function should only be executed once by the boot processor before the
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
void hardlockup_detector_disable(void)
{
watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
}
static int __init hardlockup_panic_setup(char *str)
{
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
else if (!strncmp(str, "1", 1))
watchdog_enabled |= NMI_WATCHDOG_ENABLED;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
void touch_nmi_watchdog(void)
{
/*
* Using __raw here because some code paths have
* preemption enabled. If preemption is enabled
* then interrupts should be enabled too, in which
* case we shouldn't have to worry about the watchdog
* going off.
*/
raw_cpu_write(watchdog_nmi_touch, true);
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.size = sizeof(struct perf_event_attr),
.pinned = 1,
.disabled = 1,
};
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
}
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
* fired multiple times before we overflow'd. If it hasn't
* then this is a good indication the cpu is stuck
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
/*
* Perform all-CPU dump only once to avoid multiple hardlockups
* generating interleaving traces
*/
if (sysctl_hardlockup_all_cpu_backtrace &&
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
}
__this_cpu_write(hard_watchdog_warn, false);
return;
}
/*
* People like the simple clean cpu node info on boot.
* Reduce the watchdog noise by only printing messages
* that are different from what cpu0 displayed.
*/
static unsigned long cpu0_err;
int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
/* nothing to do if the hard lockup detector is disabled */
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
goto out;
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
goto out;
/* it is setup but not enabled */
if (event != NULL)
goto out_enable;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
/* save cpu0 error for future comparision */
if (cpu == 0 && IS_ERR(event))
cpu0_err = PTR_ERR(event);
if (!IS_ERR(event)) {
/* only print for cpu0 or different than cpu0 */
if (cpu == 0 || cpu0_err)
pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
goto out_save;
}
/*
* Disable the hard lockup detector if _any_ CPU fails to set up
* set up the hardware perf event. The watchdog() function checks
* the NMI_WATCHDOG_ENABLED bit periodically.
*
* The barriers are for syncing up watchdog_enabled across all the
* cpus, as clear_bit() does not use barriers.
*/
smp_mb__before_atomic();
clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
smp_mb__after_atomic();
/* skip displaying the same error again */
if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
return PTR_ERR(event);
/* vary the KERN level based on the returned errno */
if (PTR_ERR(event) == -EOPNOTSUPP)
pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
pr_warn("disabled (cpu%i): hardware events not enabled\n",
cpu);
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
cpu, PTR_ERR(event));
pr_info("Shutting down hard lockup detector on all cpus\n");
return PTR_ERR(event);
/* success path */
out_save:
per_cpu(watchdog_ev, cpu) = event;
out_enable:
perf_event_enable(per_cpu(watchdog_ev, cpu));
out:
return 0;
}
void watchdog_nmi_disable(unsigned int cpu)
{
struct perf_event *event = per_cpu(watchdog_ev, cpu);
if (event) {
perf_event_disable(event);
per_cpu(watchdog_ev, cpu) = NULL;
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
}
if (cpu == 0) {
/* watchdog_nmi_enable() expects this to be zero initially. */
cpu0_err = 0;
}
}

View File

@@ -194,8 +194,8 @@ config GDB_SCRIPTS
build directory. If you load vmlinux into gdb, the helper build directory. If you load vmlinux into gdb, the helper
scripts will be automatically imported by gdb as well, and scripts will be automatically imported by gdb as well, and
additional functions are available to analyze a Linux kernel additional functions are available to analyze a Linux kernel
instance. See Documentation/gdb-kernel-debugging.txt for further instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
details. for further details.
config ENABLE_WARN_DEPRECATED config ENABLE_WARN_DEPRECATED
bool "Enable __deprecated logic" bool "Enable __deprecated logic"
@@ -542,7 +542,7 @@ config DEBUG_KMEMLEAK
difference being that the orphan objects are not freed but difference being that the orphan objects are not freed but
only shown in /sys/kernel/debug/kmemleak. Enabling this only shown in /sys/kernel/debug/kmemleak. Enabling this
feature will introduce an overhead to memory feature will introduce an overhead to memory
allocations. See Documentation/kmemleak.txt for more allocations. See Documentation/dev-tools/kmemleak.rst for more
details. details.
Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
@@ -739,7 +739,7 @@ config KCOV
different machines and across reboots. If you need stable PC values, different machines and across reboots. If you need stable PC values,
disable RANDOMIZE_BASE. disable RANDOMIZE_BASE.
For more details, see Documentation/kcov.txt. For more details, see Documentation/dev-tools/kcov.rst.
config KCOV_INSTRUMENT_ALL config KCOV_INSTRUMENT_ALL
bool "Instrument all code by default" bool "Instrument all code by default"

View File

@@ -10,7 +10,8 @@ config UBSAN
This option enables undefined behaviour sanity checker This option enables undefined behaviour sanity checker
Compile-time instrumentation is used to detect various undefined Compile-time instrumentation is used to detect various undefined
behaviours in runtime. Various types of checks may be enabled behaviours in runtime. Various types of checks may be enabled
via boot parameter ubsan_handle (see: Documentation/ubsan.txt). via boot parameter ubsan_handle
(see: Documentation/dev-tools/ubsan.rst).
config UBSAN_SANITIZE_ALL config UBSAN_SANITIZE_ALL
bool "Enable instrumentation for the entire kernel" bool "Enable instrumentation for the entire kernel"

File diff suppressed because it is too large Load Diff

View File

@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page_count(page) > page_mapcount(page)) page_count(page) > page_mapcount(page))
goto isolate_fail; goto isolate_fail;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
goto isolate_fail;
/* If we already hold the lock, we can skip some rechecking */ /* If we already hold the lock, we can skip some rechecking */
if (!locked) { if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone), locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac, unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio) enum compact_priority prio)
{ {
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO; int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z; struct zoneref *z;
struct zone *zone; struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED; enum compact_result rc = COMPACT_SKIPPED;
/* Check if the GFP flags allow compaction */ /*
if (!may_enter_fs || !may_perform_io) * Check if the GFP flags allow compaction - GFP_NOIO is really
* tricky context because the migration might require IO
*/
if (!may_perform_io)
return COMPACT_SKIPPED; return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1751,6 +1760,7 @@ static void compact_node(int nid)
.mode = MIGRATE_SYNC, .mode = MIGRATE_SYNC,
.ignore_skip_hint = true, .ignore_skip_hint = true,
.whole_zone = true, .whole_zone = true,
.gfp_mask = GFP_KERNEL,
}; };
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.classzone_idx = pgdat->kcompactd_classzone_idx, .classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT, .mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true, .ignore_skip_hint = true,
.gfp_mask = GFP_KERNEL,
}; };
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,

View File

@@ -2164,12 +2164,12 @@ page_not_uptodate:
} }
EXPORT_SYMBOL(filemap_fault); EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct fault_env *fe, void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff) pgoff_t start_pgoff, pgoff_t end_pgoff)
{ {
struct radix_tree_iter iter; struct radix_tree_iter iter;
void **slot; void **slot;
struct file *file = fe->vma->vm_file; struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff; pgoff_t last_pgoff = start_pgoff;
loff_t size; loff_t size;
@@ -2225,11 +2225,11 @@ repeat:
if (file->f_ra.mmap_miss > 0) if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--; file->f_ra.mmap_miss--;
fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
if (fe->pte) if (vmf->pte)
fe->pte += iter.index - last_pgoff; vmf->pte += iter.index - last_pgoff;
last_pgoff = iter.index; last_pgoff = iter.index;
if (alloc_set_pte(fe, NULL, page)) if (alloc_set_pte(vmf, NULL, page))
goto unlock; goto unlock;
unlock_page(page); unlock_page(page);
goto next; goto next;
@@ -2239,7 +2239,7 @@ skip:
put_page(page); put_page(page);
next: next:
/* Huge page is mapped? No need to proceed. */ /* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*fe->pmd)) if (pmd_trans_huge(*vmf->pmd))
break; break;
if (iter.index == end_pgoff) if (iter.index == end_pgoff)
break; break;

View File

@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked);
* caller if required (just like with __get_user_pages). "FOLL_GET" * caller if required (just like with __get_user_pages). "FOLL_GET"
* is set implicitly if "pages" is non-NULL. * is set implicitly if "pages" is non-NULL.
*/ */
__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
unsigned long start, unsigned long nr_pages, struct mm_struct *mm, unsigned long start,
struct page **pages, unsigned int gup_flags) unsigned long nr_pages, struct page **pages,
unsigned int gup_flags)
{ {
long ret; long ret;
int locked = 1; int locked = 1;
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
return ret; return ret;
} }
EXPORT_SYMBOL(__get_user_pages_unlocked);
/* /*
* get_user_pages_unlocked() is suitable to replace the form: * get_user_pages_unlocked() is suitable to replace the form:
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* only intends to ensure the pages are faulted in. * only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page. * @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them. * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
* *
* Returns number of pages pinned. This may be fewer than the number * Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages * requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas) struct vm_area_struct **vmas, int *locked)
{ {
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
NULL, false, locked, true,
gup_flags | FOLL_TOUCH | FOLL_REMOTE); gup_flags | FOLL_TOUCH | FOLL_REMOTE);
} }
EXPORT_SYMBOL(get_user_pages_remote); EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
/* /*
* This is the same as get_user_pages_remote(), just with a * This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task * less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's. We also * and mm being operated on are the current task's and don't allow
* obviously don't pass FOLL_REMOTE in here. * passing of a locked parameter. We also obviously don't pass
* FOLL_REMOTE in here.
*/ */
long get_user_pages(unsigned long start, unsigned long nr_pages, long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, unsigned int gup_flags, struct page **pages,

Some files were not shown because too many files have changed in this diff Show More