Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block

* 'writeback' of git://git.kernel.dk/linux-2.6-block:
  writeback: check for registered bdi in flusher add and inode dirty
  writeback: add name to backing_dev_info
  writeback: add some debug inode list counters to bdi stats
  writeback: get rid of pdflush completely
  writeback: switch to per-bdi threads for flushing data
  writeback: move dirty inodes from super_block to backing_dev_info
  writeback: get rid of generic_sync_sb_inodes() export
This commit is contained in:
Linus Torvalds
2009-09-11 09:17:05 -07:00
29 changed files with 1490 additions and 983 deletions

View File

@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o pdflush.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)

View File

@@ -1,8 +1,11 @@
#include <linux/wait.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
LIST_HEAD(bdi_pending_list);
static struct task_struct *sync_supers_tsk;
static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
static void arm_supers_timer(void);
static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
struct bdi_writeback *wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
struct inode *inode;
/*
* inode lock is enough here, the bdi->wb_list is protected by
* RCU on the reader side
*/
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
list_for_each_entry(wb, &bdi->wb_list, list) {
nr_wb++;
list_for_each_entry(inode, &wb->b_dirty, i_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_list)
nr_more_io++;
}
spin_unlock(&inode_lock);
get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiReclaimable: %8lu kB\n"
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n",
"BackgroundThresh: %8lu kB\n"
"WriteBack threads:%8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
"bdi_list: %8u\n"
"state: %8lx\n"
"wb_mask: %8lx\n"
"wb_list: %8u\n"
"wb_cnt: %8u\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh),
K(dirty_thresh),
K(background_thresh));
K(bdi_thresh), K(dirty_thresh),
K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
!list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
!list_empty(&bdi->wb_list), bdi->wb_cnt);
#undef K
return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
{
int err;
sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
BUG_ON(IS_ERR(sync_supers_tsk));
init_timer(&sync_supers_timer);
setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
arm_supers_timer();
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
{
memset(wb, 0, sizeof(*wb));
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
}
static void bdi_task_init(struct backing_dev_info *bdi,
struct bdi_writeback *wb)
{
struct task_struct *tsk = current;
spin_lock(&bdi->wb_lock);
list_add_tail_rcu(&wb->list, &bdi->wb_list);
spin_unlock(&bdi->wb_lock);
tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
set_freezable();
/*
* Our parent may run at a different priority, just set us to normal
*/
set_user_nice(tsk, 0);
}
static int bdi_start_fn(void *ptr)
{
struct bdi_writeback *wb = ptr;
struct backing_dev_info *bdi = wb->bdi;
int ret;
/*
* Add us to the active bdi_list
*/
spin_lock(&bdi_lock);
list_add(&bdi->bdi_list, &bdi_list);
spin_unlock(&bdi_lock);
bdi_task_init(bdi, wb);
/*
* Clear pending bit and wakeup anybody waiting to tear us down
*/
clear_bit(BDI_pending, &bdi->state);
smp_mb__after_clear_bit();
wake_up_bit(&bdi->state, BDI_pending);
ret = bdi_writeback_task(wb);
/*
* Remove us from the list
*/
spin_lock(&bdi->wb_lock);
list_del_rcu(&wb->list);
spin_unlock(&bdi->wb_lock);
/*
* Flush any work that raced with us exiting. No new work
* will be added, since this bdi isn't discoverable anymore.
*/
if (!list_empty(&bdi->work_list))
wb_do_writeback(wb, 1);
wb->task = NULL;
return ret;
}
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
}
static void bdi_flush_io(struct backing_dev_info *bdi)
{
struct writeback_control wbc = {
.bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.range_cyclic = 1,
.nr_to_write = 1024,
};
writeback_inodes_wbc(&wbc);
}
/*
* kupdated() used to do this. We cannot do it from the bdi_forker_task()
* or we risk deadlocking on ->s_umount. The longer term solution would be
* to implement sync_supers_bdi() or similar and simply do it from the
* bdi writeback tasks individually.
*/
static int bdi_sync_supers(void *unused)
{
set_user_nice(current, 0);
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
/*
* Do this periodically, like kupdated() did before.
*/
sync_supers();
}
return 0;
}
static void arm_supers_timer(void)
{
unsigned long next;
next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
mod_timer(&sync_supers_timer, round_jiffies_up(next));
}
static void sync_supers_timer_fn(unsigned long unused)
{
wake_up_process(sync_supers_tsk);
arm_supers_timer();
}
static int bdi_forker_task(void *ptr)
{
struct bdi_writeback *me = ptr;
bdi_task_init(me->bdi, me);
for (;;) {
struct backing_dev_info *bdi, *tmp;
struct bdi_writeback *wb;
/*
* Temporary measure, we want to make sure we don't see
* dirty data on the default backing_dev_info
*/
if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
wb_do_writeback(me, 0);
spin_lock(&bdi_lock);
/*
* Check if any existing bdi's have dirty data without
* a thread registered. If so, set that up.
*/
list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
if (bdi->wb.task)
continue;
if (list_empty(&bdi->work_list) &&
!bdi_has_dirty_io(bdi))
continue;
bdi_add_default_flusher_task(bdi);
}
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&bdi_pending_list)) {
unsigned long wait;
spin_unlock(&bdi_lock);
wait = msecs_to_jiffies(dirty_writeback_interval * 10);
schedule_timeout(wait);
try_to_freeze();
continue;
}
__set_current_state(TASK_RUNNING);
/*
* This is our real job - check for pending entries in
* bdi_pending_list, and create the tasks that got added
*/
bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
bdi_list);
list_del_init(&bdi->bdi_list);
spin_unlock(&bdi_lock);
wb = &bdi->wb;
wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
dev_name(bdi->dev));
/*
* If task creation fails, then readd the bdi to
* the pending list and force writeout of the bdi
* from this forker thread. That will free some memory
* and we can try again.
*/
if (IS_ERR(wb->task)) {
wb->task = NULL;
/*
* Add this 'bdi' to the back, so we get
* a chance to flush other bdi's to free
* memory.
*/
spin_lock(&bdi_lock);
list_add_tail(&bdi->bdi_list, &bdi_pending_list);
spin_unlock(&bdi_lock);
bdi_flush_io(bdi);
}
}
return 0;
}
/*
* Add the default flusher task that gets created for any bdi
* that has dirty data pending writeout
*/
void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
{
if (!bdi_cap_writeback_dirty(bdi))
return;
if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
printk(KERN_ERR "bdi %p/%s is not registered!\n",
bdi, bdi->name);
return;
}
/*
* Check with the helper whether to proceed adding a task. Will only
* abort if we two or more simultanous calls to
* bdi_add_default_flusher_task() occured, further additions will block
* waiting for previous additions to finish.
*/
if (!test_and_set_bit(BDI_pending, &bdi->state)) {
list_move_tail(&bdi->bdi_list, &bdi_pending_list);
/*
* We are now on the pending list, wake up bdi_forker_task()
* to finish the job and add us back to the active bdi_list
*/
wake_up_process(default_backing_dev_info.wb.task);
}
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
spin_lock(&bdi_lock);
list_add_tail(&bdi->bdi_list, &bdi_list);
spin_unlock(&bdi_lock);
bdi->dev = dev;
/*
* Just start the forker thread for our default backing_dev_info,
* and add other bdi's to the list. They will get a thread created
* on-demand when they need it.
*/
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;
wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
dev_name(dev));
if (IS_ERR(wb->task)) {
wb->task = NULL;
ret = -ENOMEM;
spin_lock(&bdi_lock);
list_del(&bdi->bdi_list);
spin_unlock(&bdi_lock);
goto exit;
}
}
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
exit:
return ret;
}
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
struct bdi_writeback *wb;
if (!bdi_cap_writeback_dirty(bdi))
return;
/*
* If setup is pending, wait for that to complete first
*/
wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
TASK_UNINTERRUPTIBLE);
/*
* Make sure nobody finds us on the bdi_list anymore
*/
spin_lock(&bdi_lock);
list_del(&bdi->bdi_list);
spin_unlock(&bdi_lock);
/*
* Finally, kill the kernel threads. We don't need to be RCU
* safe anymore, since the bdi is gone from visibility.
*/
list_for_each_entry(wb, &bdi->wb_list, list)
kthread_stop(wb->task);
}
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
bdi->dev = NULL;
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister);
int bdi_init(struct backing_dev_info *bdi)
{
int i;
int err;
int i, err;
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi);
/*
* Just one thread support for now, hard code mask and count
*/
bdi->wb_mask = 1;
bdi->wb_cnt = 1;
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
WARN_ON(bdi_has_dirty_io(bdi));
bdi_unregister(bdi);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)

View File

@@ -35,15 +35,6 @@
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
* operation. We do this so we don't hold I_SYNC against an inode for
* enormous amounts of time, which would block a userspace task which has
* been forced to throttle against that inode. Also, the code reevaluates
* the dirty each time it has written this many pages.
*/
#define MAX_WRITEBACK_PAGES 1024
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
static void background_writeout(unsigned long _min_pages);
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
*
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
/*
*
*/
static DEFINE_SPINLOCK(bdi_lock);
static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
unsigned long flags;
spin_lock_irqsave(&bdi_lock, flags);
spin_lock(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
ret = -EINVAL;
}
}
spin_unlock_irqrestore(&bdi_lock, flags);
spin_unlock(&bdi_lock);
return ret;
}
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
unsigned long flags;
int ret = 0;
if (max_ratio > 100)
return -EINVAL;
spin_lock_irqsave(&bdi_lock, flags);
spin_lock(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
spin_unlock_irqrestore(&bdi_lock, flags);
spin_unlock(&bdi_lock);
return ret;
}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
* up.
*/
if (bdi_nr_reclaimable > bdi_thresh) {
writeback_inodes(&wbc);
writeback_inodes_wbc(&wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
if (pages_written >= write_chunk)
break; /* We've done our duty */
congestion_wait(BLK_RW_ASYNC, HZ/10);
schedule_timeout(1);
}
if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS)
> background_thresh)))
pdflush_operation(background_writeout, 0);
(!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS))
> background_thresh))) {
struct writeback_control wbc = {
.bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.nr_to_write = nr_writeback,
};
bdi_start_writeback(&wbc);
}
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -681,124 +675,10 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
}
/*
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
static void background_writeout(unsigned long _min_pages)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
.range_cyclic = 1,
};
for ( ; ; ) {
unsigned long background_thresh;
unsigned long dirty_thresh;
get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
if (global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
break;
wbc.more_io = 0;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
if (wbc.encountered_congestion || wbc.more_io)
congestion_wait(BLK_RW_ASYNC, HZ/10);
else
break;
}
}
}
/*
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
int wakeup_pdflush(long nr_pages)
{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
return pdflush_operation(background_writeout, nr_pages);
}
static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
* Periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space. So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_interval. But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static void wb_kupdate(unsigned long arg)
{
unsigned long oldest_jif;
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
.range_cyclic = 1,
};
sync_supers();
oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
start_jif = jiffies;
next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
nr_to_write = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
wbc.more_io = 0;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0) {
if (wbc.encountered_congestion || wbc.more_io)
congestion_wait(BLK_RW_ASYNC, HZ/10);
else
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
}
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
@@ -806,28 +686,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, file, buffer, length, ppos);
if (dirty_writeback_interval)
mod_timer(&wb_timer, jiffies +
msecs_to_jiffies(dirty_writeback_interval * 10));
else
del_timer(&wb_timer);
return 0;
}
static void wb_timer_fn(unsigned long unused)
static void do_laptop_sync(struct work_struct *work)
{
if (pdflush_operation(wb_kupdate, 0) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
static void laptop_flush(unsigned long unused)
{
sys_sync();
wakeup_flusher_threads(0);
kfree(work);
}
static void laptop_timer_fn(unsigned long unused)
{
pdflush_operation(laptop_flush, 0);
struct work_struct *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
INIT_WORK(work, do_laptop_sync);
schedule_work(work);
}
}
/*
@@ -910,8 +786,6 @@ void __init page_writeback_init(void)
{
int shift;
mod_timer(&wb_timer,
jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);

View File

@@ -1,269 +0,0 @@
/*
* mm/pdflush.c - worker threads for writing back filesystem data
*
* Copyright (C) 2002, Linus Torvalds.
*
* 09Apr2002 Andrew Morton
* Initial version
* 29Feb2004 kaos@sgi.com
* Move worker thread creation to kthread to avoid chewing
* up stack space with nested calls to kernel_thread.
*/
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/signal.h>
#include <linux/spinlock.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h> /* Needed by writeback.h */
#include <linux/writeback.h> /* Prototypes pdflush_operation() */
#include <linux/kthread.h>
#include <linux/cpuset.h>
#include <linux/freezer.h>
/*
* Minimum and maximum number of pdflush instances
*/
#define MIN_PDFLUSH_THREADS 2
#define MAX_PDFLUSH_THREADS 8
static void start_one_pdflush_thread(void);
/*
* The pdflush threads are worker threads for writing back dirty data.
* Ideally, we'd like one thread per active disk spindle. But the disk
* topology is very hard to divine at this level. Instead, we take
* care in various places to prevent more than one pdflush thread from
* performing writeback against a single filesystem. pdflush threads
* have the PF_FLUSHER flag set in current->flags to aid in this.
*/
/*
* All the pdflush threads. Protected by pdflush_lock
*/
static LIST_HEAD(pdflush_list);
static DEFINE_SPINLOCK(pdflush_lock);
/*
* The count of currently-running pdflush threads. Protected
* by pdflush_lock.
*
* Readable by sysctl, but not writable. Published to userspace at
* /proc/sys/vm/nr_pdflush_threads.
*/
int nr_pdflush_threads = 0;
/*
* The time at which the pdflush thread pool last went empty
*/
static unsigned long last_empty_jifs;
/*
* The pdflush thread.
*
* Thread pool management algorithm:
*
* - The minimum and maximum number of pdflush instances are bound
* by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
*
* - If there have been no idle pdflush instances for 1 second, create
* a new one.
*
* - If the least-recently-went-to-sleep pdflush thread has been asleep
* for more than one second, terminate a thread.
*/
/*
* A structure for passing work to a pdflush thread. Also for passing
* state information between pdflush threads. Protected by pdflush_lock.
*/
struct pdflush_work {
struct task_struct *who; /* The thread */
void (*fn)(unsigned long); /* A callback function */
unsigned long arg0; /* An argument to the callback */
struct list_head list; /* On pdflush_list, when idle */
unsigned long when_i_went_to_sleep;
};
static int __pdflush(struct pdflush_work *my_work)
{
current->flags |= PF_FLUSHER | PF_SWAPWRITE;
set_freezable();
my_work->fn = NULL;
my_work->who = current;
INIT_LIST_HEAD(&my_work->list);
spin_lock_irq(&pdflush_lock);
for ( ; ; ) {
struct pdflush_work *pdf;
set_current_state(TASK_INTERRUPTIBLE);
list_move(&my_work->list, &pdflush_list);
my_work->when_i_went_to_sleep = jiffies;
spin_unlock_irq(&pdflush_lock);
schedule();
try_to_freeze();
spin_lock_irq(&pdflush_lock);
if (!list_empty(&my_work->list)) {
/*
* Someone woke us up, but without removing our control
* structure from the global list. swsusp will do this
* in try_to_freeze()->refrigerator(). Handle it.
*/
my_work->fn = NULL;
continue;
}
if (my_work->fn == NULL) {
printk("pdflush: bogus wakeup\n");
continue;
}
spin_unlock_irq(&pdflush_lock);
(*my_work->fn)(my_work->arg0);
spin_lock_irq(&pdflush_lock);
/*
* Thread creation: For how long have there been zero
* available threads?
*
* To throttle creation, we reset last_empty_jifs.
*/
if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
if (list_empty(&pdflush_list)) {
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
last_empty_jifs = jiffies;
nr_pdflush_threads++;
spin_unlock_irq(&pdflush_lock);
start_one_pdflush_thread();
spin_lock_irq(&pdflush_lock);
}
}
}
my_work->fn = NULL;
/*
* Thread destruction: For how long has the sleepiest
* thread slept?
*/
if (list_empty(&pdflush_list))
continue;
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
/* Limit exit rate */
pdf->when_i_went_to_sleep = jiffies;
break; /* exeunt */
}
}
nr_pdflush_threads--;
spin_unlock_irq(&pdflush_lock);
return 0;
}
/*
* Of course, my_work wants to be just a local in __pdflush(). It is
* separated out in this manner to hopefully prevent the compiler from
* performing unfortunate optimisations against the auto variables. Because
* these are visible to other tasks and CPUs. (No problem has actually
* been observed. This is just paranoia).
*/
static int pdflush(void *dummy)
{
struct pdflush_work my_work;
cpumask_var_t cpus_allowed;
/*
* Since the caller doesn't even check kthread_run() worked, let's not
* freak out too much if this fails.
*/
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
return 0;
}
/*
* pdflush can spend a lot of time doing encryption via dm-crypt. We
* don't want to do that at keventd's priority.
*/
set_user_nice(current, 0);
/*
* Some configs put our parent kthread in a limited cpuset,
* which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
* Our needs are more modest - cut back to our cpusets cpus_allowed.
* This is needed as pdflush's are dynamically created and destroyed.
* The boottime pdflush's are easily placed w/o these 2 lines.
*/
cpuset_cpus_allowed(current, cpus_allowed);
set_cpus_allowed_ptr(current, cpus_allowed);
free_cpumask_var(cpus_allowed);
return __pdflush(&my_work);
}
/*
* Attempt to wake up a pdflush thread, and get it to do some work for you.
* Returns zero if it indeed managed to find a worker thread, and passed your
* payload to it.
*/
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
{
unsigned long flags;
int ret = 0;
BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
spin_lock_irqsave(&pdflush_lock, flags);
if (list_empty(&pdflush_list)) {
ret = -1;
} else {
struct pdflush_work *pdf;
pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
list_del_init(&pdf->list);
if (list_empty(&pdflush_list))
last_empty_jifs = jiffies;
pdf->fn = fn;
pdf->arg0 = arg0;
wake_up_process(pdf->who);
}
spin_unlock_irqrestore(&pdflush_lock, flags);
return ret;
}
static void start_one_pdflush_thread(void)
{
struct task_struct *k;
k = kthread_run(pdflush, NULL, "pdflush");
if (unlikely(IS_ERR(k))) {
spin_lock_irq(&pdflush_lock);
nr_pdflush_threads--;
spin_unlock_irq(&pdflush_lock);
}
}
static int __init pdflush_init(void)
{
int i;
/*
* Pre-set nr_pdflush_threads... If we fail to create,
* the count will be decremented.
*/
nr_pdflush_threads = MIN_PDFLUSH_THREADS;
for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
start_one_pdflush_thread();
return 0;
}
module_init(pdflush_init);

View File

@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
};
static struct backing_dev_info swap_backing_dev_info = {
.name = "swap",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
.unplug_io_fn = swap_unplug_io_fn,
};

View File

@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (total_scanned > sc->swap_cluster_max +
sc->swap_cluster_max / 2) {
wakeup_pdflush(laptop_mode ? 0 : total_scanned);
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}