|
@@ -19,171 +19,223 @@
|
|
|
#include <linux/sched.h>
|
|
|
#include <linux/fs.h>
|
|
|
#include <linux/mm.h>
|
|
|
+#include <linux/kthread.h>
|
|
|
+#include <linux/freezer.h>
|
|
|
#include <linux/writeback.h>
|
|
|
#include <linux/blkdev.h>
|
|
|
#include <linux/backing-dev.h>
|
|
|
#include <linux/buffer_head.h>
|
|
|
#include "internal.h"
|
|
|
|
|
|
+#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
|
|
|
|
|
|
-/**
|
|
|
- * writeback_acquire - attempt to get exclusive writeback access to a device
|
|
|
- * @bdi: the device's backing_dev_info structure
|
|
|
- *
|
|
|
- * It is a waste of resources to have more than one pdflush thread blocked on
|
|
|
- * a single request queue. Exclusion at the request_queue level is obtained
|
|
|
- * via a flag in the request_queue's backing_dev_info.state.
|
|
|
- *
|
|
|
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
|
|
|
- * unless they implement their own. Which is somewhat inefficient, as this
|
|
|
- * may prevent concurrent writeback against multiple devices.
|
|
|
+/*
|
|
|
+ * We don't actually have pdflush, but this one is exported though /proc...
|
|
|
*/
|
|
|
-static int writeback_acquire(struct backing_dev_info *bdi)
|
|
|
+int nr_pdflush_threads;
|
|
|
+
|
|
|
+/*
|
|
|
+ * Work items for the bdi_writeback threads
|
|
|
+ */
|
|
|
+struct bdi_work {
|
|
|
+ struct list_head list;
|
|
|
+ struct list_head wait_list;
|
|
|
+ struct rcu_head rcu_head;
|
|
|
+
|
|
|
+ unsigned long seen;
|
|
|
+ atomic_t pending;
|
|
|
+
|
|
|
+ struct super_block *sb;
|
|
|
+ unsigned long nr_pages;
|
|
|
+ enum writeback_sync_modes sync_mode;
|
|
|
+
|
|
|
+ unsigned long state;
|
|
|
+};
|
|
|
+
|
|
|
+enum {
|
|
|
+ WS_USED_B = 0,
|
|
|
+ WS_ONSTACK_B,
|
|
|
+};
|
|
|
+
|
|
|
+#define WS_USED (1 << WS_USED_B)
|
|
|
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
|
|
|
+
|
|
|
+static inline bool bdi_work_on_stack(struct bdi_work *work)
|
|
|
{
|
|
|
- return !test_and_set_bit(BDI_pdflush, &bdi->state);
|
|
|
+ return test_bit(WS_ONSTACK_B, &work->state);
|
|
|
+}
|
|
|
+
|
|
|
+static inline void bdi_work_init(struct bdi_work *work,
|
|
|
+ struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ INIT_RCU_HEAD(&work->rcu_head);
|
|
|
+ work->sb = wbc->sb;
|
|
|
+ work->nr_pages = wbc->nr_to_write;
|
|
|
+ work->sync_mode = wbc->sync_mode;
|
|
|
+ work->state = WS_USED;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void bdi_work_init_on_stack(struct bdi_work *work,
|
|
|
+ struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ bdi_work_init(work, wbc);
|
|
|
+ work->state |= WS_ONSTACK;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* writeback_in_progress - determine whether there is writeback in progress
|
|
|
* @bdi: the device's backing_dev_info structure.
|
|
|
*
|
|
|
- * Determine whether there is writeback in progress against a backing device.
|
|
|
+ * Determine whether there is writeback waiting to be handled against a
|
|
|
+ * backing device.
|
|
|
*/
|
|
|
int writeback_in_progress(struct backing_dev_info *bdi)
|
|
|
{
|
|
|
- return test_bit(BDI_pdflush, &bdi->state);
|
|
|
+ return !list_empty(&bdi->work_list);
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * writeback_release - relinquish exclusive writeback access against a device.
|
|
|
- * @bdi: the device's backing_dev_info structure
|
|
|
- */
|
|
|
-static void writeback_release(struct backing_dev_info *bdi)
|
|
|
+static void bdi_work_clear(struct bdi_work *work)
|
|
|
{
|
|
|
- BUG_ON(!writeback_in_progress(bdi));
|
|
|
- clear_bit(BDI_pdflush, &bdi->state);
|
|
|
+ clear_bit(WS_USED_B, &work->state);
|
|
|
+ smp_mb__after_clear_bit();
|
|
|
+ wake_up_bit(&work->state, WS_USED_B);
|
|
|
}
|
|
|
|
|
|
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
|
|
|
+static void bdi_work_free(struct rcu_head *head)
|
|
|
{
|
|
|
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
|
|
|
- struct dentry *dentry;
|
|
|
- const char *name = "?";
|
|
|
+ struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
|
|
|
|
|
|
- dentry = d_find_alias(inode);
|
|
|
- if (dentry) {
|
|
|
- spin_lock(&dentry->d_lock);
|
|
|
- name = (const char *) dentry->d_name.name;
|
|
|
- }
|
|
|
- printk(KERN_DEBUG
|
|
|
- "%s(%d): dirtied inode %lu (%s) on %s\n",
|
|
|
- current->comm, task_pid_nr(current), inode->i_ino,
|
|
|
- name, inode->i_sb->s_id);
|
|
|
- if (dentry) {
|
|
|
- spin_unlock(&dentry->d_lock);
|
|
|
- dput(dentry);
|
|
|
- }
|
|
|
- }
|
|
|
+ if (!bdi_work_on_stack(work))
|
|
|
+ kfree(work);
|
|
|
+ else
|
|
|
+ bdi_work_clear(work);
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * __mark_inode_dirty - internal function
|
|
|
- * @inode: inode to mark
|
|
|
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
|
|
|
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
|
|
|
- * mark_inode_dirty_sync.
|
|
|
- *
|
|
|
- * Put the inode on the super block's dirty list.
|
|
|
- *
|
|
|
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
|
|
|
- * dirty list only if it is hashed or if it refers to a blockdev.
|
|
|
- * If it was not hashed, it will never be added to the dirty list
|
|
|
- * even if it is later hashed, as it will have been marked dirty already.
|
|
|
- *
|
|
|
- * In short, make sure you hash any inodes _before_ you start marking
|
|
|
- * them dirty.
|
|
|
- *
|
|
|
- * This function *must* be atomic for the I_DIRTY_PAGES case -
|
|
|
- * set_page_dirty() is called under spinlock in several places.
|
|
|
- *
|
|
|
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
|
|
|
- * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
|
|
|
- * the kernel-internal blockdev inode represents the dirtying time of the
|
|
|
- * blockdev's pages. This is why for I_DIRTY_PAGES we always use
|
|
|
- * page->mapping->host, so the page-dirtying time is recorded in the internal
|
|
|
- * blockdev inode.
|
|
|
- */
|
|
|
-void __mark_inode_dirty(struct inode *inode, int flags)
|
|
|
+static void wb_work_complete(struct bdi_work *work)
|
|
|
{
|
|
|
- struct super_block *sb = inode->i_sb;
|
|
|
+ const enum writeback_sync_modes sync_mode = work->sync_mode;
|
|
|
|
|
|
/*
|
|
|
- * Don't do this for I_DIRTY_PAGES - that doesn't actually
|
|
|
- * dirty the inode itself
|
|
|
+ * For allocated work, we can clear the done/seen bit right here.
|
|
|
+ * For on-stack work, we need to postpone both the clear and free
|
|
|
+ * to after the RCU grace period, since the stack could be invalidated
|
|
|
+ * as soon as bdi_work_clear() has done the wakeup.
|
|
|
*/
|
|
|
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
|
|
|
- if (sb->s_op->dirty_inode)
|
|
|
- sb->s_op->dirty_inode(inode);
|
|
|
- }
|
|
|
+ if (!bdi_work_on_stack(work))
|
|
|
+ bdi_work_clear(work);
|
|
|
+ if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
|
|
|
+ call_rcu(&work->rcu_head, bdi_work_free);
|
|
|
+}
|
|
|
|
|
|
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
|
|
|
+{
|
|
|
/*
|
|
|
- * make sure that changes are seen by all cpus before we test i_state
|
|
|
- * -- mikulas
|
|
|
+ * The caller has retrieved the work arguments from this work,
|
|
|
+ * drop our reference. If this is the last ref, delete and free it
|
|
|
*/
|
|
|
- smp_mb();
|
|
|
+ if (atomic_dec_and_test(&work->pending)) {
|
|
|
+ struct backing_dev_info *bdi = wb->bdi;
|
|
|
|
|
|
- /* avoid the locking if we can */
|
|
|
- if ((inode->i_state & flags) == flags)
|
|
|
- return;
|
|
|
+ spin_lock(&bdi->wb_lock);
|
|
|
+ list_del_rcu(&work->list);
|
|
|
+ spin_unlock(&bdi->wb_lock);
|
|
|
|
|
|
- if (unlikely(block_dump))
|
|
|
- block_dump___mark_inode_dirty(inode);
|
|
|
-
|
|
|
- spin_lock(&inode_lock);
|
|
|
- if ((inode->i_state & flags) != flags) {
|
|
|
- const int was_dirty = inode->i_state & I_DIRTY;
|
|
|
+ wb_work_complete(work);
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
- inode->i_state |= flags;
|
|
|
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
|
|
|
+{
|
|
|
+ if (work) {
|
|
|
+ work->seen = bdi->wb_mask;
|
|
|
+ BUG_ON(!work->seen);
|
|
|
+ atomic_set(&work->pending, bdi->wb_cnt);
|
|
|
+ BUG_ON(!bdi->wb_cnt);
|
|
|
|
|
|
/*
|
|
|
- * If the inode is being synced, just update its dirty state.
|
|
|
- * The unlocker will place the inode on the appropriate
|
|
|
- * superblock list, based upon its state.
|
|
|
+ * Make sure stores are seen before it appears on the list
|
|
|
*/
|
|
|
- if (inode->i_state & I_SYNC)
|
|
|
- goto out;
|
|
|
+ smp_mb();
|
|
|
|
|
|
- /*
|
|
|
- * Only add valid (hashed) inodes to the superblock's
|
|
|
- * dirty list. Add blockdev inodes as well.
|
|
|
- */
|
|
|
- if (!S_ISBLK(inode->i_mode)) {
|
|
|
- if (hlist_unhashed(&inode->i_hash))
|
|
|
- goto out;
|
|
|
- }
|
|
|
- if (inode->i_state & (I_FREEING|I_CLEAR))
|
|
|
- goto out;
|
|
|
+ spin_lock(&bdi->wb_lock);
|
|
|
+ list_add_tail_rcu(&work->list, &bdi->work_list);
|
|
|
+ spin_unlock(&bdi->wb_lock);
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the default thread isn't there, make sure we add it. When
|
|
|
+ * it gets created and wakes up, we'll run this work.
|
|
|
+ */
|
|
|
+ if (unlikely(list_empty_careful(&bdi->wb_list)))
|
|
|
+ wake_up_process(default_backing_dev_info.wb.task);
|
|
|
+ else {
|
|
|
+ struct bdi_writeback *wb = &bdi->wb;
|
|
|
|
|
|
/*
|
|
|
- * If the inode was already on s_dirty/s_io/s_more_io, don't
|
|
|
- * reposition it (that would break s_dirty time-ordering).
|
|
|
+ * If we failed allocating the bdi work item, wake up the wb
|
|
|
+ * thread always. As a safety precaution, it'll flush out
|
|
|
+ * everything
|
|
|
*/
|
|
|
- if (!was_dirty) {
|
|
|
- inode->dirtied_when = jiffies;
|
|
|
- list_move(&inode->i_list, &sb->s_dirty);
|
|
|
- }
|
|
|
+ if (!wb_has_dirty_io(wb)) {
|
|
|
+ if (work)
|
|
|
+ wb_clear_pending(wb, work);
|
|
|
+ } else if (wb->task)
|
|
|
+ wake_up_process(wb->task);
|
|
|
}
|
|
|
-out:
|
|
|
- spin_unlock(&inode_lock);
|
|
|
}
|
|
|
|
|
|
-EXPORT_SYMBOL(__mark_inode_dirty);
|
|
|
+/*
|
|
|
+ * Used for on-stack allocated work items. The caller needs to wait until
|
|
|
+ * the wb threads have acked the work before it's safe to continue.
|
|
|
+ */
|
|
|
+static void bdi_wait_on_work_clear(struct bdi_work *work)
|
|
|
+{
|
|
|
+ wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
|
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
|
+}
|
|
|
|
|
|
-static int write_inode(struct inode *inode, int sync)
|
|
|
+static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
|
|
|
{
|
|
|
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
|
|
|
- return inode->i_sb->s_op->write_inode(inode, sync);
|
|
|
- return 0;
|
|
|
+ struct bdi_work *work;
|
|
|
+
|
|
|
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
|
|
|
+ if (work)
|
|
|
+ bdi_work_init(work, wbc);
|
|
|
+
|
|
|
+ return work;
|
|
|
+}
|
|
|
+
|
|
|
+void bdi_start_writeback(struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
|
|
|
+ struct bdi_work work_stack, *work = NULL;
|
|
|
+
|
|
|
+ if (!must_wait)
|
|
|
+ work = bdi_alloc_work(wbc);
|
|
|
+
|
|
|
+ if (!work) {
|
|
|
+ work = &work_stack;
|
|
|
+ bdi_work_init_on_stack(work, wbc);
|
|
|
+ }
|
|
|
+
|
|
|
+ bdi_queue_work(wbc->bdi, work);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the sync mode is WB_SYNC_ALL, block waiting for the work to
|
|
|
+ * complete. If not, we only need to wait for the work to be started,
|
|
|
+ * if we allocated it on-stack. We use the same mechanism, if the
|
|
|
+ * wait bit is set in the bdi_work struct, then threads will not
|
|
|
+ * clear pending until after they are done.
|
|
|
+ *
|
|
|
+ * Note that work == &work_stack if must_wait is true, so we don't
|
|
|
+ * need to do call_rcu() here ever, since the completion path will
|
|
|
+ * have done that for us.
|
|
|
+ */
|
|
|
+ if (must_wait || work == &work_stack) {
|
|
|
+ bdi_wait_on_work_clear(work);
|
|
|
+ if (work != &work_stack)
|
|
|
+ call_rcu(&work->rcu_head, bdi_work_free);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -191,31 +243,32 @@ static int write_inode(struct inode *inode, int sync)
|
|
|
* furthest end of its superblock's dirty-inode list.
|
|
|
*
|
|
|
* Before stamping the inode's ->dirtied_when, we check to see whether it is
|
|
|
- * already the most-recently-dirtied inode on the s_dirty list. If that is
|
|
|
+ * already the most-recently-dirtied inode on the b_dirty list. If that is
|
|
|
* the case then the inode must have been redirtied while it was being written
|
|
|
* out and we don't reset its dirtied_when.
|
|
|
*/
|
|
|
static void redirty_tail(struct inode *inode)
|
|
|
{
|
|
|
- struct super_block *sb = inode->i_sb;
|
|
|
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
|
|
|
|
|
|
- if (!list_empty(&sb->s_dirty)) {
|
|
|
- struct inode *tail_inode;
|
|
|
+ if (!list_empty(&wb->b_dirty)) {
|
|
|
+ struct inode *tail;
|
|
|
|
|
|
- tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
|
|
|
- if (time_before(inode->dirtied_when,
|
|
|
- tail_inode->dirtied_when))
|
|
|
+ tail = list_entry(wb->b_dirty.next, struct inode, i_list);
|
|
|
+ if (time_before(inode->dirtied_when, tail->dirtied_when))
|
|
|
inode->dirtied_when = jiffies;
|
|
|
}
|
|
|
- list_move(&inode->i_list, &sb->s_dirty);
|
|
|
+ list_move(&inode->i_list, &wb->b_dirty);
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * requeue inode for re-scanning after sb->s_io list is exhausted.
|
|
|
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
|
|
|
*/
|
|
|
static void requeue_io(struct inode *inode)
|
|
|
{
|
|
|
- list_move(&inode->i_list, &inode->i_sb->s_more_io);
|
|
|
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
|
|
|
+
|
|
|
+ list_move(&inode->i_list, &wb->b_more_io);
|
|
|
}
|
|
|
|
|
|
static void inode_sync_complete(struct inode *inode)
|
|
@@ -262,20 +315,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
|
|
|
/*
|
|
|
* Queue all expired dirty inodes for io, eldest first.
|
|
|
*/
|
|
|
-static void queue_io(struct super_block *sb,
|
|
|
- unsigned long *older_than_this)
|
|
|
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
|
|
|
{
|
|
|
- list_splice_init(&sb->s_more_io, sb->s_io.prev);
|
|
|
- move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
|
|
|
+ list_splice_init(&wb->b_more_io, wb->b_io.prev);
|
|
|
+ move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
|
|
|
}
|
|
|
|
|
|
-int sb_has_dirty_inodes(struct super_block *sb)
|
|
|
+static int write_inode(struct inode *inode, int sync)
|
|
|
{
|
|
|
- return !list_empty(&sb->s_dirty) ||
|
|
|
- !list_empty(&sb->s_io) ||
|
|
|
- !list_empty(&sb->s_more_io);
|
|
|
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
|
|
|
+ return inode->i_sb->s_op->write_inode(inode, sync);
|
|
|
+ return 0;
|
|
|
}
|
|
|
-EXPORT_SYMBOL(sb_has_dirty_inodes);
|
|
|
|
|
|
/*
|
|
|
* Wait for writeback on an inode to complete.
|
|
@@ -322,11 +373,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|
|
if (inode->i_state & I_SYNC) {
|
|
|
/*
|
|
|
* If this inode is locked for writeback and we are not doing
|
|
|
- * writeback-for-data-integrity, move it to s_more_io so that
|
|
|
+ * writeback-for-data-integrity, move it to b_more_io so that
|
|
|
* writeback can proceed with the other inodes on s_io.
|
|
|
*
|
|
|
* We'll have another go at writing back this inode when we
|
|
|
- * completed a full scan of s_io.
|
|
|
+ * completed a full scan of b_io.
|
|
|
*/
|
|
|
if (!wait) {
|
|
|
requeue_io(inode);
|
|
@@ -371,11 +422,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|
|
/*
|
|
|
* We didn't write back all the pages. nfs_writepages()
|
|
|
* sometimes bales out without doing anything. Redirty
|
|
|
- * the inode; Move it from s_io onto s_more_io/s_dirty.
|
|
|
+ * the inode; Move it from b_io onto b_more_io/b_dirty.
|
|
|
*/
|
|
|
/*
|
|
|
* akpm: if the caller was the kupdate function we put
|
|
|
- * this inode at the head of s_dirty so it gets first
|
|
|
+ * this inode at the head of b_dirty so it gets first
|
|
|
* consideration. Otherwise, move it to the tail, for
|
|
|
* the reasons described there. I'm not really sure
|
|
|
* how much sense this makes. Presumably I had a good
|
|
@@ -385,7 +436,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|
|
if (wbc->for_kupdate) {
|
|
|
/*
|
|
|
* For the kupdate function we move the inode
|
|
|
- * to s_more_io so it will get more writeout as
|
|
|
+ * to b_more_io so it will get more writeout as
|
|
|
* soon as the queue becomes uncongested.
|
|
|
*/
|
|
|
inode->i_state |= I_DIRTY_PAGES;
|
|
@@ -434,50 +485,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Write out a superblock's list of dirty inodes. A wait will be performed
|
|
|
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
|
|
|
- *
|
|
|
- * If older_than_this is non-NULL, then only write out inodes which
|
|
|
- * had their first dirtying at a time earlier than *older_than_this.
|
|
|
- *
|
|
|
- * If we're a pdflush thread, then implement pdflush collision avoidance
|
|
|
- * against the entire list.
|
|
|
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
|
|
|
+ * before calling writeback. So make sure that we do pin it, so it doesn't
|
|
|
+ * go away while we are writing inodes from it.
|
|
|
*
|
|
|
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
|
|
|
- * This function assumes that the blockdev superblock's inodes are backed by
|
|
|
- * a variety of queues, so all inodes are searched. For other superblocks,
|
|
|
- * assume that all inodes are backed by the same queue.
|
|
|
- *
|
|
|
- * FIXME: this linear search could get expensive with many fileystems. But
|
|
|
- * how to fix? We need to go from an address_space to all inodes which share
|
|
|
- * a queue with that address_space. (Easy: have a global "dirty superblocks"
|
|
|
- * list).
|
|
|
- *
|
|
|
- * The inodes to be written are parked on sb->s_io. They are moved back onto
|
|
|
- * sb->s_dirty as they are selected for writing. This way, none can be missed
|
|
|
- * on the writer throttling path, and we get decent balancing between many
|
|
|
- * throttled threads: we don't want them all piling up on inode_sync_wait.
|
|
|
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
|
|
|
+ * 1 if we failed.
|
|
|
*/
|
|
|
-void generic_sync_sb_inodes(struct super_block *sb,
|
|
|
+static int pin_sb_for_writeback(struct writeback_control *wbc,
|
|
|
+ struct inode *inode)
|
|
|
+{
|
|
|
+ struct super_block *sb = inode->i_sb;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Caller must already hold the ref for this
|
|
|
+ */
|
|
|
+ if (wbc->sync_mode == WB_SYNC_ALL) {
|
|
|
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ spin_lock(&sb_lock);
|
|
|
+ sb->s_count++;
|
|
|
+ if (down_read_trylock(&sb->s_umount)) {
|
|
|
+ if (sb->s_root) {
|
|
|
+ spin_unlock(&sb_lock);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ * umounted, drop rwsem again and fall through to failure
|
|
|
+ */
|
|
|
+ up_read(&sb->s_umount);
|
|
|
+ }
|
|
|
+
|
|
|
+ sb->s_count--;
|
|
|
+ spin_unlock(&sb_lock);
|
|
|
+ return 1;
|
|
|
+}
|
|
|
+
|
|
|
+static void unpin_sb_for_writeback(struct writeback_control *wbc,
|
|
|
+ struct inode *inode)
|
|
|
+{
|
|
|
+ struct super_block *sb = inode->i_sb;
|
|
|
+
|
|
|
+ if (wbc->sync_mode == WB_SYNC_ALL)
|
|
|
+ return;
|
|
|
+
|
|
|
+ up_read(&sb->s_umount);
|
|
|
+ put_super(sb);
|
|
|
+}
|
|
|
+
|
|
|
+static void writeback_inodes_wb(struct bdi_writeback *wb,
|
|
|
struct writeback_control *wbc)
|
|
|
{
|
|
|
+ struct super_block *sb = wbc->sb;
|
|
|
+ const int is_blkdev_sb = sb_is_blkdev_sb(sb);
|
|
|
const unsigned long start = jiffies; /* livelock avoidance */
|
|
|
- int sync = wbc->sync_mode == WB_SYNC_ALL;
|
|
|
|
|
|
spin_lock(&inode_lock);
|
|
|
- if (!wbc->for_kupdate || list_empty(&sb->s_io))
|
|
|
- queue_io(sb, wbc->older_than_this);
|
|
|
|
|
|
- while (!list_empty(&sb->s_io)) {
|
|
|
- struct inode *inode = list_entry(sb->s_io.prev,
|
|
|
+ if (!wbc->for_kupdate || list_empty(&wb->b_io))
|
|
|
+ queue_io(wb, wbc->older_than_this);
|
|
|
+
|
|
|
+ while (!list_empty(&wb->b_io)) {
|
|
|
+ struct inode *inode = list_entry(wb->b_io.prev,
|
|
|
struct inode, i_list);
|
|
|
- struct address_space *mapping = inode->i_mapping;
|
|
|
- struct backing_dev_info *bdi = mapping->backing_dev_info;
|
|
|
long pages_skipped;
|
|
|
|
|
|
- if (!bdi_cap_writeback_dirty(bdi)) {
|
|
|
+ /*
|
|
|
+ * super block given and doesn't match, skip this inode
|
|
|
+ */
|
|
|
+ if (sb && sb != inode->i_sb) {
|
|
|
+ redirty_tail(inode);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!bdi_cap_writeback_dirty(wb->bdi)) {
|
|
|
redirty_tail(inode);
|
|
|
- if (sb_is_blkdev_sb(sb)) {
|
|
|
+ if (is_blkdev_sb) {
|
|
|
/*
|
|
|
* Dirty memory-backed blockdev: the ramdisk
|
|
|
* driver does this. Skip just this inode
|
|
@@ -497,21 +582,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
|
|
|
+ if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
|
|
|
wbc->encountered_congestion = 1;
|
|
|
- if (!sb_is_blkdev_sb(sb))
|
|
|
+ if (!is_blkdev_sb)
|
|
|
break; /* Skip a congested fs */
|
|
|
requeue_io(inode);
|
|
|
continue; /* Skip a congested blockdev */
|
|
|
}
|
|
|
|
|
|
- if (wbc->bdi && bdi != wbc->bdi) {
|
|
|
- if (!sb_is_blkdev_sb(sb))
|
|
|
- break; /* fs has the wrong queue */
|
|
|
- requeue_io(inode);
|
|
|
- continue; /* blockdev has wrong queue */
|
|
|
- }
|
|
|
-
|
|
|
/*
|
|
|
* Was this inode dirtied after sync_sb_inodes was called?
|
|
|
* This keeps sync from extra jobs and livelock.
|
|
@@ -519,16 +597,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
|
|
|
if (inode_dirtied_after(inode, start))
|
|
|
break;
|
|
|
|
|
|
- /* Is another pdflush already flushing this queue? */
|
|
|
- if (current_is_pdflush() && !writeback_acquire(bdi))
|
|
|
- break;
|
|
|
+ if (pin_sb_for_writeback(wbc, inode)) {
|
|
|
+ requeue_io(inode);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
|
|
|
__iget(inode);
|
|
|
pages_skipped = wbc->pages_skipped;
|
|
|
writeback_single_inode(inode, wbc);
|
|
|
- if (current_is_pdflush())
|
|
|
- writeback_release(bdi);
|
|
|
+ unpin_sb_for_writeback(wbc, inode);
|
|
|
if (wbc->pages_skipped != pages_skipped) {
|
|
|
/*
|
|
|
* writeback is not making progress due to locked
|
|
@@ -544,144 +622,571 @@ void generic_sync_sb_inodes(struct super_block *sb,
|
|
|
wbc->more_io = 1;
|
|
|
break;
|
|
|
}
|
|
|
- if (!list_empty(&sb->s_more_io))
|
|
|
+ if (!list_empty(&wb->b_more_io))
|
|
|
wbc->more_io = 1;
|
|
|
}
|
|
|
|
|
|
- if (sync) {
|
|
|
- struct inode *inode, *old_inode = NULL;
|
|
|
+ spin_unlock(&inode_lock);
|
|
|
+ /* Leave any unwritten inodes on b_io */
|
|
|
+}
|
|
|
+
|
|
|
+void writeback_inodes_wbc(struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ struct backing_dev_info *bdi = wbc->bdi;
|
|
|
|
|
|
+ writeback_inodes_wb(&bdi->wb, wbc);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
|
|
|
+ * operation. We do this so we don't hold I_SYNC against an inode for
|
|
|
+ * enormous amounts of time, which would block a userspace task which has
|
|
|
+ * been forced to throttle against that inode. Also, the code reevaluates
|
|
|
+ * the dirty each time it has written this many pages.
|
|
|
+ */
|
|
|
+#define MAX_WRITEBACK_PAGES 1024
|
|
|
+
|
|
|
+static inline bool over_bground_thresh(void)
|
|
|
+{
|
|
|
+ unsigned long background_thresh, dirty_thresh;
|
|
|
+
|
|
|
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
|
|
|
+
|
|
|
+ return (global_page_state(NR_FILE_DIRTY) +
|
|
|
+ global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Explicit flushing or periodic writeback of "old" data.
|
|
|
+ *
|
|
|
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
|
|
|
+ * dirtying-time in the inode's address_space. So this periodic writeback code
|
|
|
+ * just walks the superblock inode list, writing back any inodes which are
|
|
|
+ * older than a specific point in time.
|
|
|
+ *
|
|
|
+ * Try to run once per dirty_writeback_interval. But if a writeback event
|
|
|
+ * takes longer than a dirty_writeback_interval interval, then leave a
|
|
|
+ * one-second gap.
|
|
|
+ *
|
|
|
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
|
|
|
+ * all dirty pages if they are all attached to "old" mappings.
|
|
|
+ */
|
|
|
+static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
|
|
|
+ struct super_block *sb,
|
|
|
+ enum writeback_sync_modes sync_mode, int for_kupdate)
|
|
|
+{
|
|
|
+ struct writeback_control wbc = {
|
|
|
+ .bdi = wb->bdi,
|
|
|
+ .sb = sb,
|
|
|
+ .sync_mode = sync_mode,
|
|
|
+ .older_than_this = NULL,
|
|
|
+ .for_kupdate = for_kupdate,
|
|
|
+ .range_cyclic = 1,
|
|
|
+ };
|
|
|
+ unsigned long oldest_jif;
|
|
|
+ long wrote = 0;
|
|
|
+
|
|
|
+ if (wbc.for_kupdate) {
|
|
|
+ wbc.older_than_this = &oldest_jif;
|
|
|
+ oldest_jif = jiffies -
|
|
|
+ msecs_to_jiffies(dirty_expire_interval * 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (;;) {
|
|
|
/*
|
|
|
- * Data integrity sync. Must wait for all pages under writeback,
|
|
|
- * because there may have been pages dirtied before our sync
|
|
|
- * call, but which had writeout started before we write it out.
|
|
|
- * In which case, the inode may not be on the dirty list, but
|
|
|
- * we still have to wait for that writeout.
|
|
|
+ * Don't flush anything for non-integrity writeback where
|
|
|
+ * no nr_pages was given
|
|
|
*/
|
|
|
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
|
|
|
- struct address_space *mapping;
|
|
|
+ if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
|
|
|
+ break;
|
|
|
|
|
|
- if (inode->i_state &
|
|
|
- (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
|
|
|
- continue;
|
|
|
- mapping = inode->i_mapping;
|
|
|
- if (mapping->nrpages == 0)
|
|
|
+ /*
|
|
|
+ * If no specific pages were given and this is just a
|
|
|
+ * periodic background writeout and we are below the
|
|
|
+ * background dirty threshold, don't do anything
|
|
|
+ */
|
|
|
+ if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
|
|
|
+ break;
|
|
|
+
|
|
|
+ wbc.more_io = 0;
|
|
|
+ wbc.encountered_congestion = 0;
|
|
|
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
|
|
|
+ wbc.pages_skipped = 0;
|
|
|
+ writeback_inodes_wb(wb, &wbc);
|
|
|
+ nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
|
|
|
+ wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If we ran out of stuff to write, bail unless more_io got set
|
|
|
+ */
|
|
|
+ if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
|
|
|
+ if (wbc.more_io && !wbc.for_kupdate)
|
|
|
continue;
|
|
|
- __iget(inode);
|
|
|
- spin_unlock(&inode_lock);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return wrote;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Return the next bdi_work struct that hasn't been processed by this
|
|
|
+ * wb thread yet
|
|
|
+ */
|
|
|
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
|
|
|
+ struct bdi_writeback *wb)
|
|
|
+{
|
|
|
+ struct bdi_work *work, *ret = NULL;
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+
|
|
|
+ list_for_each_entry_rcu(work, &bdi->work_list, list) {
|
|
|
+ if (!test_and_clear_bit(wb->nr, &work->seen))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ ret = work;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ rcu_read_unlock();
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
|
|
|
+{
|
|
|
+ unsigned long expired;
|
|
|
+ long nr_pages;
|
|
|
+
|
|
|
+ expired = wb->last_old_flush +
|
|
|
+ msecs_to_jiffies(dirty_writeback_interval * 10);
|
|
|
+ if (time_before(jiffies, expired))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ wb->last_old_flush = jiffies;
|
|
|
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
|
|
|
+ global_page_state(NR_UNSTABLE_NFS) +
|
|
|
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
|
|
|
+
|
|
|
+ if (nr_pages)
|
|
|
+ return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Retrieve work items and do the writeback they describe
|
|
|
+ */
|
|
|
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
|
|
|
+{
|
|
|
+ struct backing_dev_info *bdi = wb->bdi;
|
|
|
+ struct bdi_work *work;
|
|
|
+ long nr_pages, wrote = 0;
|
|
|
+
|
|
|
+ while ((work = get_next_work_item(bdi, wb)) != NULL) {
|
|
|
+ enum writeback_sync_modes sync_mode;
|
|
|
+
|
|
|
+ nr_pages = work->nr_pages;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Override sync mode, in case we must wait for completion
|
|
|
+ */
|
|
|
+ if (force_wait)
|
|
|
+ work->sync_mode = sync_mode = WB_SYNC_ALL;
|
|
|
+ else
|
|
|
+ sync_mode = work->sync_mode;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If this isn't a data integrity operation, just notify
|
|
|
+ * that we have seen this work and we are now starting it.
|
|
|
+ */
|
|
|
+ if (sync_mode == WB_SYNC_NONE)
|
|
|
+ wb_clear_pending(wb, work);
|
|
|
+
|
|
|
+ wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This is a data integrity writeback, so only do the
|
|
|
+ * notification when we have completed the work.
|
|
|
+ */
|
|
|
+ if (sync_mode == WB_SYNC_ALL)
|
|
|
+ wb_clear_pending(wb, work);
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Check for periodic writeback, kupdated() style
|
|
|
+ */
|
|
|
+ wrote += wb_check_old_data_flush(wb);
|
|
|
+
|
|
|
+ return wrote;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Handle writeback of dirty data for the device backed by this bdi. Also
|
|
|
+ * wakes up periodically and does kupdated style flushing.
|
|
|
+ */
|
|
|
+int bdi_writeback_task(struct bdi_writeback *wb)
|
|
|
+{
|
|
|
+ unsigned long last_active = jiffies;
|
|
|
+ unsigned long wait_jiffies = -1UL;
|
|
|
+ long pages_written;
|
|
|
+
|
|
|
+ while (!kthread_should_stop()) {
|
|
|
+ pages_written = wb_do_writeback(wb, 0);
|
|
|
+
|
|
|
+ if (pages_written)
|
|
|
+ last_active = jiffies;
|
|
|
+ else if (wait_jiffies != -1UL) {
|
|
|
+ unsigned long max_idle;
|
|
|
+
|
|
|
/*
|
|
|
- * We hold a reference to 'inode' so it couldn't have
|
|
|
- * been removed from s_inodes list while we dropped the
|
|
|
- * inode_lock. We cannot iput the inode now as we can
|
|
|
- * be holding the last reference and we cannot iput it
|
|
|
- * under inode_lock. So we keep the reference and iput
|
|
|
- * it later.
|
|
|
+ * Longest period of inactivity that we tolerate. If we
|
|
|
+ * see dirty data again later, the task will get
|
|
|
+ * recreated automatically.
|
|
|
*/
|
|
|
- iput(old_inode);
|
|
|
- old_inode = inode;
|
|
|
+ max_idle = max(5UL * 60 * HZ, wait_jiffies);
|
|
|
+ if (time_after(jiffies, max_idle + last_active))
|
|
|
+ break;
|
|
|
+ }
|
|
|
|
|
|
- filemap_fdatawait(mapping);
|
|
|
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
|
|
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
|
|
+ schedule_timeout(wait_jiffies);
|
|
|
+ try_to_freeze();
|
|
|
+ }
|
|
|
|
|
|
- cond_resched();
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Schedule writeback for all backing devices. Expensive! If this is a data
|
|
|
+ * integrity operation, writeback will be complete when this returns. If
|
|
|
+ * we are simply called for WB_SYNC_NONE, then writeback will merely be
|
|
|
+ * scheduled to run.
|
|
|
+ */
|
|
|
+static void bdi_writeback_all(struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
|
|
|
+ struct backing_dev_info *bdi;
|
|
|
+ struct bdi_work *work;
|
|
|
+ LIST_HEAD(list);
|
|
|
+
|
|
|
+restart:
|
|
|
+ spin_lock(&bdi_lock);
|
|
|
+
|
|
|
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
|
|
|
+ struct bdi_work *work;
|
|
|
|
|
|
- spin_lock(&inode_lock);
|
|
|
+ if (!bdi_has_dirty_io(bdi))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If work allocation fails, do the writes inline. We drop
|
|
|
+ * the lock and restart the list writeout. This should be OK,
|
|
|
+ * since this happens rarely and because the writeout should
|
|
|
+ * eventually make more free memory available.
|
|
|
+ */
|
|
|
+ work = bdi_alloc_work(wbc);
|
|
|
+ if (!work) {
|
|
|
+ struct writeback_control __wbc;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Not a data integrity writeout, just continue
|
|
|
+ */
|
|
|
+ if (!must_wait)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ spin_unlock(&bdi_lock);
|
|
|
+ __wbc = *wbc;
|
|
|
+ __wbc.bdi = bdi;
|
|
|
+ writeback_inodes_wbc(&__wbc);
|
|
|
+ goto restart;
|
|
|
}
|
|
|
- spin_unlock(&inode_lock);
|
|
|
- iput(old_inode);
|
|
|
- } else
|
|
|
- spin_unlock(&inode_lock);
|
|
|
+ if (must_wait)
|
|
|
+ list_add_tail(&work->wait_list, &list);
|
|
|
+
|
|
|
+ bdi_queue_work(bdi, work);
|
|
|
+ }
|
|
|
+
|
|
|
+ spin_unlock(&bdi_lock);
|
|
|
|
|
|
- return; /* Leave any unwritten inodes on s_io */
|
|
|
+ /*
|
|
|
+ * If this is for WB_SYNC_ALL, wait for pending work to complete
|
|
|
+ * before returning.
|
|
|
+ */
|
|
|
+ while (!list_empty(&list)) {
|
|
|
+ work = list_entry(list.next, struct bdi_work, wait_list);
|
|
|
+ list_del(&work->wait_list);
|
|
|
+ bdi_wait_on_work_clear(work);
|
|
|
+ call_rcu(&work->rcu_head, bdi_work_free);
|
|
|
+ }
|
|
|
}
|
|
|
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
|
|
|
|
|
|
-static void sync_sb_inodes(struct super_block *sb,
|
|
|
- struct writeback_control *wbc)
|
|
|
+/*
|
|
|
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
|
|
|
+ * the whole world.
|
|
|
+ */
|
|
|
+void wakeup_flusher_threads(long nr_pages)
|
|
|
{
|
|
|
- generic_sync_sb_inodes(sb, wbc);
|
|
|
+ struct writeback_control wbc = {
|
|
|
+ .sync_mode = WB_SYNC_NONE,
|
|
|
+ .older_than_this = NULL,
|
|
|
+ .range_cyclic = 1,
|
|
|
+ };
|
|
|
+
|
|
|
+ if (nr_pages == 0)
|
|
|
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
|
|
|
+ global_page_state(NR_UNSTABLE_NFS);
|
|
|
+ wbc.nr_to_write = nr_pages;
|
|
|
+ bdi_writeback_all(&wbc);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Start writeback of dirty pagecache data against all unlocked inodes.
|
|
|
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
|
|
|
+{
|
|
|
+ if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
|
|
|
+ struct dentry *dentry;
|
|
|
+ const char *name = "?";
|
|
|
+
|
|
|
+ dentry = d_find_alias(inode);
|
|
|
+ if (dentry) {
|
|
|
+ spin_lock(&dentry->d_lock);
|
|
|
+ name = (const char *) dentry->d_name.name;
|
|
|
+ }
|
|
|
+ printk(KERN_DEBUG
|
|
|
+ "%s(%d): dirtied inode %lu (%s) on %s\n",
|
|
|
+ current->comm, task_pid_nr(current), inode->i_ino,
|
|
|
+ name, inode->i_sb->s_id);
|
|
|
+ if (dentry) {
|
|
|
+ spin_unlock(&dentry->d_lock);
|
|
|
+ dput(dentry);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * __mark_inode_dirty - internal function
|
|
|
+ * @inode: inode to mark
|
|
|
+ * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
|
|
|
+ * Mark an inode as dirty. Callers should use mark_inode_dirty or
|
|
|
+ * mark_inode_dirty_sync.
|
|
|
*
|
|
|
- * Note:
|
|
|
- * We don't need to grab a reference to superblock here. If it has non-empty
|
|
|
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
|
|
|
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
|
|
|
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
|
|
|
- * inode from superblock lists we are OK.
|
|
|
+ * Put the inode on the super block's dirty list.
|
|
|
+ *
|
|
|
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
|
|
|
+ * dirty list only if it is hashed or if it refers to a blockdev.
|
|
|
+ * If it was not hashed, it will never be added to the dirty list
|
|
|
+ * even if it is later hashed, as it will have been marked dirty already.
|
|
|
*
|
|
|
- * If `older_than_this' is non-zero then only flush inodes which have a
|
|
|
- * flushtime older than *older_than_this.
|
|
|
+ * In short, make sure you hash any inodes _before_ you start marking
|
|
|
+ * them dirty.
|
|
|
*
|
|
|
- * If `bdi' is non-zero then we will scan the first inode against each
|
|
|
- * superblock until we find the matching ones. One group will be the dirty
|
|
|
- * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
|
|
|
- * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
|
|
|
- * super-efficient but we're about to do a ton of I/O...
|
|
|
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
|
|
|
+ * set_page_dirty() is called under spinlock in several places.
|
|
|
+ *
|
|
|
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
|
|
|
+ * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
|
|
|
+ * the kernel-internal blockdev inode represents the dirtying time of the
|
|
|
+ * blockdev's pages. This is why for I_DIRTY_PAGES we always use
|
|
|
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
|
|
|
+ * blockdev inode.
|
|
|
*/
|
|
|
-void
|
|
|
-writeback_inodes(struct writeback_control *wbc)
|
|
|
+void __mark_inode_dirty(struct inode *inode, int flags)
|
|
|
{
|
|
|
- struct super_block *sb;
|
|
|
+ struct super_block *sb = inode->i_sb;
|
|
|
|
|
|
- might_sleep();
|
|
|
- spin_lock(&sb_lock);
|
|
|
-restart:
|
|
|
- list_for_each_entry_reverse(sb, &super_blocks, s_list) {
|
|
|
- if (sb_has_dirty_inodes(sb)) {
|
|
|
- /* we're making our own get_super here */
|
|
|
- sb->s_count++;
|
|
|
- spin_unlock(&sb_lock);
|
|
|
- /*
|
|
|
- * If we can't get the readlock, there's no sense in
|
|
|
- * waiting around, most of the time the FS is going to
|
|
|
- * be unmounted by the time it is released.
|
|
|
- */
|
|
|
- if (down_read_trylock(&sb->s_umount)) {
|
|
|
- if (sb->s_root)
|
|
|
- sync_sb_inodes(sb, wbc);
|
|
|
- up_read(&sb->s_umount);
|
|
|
+ /*
|
|
|
+ * Don't do this for I_DIRTY_PAGES - that doesn't actually
|
|
|
+ * dirty the inode itself
|
|
|
+ */
|
|
|
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
|
|
|
+ if (sb->s_op->dirty_inode)
|
|
|
+ sb->s_op->dirty_inode(inode);
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * make sure that changes are seen by all cpus before we test i_state
|
|
|
+ * -- mikulas
|
|
|
+ */
|
|
|
+ smp_mb();
|
|
|
+
|
|
|
+ /* avoid the locking if we can */
|
|
|
+ if ((inode->i_state & flags) == flags)
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (unlikely(block_dump))
|
|
|
+ block_dump___mark_inode_dirty(inode);
|
|
|
+
|
|
|
+ spin_lock(&inode_lock);
|
|
|
+ if ((inode->i_state & flags) != flags) {
|
|
|
+ const int was_dirty = inode->i_state & I_DIRTY;
|
|
|
+
|
|
|
+ inode->i_state |= flags;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the inode is being synced, just update its dirty state.
|
|
|
+ * The unlocker will place the inode on the appropriate
|
|
|
+ * superblock list, based upon its state.
|
|
|
+ */
|
|
|
+ if (inode->i_state & I_SYNC)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Only add valid (hashed) inodes to the superblock's
|
|
|
+ * dirty list. Add blockdev inodes as well.
|
|
|
+ */
|
|
|
+ if (!S_ISBLK(inode->i_mode)) {
|
|
|
+ if (hlist_unhashed(&inode->i_hash))
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ if (inode->i_state & (I_FREEING|I_CLEAR))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the inode was already on b_dirty/b_io/b_more_io, don't
|
|
|
+ * reposition it (that would break b_dirty time-ordering).
|
|
|
+ */
|
|
|
+ if (!was_dirty) {
|
|
|
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
|
|
|
+ struct backing_dev_info *bdi = wb->bdi;
|
|
|
+
|
|
|
+ if (bdi_cap_writeback_dirty(bdi) &&
|
|
|
+ !test_bit(BDI_registered, &bdi->state)) {
|
|
|
+ WARN_ON(1);
|
|
|
+ printk(KERN_ERR "bdi-%s not registered\n",
|
|
|
+ bdi->name);
|
|
|
}
|
|
|
- spin_lock(&sb_lock);
|
|
|
- if (__put_super_and_need_restart(sb))
|
|
|
- goto restart;
|
|
|
+
|
|
|
+ inode->dirtied_when = jiffies;
|
|
|
+ list_move(&inode->i_list, &wb->b_dirty);
|
|
|
}
|
|
|
- if (wbc->nr_to_write <= 0)
|
|
|
- break;
|
|
|
}
|
|
|
- spin_unlock(&sb_lock);
|
|
|
+out:
|
|
|
+ spin_unlock(&inode_lock);
|
|
|
}
|
|
|
+EXPORT_SYMBOL(__mark_inode_dirty);
|
|
|
|
|
|
/*
|
|
|
- * writeback and wait upon the filesystem's dirty inodes. The caller will
|
|
|
- * do this in two passes - one to write, and one to wait.
|
|
|
+ * Write out a superblock's list of dirty inodes. A wait will be performed
|
|
|
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
|
|
|
+ *
|
|
|
+ * If older_than_this is non-NULL, then only write out inodes which
|
|
|
+ * had their first dirtying at a time earlier than *older_than_this.
|
|
|
*
|
|
|
- * A finite limit is set on the number of pages which will be written.
|
|
|
- * To prevent infinite livelock of sys_sync().
|
|
|
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
|
|
|
+ * against the entire list.
|
|
|
*
|
|
|
- * We add in the number of potentially dirty inodes, because each inode write
|
|
|
- * can dirty pagecache in the underlying blockdev.
|
|
|
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
|
|
|
+ * This function assumes that the blockdev superblock's inodes are backed by
|
|
|
+ * a variety of queues, so all inodes are searched. For other superblocks,
|
|
|
+ * assume that all inodes are backed by the same queue.
|
|
|
+ *
|
|
|
+ * The inodes to be written are parked on bdi->b_io. They are moved back onto
|
|
|
+ * bdi->b_dirty as they are selected for writing. This way, none can be missed
|
|
|
+ * on the writer throttling path, and we get decent balancing between many
|
|
|
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
|
|
|
*/
|
|
|
-void sync_inodes_sb(struct super_block *sb, int wait)
|
|
|
+static void wait_sb_inodes(struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ struct inode *inode, *old_inode = NULL;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We need to be protected against the filesystem going from
|
|
|
+ * r/o to r/w or vice versa.
|
|
|
+ */
|
|
|
+ WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
|
|
|
+
|
|
|
+ spin_lock(&inode_lock);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Data integrity sync. Must wait for all pages under writeback,
|
|
|
+ * because there may have been pages dirtied before our sync
|
|
|
+ * call, but which had writeout started before we write it out.
|
|
|
+ * In which case, the inode may not be on the dirty list, but
|
|
|
+ * we still have to wait for that writeout.
|
|
|
+ */
|
|
|
+ list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
|
|
|
+ struct address_space *mapping;
|
|
|
+
|
|
|
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
|
|
|
+ continue;
|
|
|
+ mapping = inode->i_mapping;
|
|
|
+ if (mapping->nrpages == 0)
|
|
|
+ continue;
|
|
|
+ __iget(inode);
|
|
|
+ spin_unlock(&inode_lock);
|
|
|
+ /*
|
|
|
+ * We hold a reference to 'inode' so it couldn't have
|
|
|
+ * been removed from s_inodes list while we dropped the
|
|
|
+ * inode_lock. We cannot iput the inode now as we can
|
|
|
+ * be holding the last reference and we cannot iput it
|
|
|
+ * under inode_lock. So we keep the reference and iput
|
|
|
+ * it later.
|
|
|
+ */
|
|
|
+ iput(old_inode);
|
|
|
+ old_inode = inode;
|
|
|
+
|
|
|
+ filemap_fdatawait(mapping);
|
|
|
+
|
|
|
+ cond_resched();
|
|
|
+
|
|
|
+ spin_lock(&inode_lock);
|
|
|
+ }
|
|
|
+ spin_unlock(&inode_lock);
|
|
|
+ iput(old_inode);
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * writeback_inodes_sb - writeback dirty inodes from given super_block
|
|
|
+ * @sb: the superblock
|
|
|
+ *
|
|
|
+ * Start writeback on some inodes on this super_block. No guarantees are made
|
|
|
+ * on how many (if any) will be written, and this function does not wait
|
|
|
+ * for IO completion of submitted IO. The number of pages submitted is
|
|
|
+ * returned.
|
|
|
+ */
|
|
|
+long writeback_inodes_sb(struct super_block *sb)
|
|
|
{
|
|
|
struct writeback_control wbc = {
|
|
|
- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
|
|
|
+ .sb = sb,
|
|
|
+ .sync_mode = WB_SYNC_NONE,
|
|
|
.range_start = 0,
|
|
|
.range_end = LLONG_MAX,
|
|
|
};
|
|
|
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
|
|
|
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
|
|
|
+ long nr_to_write;
|
|
|
|
|
|
- if (!wait) {
|
|
|
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
|
|
|
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
|
|
|
-
|
|
|
- wbc.nr_to_write = nr_dirty + nr_unstable +
|
|
|
+ nr_to_write = nr_dirty + nr_unstable +
|
|
|
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
|
|
|
- } else
|
|
|
- wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
|
|
|
|
|
|
- sync_sb_inodes(sb, &wbc);
|
|
|
+ wbc.nr_to_write = nr_to_write;
|
|
|
+ bdi_writeback_all(&wbc);
|
|
|
+ return nr_to_write - wbc.nr_to_write;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(writeback_inodes_sb);
|
|
|
+
|
|
|
+/**
|
|
|
+ * sync_inodes_sb - sync sb inode pages
|
|
|
+ * @sb: the superblock
|
|
|
+ *
|
|
|
+ * This function writes and waits on any dirty inode belonging to this
|
|
|
+ * super_block. The number of pages synced is returned.
|
|
|
+ */
|
|
|
+long sync_inodes_sb(struct super_block *sb)
|
|
|
+{
|
|
|
+ struct writeback_control wbc = {
|
|
|
+ .sb = sb,
|
|
|
+ .sync_mode = WB_SYNC_ALL,
|
|
|
+ .range_start = 0,
|
|
|
+ .range_end = LLONG_MAX,
|
|
|
+ };
|
|
|
+ long nr_to_write = LONG_MAX; /* doesn't actually matter */
|
|
|
+
|
|
|
+ wbc.nr_to_write = nr_to_write;
|
|
|
+ bdi_writeback_all(&wbc);
|
|
|
+ wait_sb_inodes(&wbc);
|
|
|
+ return nr_to_write - wbc.nr_to_write;
|
|
|
}
|
|
|
+EXPORT_SYMBOL(sync_inodes_sb);
|
|
|
|
|
|
/**
|
|
|
* write_inode_now - write an inode to disk
|