14 years ago · 92f1c008ae
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -1,59 +0,0 @@
 
				-/*
			
 
				- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
			
 
				- * All Rights Reserved.
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or
			
 
				- * modify it under the terms of the GNU General Public License as
			
 
				- * published by the Free Software Foundation.
			
 
				- *
			
 
				- * This program is distributed in the hope that it would be useful,
			
 
				- * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				- * GNU General Public License for more details.
			
 
				- *
			
 
				- * You should have received a copy of the GNU General Public License
			
 
				- * along with this program; if not, write the Free Software Foundation,
			
 
				- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				- */
			
 
				-#ifndef __XFS_SUPPORT_SV_H__
			
 
				-#define __XFS_SUPPORT_SV_H__
			
 
				-
			
 
				-#include <linux/wait.h>
			
 
				-#include <linux/sched.h>
			
 
				-#include <linux/spinlock.h>
			
 
				-
			
 
				-/*
			
 
				- * Synchronisation variables.
			
 
				- *
			
 
				- * (Parameters "pri", "svf" and "rts" are not implemented)
			
 
				- */
			
 
				-
			
 
				-typedef struct sv_s {
			
 
				-	wait_queue_head_t waiters;
			
 
				-} sv_t;
			
 
				-
			
 
				-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
			
 
				-{
			
 
				-	DECLARE_WAITQUEUE(wait, current);
			
 
				-
			
 
				-	add_wait_queue_exclusive(&sv->waiters, &wait);
			
 
				-	__set_current_state(TASK_UNINTERRUPTIBLE);
			
 
				-	spin_unlock(lock);
			
 
				-
			
 
				-	schedule();
			
 
				-
			
 
				-	remove_wait_queue(&sv->waiters, &wait);
			
 
				-}
			
 
				-
			
 
				-#define sv_init(sv,flag,name) \
			
 
				-	init_waitqueue_head(&(sv)->waiters)
			
 
				-#define sv_destroy(sv) \
			
 
				-	/*NOTHING*/
			
 
				-#define sv_wait(sv, pri, lock, s) \
			
 
				-	_sv_wait(sv, lock)
			
 
				-#define sv_signal(sv) \
			
 
				-	wake_up(&(sv)->waiters)
			
 
				-#define sv_broadcast(sv) \
			
 
				-	wake_up_all(&(sv)->waiters)
			
 
				-
			
 
				-#endif /* __XFS_SUPPORT_SV_H__ */
			
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 
				 #include <linux/pagevec.h>
			
 
				 #include <linux/writeback.h>
			
 
				 
			
 
				-/*
			
 
				- * Types of I/O for bmap clustering and I/O completion tracking.
			
 
				- */
			
 
				-enum {
			
 
				-	IO_READ,	/* mapping for a read */
			
 
				-	IO_DELAY,	/* mapping covers delalloc region */
			
 
				-	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
			
 
				-	IO_NEW		/* just allocated */
			
 
				-};
			
 
				 
			
 
				 /*
			
 
				  * Prime number of hash buckets since address is used as the key.
			
@@ -182,9 +173,6 @@ xfs_setfilesize(
 
				 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
			
 
				 	xfs_fsize_t		isize;
			
 
				 
			
 
				-	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
			
 
				-	ASSERT(ioend->io_type != IO_READ);
			
 
				-
			
 
				 	if (unlikely(ioend->io_error))
			
 
				 		return 0;
			
 
				 
			
@@ -244,10 +232,8 @@ xfs_end_io(
 
				 	 * We might have to update the on-disk file size after extending
			
 
				 	 * writes.
			
 
				 	 */
			
 
				-	if (ioend->io_type != IO_READ) {
			
 
				-		error = xfs_setfilesize(ioend);
			
 
				-		ASSERT(!error || error == EAGAIN);
			
 
				-	}
			
 
				+	error = xfs_setfilesize(ioend);
			
 
				+	ASSERT(!error || error == EAGAIN);
			
 
				 
			
 
				 	/*
			
 
				 	 * If we didn't complete processing of the ioend, requeue it to the
			
@@ -318,14 +304,63 @@ STATIC int
 
				 xfs_map_blocks(
			
 
				 	struct inode		*inode,
			
 
				 	loff_t			offset,
			
 
				-	ssize_t			count,
			
 
				 	struct xfs_bmbt_irec	*imap,
			
 
				-	int			flags)
			
 
				+	int			type,
			
 
				+	int			nonblocking)
			
 
				 {
			
 
				-	int			nmaps = 1;
			
 
				-	int			new = 0;
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	ssize_t			count = 1 << inode->i_blkbits;
			
 
				+	xfs_fileoff_t		offset_fsb, end_fsb;
			
 
				+	int			error = 0;
			
 
				+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
			
 
				+	int			nimaps = 1;
			
 
				+
			
 
				+	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+		return -XFS_ERROR(EIO);
			
 
				+
			
 
				+	if (type == IO_UNWRITTEN)
			
 
				+		bmapi_flags |= XFS_BMAPI_IGSTATE;
			
 
				+
			
 
				+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
			
 
				+		if (nonblocking)
			
 
				+			return -XFS_ERROR(EAGAIN);
			
 
				+		xfs_ilock(ip, XFS_ILOCK_SHARED);
			
 
				+	}
			
 
				 
			
 
				-	return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
			
 
				+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
			
 
				+	       (ip->i_df.if_flags & XFS_IFEXTENTS));
			
 
				+	ASSERT(offset <= mp->m_maxioffset);
			
 
				+
			
 
				+	if (offset + count > mp->m_maxioffset)
			
 
				+		count = mp->m_maxioffset - offset;
			
 
				+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
			
 
				+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				+	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
			
 
				+			  bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
			
 
				+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
			
 
				+
			
 
				+	if (error)
			
 
				+		return -XFS_ERROR(error);
			
 
				+
			
 
				+	if (type == IO_DELALLOC &&
			
 
				+	    (!nimaps || isnullstartblock(imap->br_startblock))) {
			
 
				+		error = xfs_iomap_write_allocate(ip, offset, count, imap);
			
 
				+		if (!error)
			
 
				+			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
			
 
				+		return -XFS_ERROR(error);
			
 
				+	}
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+	if (type == IO_UNWRITTEN) {
			
 
				+		ASSERT(nimaps);
			
 
				+		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
			
 
				+		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
			
 
				+	}
			
 
				+#endif
			
 
				+	if (nimaps)
			
 
				+		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 STATIC int
			
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
 
				 
			
 
				 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
			
 
				 		   WRITE_SYNC_PLUG : WRITE, bio);
			
 
				-	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
			
 
				-	bio_put(bio);
			
 
				 }
			
 
				 
			
 
				 STATIC struct bio *
			
 
				 xfs_alloc_ioend_bio(
			
 
				 	struct buffer_head	*bh)
			
 
				 {
			
 
				-	struct bio		*bio;
			
 
				 	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
			
 
				-
			
 
				-	do {
			
 
				-		bio = bio_alloc(GFP_NOIO, nvecs);
			
 
				-		nvecs >>= 1;
			
 
				-	} while (!bio);
			
 
				+	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
			
 
				 
			
 
				 	ASSERT(bio->bi_private == NULL);
			
 
				 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
			
 
				 	bio->bi_bdev = bh->b_bdev;
			
 
				-	bio_get(bio);
			
 
				 	return bio;
			
 
				 }
			
 
				 
			
@@ -470,9 +497,8 @@ xfs_submit_ioend(
 
				 	/* Pass 1 - start writeback */
			
 
				 	do {
			
 
				 		next = ioend->io_list;
			
 
				-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
			
 
				+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
			
 
				 			xfs_start_buffer_writeback(bh);
			
 
				-		}
			
 
				 	} while ((ioend = next) != NULL);
			
 
				 
			
 
				 	/* Pass 2 - submit I/O */
			
@@ -600,116 +626,12 @@ xfs_map_at_offset(
 
				 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
			
 
				 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
			
 
				 
			
 
				-	lock_buffer(bh);
			
 
				 	xfs_map_buffer(inode, bh, imap, offset);
			
 
				-	bh->b_bdev = xfs_find_bdev_for_inode(inode);
			
 
				 	set_buffer_mapped(bh);
			
 
				 	clear_buffer_delay(bh);
			
 
				 	clear_buffer_unwritten(bh);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Look for a page at index that is suitable for clustering.
			
 
				- */
			
 
				-STATIC unsigned int
			
 
				-xfs_probe_page(
			
 
				-	struct page		*page,
			
 
				-	unsigned int		pg_offset)
			
 
				-{
			
 
				-	struct buffer_head	*bh, *head;
			
 
				-	int			ret = 0;
			
 
				-
			
 
				-	if (PageWriteback(page))
			
 
				-		return 0;
			
 
				-	if (!PageDirty(page))
			
 
				-		return 0;
			
 
				-	if (!page->mapping)
			
 
				-		return 0;
			
 
				-	if (!page_has_buffers(page))
			
 
				-		return 0;
			
 
				-
			
 
				-	bh = head = page_buffers(page);
			
 
				-	do {
			
 
				-		if (!buffer_uptodate(bh))
			
 
				-			break;
			
 
				-		if (!buffer_mapped(bh))
			
 
				-			break;
			
 
				-		ret += bh->b_size;
			
 
				-		if (ret >= pg_offset)
			
 
				-			break;
			
 
				-	} while ((bh = bh->b_this_page) != head);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-STATIC size_t
			
 
				-xfs_probe_cluster(
			
 
				-	struct inode		*inode,
			
 
				-	struct page		*startpage,
			
 
				-	struct buffer_head	*bh,
			
 
				-	struct buffer_head	*head)
			
 
				-{
			
 
				-	struct pagevec		pvec;
			
 
				-	pgoff_t			tindex, tlast, tloff;
			
 
				-	size_t			total = 0;
			
 
				-	int			done = 0, i;
			
 
				-
			
 
				-	/* First sum forwards in this page */
			
 
				-	do {
			
 
				-		if (!buffer_uptodate(bh) || !buffer_mapped(bh))
			
 
				-			return total;
			
 
				-		total += bh->b_size;
			
 
				-	} while ((bh = bh->b_this_page) != head);
			
 
				-
			
 
				-	/* if we reached the end of the page, sum forwards in following pages */
			
 
				-	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
			
 
				-	tindex = startpage->index + 1;
			
 
				-
			
 
				-	/* Prune this back to avoid pathological behavior */
			
 
				-	tloff = min(tlast, startpage->index + 64);
			
 
				-
			
 
				-	pagevec_init(&pvec, 0);
			
 
				-	while (!done && tindex <= tloff) {
			
 
				-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
			
 
				-
			
 
				-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
			
 
				-			break;
			
 
				-
			
 
				-		for (i = 0; i < pagevec_count(&pvec); i++) {
			
 
				-			struct page *page = pvec.pages[i];
			
 
				-			size_t pg_offset, pg_len = 0;
			
 
				-
			
 
				-			if (tindex == tlast) {
			
 
				-				pg_offset =
			
 
				-				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
			
 
				-				if (!pg_offset) {
			
 
				-					done = 1;
			
 
				-					break;
			
 
				-				}
			
 
				-			} else
			
 
				-				pg_offset = PAGE_CACHE_SIZE;
			
 
				-
			
 
				-			if (page->index == tindex && trylock_page(page)) {
			
 
				-				pg_len = xfs_probe_page(page, pg_offset);
			
 
				-				unlock_page(page);
			
 
				-			}
			
 
				-
			
 
				-			if (!pg_len) {
			
 
				-				done = 1;
			
 
				-				break;
			
 
				-			}
			
 
				-
			
 
				-			total += pg_len;
			
 
				-			tindex++;
			
 
				-		}
			
 
				-
			
 
				-		pagevec_release(&pvec);
			
 
				-		cond_resched();
			
 
				-	}
			
 
				-
			
 
				-	return total;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Test if a given page is suitable for writing as part of an unwritten
			
 
				  * or delayed allocate extent.
			
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
 
				 			if (buffer_unwritten(bh))
			
 
				 				acceptable = (type == IO_UNWRITTEN);
			
 
				 			else if (buffer_delay(bh))
			
 
				-				acceptable = (type == IO_DELAY);
			
 
				+				acceptable = (type == IO_DELALLOC);
			
 
				 			else if (buffer_dirty(bh) && buffer_mapped(bh))
			
 
				-				acceptable = (type == IO_NEW);
			
 
				+				acceptable = (type == IO_OVERWRITE);
			
 
				 			else
			
 
				 				break;
			
 
				 		} while ((bh = bh->b_this_page) != head);
			
@@ -758,8 +680,7 @@ xfs_convert_page(
 
				 	loff_t			tindex,
			
 
				 	struct xfs_bmbt_irec	*imap,
			
 
				 	xfs_ioend_t		**ioendp,
			
 
				-	struct writeback_control *wbc,
			
 
				-	int			all_bh)
			
 
				+	struct writeback_control *wbc)
			
 
				 {
			
 
				 	struct buffer_head	*bh, *head;
			
 
				 	xfs_off_t		end_offset;
			
@@ -814,37 +735,30 @@ xfs_convert_page(
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
			
 
				+		if (buffer_unwritten(bh) || buffer_delay(bh) ||
			
 
				+		    buffer_mapped(bh)) {
			
 
				 			if (buffer_unwritten(bh))
			
 
				 				type = IO_UNWRITTEN;
			
 
				+			else if (buffer_delay(bh))
			
 
				+				type = IO_DELALLOC;
			
 
				 			else
			
 
				-				type = IO_DELAY;
			
 
				+				type = IO_OVERWRITE;
			
 
				 
			
 
				 			if (!xfs_imap_valid(inode, imap, offset)) {
			
 
				 				done = 1;
			
 
				 				continue;
			
 
				 			}
			
 
				 
			
 
				-			ASSERT(imap->br_startblock != HOLESTARTBLOCK);
			
 
				-			ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
			
 
				-
			
 
				-			xfs_map_at_offset(inode, bh, imap, offset);
			
 
				+			lock_buffer(bh);
			
 
				+			if (type != IO_OVERWRITE)
			
 
				+				xfs_map_at_offset(inode, bh, imap, offset);
			
 
				 			xfs_add_to_ioend(inode, bh, offset, type,
			
 
				 					 ioendp, done);
			
 
				 
			
 
				 			page_dirty--;
			
 
				 			count++;
			
 
				 		} else {
			
 
				-			type = IO_NEW;
			
 
				-			if (buffer_mapped(bh) && all_bh) {
			
 
				-				lock_buffer(bh);
			
 
				-				xfs_add_to_ioend(inode, bh, offset,
			
 
				-						type, ioendp, done);
			
 
				-				count++;
			
 
				-				page_dirty--;
			
 
				-			} else {
			
 
				-				done = 1;
			
 
				-			}
			
 
				+			done = 1;
			
 
				 		}
			
 
				 	} while (offset += len, (bh = bh->b_this_page) != head);
			
 
				 
			
@@ -876,7 +790,6 @@ xfs_cluster_write(
 
				 	struct xfs_bmbt_irec	*imap,
			
 
				 	xfs_ioend_t		**ioendp,
			
 
				 	struct writeback_control *wbc,
			
 
				-	int			all_bh,
			
 
				 	pgoff_t			tlast)
			
 
				 {
			
 
				 	struct pagevec		pvec;
			
@@ -891,7 +804,7 @@ xfs_cluster_write(
 
				 
			
 
				 		for (i = 0; i < pagevec_count(&pvec); i++) {
			
 
				 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
			
 
				-					imap, ioendp, wbc, all_bh);
			
 
				+					imap, ioendp, wbc);
			
 
				 			if (done)
			
 
				 				break;
			
 
				 		}
			
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
 
				 	struct buffer_head	*bh, *head;
			
 
				 	loff_t			offset = page_offset(page);
			
 
				 
			
 
				-	if (!xfs_is_delayed_page(page, IO_DELAY))
			
 
				+	if (!xfs_is_delayed_page(page, IO_DELALLOC))
			
 
				 		goto out_invalidate;
			
 
				 
			
 
				 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
			
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
 
				 	unsigned int		type;
			
 
				 	__uint64_t              end_offset;
			
 
				 	pgoff_t                 end_index, last_index;
			
 
				-	ssize_t			size, len;
			
 
				-	int			flags, err, imap_valid = 0, uptodate = 1;
			
 
				+	ssize_t			len;
			
 
				+	int			err, imap_valid = 0, uptodate = 1;
			
 
				 	int			count = 0;
			
 
				-	int			all_bh = 0;
			
 
				+	int			nonblocking = 0;
			
 
				 
			
 
				 	trace_xfs_writepage(inode, page, 0);
			
 
				 
			
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
 
				 
			
 
				 	bh = head = page_buffers(page);
			
 
				 	offset = page_offset(page);
			
 
				-	flags = BMAPI_READ;
			
 
				-	type = IO_NEW;
			
 
				+	type = IO_OVERWRITE;
			
 
				+
			
 
				+	if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
			
 
				+		nonblocking = 1;
			
 
				 
			
 
				 	do {
			
 
				+		int new_ioend = 0;
			
 
				+
			
 
				 		if (offset >= end_offset)
			
 
				 			break;
			
 
				 		if (!buffer_uptodate(bh))
			
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		if (imap_valid)
			
 
				-			imap_valid = xfs_imap_valid(inode, &imap, offset);
			
 
				-
			
 
				-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
			
 
				-			int new_ioend = 0;
			
 
				-
			
 
				-			/*
			
 
				-			 * Make sure we don't use a read-only iomap
			
 
				-			 */
			
 
				-			if (flags == BMAPI_READ)
			
 
				-				imap_valid = 0;
			
 
				-
			
 
				-			if (buffer_unwritten(bh)) {
			
 
				+		if (buffer_unwritten(bh)) {
			
 
				+			if (type != IO_UNWRITTEN) {
			
 
				 				type = IO_UNWRITTEN;
			
 
				-				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
			
 
				-			} else if (buffer_delay(bh)) {
			
 
				-				type = IO_DELAY;
			
 
				-				flags = BMAPI_ALLOCATE;
			
 
				-
			
 
				-				if (wbc->sync_mode == WB_SYNC_NONE)
			
 
				-					flags |= BMAPI_TRYLOCK;
			
 
				-			}
			
 
				-
			
 
				-			if (!imap_valid) {
			
 
				-				/*
			
 
				-				 * If we didn't have a valid mapping then we
			
 
				-				 * need to ensure that we put the new mapping
			
 
				-				 * in a new ioend structure. This needs to be
			
 
				-				 * done to ensure that the ioends correctly
			
 
				-				 * reflect the block mappings at io completion
			
 
				-				 * for unwritten extent conversion.
			
 
				-				 */
			
 
				-				new_ioend = 1;
			
 
				-				err = xfs_map_blocks(inode, offset, len,
			
 
				-						&imap, flags);
			
 
				-				if (err)
			
 
				-					goto error;
			
 
				-				imap_valid = xfs_imap_valid(inode, &imap,
			
 
				-							    offset);
			
 
				+				imap_valid = 0;
			
 
				 			}
			
 
				-			if (imap_valid) {
			
 
				-				xfs_map_at_offset(inode, bh, &imap, offset);
			
 
				-				xfs_add_to_ioend(inode, bh, offset, type,
			
 
				-						 &ioend, new_ioend);
			
 
				-				count++;
			
 
				+		} else if (buffer_delay(bh)) {
			
 
				+			if (type != IO_DELALLOC) {
			
 
				+				type = IO_DELALLOC;
			
 
				+				imap_valid = 0;
			
 
				 			}
			
 
				 		} else if (buffer_uptodate(bh)) {
			
 
				-			/*
			
 
				-			 * we got here because the buffer is already mapped.
			
 
				-			 * That means it must already have extents allocated
			
 
				-			 * underneath it. Map the extent by reading it.
			
 
				-			 */
			
 
				-			if (!imap_valid || flags != BMAPI_READ) {
			
 
				-				flags = BMAPI_READ;
			
 
				-				size = xfs_probe_cluster(inode, page, bh, head);
			
 
				-				err = xfs_map_blocks(inode, offset, size,
			
 
				-						&imap, flags);
			
 
				-				if (err)
			
 
				-					goto error;
			
 
				-				imap_valid = xfs_imap_valid(inode, &imap,
			
 
				-							    offset);
			
 
				+			if (type != IO_OVERWRITE) {
			
 
				+				type = IO_OVERWRITE;
			
 
				+				imap_valid = 0;
			
 
				 			}
			
 
				+		} else {
			
 
				+			if (PageUptodate(page)) {
			
 
				+				ASSERT(buffer_mapped(bh));
			
 
				+				imap_valid = 0;
			
 
				+			}
			
 
				+			continue;
			
 
				+		}
			
 
				 
			
 
				+		if (imap_valid)
			
 
				+			imap_valid = xfs_imap_valid(inode, &imap, offset);
			
 
				+		if (!imap_valid) {
			
 
				 			/*
			
 
				-			 * We set the type to IO_NEW in case we are doing a
			
 
				-			 * small write at EOF that is extending the file but
			
 
				-			 * without needing an allocation. We need to update the
			
 
				-			 * file size on I/O completion in this case so it is
			
 
				-			 * the same case as having just allocated a new extent
			
 
				-			 * that we are writing into for the first time.
			
 
				+			 * If we didn't have a valid mapping then we need to
			
 
				+			 * put the new mapping into a separate ioend structure.
			
 
				+			 * This ensures non-contiguous extents always have
			
 
				+			 * separate ioends, which is particularly important
			
 
				+			 * for unwritten extent conversion at I/O completion
			
 
				+			 * time.
			
 
				 			 */
			
 
				-			type = IO_NEW;
			
 
				-			if (trylock_buffer(bh)) {
			
 
				-				if (imap_valid)
			
 
				-					all_bh = 1;
			
 
				-				xfs_add_to_ioend(inode, bh, offset, type,
			
 
				-						&ioend, !imap_valid);
			
 
				-				count++;
			
 
				-			} else {
			
 
				-				imap_valid = 0;
			
 
				-			}
			
 
				-		} else if (PageUptodate(page)) {
			
 
				-			ASSERT(buffer_mapped(bh));
			
 
				-			imap_valid = 0;
			
 
				+			new_ioend = 1;
			
 
				+			err = xfs_map_blocks(inode, offset, &imap, type,
			
 
				+					     nonblocking);
			
 
				+			if (err)
			
 
				+				goto error;
			
 
				+			imap_valid = xfs_imap_valid(inode, &imap, offset);
			
 
				+		}
			
 
				+		if (imap_valid) {
			
 
				+			lock_buffer(bh);
			
 
				+			if (type != IO_OVERWRITE)
			
 
				+				xfs_map_at_offset(inode, bh, &imap, offset);
			
 
				+			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
			
 
				+					 new_ioend);
			
 
				+			count++;
			
 
				 		}
			
 
				 
			
 
				 		if (!iohead)
			
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
 
				 			end_index = last_index;
			
 
				 
			
 
				 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
			
 
				-					wbc, all_bh, end_index);
			
 
				+				  wbc, end_index);
			
 
				 	}
			
 
				 
			
 
				 	if (iohead)
			
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
 
				 	int			create,
			
 
				 	int			direct)
			
 
				 {
			
 
				-	int			flags = create ? BMAPI_WRITE : BMAPI_READ;
			
 
				+	struct xfs_inode	*ip = XFS_I(inode);
			
 
				+	struct xfs_mount	*mp = ip->i_mount;
			
 
				+	xfs_fileoff_t		offset_fsb, end_fsb;
			
 
				+	int			error = 0;
			
 
				+	int			lockmode = 0;
			
 
				 	struct xfs_bmbt_irec	imap;
			
 
				+	int			nimaps = 1;
			
 
				 	xfs_off_t		offset;
			
 
				 	ssize_t			size;
			
 
				-	int			nimap = 1;
			
 
				 	int			new = 0;
			
 
				-	int			error;
			
 
				+
			
 
				+	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+		return -XFS_ERROR(EIO);
			
 
				 
			
 
				 	offset = (xfs_off_t)iblock << inode->i_blkbits;
			
 
				 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
			
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
 
				 	if (!create && direct && offset >= i_size_read(inode))
			
 
				 		return 0;
			
 
				 
			
 
				-	if (direct && create)
			
 
				-		flags |= BMAPI_DIRECT;
			
 
				+	if (create) {
			
 
				+		lockmode = XFS_ILOCK_EXCL;
			
 
				+		xfs_ilock(ip, lockmode);
			
 
				+	} else {
			
 
				+		lockmode = xfs_ilock_map_shared(ip);
			
 
				+	}
			
 
				+
			
 
				+	ASSERT(offset <= mp->m_maxioffset);
			
 
				+	if (offset + size > mp->m_maxioffset)
			
 
				+		size = mp->m_maxioffset - offset;
			
 
				+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
			
 
				+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				 
			
 
				-	error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
			
 
				-			  &new);
			
 
				+	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
			
 
				+			  XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
			
 
				 	if (error)
			
 
				-		return -error;
			
 
				-	if (nimap == 0)
			
 
				-		return 0;
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	if (create &&
			
 
				+	    (!nimaps ||
			
 
				+	     (imap.br_startblock == HOLESTARTBLOCK ||
			
 
				+	      imap.br_startblock == DELAYSTARTBLOCK))) {
			
 
				+		if (direct) {
			
 
				+			error = xfs_iomap_write_direct(ip, offset, size,
			
 
				+						       &imap, nimaps);
			
 
				+		} else {
			
 
				+			error = xfs_iomap_write_delay(ip, offset, size, &imap);
			
 
				+		}
			
 
				+		if (error)
			
 
				+			goto out_unlock;
			
 
				+
			
 
				+		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
			
 
				+	} else if (nimaps) {
			
 
				+		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
			
 
				+	} else {
			
 
				+		trace_xfs_get_blocks_notfound(ip, offset, size);
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+	xfs_iunlock(ip, lockmode);
			
 
				 
			
 
				 	if (imap.br_startblock != HOLESTARTBLOCK &&
			
 
				 	    imap.br_startblock != DELAYSTARTBLOCK) {
			
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
 
				 	}
			
 
				 
			
 
				 	return 0;
			
 
				+
			
 
				+out_unlock:
			
 
				+	xfs_iunlock(ip, lockmode);
			
 
				+	return -error;
			
 
				 }
			
 
				 
			
 
				 int
			
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
 
				 	ssize_t			ret;
			
 
				 
			
 
				 	if (rw & WRITE) {
			
 
				-		iocb->private = xfs_alloc_ioend(inode, IO_NEW);
			
 
				+		iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
			
 
				 
			
 
				 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
			
 
				 					    offset, nr_segs,
			
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -22,6 +22,22 @@ extern struct workqueue_struct *xfsdatad_workqueue;
 
				 extern struct workqueue_struct *xfsconvertd_workqueue;
			
 
				 extern mempool_t *xfs_ioend_pool;
			
 
				 
			
 
				+/*
			
 
				+ * Types of I/O for bmap clustering and I/O completion tracking.
			
 
				+ */
			
 
				+enum {
			
 
				+	IO_DIRECT = 0,	/* special case for direct I/O ioends */
			
 
				+	IO_DELALLOC,	/* mapping covers delalloc region */
			
 
				+	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
			
 
				+	IO_OVERWRITE,	/* mapping covers already allocated extent */
			
 
				+};
			
 
				+
			
 
				+#define XFS_IO_TYPES \
			
 
				+	{ 0,			"" }, \
			
 
				+	{ IO_DELALLOC,		"delalloc" }, \
			
 
				+	{ IO_UNWRITTEN,		"unwritten" }, \
			
 
				+	{ IO_OVERWRITE,		"overwrite" }
			
 
				+
			
 
				 /*
			
 
				  * xfs_ioend struct manages large extent writes for XFS.
			
 
				  * It can manage several multi-page bio's at once.
			
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 
				 
			
 
				 static kmem_zone_t *xfs_buf_zone;
			
 
				 STATIC int xfsbufd(void *);
			
 
				-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
			
 
				 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
			
 
				-static struct shrinker xfs_buf_shake = {
			
 
				-	.shrink = xfsbufd_wakeup,
			
 
				-	.seeks = DEFAULT_SEEKS,
			
 
				-};
			
 
				 
			
 
				 static struct workqueue_struct *xfslogd_workqueue;
			
 
				 struct workqueue_struct *xfsdatad_workqueue;
			
@@ -168,8 +163,79 @@ test_page_region(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	Internal xfs_buf_t object manipulation
			
 
				+ * xfs_buf_lru_add - add a buffer to the LRU.
			
 
				+ *
			
 
				+ * The LRU takes a new reference to the buffer so that it will only be freed
			
 
				+ * once the shrinker takes the buffer off the LRU.
			
 
				  */
			
 
				+STATIC void
			
 
				+xfs_buf_lru_add(
			
 
				+	struct xfs_buf	*bp)
			
 
				+{
			
 
				+	struct xfs_buftarg *btp = bp->b_target;
			
 
				+
			
 
				+	spin_lock(&btp->bt_lru_lock);
			
 
				+	if (list_empty(&bp->b_lru)) {
			
 
				+		atomic_inc(&bp->b_hold);
			
 
				+		list_add_tail(&bp->b_lru, &btp->bt_lru);
			
 
				+		btp->bt_lru_nr++;
			
 
				+	}
			
 
				+	spin_unlock(&btp->bt_lru_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * xfs_buf_lru_del - remove a buffer from the LRU
			
 
				+ *
			
 
				+ * The unlocked check is safe here because it only occurs when there are not
			
 
				+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
			
 
				+ * to optimise the shrinker removing the buffer from the LRU and calling
			
 
				+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
			
 
				+ * bt_lru_lock.
			
 
				+ */
			
 
				+STATIC void
			
 
				+xfs_buf_lru_del(
			
 
				+	struct xfs_buf	*bp)
			
 
				+{
			
 
				+	struct xfs_buftarg *btp = bp->b_target;
			
 
				+
			
 
				+	if (list_empty(&bp->b_lru))
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock(&btp->bt_lru_lock);
			
 
				+	if (!list_empty(&bp->b_lru)) {
			
 
				+		list_del_init(&bp->b_lru);
			
 
				+		btp->bt_lru_nr--;
			
 
				+	}
			
 
				+	spin_unlock(&btp->bt_lru_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
			
 
				+ * b_lru_ref count so that the buffer is freed immediately when the buffer
			
 
				+ * reference count falls to zero. If the buffer is already on the LRU, we need
			
 
				+ * to remove the reference that LRU holds on the buffer.
			
 
				+ *
			
 
				+ * This prevents build-up of stale buffers on the LRU.
			
 
				+ */
			
 
				+void
			
 
				+xfs_buf_stale(
			
 
				+	struct xfs_buf	*bp)
			
 
				+{
			
 
				+	bp->b_flags |= XBF_STALE;
			
 
				+	atomic_set(&(bp)->b_lru_ref, 0);
			
 
				+	if (!list_empty(&bp->b_lru)) {
			
 
				+		struct xfs_buftarg *btp = bp->b_target;
			
 
				+
			
 
				+		spin_lock(&btp->bt_lru_lock);
			
 
				+		if (!list_empty(&bp->b_lru)) {
			
 
				+			list_del_init(&bp->b_lru);
			
 
				+			btp->bt_lru_nr--;
			
 
				+			atomic_dec(&bp->b_hold);
			
 
				+		}
			
 
				+		spin_unlock(&btp->bt_lru_lock);
			
 
				+	}
			
 
				+	ASSERT(atomic_read(&bp->b_hold) >= 1);
			
 
				+}
			
 
				 
			
 
				 STATIC void
			
 
				 _xfs_buf_initialize(
			
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
 
				 
			
 
				 	memset(bp, 0, sizeof(xfs_buf_t));
			
 
				 	atomic_set(&bp->b_hold, 1);
			
 
				+	atomic_set(&bp->b_lru_ref, 1);
			
 
				 	init_completion(&bp->b_iowait);
			
 
				+	INIT_LIST_HEAD(&bp->b_lru);
			
 
				 	INIT_LIST_HEAD(&bp->b_list);
			
 
				 	RB_CLEAR_NODE(&bp->b_rbnode);
			
 
				 	sema_init(&bp->b_sema, 0); /* held, no waiters */
			
@@ -262,6 +330,8 @@ xfs_buf_free(
 
				 {
			
 
				 	trace_xfs_buf_free(bp, _RET_IP_);
			
 
				 
			
 
				+	ASSERT(list_empty(&bp->b_lru));
			
 
				+
			
 
				 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
			
 
				 		uint		i;
			
 
				 
			
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
 
				 					__func__, gfp_mask);
			
 
				 
			
 
				 			XFS_STATS_INC(xb_page_retries);
			
 
				-			xfsbufd_wakeup(NULL, 0, gfp_mask);
			
 
				 			congestion_wait(BLK_RW_ASYNC, HZ/50);
			
 
				 			goto retry;
			
 
				 		}
			
@@ -828,6 +897,7 @@ xfs_buf_rele(
 
				 
			
 
				 	if (!pag) {
			
 
				 		ASSERT(!bp->b_relse);
			
 
				+		ASSERT(list_empty(&bp->b_lru));
			
 
				 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
			
 
				 		if (atomic_dec_and_test(&bp->b_hold))
			
 
				 			xfs_buf_free(bp);
			
@@ -835,13 +905,19 @@ xfs_buf_rele(
 
				 	}
			
 
				 
			
 
				 	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
			
 
				+
			
 
				 	ASSERT(atomic_read(&bp->b_hold) > 0);
			
 
				 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
			
 
				 		if (bp->b_relse) {
			
 
				 			atomic_inc(&bp->b_hold);
			
 
				 			spin_unlock(&pag->pag_buf_lock);
			
 
				 			bp->b_relse(bp);
			
 
				+		} else if (!(bp->b_flags & XBF_STALE) &&
			
 
				+			   atomic_read(&bp->b_lru_ref)) {
			
 
				+			xfs_buf_lru_add(bp);
			
 
				+			spin_unlock(&pag->pag_buf_lock);
			
 
				 		} else {
			
 
				+			xfs_buf_lru_del(bp);
			
 
				 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
			
 
				 			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			
 
				 			spin_unlock(&pag->pag_buf_lock);
			
@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
 
				  */
			
 
				 
			
 
				 /*
			
 
				- *	Wait for any bufs with callbacks that have been submitted but
			
 
				- *	have not yet returned... walk the hash list for the target.
			
 
				+ * Wait for any bufs with callbacks that have been submitted but have not yet
			
 
				+ * returned. These buffers will have an elevated hold count, so wait on those
			
 
				+ * while freeing all the buffers only held by the LRU.
			
 
				  */
			
 
				 void
			
 
				 xfs_wait_buftarg(
			
 
				 	struct xfs_buftarg	*btp)
			
 
				 {
			
 
				-	struct xfs_perag	*pag;
			
 
				-	uint			i;
			
 
				+	struct xfs_buf		*bp;
			
 
				 
			
 
				-	for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
			
 
				-		pag = xfs_perag_get(btp->bt_mount, i);
			
 
				-		spin_lock(&pag->pag_buf_lock);
			
 
				-		while (rb_first(&pag->pag_buf_tree)) {
			
 
				-			spin_unlock(&pag->pag_buf_lock);
			
 
				+restart:
			
 
				+	spin_lock(&btp->bt_lru_lock);
			
 
				+	while (!list_empty(&btp->bt_lru)) {
			
 
				+		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
			
 
				+		if (atomic_read(&bp->b_hold) > 1) {
			
 
				+			spin_unlock(&btp->bt_lru_lock);
			
 
				 			delay(100);
			
 
				-			spin_lock(&pag->pag_buf_lock);
			
 
				+			goto restart;
			
 
				 		}
			
 
				-		spin_unlock(&pag->pag_buf_lock);
			
 
				-		xfs_perag_put(pag);
			
 
				+		/*
			
 
				+		 * clear the LRU reference count so the bufer doesn't get
			
 
				+		 * ignored in xfs_buf_rele().
			
 
				+		 */
			
 
				+		atomic_set(&bp->b_lru_ref, 0);
			
 
				+		spin_unlock(&btp->bt_lru_lock);
			
 
				+		xfs_buf_rele(bp);
			
 
				+		spin_lock(&btp->bt_lru_lock);
			
 
				 	}
			
 
				+	spin_unlock(&btp->bt_lru_lock);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- *	buftarg list for delwrite queue processing
			
 
				- */
			
 
				-static LIST_HEAD(xfs_buftarg_list);
			
 
				-static DEFINE_SPINLOCK(xfs_buftarg_lock);
			
 
				-
			
 
				-STATIC void
			
 
				-xfs_register_buftarg(
			
 
				-	xfs_buftarg_t           *btp)
			
 
				+int
			
 
				+xfs_buftarg_shrink(
			
 
				+	struct shrinker		*shrink,
			
 
				+	int			nr_to_scan,
			
 
				+	gfp_t			mask)
			
 
				 {
			
 
				-	spin_lock(&xfs_buftarg_lock);
			
 
				-	list_add(&btp->bt_list, &xfs_buftarg_list);
			
 
				-	spin_unlock(&xfs_buftarg_lock);
			
 
				-}
			
 
				+	struct xfs_buftarg	*btp = container_of(shrink,
			
 
				+					struct xfs_buftarg, bt_shrinker);
			
 
				+	struct xfs_buf		*bp;
			
 
				+	LIST_HEAD(dispose);
			
 
				 
			
 
				-STATIC void
			
 
				-xfs_unregister_buftarg(
			
 
				-	xfs_buftarg_t           *btp)
			
 
				-{
			
 
				-	spin_lock(&xfs_buftarg_lock);
			
 
				-	list_del(&btp->bt_list);
			
 
				-	spin_unlock(&xfs_buftarg_lock);
			
 
				+	if (!nr_to_scan)
			
 
				+		return btp->bt_lru_nr;
			
 
				+
			
 
				+	spin_lock(&btp->bt_lru_lock);
			
 
				+	while (!list_empty(&btp->bt_lru)) {
			
 
				+		if (nr_to_scan-- <= 0)
			
 
				+			break;
			
 
				+
			
 
				+		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
			
 
				+
			
 
				+		/*
			
 
				+		 * Decrement the b_lru_ref count unless the value is already
			
 
				+		 * zero. If the value is already zero, we need to reclaim the
			
 
				+		 * buffer, otherwise it gets another trip through the LRU.
			
 
				+		 */
			
 
				+		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
			
 
				+			list_move_tail(&bp->b_lru, &btp->bt_lru);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * remove the buffer from the LRU now to avoid needing another
			
 
				+		 * lock round trip inside xfs_buf_rele().
			
 
				+		 */
			
 
				+		list_move(&bp->b_lru, &dispose);
			
 
				+		btp->bt_lru_nr--;
			
 
				+	}
			
 
				+	spin_unlock(&btp->bt_lru_lock);
			
 
				+
			
 
				+	while (!list_empty(&dispose)) {
			
 
				+		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
			
 
				+		list_del_init(&bp->b_lru);
			
 
				+		xfs_buf_rele(bp);
			
 
				+	}
			
 
				+
			
 
				+	return btp->bt_lru_nr;
			
 
				 }
			
 
				 
			
 
				 void
			
@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
 
				 	struct xfs_mount	*mp,
			
 
				 	struct xfs_buftarg	*btp)
			
 
				 {
			
 
				+	unregister_shrinker(&btp->bt_shrinker);
			
 
				+
			
 
				 	xfs_flush_buftarg(btp, 1);
			
 
				 	if (mp->m_flags & XFS_MOUNT_BARRIER)
			
 
				 		xfs_blkdev_issue_flush(btp);
			
 
				 	iput(btp->bt_mapping->host);
			
 
				 
			
 
				-	/* Unregister the buftarg first so that we don't get a
			
 
				-	 * wakeup finding a non-existent task
			
 
				-	 */
			
 
				-	xfs_unregister_buftarg(btp);
			
 
				 	kthread_stop(btp->bt_task);
			
 
				-
			
 
				 	kmem_free(btp);
			
 
				 }
			
 
				 
			
@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
 
				 	xfs_buftarg_t		*btp,
			
 
				 	const char		*fsname)
			
 
				 {
			
 
				-	int	error = 0;
			
 
				-
			
 
				-	INIT_LIST_HEAD(&btp->bt_list);
			
 
				 	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
			
 
				 	spin_lock_init(&btp->bt_delwrite_lock);
			
 
				 	btp->bt_flags = 0;
			
 
				 	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
			
 
				-	if (IS_ERR(btp->bt_task)) {
			
 
				-		error = PTR_ERR(btp->bt_task);
			
 
				-		goto out_error;
			
 
				-	}
			
 
				-	xfs_register_buftarg(btp);
			
 
				-out_error:
			
 
				-	return error;
			
 
				+	if (IS_ERR(btp->bt_task))
			
 
				+		return PTR_ERR(btp->bt_task);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 xfs_buftarg_t *
			
@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
 
				 	btp->bt_mount = mp;
			
 
				 	btp->bt_dev =  bdev->bd_dev;
			
 
				 	btp->bt_bdev = bdev;
			
 
				+	INIT_LIST_HEAD(&btp->bt_lru);
			
 
				+	spin_lock_init(&btp->bt_lru_lock);
			
 
				 	if (xfs_setsize_buftarg_early(btp, bdev))
			
 
				 		goto error;
			
 
				 	if (xfs_mapping_buftarg(btp, bdev))
			
 
				 		goto error;
			
 
				 	if (xfs_alloc_delwrite_queue(btp, fsname))
			
 
				 		goto error;
			
 
				+	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
			
 
				+	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
			
 
				+	register_shrinker(&btp->bt_shrinker);
			
 
				 	return btp;
			
 
				 
			
 
				 error:
			
@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
 
				 	flush_workqueue(queue);
			
 
				 }
			
 
				 
			
 
				-STATIC int
			
 
				-xfsbufd_wakeup(
			
 
				-	struct shrinker		*shrink,
			
 
				-	int			priority,
			
 
				-	gfp_t			mask)
			
 
				-{
			
 
				-	xfs_buftarg_t		*btp;
			
 
				-
			
 
				-	spin_lock(&xfs_buftarg_lock);
			
 
				-	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
			
 
				-		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
			
 
				-			continue;
			
 
				-		if (list_empty(&btp->bt_delwrite_queue))
			
 
				-			continue;
			
 
				-		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
			
 
				-		wake_up_process(btp->bt_task);
			
 
				-	}
			
 
				-	spin_unlock(&xfs_buftarg_lock);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Move as many buffers as specified to the supplied list
			
 
				  * idicating if we skipped any buffers to prevent deadlocks.
			
@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
 
				 	if (!xfsconvertd_workqueue)
			
 
				 		goto out_destroy_xfsdatad_workqueue;
			
 
				 
			
 
				-	register_shrinker(&xfs_buf_shake);
			
 
				 	return 0;
			
 
				 
			
 
				  out_destroy_xfsdatad_workqueue:
			
@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
 
				 void
			
 
				 xfs_buf_terminate(void)
			
 
				 {
			
 
				-	unregister_shrinker(&xfs_buf_shake);
			
 
				 	destroy_workqueue(xfsconvertd_workqueue);
			
 
				 	destroy_workqueue(xfsdatad_workqueue);
			
 
				 	destroy_workqueue(xfslogd_workqueue);
			
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
 
				 
			
 
				 	/* per device delwri queue */
			
 
				 	struct task_struct	*bt_task;
			
 
				-	struct list_head	bt_list;
			
 
				 	struct list_head	bt_delwrite_queue;
			
 
				 	spinlock_t		bt_delwrite_lock;
			
 
				 	unsigned long		bt_flags;
			
 
				+
			
 
				+	/* LRU control structures */
			
 
				+	struct shrinker		bt_shrinker;
			
 
				+	struct list_head	bt_lru;
			
 
				+	spinlock_t		bt_lru_lock;
			
 
				+	unsigned int		bt_lru_nr;
			
 
				 } xfs_buftarg_t;
			
 
				 
			
 
				 /*
			
@@ -164,9 +169,11 @@ typedef struct xfs_buf {
 
				 	xfs_off_t		b_file_offset;	/* offset in file */
			
 
				 	size_t			b_buffer_length;/* size of buffer in bytes */
			
 
				 	atomic_t		b_hold;		/* reference count */
			
 
				+	atomic_t		b_lru_ref;	/* lru reclaim ref count */
			
 
				 	xfs_buf_flags_t		b_flags;	/* status flags */
			
 
				 	struct semaphore	b_sema;		/* semaphore for lockables */
			
 
				 
			
 
				+	struct list_head	b_lru;		/* lru list */
			
 
				 	wait_queue_head_t	b_waiters;	/* unpin waiters */
			
 
				 	struct list_head	b_list;
			
 
				 	struct xfs_perag	*b_pag;		/* contains rbtree root */
			
@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
 
				 #define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
			
 
				 		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
			
 
				 
			
 
				-#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XBF_STALE)
			
 
				+void xfs_buf_stale(struct xfs_buf *bp);
			
 
				+#define XFS_BUF_STALE(bp)	xfs_buf_stale(bp);
			
 
				 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
			
 
				 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
			
 
				 #define XFS_BUF_SUPER_STALE(bp)	do {				\
			
@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
 
				 #define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
			
 
				 #define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
			
 
				 
			
 
				-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	do { } while (0)
			
 
				+static inline void
			
 
				+xfs_buf_set_ref(
			
 
				+	struct xfs_buf	*bp,
			
 
				+	int		lru_ref)
			
 
				+{
			
 
				+	atomic_set(&bp->b_lru_ref, lru_ref);
			
 
				+}
			
 
				+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	xfs_buf_set_ref(bp, ref)
			
 
				 #define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
			
 
				-#define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
			
 
				 
			
 
				 #define XFS_BUF_ISPINNED(bp)	atomic_read(&((bp)->b_pin_count))
			
 
				 
			
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
 
				 	else
			
 
				 		fileid_type = FILEID_INO32_GEN_PARENT;
			
 
				 
			
 
				-	/* filesystem may contain 64bit inode numbers */
			
 
				-	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
			
 
				+	/*
			
 
				+	 * If the the filesystem may contain 64bit inode numbers, we need
			
 
				+	 * to use larger file handles that can represent them.
			
 
				+	 *
			
 
				+	 * While we only allocate inodes that do not fit into 32 bits any
			
 
				+	 * large enough filesystem may contain them, thus the slightly
			
 
				+	 * confusing looking conditional below.
			
 
				+	 */
			
 
				+	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
			
 
				+	    (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
			
 
				 		fileid_type |= XFS_FILEID_TYPE_64FLAG;
			
 
				 
			
 
				 	/*
			
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 
				 
			
 
				 #include <kmem.h>
			
 
				 #include <mrlock.h>
			
 
				-#include <sv.h>
			
 
				 #include <time.h>
			
 
				 
			
 
				 #include <support/debug.h>
			
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -834,8 +834,11 @@ xfsaild_wakeup(
 
				 	struct xfs_ail		*ailp,
			
 
				 	xfs_lsn_t		threshold_lsn)
			
 
				 {
			
 
				-	ailp->xa_target = threshold_lsn;
			
 
				-	wake_up_process(ailp->xa_task);
			
 
				+	/* only ever move the target forwards */
			
 
				+	if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
			
 
				+		ailp->xa_target = threshold_lsn;
			
 
				+		wake_up_process(ailp->xa_task);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 STATIC int
			
@@ -847,8 +850,17 @@ xfsaild(
 
				 	long		tout = 0; /* milliseconds */
			
 
				 
			
 
				 	while (!kthread_should_stop()) {
			
 
				-		schedule_timeout_interruptible(tout ?
			
 
				-				msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
			
 
				+		/*
			
 
				+		 * for short sleeps indicating congestion, don't allow us to
			
 
				+		 * get woken early. Otherwise all we do is bang on the AIL lock
			
 
				+		 * without making progress.
			
 
				+		 */
			
 
				+		if (tout && tout <= 20)
			
 
				+			__set_current_state(TASK_KILLABLE);
			
 
				+		else
			
 
				+			__set_current_state(TASK_INTERRUPTIBLE);
			
 
				+		schedule_timeout(tout ?
			
 
				+				 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
			
 
				 
			
 
				 		/* swsusp */
			
 
				 		try_to_freeze();
			
@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
 
				 	 */
			
 
				 	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
			
 
				 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
			
 
				+	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
			
 
				+			&xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
			
 
				 
			
 
				 	xfs_inactive(ip);
			
 
				 }
			
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 
				 {
			
 
				 	struct inode		*inode = VFS_I(ip);
			
 
				 
			
 
				+	ASSERT(rcu_read_lock_held());
			
 
				+
			
 
				+	/*
			
 
				+	 * check for stale RCU freed inode
			
 
				+	 *
			
 
				+	 * If the inode has been reallocated, it doesn't matter if it's not in
			
 
				+	 * the AG we are walking - we are walking for writeback, so if it
			
 
				+	 * passes all the "valid inode" checks and is dirty, then we'll write
			
 
				+	 * it back anyway.  If it has been reallocated and still being
			
 
				+	 * initialised, the XFS_INEW check below will catch it.
			
 
				+	 */
			
 
				+	spin_lock(&ip->i_flags_lock);
			
 
				+	if (!ip->i_ino)
			
 
				+		goto out_unlock_noent;
			
 
				+
			
 
				+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
			
 
				+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
			
 
				+		goto out_unlock_noent;
			
 
				+	spin_unlock(&ip->i_flags_lock);
			
 
				+
			
 
				 	/* nothing to sync during shutdown */
			
 
				 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
			
 
				 		return EFSCORRUPTED;
			
 
				 
			
 
				-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
			
 
				-	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
			
 
				-		return ENOENT;
			
 
				-
			
 
				 	/* If we can't grab the inode, it must on it's way to reclaim. */
			
 
				 	if (!igrab(inode))
			
 
				 		return ENOENT;
			
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
 
				 
			
 
				 	/* inode is valid */
			
 
				 	return 0;
			
 
				+
			
 
				+out_unlock_noent:
			
 
				+	spin_unlock(&ip->i_flags_lock);
			
 
				+	return ENOENT;
			
 
				 }
			
 
				 
			
 
				 STATIC int
			
@@ -98,12 +118,12 @@ restart:
 
				 		int		error = 0;
			
 
				 		int		i;
			
 
				 
			
 
				-		read_lock(&pag->pag_ici_lock);
			
 
				+		rcu_read_lock();
			
 
				 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
			
 
				 					(void **)batch, first_index,
			
 
				 					XFS_LOOKUP_BATCH);
			
 
				 		if (!nr_found) {
			
 
				-			read_unlock(&pag->pag_ici_lock);
			
 
				+			rcu_read_unlock();
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -118,18 +138,26 @@ restart:
 
				 				batch[i] = NULL;
			
 
				 
			
 
				 			/*
			
 
				-			 * Update the index for the next lookup. Catch overflows
			
 
				-			 * into the next AG range which can occur if we have inodes
			
 
				-			 * in the last block of the AG and we are currently
			
 
				-			 * pointing to the last inode.
			
 
				+			 * Update the index for the next lookup. Catch
			
 
				+			 * overflows into the next AG range which can occur if
			
 
				+			 * we have inodes in the last block of the AG and we
			
 
				+			 * are currently pointing to the last inode.
			
 
				+			 *
			
 
				+			 * Because we may see inodes that are from the wrong AG
			
 
				+			 * due to RCU freeing and reallocation, only update the
			
 
				+			 * index if it lies in this AG. It was a race that lead
			
 
				+			 * us to see this inode, so another lookup from the
			
 
				+			 * same index will not find it again.
			
 
				 			 */
			
 
				+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
			
 
				+				continue;
			
 
				 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
			
 
				 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
			
 
				 				done = 1;
			
 
				 		}
			
 
				 
			
 
				 		/* unlock now we've grabbed the inodes. */
			
 
				-		read_unlock(&pag->pag_ici_lock);
			
 
				+		rcu_read_unlock();
			
 
				 
			
 
				 		for (i = 0; i < nr_found; i++) {
			
 
				 			if (!batch[i])
			
@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
 
				 	struct xfs_perag *pag;
			
 
				 
			
 
				 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
			
 
				-	write_lock(&pag->pag_ici_lock);
			
 
				+	spin_lock(&pag->pag_ici_lock);
			
 
				 	spin_lock(&ip->i_flags_lock);
			
 
				 	__xfs_inode_set_reclaim_tag(pag, ip);
			
 
				 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
			
 
				 	spin_unlock(&ip->i_flags_lock);
			
 
				-	write_unlock(&pag->pag_ici_lock);
			
 
				+	spin_unlock(&pag->pag_ici_lock);
			
 
				 	xfs_perag_put(pag);
			
 
				 }
			
 
				 
			
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
 
				 	struct xfs_inode	*ip,
			
 
				 	int			flags)
			
 
				 {
			
 
				+	ASSERT(rcu_read_lock_held());
			
 
				+
			
 
				+	/* quick check for stale RCU freed inode */
			
 
				+	if (!ip->i_ino)
			
 
				+		return 1;
			
 
				 
			
 
				 	/*
			
 
				-	 * do some unlocked checks first to avoid unnecceary lock traffic.
			
 
				+	 * do some unlocked checks first to avoid unnecessary lock traffic.
			
 
				 	 * The first is a flush lock check, the second is a already in reclaim
			
 
				 	 * check. Only do these checks if we are not going to block on locks.
			
 
				 	 */
			
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
 
				 	 * The radix tree lock here protects a thread in xfs_iget from racing
			
 
				 	 * with us starting reclaim on the inode.  Once we have the
			
 
				 	 * XFS_IRECLAIM flag set it will not touch us.
			
 
				+	 *
			
 
				+	 * Due to RCU lookup, we may find inodes that have been freed and only
			
 
				+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
			
 
				+	 * aren't candidates for reclaim at all, so we must check the
			
 
				+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
			
 
				 	 */
			
 
				 	spin_lock(&ip->i_flags_lock);
			
 
				-	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
			
 
				-	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
			
 
				-		/* ignore as it is already under reclaim */
			
 
				+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
			
 
				+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
			
 
				+		/* not a reclaim candidate. */
			
 
				 		spin_unlock(&ip->i_flags_lock);
			
 
				 		return 1;
			
 
				 	}
			
@@ -795,12 +833,12 @@ reclaim:
 
				 	 * added to the tree assert that it's been there before to catch
			
 
				 	 * problems with the inode life time early on.
			
 
				 	 */
			
 
				-	write_lock(&pag->pag_ici_lock);
			
 
				+	spin_lock(&pag->pag_ici_lock);
			
 
				 	if (!radix_tree_delete(&pag->pag_ici_root,
			
 
				 				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
			
 
				 		ASSERT(0);
			
 
				 	__xfs_inode_clear_reclaim(pag, ip);
			
 
				-	write_unlock(&pag->pag_ici_lock);
			
 
				+	spin_unlock(&pag->pag_ici_lock);
			
 
				 
			
 
				 	/*
			
 
				 	 * Here we do an (almost) spurious inode lock in order to coordinate
			
@@ -864,14 +902,14 @@ restart:
 
				 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
			
 
				 			int	i;
			
 
				 
			
 
				-			write_lock(&pag->pag_ici_lock);
			
 
				+			rcu_read_lock();
			
 
				 			nr_found = radix_tree_gang_lookup_tag(
			
 
				 					&pag->pag_ici_root,
			
 
				 					(void **)batch, first_index,
			
 
				 					XFS_LOOKUP_BATCH,
			
 
				 					XFS_ICI_RECLAIM_TAG);
			
 
				 			if (!nr_found) {
			
 
				-				write_unlock(&pag->pag_ici_lock);
			
 
				+				rcu_read_unlock();
			
 
				 				break;
			
 
				 			}
			
 
				 
			
@@ -891,14 +929,24 @@ restart:
 
				 				 * occur if we have inodes in the last block of
			
 
				 				 * the AG and we are currently pointing to the
			
 
				 				 * last inode.
			
 
				+				 *
			
 
				+				 * Because we may see inodes that are from the
			
 
				+				 * wrong AG due to RCU freeing and
			
 
				+				 * reallocation, only update the index if it
			
 
				+				 * lies in this AG. It was a race that lead us
			
 
				+				 * to see this inode, so another lookup from
			
 
				+				 * the same index will not find it again.
			
 
				 				 */
			
 
				+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
			
 
				+								pag->pag_agno)
			
 
				+					continue;
			
 
				 				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
			
 
				 				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
			
 
				 					done = 1;
			
 
				 			}
			
 
				 
			
 
				 			/* unlock now we've grabbed the inodes. */
			
 
				-			write_unlock(&pag->pag_ici_lock);
			
 
				+			rcu_read_unlock();
			
 
				 
			
 
				 			for (i = 0; i < nr_found; i++) {
			
 
				 				if (!batch[i])
			
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 
				 		__field(int, curr_res)
			
 
				 		__field(int, unit_res)
			
 
				 		__field(unsigned int, flags)
			
 
				-		__field(void *, reserve_headq)
			
 
				-		__field(void *, write_headq)
			
 
				+		__field(int, reserveq)
			
 
				+		__field(int, writeq)
			
 
				 		__field(int, grant_reserve_cycle)
			
 
				 		__field(int, grant_reserve_bytes)
			
 
				 		__field(int, grant_write_cycle)
			
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 
				 		__entry->curr_res = tic->t_curr_res;
			
 
				 		__entry->unit_res = tic->t_unit_res;
			
 
				 		__entry->flags = tic->t_flags;
			
 
				-		__entry->reserve_headq = log->l_reserve_headq;
			
 
				-		__entry->write_headq = log->l_write_headq;
			
 
				-		__entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
			
 
				-		__entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
			
 
				-		__entry->grant_write_cycle = log->l_grant_write_cycle;
			
 
				-		__entry->grant_write_bytes = log->l_grant_write_bytes;
			
 
				+		__entry->reserveq = list_empty(&log->l_reserveq);
			
 
				+		__entry->writeq = list_empty(&log->l_writeq);
			
 
				+		xlog_crack_grant_head(&log->l_grant_reserve_head,
			
 
				+				&__entry->grant_reserve_cycle,
			
 
				+				&__entry->grant_reserve_bytes);
			
 
				+		xlog_crack_grant_head(&log->l_grant_write_head,
			
 
				+				&__entry->grant_write_cycle,
			
 
				+				&__entry->grant_write_bytes);
			
 
				 		__entry->curr_cycle = log->l_curr_cycle;
			
 
				 		__entry->curr_block = log->l_curr_block;
			
 
				-		__entry->tail_lsn = log->l_tail_lsn;
			
 
				+		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
			
 
				 	),
			
 
				 	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
			
 
				-		  "t_unit_res %u t_flags %s reserve_headq 0x%p "
			
 
				-		  "write_headq 0x%p grant_reserve_cycle %d "
			
 
				+		  "t_unit_res %u t_flags %s reserveq %s "
			
 
				+		  "writeq %s grant_reserve_cycle %d "
			
 
				 		  "grant_reserve_bytes %d grant_write_cycle %d "
			
 
				 		  "grant_write_bytes %d curr_cycle %d curr_block %d "
			
 
				 		  "tail_cycle %d tail_block %d",
			
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 
				 		  __entry->curr_res,
			
 
				 		  __entry->unit_res,
			
 
				 		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
			
 
				-		  __entry->reserve_headq,
			
 
				-		  __entry->write_headq,
			
 
				+		  __entry->reserveq ? "empty" : "active",
			
 
				+		  __entry->writeq ? "empty" : "active",
			
 
				 		  __entry->grant_reserve_cycle,
			
 
				 		  __entry->grant_reserve_bytes,
			
 
				 		  __entry->grant_write_cycle,
			
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
			
 
				+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
			
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
			
 
				+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
			
 
				 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
			
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 
				 DEFINE_PAGE_EVENT(xfs_releasepage);
			
 
				 DEFINE_PAGE_EVENT(xfs_invalidatepage);
			
 
				 
			
 
				-DECLARE_EVENT_CLASS(xfs_iomap_class,
			
 
				+DECLARE_EVENT_CLASS(xfs_imap_class,
			
 
				 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
			
 
				-		 int flags, struct xfs_bmbt_irec *irec),
			
 
				-	TP_ARGS(ip, offset, count, flags, irec),
			
 
				+		 int type, struct xfs_bmbt_irec *irec),
			
 
				+	TP_ARGS(ip, offset, count, type, irec),
			
 
				 	TP_STRUCT__entry(
			
 
				 		__field(dev_t, dev)
			
 
				 		__field(xfs_ino_t, ino)
			
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
 
				 		__field(loff_t, new_size)
			
 
				 		__field(loff_t, offset)
			
 
				 		__field(size_t, count)
			
 
				-		__field(int, flags)
			
 
				+		__field(int, type)
			
 
				 		__field(xfs_fileoff_t, startoff)
			
 
				 		__field(xfs_fsblock_t, startblock)
			
 
				 		__field(xfs_filblks_t, blockcount)
			
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
 
				 		__entry->new_size = ip->i_new_size;
			
 
				 		__entry->offset = offset;
			
 
				 		__entry->count = count;
			
 
				-		__entry->flags = flags;
			
 
				+		__entry->type = type;
			
 
				 		__entry->startoff = irec ? irec->br_startoff : 0;
			
 
				 		__entry->startblock = irec ? irec->br_startblock : 0;
			
 
				 		__entry->blockcount = irec ? irec->br_blockcount : 0;
			
 
				 	),
			
 
				 	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
			
 
				-		  "offset 0x%llx count %zd flags %s "
			
 
				+		  "offset 0x%llx count %zd type %s "
			
 
				 		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
			
 
				 		  MAJOR(__entry->dev), MINOR(__entry->dev),
			
 
				 		  __entry->ino,
			
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
 
				 		  __entry->new_size,
			
 
				 		  __entry->offset,
			
 
				 		  __entry->count,
			
 
				-		  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
			
 
				+		  __print_symbolic(__entry->type, XFS_IO_TYPES),
			
 
				 		  __entry->startoff,
			
 
				 		  (__int64_t)__entry->startblock,
			
 
				 		  __entry->blockcount)
			
 
				 )
			
 
				 
			
 
				 #define DEFINE_IOMAP_EVENT(name)	\
			
 
				-DEFINE_EVENT(xfs_iomap_class, name,	\
			
 
				+DEFINE_EVENT(xfs_imap_class, name,	\
			
 
				 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
			
 
				-		 int flags, struct xfs_bmbt_irec *irec),		\
			
 
				-	TP_ARGS(ip, offset, count, flags, irec))
			
 
				-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
			
 
				-DEFINE_IOMAP_EVENT(xfs_iomap_found);
			
 
				-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
			
 
				+		 int type, struct xfs_bmbt_irec *irec),		\
			
 
				+	TP_ARGS(ip, offset, count, type, irec))
			
 
				+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
			
 
				+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
			
 
				+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
			
 
				+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(xfs_simple_io_class,
			
 
				 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
			
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
 
				 	TP_ARGS(ip, offset, count))
			
 
				 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
			
 
				 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
			
 
				+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
			
 
				 
			
 
				 
			
 
				 TRACE_EVENT(xfs_itruncate_start,
			
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
 
				 	TP_PROTO(struct xfs_alloc_arg *args), \
			
 
				 	TP_ARGS(args))
			
 
				 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
			
 
				+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
			
 
				 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
			
 
				 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
			
 
				 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
			
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
 
				 	ASSERT(list_empty(&dqp->q_freelist));
			
 
				 
			
 
				 	mutex_destroy(&dqp->q_qlock);
			
 
				-	sv_destroy(&dqp->q_pinwait);
			
 
				 	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
			
 
				 
			
 
				 	atomic_dec(&xfs_Gqm->qm_totaldquots);
			
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
 
				 
			
 
				 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
			
 
				 
			
 
				-	rwlock_t	pag_ici_lock;	/* incore inode lock */
			
 
				+	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
			
 
				 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
			
 
				 	int		pag_ici_reclaimable;	/* reclaimable inodes */
			
 
				 	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
			
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -577,61 +577,58 @@ xfs_alloc_ag_vextent_exact(
 
				 	xfs_extlen_t	rlen;	/* length of returned extent */
			
 
				 
			
 
				 	ASSERT(args->alignment == 1);
			
 
				+
			
 
				 	/*
			
 
				 	 * Allocate/initialize a cursor for the by-number freespace btree.
			
 
				 	 */
			
 
				 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
			
 
				-		args->agno, XFS_BTNUM_BNO);
			
 
				+					  args->agno, XFS_BTNUM_BNO);
			
 
				+
			
 
				 	/*
			
 
				 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
			
 
				 	 * Look for the closest free block <= bno, it must contain bno
			
 
				 	 * if any free block does.
			
 
				 	 */
			
 
				-	if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
			
 
				+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
			
 
				+	if (error)
			
 
				 		goto error0;
			
 
				-	if (!i) {
			
 
				-		/*
			
 
				-		 * Didn't find it, return null.
			
 
				-		 */
			
 
				-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
			
 
				-		args->agbno = NULLAGBLOCK;
			
 
				-		return 0;
			
 
				-	}
			
 
				+	if (!i)
			
 
				+		goto not_found;
			
 
				+
			
 
				 	/*
			
 
				 	 * Grab the freespace record.
			
 
				 	 */
			
 
				-	if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
			
 
				+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
			
 
				+	if (error)
			
 
				 		goto error0;
			
 
				 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			
 
				 	ASSERT(fbno <= args->agbno);
			
 
				 	minend = args->agbno + args->minlen;
			
 
				 	maxend = args->agbno + args->maxlen;
			
 
				 	fend = fbno + flen;
			
 
				+
			
 
				 	/*
			
 
				 	 * Give up if the freespace isn't long enough for the minimum request.
			
 
				 	 */
			
 
				-	if (fend < minend) {
			
 
				-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
			
 
				-		args->agbno = NULLAGBLOCK;
			
 
				-		return 0;
			
 
				-	}
			
 
				+	if (fend < minend)
			
 
				+		goto not_found;
			
 
				+
			
 
				 	/*
			
 
				 	 * End of extent will be smaller of the freespace end and the
			
 
				 	 * maximal requested end.
			
 
				-	 */
			
 
				-	end = XFS_AGBLOCK_MIN(fend, maxend);
			
 
				-	/*
			
 
				+	 *
			
 
				 	 * Fix the length according to mod and prod if given.
			
 
				 	 */
			
 
				+	end = XFS_AGBLOCK_MIN(fend, maxend);
			
 
				 	args->len = end - args->agbno;
			
 
				 	xfs_alloc_fix_len(args);
			
 
				-	if (!xfs_alloc_fix_minleft(args)) {
			
 
				-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
			
 
				-		return 0;
			
 
				-	}
			
 
				+	if (!xfs_alloc_fix_minleft(args))
			
 
				+		goto not_found;
			
 
				+
			
 
				 	rlen = args->len;
			
 
				 	ASSERT(args->agbno + rlen <= fend);
			
 
				 	end = args->agbno + rlen;
			
 
				+
			
 
				 	/*
			
 
				 	 * We are allocating agbno for rlen [agbno .. end]
			
 
				 	 * Allocate/initialize a cursor for the by-size btree.
			
@@ -640,16 +637,25 @@ xfs_alloc_ag_vextent_exact(
 
				 		args->agno, XFS_BTNUM_CNT);
			
 
				 	ASSERT(args->agbno + args->len <=
			
 
				 		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
			
 
				-	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
			
 
				-			args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
			
 
				+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
			
 
				+				      args->len, XFSA_FIXUP_BNO_OK);
			
 
				+	if (error) {
			
 
				 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
			
 
				 		goto error0;
			
 
				 	}
			
 
				+
			
 
				 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
			
 
				 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
			
 
				 
			
 
				-	trace_xfs_alloc_exact_done(args);
			
 
				 	args->wasfromfl = 0;
			
 
				+	trace_xfs_alloc_exact_done(args);
			
 
				+	return 0;
			
 
				+
			
 
				+not_found:
			
 
				+	/* Didn't find it, return null. */
			
 
				+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
			
 
				+	args->agbno = NULLAGBLOCK;
			
 
				+	trace_xfs_alloc_exact_notfound(args);
			
 
				 	return 0;
			
 
				 
			
 
				 error0:
			
@@ -658,6 +664,95 @@ error0:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Search the btree in a given direction via the search cursor and compare
			
 
				+ * the records found against the good extent we've already found.
			
 
				+ */
			
 
				+STATIC int
			
 
				+xfs_alloc_find_best_extent(
			
 
				+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
			
 
				+	struct xfs_btree_cur	**gcur,	/* good cursor */
			
 
				+	struct xfs_btree_cur	**scur,	/* searching cursor */
			
 
				+	xfs_agblock_t		gdiff,	/* difference for search comparison */
			
 
				+	xfs_agblock_t		*sbno,	/* extent found by search */
			
 
				+	xfs_extlen_t		*slen,
			
 
				+	xfs_extlen_t		*slena,	/* aligned length */
			
 
				+	int			dir)	/* 0 = search right, 1 = search left */
			
 
				+{
			
 
				+	xfs_agblock_t		bno;
			
 
				+	xfs_agblock_t		new;
			
 
				+	xfs_agblock_t		sdiff;
			
 
				+	int			error;
			
 
				+	int			i;
			
 
				+
			
 
				+	/* The good extent is perfect, no need to  search. */
			
 
				+	if (!gdiff)
			
 
				+		goto out_use_good;
			
 
				+
			
 
				+	/*
			
 
				+	 * Look until we find a better one, run out of space or run off the end.
			
 
				+	 */
			
 
				+	do {
			
 
				+		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
			
 
				+		if (error)
			
 
				+			goto error0;
			
 
				+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			
 
				+		xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
			
 
				+					  args->minlen, &bno, slena);
			
 
				+
			
 
				+		/*
			
 
				+		 * The good extent is closer than this one.
			
 
				+		 */
			
 
				+		if (!dir) {
			
 
				+			if (bno >= args->agbno + gdiff)
			
 
				+				goto out_use_good;
			
 
				+		} else {
			
 
				+			if (bno <= args->agbno - gdiff)
			
 
				+				goto out_use_good;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Same distance, compare length and pick the best.
			
 
				+		 */
			
 
				+		if (*slena >= args->minlen) {
			
 
				+			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
			
 
				+			xfs_alloc_fix_len(args);
			
 
				+
			
 
				+			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
			
 
				+						       args->alignment, *sbno,
			
 
				+						       *slen, &new);
			
 
				+
			
 
				+			/*
			
 
				+			 * Choose closer size and invalidate other cursor.
			
 
				+			 */
			
 
				+			if (sdiff < gdiff)
			
 
				+				goto out_use_search;
			
 
				+			goto out_use_good;
			
 
				+		}
			
 
				+
			
 
				+		if (!dir)
			
 
				+			error = xfs_btree_increment(*scur, 0, &i);
			
 
				+		else
			
 
				+			error = xfs_btree_decrement(*scur, 0, &i);
			
 
				+		if (error)
			
 
				+			goto error0;
			
 
				+	} while (i);
			
 
				+
			
 
				+out_use_good:
			
 
				+	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
			
 
				+	*scur = NULL;
			
 
				+	return 0;
			
 
				+
			
 
				+out_use_search:
			
 
				+	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
			
 
				+	*gcur = NULL;
			
 
				+	return 0;
			
 
				+
			
 
				+error0:
			
 
				+	/* caller invalidates cursors */
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Allocate a variable extent near bno in the allocation group agno.
			
 
				  * Extent's length (returned in len) will be between minlen and maxlen,
			
@@ -925,203 +1020,45 @@ xfs_alloc_ag_vextent_near(
 
				 			}
			
 
				 		}
			
 
				 	} while (bno_cur_lt || bno_cur_gt);
			
 
				+
			
 
				 	/*
			
 
				 	 * Got both cursors still active, need to find better entry.
			
 
				 	 */
			
 
				 	if (bno_cur_lt && bno_cur_gt) {
			
 
				-		/*
			
 
				-		 * Left side is long enough, look for a right side entry.
			
 
				-		 */
			
 
				 		if (ltlena >= args->minlen) {
			
 
				 			/*
			
 
				-			 * Fix up the length.
			
 
				+			 * Left side is good, look for a right side entry.
			
 
				 			 */
			
 
				 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
			
 
				 			xfs_alloc_fix_len(args);
			
 
				-			rlen = args->len;
			
 
				-			ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
			
 
				+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
			
 
				 				args->alignment, ltbno, ltlen, &ltnew);
			
 
				+
			
 
				+			error = xfs_alloc_find_best_extent(args,
			
 
				+						&bno_cur_lt, &bno_cur_gt,
			
 
				+						ltdiff, &gtbno, &gtlen, &gtlena,
			
 
				+						0 /* search right */);
			
 
				+		} else {
			
 
				+			ASSERT(gtlena >= args->minlen);
			
 
				+
			
 
				 			/*
			
 
				-			 * Not perfect.
			
 
				-			 */
			
 
				-			if (ltdiff) {
			
 
				-				/*
			
 
				-				 * Look until we find a better one, run out of
			
 
				-				 * space, or run off the end.
			
 
				-				 */
			
 
				-				while (bno_cur_lt && bno_cur_gt) {
			
 
				-					if ((error = xfs_alloc_get_rec(
			
 
				-							bno_cur_gt, &gtbno,
			
 
				-							&gtlen, &i)))
			
 
				-						goto error0;
			
 
				-					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			
 
				-					xfs_alloc_compute_aligned(gtbno, gtlen,
			
 
				-						args->alignment, args->minlen,
			
 
				-						&gtbnoa, &gtlena);
			
 
				-					/*
			
 
				-					 * The left one is clearly better.
			
 
				-					 */
			
 
				-					if (gtbnoa >= args->agbno + ltdiff) {
			
 
				-						xfs_btree_del_cursor(
			
 
				-							bno_cur_gt,
			
 
				-							XFS_BTREE_NOERROR);
			
 
				-						bno_cur_gt = NULL;
			
 
				-						break;
			
 
				-					}
			
 
				-					/*
			
 
				-					 * If we reach a big enough entry,
			
 
				-					 * compare the two and pick the best.
			
 
				-					 */
			
 
				-					if (gtlena >= args->minlen) {
			
 
				-						args->len =
			
 
				-							XFS_EXTLEN_MIN(gtlena,
			
 
				-								args->maxlen);
			
 
				-						xfs_alloc_fix_len(args);
			
 
				-						rlen = args->len;
			
 
				-						gtdiff = xfs_alloc_compute_diff(
			
 
				-							args->agbno, rlen,
			
 
				-							args->alignment,
			
 
				-							gtbno, gtlen, &gtnew);
			
 
				-						/*
			
 
				-						 * Right side is better.
			
 
				-						 */
			
 
				-						if (gtdiff < ltdiff) {
			
 
				-							xfs_btree_del_cursor(
			
 
				-								bno_cur_lt,
			
 
				-								XFS_BTREE_NOERROR);
			
 
				-							bno_cur_lt = NULL;
			
 
				-						}
			
 
				-						/*
			
 
				-						 * Left side is better.
			
 
				-						 */
			
 
				-						else {
			
 
				-							xfs_btree_del_cursor(
			
 
				-								bno_cur_gt,
			
 
				-								XFS_BTREE_NOERROR);
			
 
				-							bno_cur_gt = NULL;
			
 
				-						}
			
 
				-						break;
			
 
				-					}
			
 
				-					/*
			
 
				-					 * Fell off the right end.
			
 
				-					 */
			
 
				-					if ((error = xfs_btree_increment(
			
 
				-							bno_cur_gt, 0, &i)))
			
 
				-						goto error0;
			
 
				-					if (!i) {
			
 
				-						xfs_btree_del_cursor(
			
 
				-							bno_cur_gt,
			
 
				-							XFS_BTREE_NOERROR);
			
 
				-						bno_cur_gt = NULL;
			
 
				-						break;
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-			/*
			
 
				-			 * The left side is perfect, trash the right side.
			
 
				-			 */
			
 
				-			else {
			
 
				-				xfs_btree_del_cursor(bno_cur_gt,
			
 
				-						     XFS_BTREE_NOERROR);
			
 
				-				bno_cur_gt = NULL;
			
 
				-			}
			
 
				-		}
			
 
				-		/*
			
 
				-		 * It's the right side that was found first, look left.
			
 
				-		 */
			
 
				-		else {
			
 
				-			/*
			
 
				-			 * Fix up the length.
			
 
				+			 * Right side is good, look for a left side entry.
			
 
				 			 */
			
 
				 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
			
 
				 			xfs_alloc_fix_len(args);
			
 
				-			rlen = args->len;
			
 
				-			gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
			
 
				+			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
			
 
				 				args->alignment, gtbno, gtlen, &gtnew);
			
 
				-			/*
			
 
				-			 * Right side entry isn't perfect.
			
 
				-			 */
			
 
				-			if (gtdiff) {
			
 
				-				/*
			
 
				-				 * Look until we find a better one, run out of
			
 
				-				 * space, or run off the end.
			
 
				-				 */
			
 
				-				while (bno_cur_lt && bno_cur_gt) {
			
 
				-					if ((error = xfs_alloc_get_rec(
			
 
				-							bno_cur_lt, &ltbno,
			
 
				-							&ltlen, &i)))
			
 
				-						goto error0;
			
 
				-					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			
 
				-					xfs_alloc_compute_aligned(ltbno, ltlen,
			
 
				-						args->alignment, args->minlen,
			
 
				-						&ltbnoa, &ltlena);
			
 
				-					/*
			
 
				-					 * The right one is clearly better.
			
 
				-					 */
			
 
				-					if (ltbnoa <= args->agbno - gtdiff) {
			
 
				-						xfs_btree_del_cursor(
			
 
				-							bno_cur_lt,
			
 
				-							XFS_BTREE_NOERROR);
			
 
				-						bno_cur_lt = NULL;
			
 
				-						break;
			
 
				-					}
			
 
				-					/*
			
 
				-					 * If we reach a big enough entry,
			
 
				-					 * compare the two and pick the best.
			
 
				-					 */
			
 
				-					if (ltlena >= args->minlen) {
			
 
				-						args->len = XFS_EXTLEN_MIN(
			
 
				-							ltlena, args->maxlen);
			
 
				-						xfs_alloc_fix_len(args);
			
 
				-						rlen = args->len;
			
 
				-						ltdiff = xfs_alloc_compute_diff(
			
 
				-							args->agbno, rlen,
			
 
				-							args->alignment,
			
 
				-							ltbno, ltlen, &ltnew);
			
 
				-						/*
			
 
				-						 * Left side is better.
			
 
				-						 */
			
 
				-						if (ltdiff < gtdiff) {
			
 
				-							xfs_btree_del_cursor(
			
 
				-								bno_cur_gt,
			
 
				-								XFS_BTREE_NOERROR);
			
 
				-							bno_cur_gt = NULL;
			
 
				-						}
			
 
				-						/*
			
 
				-						 * Right side is better.
			
 
				-						 */
			
 
				-						else {
			
 
				-							xfs_btree_del_cursor(
			
 
				-								bno_cur_lt,
			
 
				-								XFS_BTREE_NOERROR);
			
 
				-							bno_cur_lt = NULL;
			
 
				-						}
			
 
				-						break;
			
 
				-					}
			
 
				-					/*
			
 
				-					 * Fell off the left end.
			
 
				-					 */
			
 
				-					if ((error = xfs_btree_decrement(
			
 
				-							bno_cur_lt, 0, &i)))
			
 
				-						goto error0;
			
 
				-					if (!i) {
			
 
				-						xfs_btree_del_cursor(bno_cur_lt,
			
 
				-							XFS_BTREE_NOERROR);
			
 
				-						bno_cur_lt = NULL;
			
 
				-						break;
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-			/*
			
 
				-			 * The right side is perfect, trash the left side.
			
 
				-			 */
			
 
				-			else {
			
 
				-				xfs_btree_del_cursor(bno_cur_lt,
			
 
				-					XFS_BTREE_NOERROR);
			
 
				-				bno_cur_lt = NULL;
			
 
				-			}
			
 
				+
			
 
				+			error = xfs_alloc_find_best_extent(args,
			
 
				+						&bno_cur_gt, &bno_cur_lt,
			
 
				+						gtdiff, &ltbno, &ltlen, &ltlena,
			
 
				+						1 /* search left */);
			
 
				 		}
			
 
				+
			
 
				+		if (error)
			
 
				+			goto error0;
			
 
				 	}
			
 
				+
			
 
				 	/*
			
 
				 	 * If we couldn't get anything, give up.
			
 
				 	 */
			
@@ -1130,6 +1067,7 @@ xfs_alloc_ag_vextent_near(
 
				 		args->agbno = NULLAGBLOCK;
			
 
				 		return 0;
			
 
				 	}
			
 
				+
			
 
				 	/*
			
 
				 	 * At this point we have selected a freespace entry, either to the
			
 
				 	 * left or to the right.  If it's on the right, copy all the
			
@@ -1146,6 +1084,7 @@ xfs_alloc_ag_vextent_near(
 
				 		j = 1;
			
 
				 	} else
			
 
				 		j = 0;
			
 
				+
			
 
				 	/*
			
 
				 	 * Fix up the length and compute the useful address.
			
 
				 	 */
			
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 
				 	 * It didn't all fit, so we have to sort everything on hashval.
			
 
				 	 */
			
 
				 	sbsize = sf->hdr.count * sizeof(*sbuf);
			
 
				-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
			
 
				+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
			
 
				 
			
 
				 	/*
			
 
				 	 * Scan the attribute list for the rest of the entries, storing
			
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
				 				args.dp = context->dp;
			
 
				 				args.whichfork = XFS_ATTR_FORK;
			
 
				 				args.valuelen = valuelen;
			
 
				-				args.value = kmem_alloc(valuelen, KM_SLEEP);
			
 
				+				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
			
 
				 				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
			
 
				 				args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
			
 
				 				retval = xfs_attr_rmtval_get(&args);
			
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
 
				 		return error;
			
 
				 	}
			
 
				 	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
			
 
				-	if (bp != NULL) {
			
 
				+	if (bp)
			
 
				 		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
			
 
				-	}
			
 
				 	*bpp = bp;
			
 
				 	return 0;
			
 
				 }
			
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
 
				 	switch (cur->bc_btnum) {
			
 
				 	case XFS_BTNUM_BNO:
			
 
				 	case XFS_BTNUM_CNT:
			
 
				-		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
			
 
				+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
			
 
				 		break;
			
 
				 	case XFS_BTNUM_INO:
			
 
				-		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
			
 
				+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
			
 
				 		break;
			
 
				 	case XFS_BTNUM_BMAP:
			
 
				-		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
			
 
				+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
			
 
				 		break;
			
 
				 	default:
			
 
				 		ASSERT(0);
			
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
 
				 #endif
			
 
				 
			
 
				 STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
			
 
				-STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
			
 
				+STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
			
 
				 
			
 
				 /*
			
 
				  * This returns the number of log iovecs needed to log the
			
@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
 
				 		 * xfs_trans_ail_delete() drops the AIL lock.
			
 
				 		 */
			
 
				 		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
			
 
				-			xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
			
 
				+			xfs_buf_do_callbacks(bp);
			
 
				 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
			
 
				 			XFS_BUF_CLR_IODONE_FUNC(bp);
			
 
				 		} else {
			
@@ -918,15 +918,26 @@ xfs_buf_attach_iodone(
 
				 	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * We can have many callbacks on a buffer. Running the callbacks individually
			
 
				+ * can cause a lot of contention on the AIL lock, so we allow for a single
			
 
				+ * callback to be able to scan the remaining lip->li_bio_list for other items
			
 
				+ * of the same type and callback to be processed in the first call.
			
 
				+ *
			
 
				+ * As a result, the loop walking the callback list below will also modify the
			
 
				+ * list. it removes the first item from the list and then runs the callback.
			
 
				+ * The loop then restarts from the new head of the list. This allows the
			
 
				+ * callback to scan and modify the list attached to the buffer and we don't
			
 
				+ * have to care about maintaining a next item pointer.
			
 
				+ */
			
 
				 STATIC void
			
 
				 xfs_buf_do_callbacks(
			
 
				-	xfs_buf_t	*bp,
			
 
				-	xfs_log_item_t	*lip)
			
 
				+	struct xfs_buf		*bp)
			
 
				 {
			
 
				-	xfs_log_item_t	*nlip;
			
 
				+	struct xfs_log_item	*lip;
			
 
				 
			
 
				-	while (lip != NULL) {
			
 
				-		nlip = lip->li_bio_list;
			
 
				+	while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
			
 
				+		XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
			
 
				 		ASSERT(lip->li_cb != NULL);
			
 
				 		/*
			
 
				 		 * Clear the next pointer so we don't have any
			
@@ -936,7 +947,6 @@ xfs_buf_do_callbacks(
 
				 		 */
			
 
				 		lip->li_bio_list = NULL;
			
 
				 		lip->li_cb(bp, lip);
			
 
				-		lip = nlip;
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks(
 
				 			ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
			
 
				 			XFS_BUF_SUPER_STALE(bp);
			
 
				 			trace_xfs_buf_item_iodone(bp, _RET_IP_);
			
 
				-			xfs_buf_do_callbacks(bp, lip);
			
 
				+			xfs_buf_do_callbacks(bp);
			
 
				 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
			
 
				 			XFS_BUF_CLR_IODONE_FUNC(bp);
			
 
				 			xfs_buf_ioend(bp, 0);
			
@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks(
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	xfs_buf_do_callbacks(bp, lip);
			
 
				+	xfs_buf_do_callbacks(bp);
			
 
				 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
			
 
				 	XFS_BUF_CLR_IODONE_FUNC(bp);
			
 
				 	xfs_buf_ioend(bp, 0);
			
@@ -1063,7 +1073,7 @@ xfs_buf_error_relse(
 
				 	 * We have to unpin the pinned buffers so do the
			
 
				 	 * callbacks.
			
 
				 	 */
			
 
				-	xfs_buf_do_callbacks(bp, lip);
			
 
				+	xfs_buf_do_callbacks(bp);
			
 
				 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
			
 
				 	XFS_BUF_CLR_IODONE_FUNC(bp);
			
 
				 	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
			
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
 
				 	xfs_buf_log_format_t	bli_format;	/* in-log header */
			
 
				 } xfs_buf_log_item_t;
			
 
				 
			
 
				-/*
			
 
				- * This structure is used during recovery to record the buf log
			
 
				- * items which have been canceled and should not be replayed.
			
 
				- */
			
 
				-typedef struct xfs_buf_cancel {
			
 
				-	xfs_daddr_t		bc_blkno;
			
 
				-	uint			bc_len;
			
 
				-	int			bc_refcount;
			
 
				-	struct xfs_buf_cancel	*bc_next;
			
 
				-} xfs_buf_cancel_t;
			
 
				-
			
 
				 void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
			
 
				 void	xfs_buf_item_relse(struct xfs_buf *);
			
 
				 void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
			
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -47,6 +47,28 @@ xfs_efi_item_free(
 
				 		kmem_zone_free(xfs_efi_zone, efip);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Freeing the efi requires that we remove it from the AIL if it has already
			
 
				+ * been placed there. However, the EFI may not yet have been placed in the AIL
			
 
				+ * when called by xfs_efi_release() from EFD processing due to the ordering of
			
 
				+ * committed vs unpin operations in bulk insert operations. Hence the
			
 
				+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
			
 
				+ * the EFI.
			
 
				+ */
			
 
				+STATIC void
			
 
				+__xfs_efi_release(
			
 
				+	struct xfs_efi_log_item	*efip)
			
 
				+{
			
 
				+	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
			
 
				+
			
 
				+	if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
			
 
				+		spin_lock(&ailp->xa_lock);
			
 
				+		/* xfs_trans_ail_delete() drops the AIL lock. */
			
 
				+		xfs_trans_ail_delete(ailp, &efip->efi_item);
			
 
				+		xfs_efi_item_free(efip);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * This returns the number of iovecs needed to log the given efi item.
			
 
				  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
			
@@ -74,7 +96,8 @@ xfs_efi_item_format(
 
				 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
			
 
				 	uint			size;
			
 
				 
			
 
				-	ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
			
 
				+	ASSERT(atomic_read(&efip->efi_next_extent) ==
			
 
				+				efip->efi_format.efi_nextents);
			
 
				 
			
 
				 	efip->efi_format.efi_type = XFS_LI_EFI;
			
 
				 
			
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * While EFIs cannot really be pinned, the unpin operation is the
			
 
				- * last place at which the EFI is manipulated during a transaction.
			
 
				- * Here we coordinate with xfs_efi_cancel() to determine who gets to
			
 
				- * free the EFI.
			
 
				+ * While EFIs cannot really be pinned, the unpin operation is the last place at
			
 
				+ * which the EFI is manipulated during a transaction.  If we are being asked to
			
 
				+ * remove the EFI it's because the transaction has been cancelled and by
			
 
				+ * definition that means the EFI cannot be in the AIL so remove it from the
			
 
				+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
			
 
				+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
			
 
				  */
			
 
				 STATIC void
			
 
				 xfs_efi_item_unpin(
			
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
 
				 	int			remove)
			
 
				 {
			
 
				 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
			
 
				-	struct xfs_ail		*ailp = lip->li_ailp;
			
 
				-
			
 
				-	spin_lock(&ailp->xa_lock);
			
 
				-	if (efip->efi_flags & XFS_EFI_CANCELED) {
			
 
				-		if (remove)
			
 
				-			xfs_trans_del_item(lip);
			
 
				 
			
 
				-		/* xfs_trans_ail_delete() drops the AIL lock. */
			
 
				-		xfs_trans_ail_delete(ailp, lip);
			
 
				+	if (remove) {
			
 
				+		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
			
 
				+		xfs_trans_del_item(lip);
			
 
				 		xfs_efi_item_free(efip);
			
 
				-	} else {
			
 
				-		efip->efi_flags |= XFS_EFI_COMMITTED;
			
 
				-		spin_unlock(&ailp->xa_lock);
			
 
				+		return;
			
 
				 	}
			
 
				+	__xfs_efi_release(efip);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * The EFI is logged only once and cannot be moved in the log, so
			
 
				- * simply return the lsn at which it's been logged.  The canceled
			
 
				- * flag is not paid any attention here.  Checking for that is delayed
			
 
				- * until the EFI is unpinned.
			
 
				+ * The EFI is logged only once and cannot be moved in the log, so simply return
			
 
				+ * the lsn at which it's been logged.  For bulk transaction committed
			
 
				+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
			
 
				+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
			
 
				+ * when processing the EFD.
			
 
				  */
			
 
				 STATIC xfs_lsn_t
			
 
				 xfs_efi_item_committed(
			
 
				 	struct xfs_log_item	*lip,
			
 
				 	xfs_lsn_t		lsn)
			
 
				 {
			
 
				+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
			
 
				+
			
 
				+	set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
			
 
				 	return lsn;
			
 
				 }
			
 
				 
			
@@ -230,6 +253,7 @@ xfs_efi_init(
 
				 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
			
 
				 	efip->efi_format.efi_nextents = nextents;
			
 
				 	efip->efi_format.efi_id = (__psint_t)(void*)efip;
			
 
				+	atomic_set(&efip->efi_next_extent, 0);
			
 
				 
			
 
				 	return efip;
			
 
				 }
			
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * This is called by the efd item code below to release references to
			
 
				- * the given efi item.  Each efd calls this with the number of
			
 
				- * extents that it has logged, and when the sum of these reaches
			
 
				- * the total number of extents logged by this efi item we can free
			
 
				- * the efi item.
			
 
				- *
			
 
				- * Freeing the efi item requires that we remove it from the AIL.
			
 
				- * We'll use the AIL lock to protect our counters as well as
			
 
				- * the removal from the AIL.
			
 
				+ * This is called by the efd item code below to release references to the given
			
 
				+ * efi item.  Each efd calls this with the number of extents that it has
			
 
				+ * logged, and when the sum of these reaches the total number of extents logged
			
 
				+ * by this efi item we can free the efi item.
			
 
				  */
			
 
				 void
			
 
				 xfs_efi_release(xfs_efi_log_item_t	*efip,
			
 
				 		uint			nextents)
			
 
				 {
			
 
				-	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
			
 
				-	int			extents_left;
			
 
				-
			
 
				-	ASSERT(efip->efi_next_extent > 0);
			
 
				-	ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
			
 
				-
			
 
				-	spin_lock(&ailp->xa_lock);
			
 
				-	ASSERT(efip->efi_next_extent >= nextents);
			
 
				-	efip->efi_next_extent -= nextents;
			
 
				-	extents_left = efip->efi_next_extent;
			
 
				-	if (extents_left == 0) {
			
 
				-		/* xfs_trans_ail_delete() drops the AIL lock. */
			
 
				-		xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
			
 
				-		xfs_efi_item_free(efip);
			
 
				-	} else {
			
 
				-		spin_unlock(&ailp->xa_lock);
			
 
				-	}
			
 
				+	ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
			
 
				+	if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
			
 
				+		__xfs_efi_release(efip);
			
 
				 }
			
 
				 
			
 
				 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
			
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 
				 #define	XFS_EFI_MAX_FAST_EXTENTS	16
			
 
				 
			
 
				 /*
			
 
				- * Define EFI flags.
			
 
				+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
			
 
				  */
			
 
				-#define	XFS_EFI_RECOVERED	0x1
			
 
				-#define	XFS_EFI_COMMITTED	0x2
			
 
				-#define	XFS_EFI_CANCELED	0x4
			
 
				+#define	XFS_EFI_RECOVERED	1
			
 
				+#define	XFS_EFI_COMMITTED	2
			
 
				 
			
 
				 /*
			
 
				  * This is the "extent free intention" log item.  It is used
			
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 
				  */
			
 
				 typedef struct xfs_efi_log_item {
			
 
				 	xfs_log_item_t		efi_item;
			
 
				-	uint			efi_flags;	/* misc flags */
			
 
				-	uint			efi_next_extent;
			
 
				+	atomic_t		efi_next_extent;
			
 
				+	unsigned long		efi_flags;	/* misc flags */
			
 
				 	xfs_efi_log_format_t	efi_format;
			
 
				 } xfs_efi_log_item_t;
			
 
				 
			
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
 
				 		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
			
 
				 	} else
			
 
				 		mp->m_maxicount = 0;
			
 
				+	xfs_set_low_space_thresholds(mp);
			
 
				 
			
 
				 	/* update secondary superblocks. */
			
 
				 	for (agno = 1; agno < nagcount; agno++) {
			
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -42,6 +42,17 @@
 
				 #include "xfs_trace.h"
			
 
				 
			
 
				 
			
 
				+/*
			
 
				+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
			
 
				+ * inodes are considered the same for lockdep purposes, including inodes that
			
 
				+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
			
 
				+ * guarantee the locks are considered the same when there are multiple lock
			
 
				+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
			
 
				+ * obvious in lockdep reports which class the report is against.
			
 
				+ */
			
 
				+static struct lock_class_key xfs_iolock_active;
			
 
				+struct lock_class_key xfs_iolock_reclaimable;
			
 
				+
			
 
				 /*
			
 
				  * Allocate and initialise an xfs_inode.
			
 
				  */
			
@@ -69,8 +80,11 @@ xfs_inode_alloc(
 
				 	ASSERT(atomic_read(&ip->i_pincount) == 0);
			
 
				 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
			
 
				 	ASSERT(completion_done(&ip->i_flush));
			
 
				+	ASSERT(ip->i_ino == 0);
			
 
				 
			
 
				 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
			
 
				+	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
			
 
				+			&xfs_iolock_active, "xfs_iolock_active");
			
 
				 
			
 
				 	/* initialise the xfs inode */
			
 
				 	ip->i_ino = ino;
			
@@ -85,9 +99,6 @@ xfs_inode_alloc(
 
				 	ip->i_size = 0;
			
 
				 	ip->i_new_size = 0;
			
 
				 
			
 
				-	/* prevent anyone from using this yet */
			
 
				-	VFS_I(ip)->i_state = I_NEW;
			
 
				-
			
 
				 	return ip;
			
 
				 }
			
 
				 
			
@@ -145,7 +156,18 @@ xfs_inode_free(
 
				 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
			
 
				 	ASSERT(completion_done(&ip->i_flush));
			
 
				 
			
 
				-	call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
			
 
				+	/*
			
 
				+	 * Because we use RCU freeing we need to ensure the inode always
			
 
				+	 * appears to be reclaimed with an invalid inode number when in the
			
 
				+	 * free state. The ip->i_flags_lock provides the barrier against lookup
			
 
				+	 * races.
			
 
				+	 */
			
 
				+	spin_lock(&ip->i_flags_lock);
			
 
				+	ip->i_flags = XFS_IRECLAIM;
			
 
				+	ip->i_ino = 0;
			
 
				+	spin_unlock(&ip->i_flags_lock);
			
 
				+
			
 
				+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -155,14 +177,29 @@ static int
 
				 xfs_iget_cache_hit(
			
 
				 	struct xfs_perag	*pag,
			
 
				 	struct xfs_inode	*ip,
			
 
				+	xfs_ino_t		ino,
			
 
				 	int			flags,
			
 
				-	int			lock_flags) __releases(pag->pag_ici_lock)
			
 
				+	int			lock_flags) __releases(RCU)
			
 
				 {
			
 
				 	struct inode		*inode = VFS_I(ip);
			
 
				 	struct xfs_mount	*mp = ip->i_mount;
			
 
				 	int			error;
			
 
				 
			
 
				+	/*
			
 
				+	 * check for re-use of an inode within an RCU grace period due to the
			
 
				+	 * radix tree nodes not being updated yet. We monitor for this by
			
 
				+	 * setting the inode number to zero before freeing the inode structure.
			
 
				+	 * If the inode has been reallocated and set up, then the inode number
			
 
				+	 * will not match, so check for that, too.
			
 
				+	 */
			
 
				 	spin_lock(&ip->i_flags_lock);
			
 
				+	if (ip->i_ino != ino) {
			
 
				+		trace_xfs_iget_skip(ip);
			
 
				+		XFS_STATS_INC(xs_ig_frecycle);
			
 
				+		error = EAGAIN;
			
 
				+		goto out_error;
			
 
				+	}
			
 
				+
			
 
				 
			
 
				 	/*
			
 
				 	 * If we are racing with another cache hit that is currently
			
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
 
				 		ip->i_flags |= XFS_IRECLAIM;
			
 
				 
			
 
				 		spin_unlock(&ip->i_flags_lock);
			
 
				-		read_unlock(&pag->pag_ici_lock);
			
 
				+		rcu_read_unlock();
			
 
				 
			
 
				 		error = -inode_init_always(mp->m_super, inode);
			
 
				 		if (error) {
			
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
 
				 			 * Re-initializing the inode failed, and we are in deep
			
 
				 			 * trouble.  Try to re-add it to the reclaim list.
			
 
				 			 */
			
 
				-			read_lock(&pag->pag_ici_lock);
			
 
				+			rcu_read_lock();
			
 
				 			spin_lock(&ip->i_flags_lock);
			
 
				 
			
 
				 			ip->i_flags &= ~XFS_INEW;
			
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
 
				 			goto out_error;
			
 
				 		}
			
 
				 
			
 
				-		write_lock(&pag->pag_ici_lock);
			
 
				+		spin_lock(&pag->pag_ici_lock);
			
 
				 		spin_lock(&ip->i_flags_lock);
			
 
				 		ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
			
 
				 		ip->i_flags |= XFS_INEW;
			
 
				 		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
			
 
				 		inode->i_state = I_NEW;
			
 
				+
			
 
				+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
			
 
				+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
			
 
				+		lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
			
 
				+				&xfs_iolock_active, "xfs_iolock_active");
			
 
				+
			
 
				 		spin_unlock(&ip->i_flags_lock);
			
 
				-		write_unlock(&pag->pag_ici_lock);
			
 
				+		spin_unlock(&pag->pag_ici_lock);
			
 
				 	} else {
			
 
				 		/* If the VFS inode is being torn down, pause and try again. */
			
 
				 		if (!igrab(inode)) {
			
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
 
				 
			
 
				 		/* We've got a live one. */
			
 
				 		spin_unlock(&ip->i_flags_lock);
			
 
				-		read_unlock(&pag->pag_ici_lock);
			
 
				+		rcu_read_unlock();
			
 
				 		trace_xfs_iget_hit(ip);
			
 
				 	}
			
 
				 
			
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
 
				 
			
 
				 out_error:
			
 
				 	spin_unlock(&ip->i_flags_lock);
			
 
				-	read_unlock(&pag->pag_ici_lock);
			
 
				+	rcu_read_unlock();
			
 
				 	return error;
			
 
				 }
			
 
				 
			
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
 
				 			BUG();
			
 
				 	}
			
 
				 
			
 
				-	write_lock(&pag->pag_ici_lock);
			
 
				+	spin_lock(&pag->pag_ici_lock);
			
 
				 
			
 
				 	/* insert the new inode */
			
 
				 	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
			
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
 
				 	ip->i_udquot = ip->i_gdquot = NULL;
			
 
				 	xfs_iflags_set(ip, XFS_INEW);
			
 
				 
			
 
				-	write_unlock(&pag->pag_ici_lock);
			
 
				+	spin_unlock(&pag->pag_ici_lock);
			
 
				 	radix_tree_preload_end();
			
 
				 
			
 
				 	*ipp = ip;
			
 
				 	return 0;
			
 
				 
			
 
				 out_preload_end:
			
 
				-	write_unlock(&pag->pag_ici_lock);
			
 
				+	spin_unlock(&pag->pag_ici_lock);
			
 
				 	radix_tree_preload_end();
			
 
				 	if (lock_flags)
			
 
				 		xfs_iunlock(ip, lock_flags);
			
@@ -377,7 +420,7 @@ xfs_iget(
 
				 	xfs_agino_t	agino;
			
 
				 
			
 
				 	/* reject inode numbers outside existing AGs */
			
 
				-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
			
 
				+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
			
 
				 		return EINVAL;
			
 
				 
			
 
				 	/* get the perag structure and ensure that it's inode capable */
			
@@ -386,15 +429,15 @@ xfs_iget(
 
				 
			
 
				 again:
			
 
				 	error = 0;
			
 
				-	read_lock(&pag->pag_ici_lock);
			
 
				+	rcu_read_lock();
			
 
				 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
			
 
				 
			
 
				 	if (ip) {
			
 
				-		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
			
 
				+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
			
 
				 		if (error)
			
 
				 			goto out_error_or_again;
			
 
				 	} else {
			
 
				-		read_unlock(&pag->pag_ici_lock);
			
 
				+		rcu_read_unlock();
			
 
				 		XFS_STATS_INC(xs_ig_missed);
			
 
				 
			
 
				 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
			
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
 
				 	 * around for a while.  This helps to keep recently accessed
			
 
				 	 * meta-data in-core longer.
			
 
				 	 */
			
 
				-	XFS_BUF_SET_REF(bp, XFS_INO_REF);
			
 
				+	xfs_buf_set_ref(bp, XFS_INO_REF);
			
 
				 
			
 
				 	/*
			
 
				 	 * Use xfs_trans_brelse() to release the buffer containing the
			
@@ -2000,16 +2000,32 @@ xfs_ifree_cluster(
 
				 		 */
			
 
				 		for (i = 0; i < ninodes; i++) {
			
 
				 retry:
			
 
				-			read_lock(&pag->pag_ici_lock);
			
 
				+			rcu_read_lock();
			
 
				 			ip = radix_tree_lookup(&pag->pag_ici_root,
			
 
				 					XFS_INO_TO_AGINO(mp, (inum + i)));
			
 
				 
			
 
				-			/* Inode not in memory or stale, nothing to do */
			
 
				-			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
			
 
				-				read_unlock(&pag->pag_ici_lock);
			
 
				+			/* Inode not in memory, nothing to do */
			
 
				+			if (!ip) {
			
 
				+				rcu_read_unlock();
			
 
				 				continue;
			
 
				 			}
			
 
				 
			
 
				+			/*
			
 
				+			 * because this is an RCU protected lookup, we could
			
 
				+			 * find a recently freed or even reallocated inode
			
 
				+			 * during the lookup. We need to check under the
			
 
				+			 * i_flags_lock for a valid inode here. Skip it if it
			
 
				+			 * is not valid, the wrong inode or stale.
			
 
				+			 */
			
 
				+			spin_lock(&ip->i_flags_lock);
			
 
				+			if (ip->i_ino != inum + i ||
			
 
				+			    __xfs_iflags_test(ip, XFS_ISTALE)) {
			
 
				+				spin_unlock(&ip->i_flags_lock);
			
 
				+				rcu_read_unlock();
			
 
				+				continue;
			
 
				+			}
			
 
				+			spin_unlock(&ip->i_flags_lock);
			
 
				+
			
 
				 			/*
			
 
				 			 * Don't try to lock/unlock the current inode, but we
			
 
				 			 * _cannot_ skip the other inodes that we did not find
			
@@ -2019,11 +2035,11 @@ retry:
 
				 			 */
			
 
				 			if (ip != free_ip &&
			
 
				 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
			
 
				-				read_unlock(&pag->pag_ici_lock);
			
 
				+				rcu_read_unlock();
			
 
				 				delay(1);
			
 
				 				goto retry;
			
 
				 			}
			
 
				-			read_unlock(&pag->pag_ici_lock);
			
 
				+			rcu_read_unlock();
			
 
				 
			
 
				 			xfs_iflock(ip);
			
 
				 			xfs_iflags_set(ip, XFS_ISTALE);
			
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
 
				 
			
 
				 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
			
 
				 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
			
 
				-	read_lock(&pag->pag_ici_lock);
			
 
				+	rcu_read_lock();
			
 
				 	/* really need a gang lookup range call here */
			
 
				 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
			
 
				 					first_index, inodes_per_cluster);
			
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
 
				 		iq = ilist[i];
			
 
				 		if (iq == ip)
			
 
				 			continue;
			
 
				-		/* if the inode lies outside this cluster, we're done. */
			
 
				-		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
			
 
				-			break;
			
 
				+
			
 
				+		/*
			
 
				+		 * because this is an RCU protected lookup, we could find a
			
 
				+		 * recently freed or even reallocated inode during the lookup.
			
 
				+		 * We need to check under the i_flags_lock for a valid inode
			
 
				+		 * here. Skip it if it is not valid or the wrong inode.
			
 
				+		 */
			
 
				+		spin_lock(&ip->i_flags_lock);
			
 
				+		if (!ip->i_ino ||
			
 
				+		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
			
 
				+			spin_unlock(&ip->i_flags_lock);
			
 
				+			continue;
			
 
				+		}
			
 
				+		spin_unlock(&ip->i_flags_lock);
			
 
				+
			
 
				 		/*
			
 
				 		 * Do an un-protected check to see if the inode is dirty and
			
 
				 		 * is a candidate for flushing.  These checks will be repeated
			
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
 
				 	}
			
 
				 
			
 
				 out_free:
			
 
				-	read_unlock(&pag->pag_ici_lock);
			
 
				+	rcu_read_unlock();
			
 
				 	kmem_free(ilist);
			
 
				 out_put:
			
 
				 	xfs_perag_put(pag);
			
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
 
				 	 * Corruption detected in the clustering loop.  Invalidate the
			
 
				 	 * inode buffer and shut down the filesystem.
			
 
				 	 */
			
 
				-	read_unlock(&pag->pag_ici_lock);
			
 
				+	rcu_read_unlock();
			
 
				 	/*
			
 
				 	 * Clean up the buffer.  If it was B_DELWRI, just release it --
			
 
				 	 * brelse can handle it with no problems.  If not, shut down the
			
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 
				 /*
			
 
				  * In-core inode flags.
			
 
				  */
			
 
				-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
			
 
				-#define XFS_ISTALE	0x0002	/* inode has been staled */
			
 
				-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
			
 
				-#define XFS_INEW	0x0008	/* inode has just been allocated */
			
 
				-#define XFS_IFILESTREAM	0x0010	/* inode is in a filestream directory */
			
 
				-#define XFS_ITRUNCATED	0x0020	/* truncated down so flush-on-close */
			
 
				+#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
			
 
				+#define XFS_ISTALE		0x0002	/* inode has been staled */
			
 
				+#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
			
 
				+#define XFS_INEW		0x0008	/* inode has just been allocated */
			
 
				+#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
			
 
				+#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
			
 
				+#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
			
 
				 
			
 
				 /*
			
 
				  * Flags for inode locking.
			
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 
				 #define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
			
 
				 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
			
 
				 
			
 
				+extern struct lock_class_key xfs_iolock_reclaimable;
			
 
				+
			
 
				 /*
			
 
				  * Flags for xfs_itruncate_start().
			
 
				  */
			
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
 
				  * flushed to disk.  It is responsible for removing the inode item
			
 
				  * from the AIL if it has not been re-logged, and unlocking the inode's
			
 
				  * flush lock.
			
 
				+ *
			
 
				+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
			
 
				+ * list for other inodes that will run this function. We remove them from the
			
 
				+ * buffer list so we can process all the inode IO completions in one AIL lock
			
 
				+ * traversal.
			
 
				  */
			
 
				 void
			
 
				 xfs_iflush_done(
			
 
				 	struct xfs_buf		*bp,
			
 
				 	struct xfs_log_item	*lip)
			
 
				 {
			
 
				-	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
			
 
				-	xfs_inode_t		*ip = iip->ili_inode;
			
 
				+	struct xfs_inode_log_item *iip;
			
 
				+	struct xfs_log_item	*blip;
			
 
				+	struct xfs_log_item	*next;
			
 
				+	struct xfs_log_item	*prev;
			
 
				 	struct xfs_ail		*ailp = lip->li_ailp;
			
 
				+	int			need_ail = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Scan the buffer IO completions for other inodes being completed and
			
 
				+	 * attach them to the current inode log item.
			
 
				+	 */
			
 
				+	blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
			
 
				+	prev = NULL;
			
 
				+	while (blip != NULL) {
			
 
				+		if (lip->li_cb != xfs_iflush_done) {
			
 
				+			prev = blip;
			
 
				+			blip = blip->li_bio_list;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/* remove from list */
			
 
				+		next = blip->li_bio_list;
			
 
				+		if (!prev) {
			
 
				+			XFS_BUF_SET_FSPRIVATE(bp, next);
			
 
				+		} else {
			
 
				+			prev->li_bio_list = next;
			
 
				+		}
			
 
				+
			
 
				+		/* add to current list */
			
 
				+		blip->li_bio_list = lip->li_bio_list;
			
 
				+		lip->li_bio_list = blip;
			
 
				+
			
 
				+		/*
			
 
				+		 * while we have the item, do the unlocked check for needing
			
 
				+		 * the AIL lock.
			
 
				+		 */
			
 
				+		iip = INODE_ITEM(blip);
			
 
				+		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
			
 
				+			need_ail++;
			
 
				+
			
 
				+		blip = next;
			
 
				+	}
			
 
				+
			
 
				+	/* make sure we capture the state of the initial inode. */
			
 
				+	iip = INODE_ITEM(lip);
			
 
				+	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
			
 
				+		need_ail++;
			
 
				 
			
 
				 	/*
			
 
				 	 * We only want to pull the item from the AIL if it is
			
@@ -861,28 +910,37 @@ xfs_iflush_done(
 
				 	 * the lock since it's cheaper, and then we recheck while
			
 
				 	 * holding the lock before removing the inode from the AIL.
			
 
				 	 */
			
 
				-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
			
 
				+	if (need_ail) {
			
 
				+		struct xfs_log_item *log_items[need_ail];
			
 
				+		int i = 0;
			
 
				 		spin_lock(&ailp->xa_lock);
			
 
				-		if (lip->li_lsn == iip->ili_flush_lsn) {
			
 
				-			/* xfs_trans_ail_delete() drops the AIL lock. */
			
 
				-			xfs_trans_ail_delete(ailp, lip);
			
 
				-		} else {
			
 
				-			spin_unlock(&ailp->xa_lock);
			
 
				+		for (blip = lip; blip; blip = blip->li_bio_list) {
			
 
				+			iip = INODE_ITEM(blip);
			
 
				+			if (iip->ili_logged &&
			
 
				+			    blip->li_lsn == iip->ili_flush_lsn) {
			
 
				+				log_items[i++] = blip;
			
 
				+			}
			
 
				+			ASSERT(i <= need_ail);
			
 
				 		}
			
 
				+		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
			
 
				+		xfs_trans_ail_delete_bulk(ailp, log_items, i);
			
 
				 	}
			
 
				 
			
 
				-	iip->ili_logged = 0;
			
 
				 
			
 
				 	/*
			
 
				-	 * Clear the ili_last_fields bits now that we know that the
			
 
				-	 * data corresponding to them is safely on disk.
			
 
				+	 * clean up and unlock the flush lock now we are done. We can clear the
			
 
				+	 * ili_last_fields bits now that we know that the data corresponding to
			
 
				+	 * them is safely on disk.
			
 
				 	 */
			
 
				-	iip->ili_last_fields = 0;
			
 
				+	for (blip = lip; blip; blip = next) {
			
 
				+		next = blip->li_bio_list;
			
 
				+		blip->li_bio_list = NULL;
			
 
				 
			
 
				-	/*
			
 
				-	 * Release the inode's flush lock since we're done with it.
			
 
				-	 */
			
 
				-	xfs_ifunlock(ip);
			
 
				+		iip = INODE_ITEM(blip);
			
 
				+		iip->ili_logged = 0;
			
 
				+		iip->ili_last_fields = 0;
			
 
				+		xfs_ifunlock(iip->ili_inode);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 
				 
			
 
				 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
			
 
				 						<< mp->m_writeio_log)
			
 
				-#define XFS_STRAT_WRITE_IMAPS	2
			
 
				 #define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
			
 
				 
			
 
				-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
			
 
				-				  int, struct xfs_bmbt_irec *, int *);
			
 
				-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
			
 
				-				 struct xfs_bmbt_irec *, int *);
			
 
				-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
			
 
				-				struct xfs_bmbt_irec *, int *);
			
 
				-
			
 
				-int
			
 
				-xfs_iomap(
			
 
				-	struct xfs_inode	*ip,
			
 
				-	xfs_off_t		offset,
			
 
				-	ssize_t			count,
			
 
				-	int			flags,
			
 
				-	struct xfs_bmbt_irec	*imap,
			
 
				-	int			*nimaps,
			
 
				-	int			*new)
			
 
				-{
			
 
				-	struct xfs_mount	*mp = ip->i_mount;
			
 
				-	xfs_fileoff_t		offset_fsb, end_fsb;
			
 
				-	int			error = 0;
			
 
				-	int			lockmode = 0;
			
 
				-	int			bmapi_flags = 0;
			
 
				-
			
 
				-	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
			
 
				-
			
 
				-	*new = 0;
			
 
				-
			
 
				-	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				-		return XFS_ERROR(EIO);
			
 
				-
			
 
				-	trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
			
 
				-
			
 
				-	switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
			
 
				-	case BMAPI_READ:
			
 
				-		lockmode = xfs_ilock_map_shared(ip);
			
 
				-		bmapi_flags = XFS_BMAPI_ENTIRE;
			
 
				-		break;
			
 
				-	case BMAPI_WRITE:
			
 
				-		lockmode = XFS_ILOCK_EXCL;
			
 
				-		if (flags & BMAPI_IGNSTATE)
			
 
				-			bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
			
 
				-		xfs_ilock(ip, lockmode);
			
 
				-		break;
			
 
				-	case BMAPI_ALLOCATE:
			
 
				-		lockmode = XFS_ILOCK_SHARED;
			
 
				-		bmapi_flags = XFS_BMAPI_ENTIRE;
			
 
				-
			
 
				-		/* Attempt non-blocking lock */
			
 
				-		if (flags & BMAPI_TRYLOCK) {
			
 
				-			if (!xfs_ilock_nowait(ip, lockmode))
			
 
				-				return XFS_ERROR(EAGAIN);
			
 
				-		} else {
			
 
				-			xfs_ilock(ip, lockmode);
			
 
				-		}
			
 
				-		break;
			
 
				-	default:
			
 
				-		BUG();
			
 
				-	}
			
 
				-
			
 
				-	ASSERT(offset <= mp->m_maxioffset);
			
 
				-	if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
			
 
				-		count = mp->m_maxioffset - offset;
			
 
				-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
			
 
				-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				-
			
 
				-	error = xfs_bmapi(NULL, ip, offset_fsb,
			
 
				-			(xfs_filblks_t)(end_fsb - offset_fsb),
			
 
				-			bmapi_flags,  NULL, 0, imap,
			
 
				-			nimaps, NULL);
			
 
				-
			
 
				-	if (error)
			
 
				-		goto out;
			
 
				-
			
 
				-	switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
			
 
				-	case BMAPI_WRITE:
			
 
				-		/* If we found an extent, return it */
			
 
				-		if (*nimaps &&
			
 
				-		    (imap->br_startblock != HOLESTARTBLOCK) &&
			
 
				-		    (imap->br_startblock != DELAYSTARTBLOCK)) {
			
 
				-			trace_xfs_iomap_found(ip, offset, count, flags, imap);
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		if (flags & BMAPI_DIRECT) {
			
 
				-			error = xfs_iomap_write_direct(ip, offset, count, flags,
			
 
				-						       imap, nimaps);
			
 
				-		} else {
			
 
				-			error = xfs_iomap_write_delay(ip, offset, count, flags,
			
 
				-						      imap, nimaps);
			
 
				-		}
			
 
				-		if (!error) {
			
 
				-			trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
			
 
				-		}
			
 
				-		*new = 1;
			
 
				-		break;
			
 
				-	case BMAPI_ALLOCATE:
			
 
				-		/* If we found an extent, return it */
			
 
				-		xfs_iunlock(ip, lockmode);
			
 
				-		lockmode = 0;
			
 
				-
			
 
				-		if (*nimaps && !isnullstartblock(imap->br_startblock)) {
			
 
				-			trace_xfs_iomap_found(ip, offset, count, flags, imap);
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		error = xfs_iomap_write_allocate(ip, offset, count,
			
 
				-						 imap, nimaps);
			
 
				-		break;
			
 
				-	}
			
 
				-
			
 
				-	ASSERT(*nimaps <= 1);
			
 
				-
			
 
				-out:
			
 
				-	if (lockmode)
			
 
				-		xfs_iunlock(ip, lockmode);
			
 
				-	return XFS_ERROR(error);
			
 
				-}
			
 
				-
			
 
				 STATIC int
			
 
				 xfs_iomap_eof_align_last_fsb(
			
 
				 	xfs_mount_t	*mp,
			
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
 
				 	return EFSCORRUPTED;
			
 
				 }
			
 
				 
			
 
				-STATIC int
			
 
				+int
			
 
				 xfs_iomap_write_direct(
			
 
				 	xfs_inode_t	*ip,
			
 
				 	xfs_off_t	offset,
			
 
				 	size_t		count,
			
 
				-	int		flags,
			
 
				 	xfs_bmbt_irec_t *imap,
			
 
				-	int		*nmaps)
			
 
				+	int		nmaps)
			
 
				 {
			
 
				 	xfs_mount_t	*mp = ip->i_mount;
			
 
				 	xfs_fileoff_t	offset_fsb;
			
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
 
				 		if (error)
			
 
				 			goto error_out;
			
 
				 	} else {
			
 
				-		if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
			
 
				+		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
			
 
				 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
			
 
				 					imap->br_blockcount +
			
 
				 					imap->br_startoff);
			
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
 
				 	xfs_trans_ijoin(tp, ip);
			
 
				 
			
 
				 	bmapi_flag = XFS_BMAPI_WRITE;
			
 
				-	if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
			
 
				+	if (offset < ip->i_size || extsz)
			
 
				 		bmapi_flag |= XFS_BMAPI_PREALLOC;
			
 
				 
			
 
				 	/*
			
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
 
				 		goto error_out;
			
 
				 	}
			
 
				 
			
 
				-	*nmaps = 1;
			
 
				 	return 0;
			
 
				 
			
 
				 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
			
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 
				 
			
 
				 error1:	/* Just cancel transaction */
			
 
				 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
			
 
				-	*nmaps = 0;	/* nothing set-up here */
			
 
				 
			
 
				 error_out:
			
 
				 	return XFS_ERROR(error);
			
@@ -389,6 +267,9 @@ error_out:
 
				  * If the caller is doing a write at the end of the file, then extend the
			
 
				  * allocation out to the file system's write iosize.  We clean up any extra
			
 
				  * space left over when the file is closed in xfs_inactive().
			
 
				+ *
			
 
				+ * If we find we already have delalloc preallocation beyond EOF, don't do more
			
 
				+ * preallocation as it it not needed.
			
 
				  */
			
 
				 STATIC int
			
 
				 xfs_iomap_eof_want_preallocate(
			
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
 
				 	xfs_inode_t	*ip,
			
 
				 	xfs_off_t	offset,
			
 
				 	size_t		count,
			
 
				-	int		ioflag,
			
 
				 	xfs_bmbt_irec_t *imap,
			
 
				 	int		nimaps,
			
 
				 	int		*prealloc)
			
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
 
				 	xfs_filblks_t   count_fsb;
			
 
				 	xfs_fsblock_t	firstblock;
			
 
				 	int		n, error, imaps;
			
 
				+	int		found_delalloc = 0;
			
 
				 
			
 
				 	*prealloc = 0;
			
 
				 	if ((offset + count) <= ip->i_size)
			
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
 
				 				return 0;
			
 
				 			start_fsb += imap[n].br_blockcount;
			
 
				 			count_fsb -= imap[n].br_blockcount;
			
 
				+
			
 
				+			if (imap[n].br_startblock == DELAYSTARTBLOCK)
			
 
				+				found_delalloc = 1;
			
 
				 		}
			
 
				 	}
			
 
				-	*prealloc = 1;
			
 
				+	if (!found_delalloc)
			
 
				+		*prealloc = 1;
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-STATIC int
			
 
				+/*
			
 
				+ * If we don't have a user specified preallocation size, dynamically increase
			
 
				+ * the preallocation size as the size of the file grows. Cap the maximum size
			
 
				+ * at a single extent or less if the filesystem is near full. The closer the
			
 
				+ * filesystem is to full, the smaller the maximum prealocation.
			
 
				+ */
			
 
				+STATIC xfs_fsblock_t
			
 
				+xfs_iomap_prealloc_size(
			
 
				+	struct xfs_mount	*mp,
			
 
				+	struct xfs_inode	*ip)
			
 
				+{
			
 
				+	xfs_fsblock_t		alloc_blocks = 0;
			
 
				+
			
 
				+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
			
 
				+		int shift = 0;
			
 
				+		int64_t freesp;
			
 
				+
			
 
				+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
			
 
				+		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
			
 
				+					rounddown_pow_of_two(alloc_blocks));
			
 
				+
			
 
				+		xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
			
 
				+		freesp = mp->m_sb.sb_fdblocks;
			
 
				+		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
			
 
				+			shift = 2;
			
 
				+			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
			
 
				+				shift++;
			
 
				+			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
			
 
				+				shift++;
			
 
				+			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
			
 
				+				shift++;
			
 
				+			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
			
 
				+				shift++;
			
 
				+		}
			
 
				+		if (shift)
			
 
				+			alloc_blocks >>= shift;
			
 
				+	}
			
 
				+
			
 
				+	if (alloc_blocks < mp->m_writeio_blocks)
			
 
				+		alloc_blocks = mp->m_writeio_blocks;
			
 
				+
			
 
				+	return alloc_blocks;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				 xfs_iomap_write_delay(
			
 
				 	xfs_inode_t	*ip,
			
 
				 	xfs_off_t	offset,
			
 
				 	size_t		count,
			
 
				-	int		ioflag,
			
 
				-	xfs_bmbt_irec_t *ret_imap,
			
 
				-	int		*nmaps)
			
 
				+	xfs_bmbt_irec_t *ret_imap)
			
 
				 {
			
 
				 	xfs_mount_t	*mp = ip->i_mount;
			
 
				 	xfs_fileoff_t	offset_fsb;
			
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
 
				 	extsz = xfs_get_extsz_hint(ip);
			
 
				 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
			
 
				 
			
 
				+
			
 
				 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
			
 
				-				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
			
 
				+				imap, XFS_WRITE_IMAPS, &prealloc);
			
 
				 	if (error)
			
 
				 		return error;
			
 
				 
			
 
				 retry:
			
 
				 	if (prealloc) {
			
 
				+		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
			
 
				+
			
 
				 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
			
 
				 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
			
 
				-		last_fsb = ioalign + mp->m_writeio_blocks;
			
 
				+		last_fsb = ioalign + alloc_blocks;
			
 
				 	} else {
			
 
				 		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
			
 
				 	}
			
@@ -496,22 +426,31 @@ retry:
 
				 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
			
 
				 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
			
 
				 			  &nimaps, NULL);
			
 
				-	if (error && (error != ENOSPC))
			
 
				+	switch (error) {
			
 
				+	case 0:
			
 
				+	case ENOSPC:
			
 
				+	case EDQUOT:
			
 
				+		break;
			
 
				+	default:
			
 
				 		return XFS_ERROR(error);
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				-	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
			
 
				-	 * then we must have run out of space - flush all other inodes with
			
 
				-	 * delalloc blocks and retry without EOF preallocation.
			
 
				+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
			
 
				+	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
			
 
				+	 * some of the excess reserved metadata space. For both cases, retry
			
 
				+	 * without EOF preallocation.
			
 
				 	 */
			
 
				 	if (nimaps == 0) {
			
 
				 		trace_xfs_delalloc_enospc(ip, offset, count);
			
 
				 		if (flushed)
			
 
				-			return XFS_ERROR(ENOSPC);
			
 
				+			return XFS_ERROR(error ? error : ENOSPC);
			
 
				 
			
 
				-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				-		xfs_flush_inodes(ip);
			
 
				-		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+		if (error == ENOSPC) {
			
 
				+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
 
				+			xfs_flush_inodes(ip);
			
 
				+			xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				+		}
			
 
				 
			
 
				 		flushed = 1;
			
 
				 		error = 0;
			
@@ -523,8 +462,6 @@ retry:
 
				 		return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
			
 
				 
			
 
				 	*ret_imap = imap[0];
			
 
				-	*nmaps = 1;
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -538,13 +475,12 @@ retry:
 
				  * We no longer bother to look at the incoming map - all we have to
			
 
				  * guarantee is that whatever we allocate fills the required range.
			
 
				  */
			
 
				-STATIC int
			
 
				+int
			
 
				 xfs_iomap_write_allocate(
			
 
				 	xfs_inode_t	*ip,
			
 
				 	xfs_off_t	offset,
			
 
				 	size_t		count,
			
 
				-	xfs_bmbt_irec_t *imap,
			
 
				-	int		*retmap)
			
 
				+	xfs_bmbt_irec_t *imap)
			
 
				 {
			
 
				 	xfs_mount_t	*mp = ip->i_mount;
			
 
				 	xfs_fileoff_t	offset_fsb, last_block;
			
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
 
				 	int		error = 0;
			
 
				 	int		nres;
			
 
				 
			
 
				-	*retmap = 0;
			
 
				-
			
 
				 	/*
			
 
				 	 * Make sure that the dquots are there.
			
 
				 	 */
			
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
 
				 		if ((offset_fsb >= imap->br_startoff) &&
			
 
				 		    (offset_fsb < (imap->br_startoff +
			
 
				 				   imap->br_blockcount))) {
			
 
				-			*retmap = 1;
			
 
				 			XFS_STATS_INC(xs_xstrat_quick);
			
 
				 			return 0;
			
 
				 		}
			
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 
				 #ifndef __XFS_IOMAP_H__
			
 
				 #define __XFS_IOMAP_H__
			
 
				 
			
 
				-/* base extent manipulation calls */
			
 
				-#define BMAPI_READ	(1 << 0)	/* read extents */
			
 
				-#define BMAPI_WRITE	(1 << 1)	/* create extents */
			
 
				-#define BMAPI_ALLOCATE	(1 << 2)	/* delayed allocate to real extents */
			
 
				-
			
 
				-/* modifiers */
			
 
				-#define BMAPI_IGNSTATE	(1 << 4)	/* ignore unwritten state on read */
			
 
				-#define BMAPI_DIRECT	(1 << 5)	/* direct instead of buffered write */
			
 
				-#define BMAPI_MMA	(1 << 6)	/* allocate for mmap write */
			
 
				-#define BMAPI_TRYLOCK	(1 << 7)	/* non-blocking request */
			
 
				-
			
 
				-#define BMAPI_FLAGS \
			
 
				-	{ BMAPI_READ,		"READ" }, \
			
 
				-	{ BMAPI_WRITE,		"WRITE" }, \
			
 
				-	{ BMAPI_ALLOCATE,	"ALLOCATE" }, \
			
 
				-	{ BMAPI_IGNSTATE,	"IGNSTATE" }, \
			
 
				-	{ BMAPI_DIRECT,		"DIRECT" }, \
			
 
				-	{ BMAPI_TRYLOCK,	"TRYLOCK" }
			
 
				-
			
 
				 struct xfs_inode;
			
 
				 struct xfs_bmbt_irec;
			
 
				 
			
 
				-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
			
 
				-		     struct xfs_bmbt_irec *, int *, int *);
			
 
				+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
			
 
				+			struct xfs_bmbt_irec *, int);
			
 
				+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
			
 
				+			struct xfs_bmbt_irec *);
			
 
				+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
			
 
				+			struct xfs_bmbt_irec *);
			
 
				 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
			
 
				 
			
 
				 #endif /* __XFS_IOMAP_H__*/
			
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
 
				 	INIT_LIST_HEAD(&cil->xc_committing);
			
 
				 	spin_lock_init(&cil->xc_cil_lock);
			
 
				 	init_rwsem(&cil->xc_ctx_lock);
			
 
				-	sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
			
 
				+	init_waitqueue_head(&cil->xc_commit_wait);
			
 
				 
			
 
				 	INIT_LIST_HEAD(&ctx->committing);
			
 
				 	INIT_LIST_HEAD(&ctx->busy_extents);
			
@@ -361,15 +361,10 @@ xlog_cil_committed(
 
				 	int	abort)
			
 
				 {
			
 
				 	struct xfs_cil_ctx	*ctx = args;
			
 
				-	struct xfs_log_vec	*lv;
			
 
				-	int			abortflag = abort ? XFS_LI_ABORTED : 0;
			
 
				 	struct xfs_busy_extent	*busyp, *n;
			
 
				 
			
 
				-	/* unpin all the log items */
			
 
				-	for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
			
 
				-		xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
			
 
				-							abortflag);
			
 
				-	}
			
 
				+	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
			
 
				+					ctx->start_lsn, abort);
			
 
				 
			
 
				 	list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
			
 
				 		xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
			
@@ -568,7 +563,7 @@ restart:
 
				 			 * It is still being pushed! Wait for the push to
			
 
				 			 * complete, then start again from the beginning.
			
 
				 			 */
			
 
				-			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
			
 
				+			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
			
 
				 			goto restart;
			
 
				 		}
			
 
				 	}
			
@@ -592,7 +587,7 @@ restart:
 
				 	 */
			
 
				 	spin_lock(&cil->xc_cil_lock);
			
 
				 	ctx->commit_lsn = commit_lsn;
			
 
				-	sv_broadcast(&cil->xc_commit_wait);
			
 
				+	wake_up_all(&cil->xc_commit_wait);
			
 
				 	spin_unlock(&cil->xc_cil_lock);
			
 
				 
			
 
				 	/* release the hounds! */
			
@@ -757,7 +752,7 @@ restart:
 
				 			 * It is still being pushed! Wait for the push to
			
 
				 			 * complete, then start again from the beginning.
			
 
				 			 */
			
 
				-			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
			
 
				+			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
			
 
				 			goto restart;
			
 
				 		}
			
 
				 		if (ctx->sequence != sequence)
			
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 
				 struct xfs_buf;
			
 
				 struct log;
			
 
				 struct xlog_ticket;
			
 
				-struct xfs_buf_cancel;
			
 
				 struct xfs_mount;
			
 
				 
			
 
				 /*
			
@@ -54,7 +53,6 @@ struct xfs_mount;
 
				 	BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
			
 
				 	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
			
 
				 
			
 
				-
			
 
				 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
			
 
				 {
			
 
				 	return ((xfs_lsn_t)cycle << 32) | block;
			
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
 
				  */
			
 
				 #define XLOG_TIC_INITED		0x1	/* has been initialized */
			
 
				 #define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
			
 
				-#define XLOG_TIC_IN_Q		0x4
			
 
				 
			
 
				 #define XLOG_TIC_FLAGS \
			
 
				 	{ XLOG_TIC_INITED,	"XLOG_TIC_INITED" }, \
			
 
				-	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }, \
			
 
				-	{ XLOG_TIC_IN_Q,	"XLOG_TIC_IN_Q" }
			
 
				+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }
			
 
				 
			
 
				 #endif	/* __KERNEL__ */
			
 
				 
			
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 
				 } xlog_res_t;
			
 
				 
			
 
				 typedef struct xlog_ticket {
			
 
				-	sv_t		   t_wait;	 /* ticket wait queue            : 20 */
			
 
				-	struct xlog_ticket *t_next;	 /*			         :4|8 */
			
 
				-	struct xlog_ticket *t_prev;	 /*				 :4|8 */
			
 
				+	wait_queue_head_t  t_wait;	 /* ticket wait queue */
			
 
				+	struct list_head   t_queue;	 /* reserve/write queue */
			
 
				 	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
			
 
				 	atomic_t	   t_ref;	 /* ticket reference count       : 4  */
			
 
				 	int		   t_curr_res;	 /* current reservation in bytes : 4  */
			
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
 
				  * and move everything else out to subsequent cachelines.
			
 
				  */
			
 
				 typedef struct xlog_in_core {
			
 
				-	sv_t			ic_force_wait;
			
 
				-	sv_t			ic_write_wait;
			
 
				+	wait_queue_head_t	ic_force_wait;
			
 
				+	wait_queue_head_t	ic_write_wait;
			
 
				 	struct xlog_in_core	*ic_next;
			
 
				 	struct xlog_in_core	*ic_prev;
			
 
				 	struct xfs_buf		*ic_bp;
			
@@ -421,7 +416,7 @@ struct xfs_cil {
 
				 	struct xfs_cil_ctx	*xc_ctx;
			
 
				 	struct rw_semaphore	xc_ctx_lock;
			
 
				 	struct list_head	xc_committing;
			
 
				-	sv_t			xc_commit_wait;
			
 
				+	wait_queue_head_t	xc_commit_wait;
			
 
				 	xfs_lsn_t		xc_current_sequence;
			
 
				 };
			
 
				 
			
@@ -491,7 +486,7 @@ typedef struct log {
 
				 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
			
 
				 	uint			l_flags;
			
 
				 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
			
 
				-	struct xfs_buf_cancel	**l_buf_cancel_table;
			
 
				+	struct list_head	*l_buf_cancel_table;
			
 
				 	int			l_iclog_hsize;  /* size of iclog header */
			
 
				 	int			l_iclog_heads;  /* # of iclog header sectors */
			
 
				 	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
			
@@ -503,29 +498,40 @@ typedef struct log {
 
				 	int			l_logBBsize;    /* size of log in BB chunks */
			
 
				 
			
 
				 	/* The following block of fields are changed while holding icloglock */
			
 
				-	sv_t			l_flush_wait ____cacheline_aligned_in_smp;
			
 
				+	wait_queue_head_t	l_flush_wait ____cacheline_aligned_in_smp;
			
 
				 						/* waiting for iclog flush */
			
 
				 	int			l_covered_state;/* state of "covering disk
			
 
				 						 * log entries" */
			
 
				 	xlog_in_core_t		*l_iclog;       /* head log queue	*/
			
 
				 	spinlock_t		l_icloglock;    /* grab to change iclog state */
			
 
				-	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
			
 
				-						 * buffers */
			
 
				-	xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
			
 
				 	int			l_curr_cycle;   /* Cycle number of log writes */
			
 
				 	int			l_prev_cycle;   /* Cycle number before last
			
 
				 						 * block increment */
			
 
				 	int			l_curr_block;   /* current logical log block */
			
 
				 	int			l_prev_block;   /* previous logical log block */
			
 
				 
			
 
				-	/* The following block of fields are changed while holding grant_lock */
			
 
				-	spinlock_t		l_grant_lock ____cacheline_aligned_in_smp;
			
 
				-	xlog_ticket_t		*l_reserve_headq;
			
 
				-	xlog_ticket_t		*l_write_headq;
			
 
				-	int			l_grant_reserve_cycle;
			
 
				-	int			l_grant_reserve_bytes;
			
 
				-	int			l_grant_write_cycle;
			
 
				-	int			l_grant_write_bytes;
			
 
				+	/*
			
 
				+	 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
			
 
				+	 * read without needing to hold specific locks. To avoid operations
			
 
				+	 * contending with other hot objects, place each of them on a separate
			
 
				+	 * cacheline.
			
 
				+	 */
			
 
				+	/* lsn of last LR on disk */
			
 
				+	atomic64_t		l_last_sync_lsn ____cacheline_aligned_in_smp;
			
 
				+	/* lsn of 1st LR with unflushed * buffers */
			
 
				+	atomic64_t		l_tail_lsn ____cacheline_aligned_in_smp;
			
 
				+
			
 
				+	/*
			
 
				+	 * ticket grant locks, queues and accounting have their own cachlines
			
 
				+	 * as these are quite hot and can be operated on concurrently.
			
 
				+	 */
			
 
				+	spinlock_t		l_grant_reserve_lock ____cacheline_aligned_in_smp;
			
 
				+	struct list_head	l_reserveq;
			
 
				+	atomic64_t		l_grant_reserve_head;
			
 
				+
			
 
				+	spinlock_t		l_grant_write_lock ____cacheline_aligned_in_smp;
			
 
				+	struct list_head	l_writeq;
			
 
				+	atomic64_t		l_grant_write_head;
			
 
				 
			
 
				 	/* The following field are used for debugging; need to hold icloglock */
			
 
				 #ifdef DEBUG
			
@@ -534,6 +540,9 @@ typedef struct log {
 
				 
			
 
				 } xlog_t;
			
 
				 
			
 
				+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
			
 
				+	((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
			
 
				+
			
 
				 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
			
 
				 
			
 
				 /* common routines */
			
@@ -561,6 +570,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
 
				 				struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
			
 
				 				xlog_in_core_t **commit_iclog, uint flags);
			
 
				 
			
 
				+/*
			
 
				+ * When we crack an atomic LSN, we sample it first so that the value will not
			
 
				+ * change while we are cracking it into the component values. This means we
			
 
				+ * will always get consistent component values to work from. This should always
			
 
				+ * be used to smaple and crack LSNs taht are stored and updated in atomic
			
 
				+ * variables.
			
 
				+ */
			
 
				+static inline void
			
 
				+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
			
 
				+{
			
 
				+	xfs_lsn_t val = atomic64_read(lsn);
			
 
				+
			
 
				+	*cycle = CYCLE_LSN(val);
			
 
				+	*block = BLOCK_LSN(val);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Calculate and assign a value to an atomic LSN variable from component pieces.
			
 
				+ */
			
 
				+static inline void
			
 
				+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
			
 
				+{
			
 
				+	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * When we crack the grant head, we sample it first so that the value will not
			
 
				+ * change while we are cracking it into the component values. This means we
			
 
				+ * will always get consistent component values to work from.
			
 
				+ */
			
 
				+static inline void
			
 
				+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
			
 
				+{
			
 
				+	*cycle = val >> 32;
			
 
				+	*space = val & 0xffffffff;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
			
 
				+{
			
 
				+	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
			
 
				+}
			
 
				+
			
 
				+static inline int64_t
			
 
				+xlog_assign_grant_head_val(int cycle, int space)
			
 
				+{
			
 
				+	return ((int64_t)cycle << 32) | space;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
			
 
				+{
			
 
				+	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Committed Item List interfaces
			
 
				  */
			
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
 
				  */
			
 
				 #define XLOG_UNMOUNT_REC_TYPE	(-1U)
			
 
				 
			
 
				+/*
			
 
				+ * Wrapper function for waiting on a wait queue serialised against wakeups
			
 
				+ * by a spinlock. This matches the semantics of all the wait queues used in the
			
 
				+ * log code.
			
 
				+ */
			
 
				+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
			
 
				+{
			
 
				+	DECLARE_WAITQUEUE(wait, current);
			
 
				+
			
 
				+	add_wait_queue_exclusive(wq, &wait);
			
 
				+	__set_current_state(TASK_UNINTERRUPTIBLE);
			
 
				+	spin_unlock(lock);
			
 
				+	schedule();
			
 
				+	remove_wait_queue(wq, &wait);
			
 
				+}
			
 
				 #endif	/* __KERNEL__ */
			
 
				 
			
 
				 #endif	/* __XFS_LOG_PRIV_H__ */
			
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
 
				 			goto out_unwind;
			
 
				 		pag->pag_agno = index;
			
 
				 		pag->pag_mount = mp;
			
 
				-		rwlock_init(&pag->pag_ici_lock);
			
 
				+		spin_lock_init(&pag->pag_ici_lock);
			
 
				 		mutex_init(&pag->pag_ici_reclaim_lock);
			
 
				 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
			
 
				 		spin_lock_init(&pag->pag_buf_lock);
			
@@ -974,6 +974,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 
				 	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * precalculate the low space thresholds for dynamic speculative preallocation.
			
 
				+ */
			
 
				+void
			
 
				+xfs_set_low_space_thresholds(
			
 
				+	struct xfs_mount	*mp)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < XFS_LOWSP_MAX; i++) {
			
 
				+		__uint64_t space = mp->m_sb.sb_dblocks;
			
 
				+
			
 
				+		do_div(space, 100);
			
 
				+		mp->m_low_space[i] = space * (i + 1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 /*
			
 
				  * Set whether we're using inode alignment.
			
 
				  */
			
@@ -1196,6 +1214,9 @@ xfs_mountfs(
 
				 	 */
			
 
				 	xfs_set_rw_sizes(mp);
			
 
				 
			
 
				+	/* set the low space thresholds for dynamic preallocation */
			
 
				+	xfs_set_low_space_thresholds(mp);
			
 
				+
			
 
				 	/*
			
 
				 	 * Set the inode cluster size.
			
 
				 	 * This may still be overridden by the file system
			
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
 
				 	xfs_mod_incore_sb(mp, field, delta, rsvd)
			
 
				 #endif
			
 
				 
			
 
				+/* dynamic preallocation free space thresholds, 5% down to 1% */
			
 
				+enum {
			
 
				+	XFS_LOWSP_1_PCNT = 0,
			
 
				+	XFS_LOWSP_2_PCNT,
			
 
				+	XFS_LOWSP_3_PCNT,
			
 
				+	XFS_LOWSP_4_PCNT,
			
 
				+	XFS_LOWSP_5_PCNT,
			
 
				+	XFS_LOWSP_MAX,
			
 
				+};
			
 
				+
			
 
				 typedef struct xfs_mount {
			
 
				 	struct super_block	*m_super;
			
 
				 	xfs_tid_t		m_tid;		/* next unused tid for fs */
			
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
 
				 	__int64_t		m_update_flags;	/* sb flags we need to update
			
 
				 						   on the next remount,rw */
			
 
				 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
			
 
				+	int64_t			m_low_space[XFS_LOWSP_MAX];
			
 
				+						/* low free space thresholds */
			
 
				 } xfs_mount_t;
			
 
				 
			
 
				 /*
			
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
				 
			
 
				 extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
			
 
				 
			
 
				+extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
			
 
				+
			
 
				 #endif	/* __KERNEL__ */
			
 
				 
			
 
				 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
			
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
 
				  * they could be immediately flushed and we'd have to race with the flusher
			
 
				  * trying to pull the item from the AIL as we add it.
			
 
				  */
			
 
				-void
			
 
				+static void
			
 
				 xfs_trans_item_committed(
			
 
				 	struct xfs_log_item	*lip,
			
 
				 	xfs_lsn_t		commit_lsn,
			
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
 
				 	xfs_trans_free(tp);
			
 
				 }
			
 
				 
			
 
				+static inline void
			
 
				+xfs_log_item_batch_insert(
			
 
				+	struct xfs_ail		*ailp,
			
 
				+	struct xfs_log_item	**log_items,
			
 
				+	int			nr_items,
			
 
				+	xfs_lsn_t		commit_lsn)
			
 
				+{
			
 
				+	int	i;
			
 
				+
			
 
				+	spin_lock(&ailp->xa_lock);
			
 
				+	/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
			
 
				+	xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
			
 
				+
			
 
				+	for (i = 0; i < nr_items; i++)
			
 
				+		IOP_UNPIN(log_items[i], 0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Bulk operation version of xfs_trans_committed that takes a log vector of
			
 
				+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
			
 
				+ * minimise lock traffic.
			
 
				+ */
			
 
				+void
			
 
				+xfs_trans_committed_bulk(
			
 
				+	struct xfs_ail		*ailp,
			
 
				+	struct xfs_log_vec	*log_vector,
			
 
				+	xfs_lsn_t		commit_lsn,
			
 
				+	int			aborted)
			
 
				+{
			
 
				+#define LOG_ITEM_BATCH_SIZE	32
			
 
				+	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
			
 
				+	struct xfs_log_vec	*lv;
			
 
				+	int			i = 0;
			
 
				+
			
 
				+	/* unpin all the log items */
			
 
				+	for (lv = log_vector; lv; lv = lv->lv_next ) {
			
 
				+		struct xfs_log_item	*lip = lv->lv_item;
			
 
				+		xfs_lsn_t		item_lsn;
			
 
				+
			
 
				+		if (aborted)
			
 
				+			lip->li_flags |= XFS_LI_ABORTED;
			
 
				+		item_lsn = IOP_COMMITTED(lip, commit_lsn);
			
 
				+
			
 
				+		/* item_lsn of -1 means the item was freed */
			
 
				+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
			
 
				+			continue;
			
 
				+
			
 
				+		if (item_lsn != commit_lsn) {
			
 
				+
			
 
				+			/*
			
 
				+			 * Not a bulk update option due to unusual item_lsn.
			
 
				+			 * Push into AIL immediately, rechecking the lsn once
			
 
				+			 * we have the ail lock. Then unpin the item.
			
 
				+			 */
			
 
				+			spin_lock(&ailp->xa_lock);
			
 
				+			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
			
 
				+				xfs_trans_ail_update(ailp, lip, item_lsn);
			
 
				+			else
			
 
				+				spin_unlock(&ailp->xa_lock);
			
 
				+			IOP_UNPIN(lip, 0);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/* Item is a candidate for bulk AIL insert.  */
			
 
				+		log_items[i++] = lv->lv_item;
			
 
				+		if (i >= LOG_ITEM_BATCH_SIZE) {
			
 
				+			xfs_log_item_batch_insert(ailp, log_items,
			
 
				+					LOG_ITEM_BATCH_SIZE, commit_lsn);
			
 
				+			i = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* make sure we insert the remainder! */
			
 
				+	if (i)
			
 
				+		xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Called from the trans_commit code when we notice that
			
 
				  * the filesystem is in the middle of a forced shutdown.
			
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 
				 #define	XFS_ALLOC_BTREE_REF	2
			
 
				 #define	XFS_BMAP_BTREE_REF	2
			
 
				 #define	XFS_DIR_BTREE_REF	2
			
 
				+#define	XFS_INO_REF		2
			
 
				 #define	XFS_ATTR_BTREE_REF	1
			
 
				-#define	XFS_INO_REF		1
			
 
				 #define	XFS_DQUOT_REF		1
			
 
				 
			
 
				 #ifdef __KERNEL__
			
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 
				 #include "xfs_trans_priv.h"
			
 
				 #include "xfs_error.h"
			
 
				 
			
 
				-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
			
 
				-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
			
 
				+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
			
 
				+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
			
 
				 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
			
 
				 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
			
 
				 
			
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
 
				 		xfs_log_move_tail(ailp->xa_mount, 1);
			
 
				 }	/* xfs_trans_unlocked_item */
			
 
				 
			
 
				-
			
 
				 /*
			
 
				- * Update the position of the item in the AIL with the new
			
 
				- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
			
 
				- * it to its new position by removing it and re-adding it.
			
 
				+ * xfs_trans_ail_update - bulk AIL insertion operation.
			
 
				+ *
			
 
				+ * @xfs_trans_ail_update takes an array of log items that all need to be
			
 
				+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
			
 
				+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
			
 
				+ * it to the AIL. If we move the first item in the AIL, update the log tail to
			
 
				+ * match the new minimum LSN in the AIL.
			
 
				  *
			
 
				- * Wakeup anyone with an lsn less than the item's lsn.  If the item
			
 
				- * we move in the AIL is the minimum one, update the tail lsn in the
			
 
				- * log manager.
			
 
				+ * This function takes the AIL lock once to execute the update operations on
			
 
				+ * all the items in the array, and as such should not be called with the AIL
			
 
				+ * lock held. As a result, once we have the AIL lock, we need to check each log
			
 
				+ * item LSN to confirm it needs to be moved forward in the AIL.
			
 
				  *
			
 
				- * This function must be called with the AIL lock held.  The lock
			
 
				- * is dropped before returning.
			
 
				+ * To optimise the insert operation, we delete all the items from the AIL in
			
 
				+ * the first pass, moving them into a temporary list, then splice the temporary
			
 
				+ * list into the correct position in the AIL. This avoids needing to do an
			
 
				+ * insert operation on every item.
			
 
				+ *
			
 
				+ * This function must be called with the AIL lock held.  The lock is dropped
			
 
				+ * before returning.
			
 
				  */
			
 
				 void
			
 
				-xfs_trans_ail_update(
			
 
				-	struct xfs_ail	*ailp,
			
 
				-	xfs_log_item_t	*lip,
			
 
				-	xfs_lsn_t	lsn) __releases(ailp->xa_lock)
			
 
				+xfs_trans_ail_update_bulk(
			
 
				+	struct xfs_ail		*ailp,
			
 
				+	struct xfs_log_item	**log_items,
			
 
				+	int			nr_items,
			
 
				+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
			
 
				 {
			
 
				-	xfs_log_item_t		*dlip = NULL;
			
 
				-	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
			
 
				+	xfs_log_item_t		*mlip;
			
 
				 	xfs_lsn_t		tail_lsn;
			
 
				+	int			mlip_changed = 0;
			
 
				+	int			i;
			
 
				+	LIST_HEAD(tmp);
			
 
				 
			
 
				 	mlip = xfs_ail_min(ailp);
			
 
				 
			
 
				-	if (lip->li_flags & XFS_LI_IN_AIL) {
			
 
				-		dlip = xfs_ail_delete(ailp, lip);
			
 
				-		ASSERT(dlip == lip);
			
 
				-		xfs_trans_ail_cursor_clear(ailp, dlip);
			
 
				-	} else {
			
 
				-		lip->li_flags |= XFS_LI_IN_AIL;
			
 
				+	for (i = 0; i < nr_items; i++) {
			
 
				+		struct xfs_log_item *lip = log_items[i];
			
 
				+		if (lip->li_flags & XFS_LI_IN_AIL) {
			
 
				+			/* check if we really need to move the item */
			
 
				+			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
			
 
				+				continue;
			
 
				+
			
 
				+			xfs_ail_delete(ailp, lip);
			
 
				+			if (mlip == lip)
			
 
				+				mlip_changed = 1;
			
 
				+		} else {
			
 
				+			lip->li_flags |= XFS_LI_IN_AIL;
			
 
				+		}
			
 
				+		lip->li_lsn = lsn;
			
 
				+		list_add(&lip->li_ail, &tmp);
			
 
				 	}
			
 
				 
			
 
				-	lip->li_lsn = lsn;
			
 
				-	xfs_ail_insert(ailp, lip);
			
 
				+	xfs_ail_splice(ailp, &tmp, lsn);
			
 
				 
			
 
				-	if (mlip == dlip) {
			
 
				-		mlip = xfs_ail_min(ailp);
			
 
				-		/*
			
 
				-		 * It is not safe to access mlip after the AIL lock is
			
 
				-		 * dropped, so we must get a copy of li_lsn before we do
			
 
				-		 * so.  This is especially important on 32-bit platforms
			
 
				-		 * where accessing and updating 64-bit values like li_lsn
			
 
				-		 * is not atomic.
			
 
				-		 */
			
 
				-		tail_lsn = mlip->li_lsn;
			
 
				-		spin_unlock(&ailp->xa_lock);
			
 
				-		xfs_log_move_tail(ailp->xa_mount, tail_lsn);
			
 
				-	} else {
			
 
				+	if (!mlip_changed) {
			
 
				 		spin_unlock(&ailp->xa_lock);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				-
			
 
				-}	/* xfs_trans_update_ail */
			
 
				+	/*
			
 
				+	 * It is not safe to access mlip after the AIL lock is dropped, so we
			
 
				+	 * must get a copy of li_lsn before we do so.  This is especially
			
 
				+	 * important on 32-bit platforms where accessing and updating 64-bit
			
 
				+	 * values like li_lsn is not atomic.
			
 
				+	 */
			
 
				+	mlip = xfs_ail_min(ailp);
			
 
				+	tail_lsn = mlip->li_lsn;
			
 
				+	spin_unlock(&ailp->xa_lock);
			
 
				+	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				- * Delete the given item from the AIL.  It must already be in
			
 
				- * the AIL.
			
 
				+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
			
 
				  *
			
 
				- * Wakeup anyone with an lsn less than item's lsn.    If the item
			
 
				- * we delete in the AIL is the minimum one, update the tail lsn in the
			
 
				- * log manager.
			
 
				+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
			
 
				+ * removed from the AIL. The caller is already holding the AIL lock, and done
			
 
				+ * all the checks necessary to ensure the items passed in via @log_items are
			
 
				+ * ready for deletion. This includes checking that the items are in the AIL.
			
 
				  *
			
 
				- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
			
 
				- * bump the AIL's generation count to indicate that the tree
			
 
				- * has changed.
			
 
				+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
			
 
				+ * flag from the item and reset the item's lsn to 0. If we remove the first
			
 
				+ * item in the AIL, update the log tail to match the new minimum LSN in the
			
 
				+ * AIL.
			
 
				  *
			
 
				- * This function must be called with the AIL lock held.  The lock
			
 
				- * is dropped before returning.
			
 
				+ * This function will not drop the AIL lock until all items are removed from
			
 
				+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
			
 
				+ * greatly increase the AIL hold time, but does significantly reduce the amount
			
 
				+ * of traffic on the lock, especially during IO completion.
			
 
				+ *
			
 
				+ * This function must be called with the AIL lock held.  The lock is dropped
			
 
				+ * before returning.
			
 
				  */
			
 
				 void
			
 
				-xfs_trans_ail_delete(
			
 
				-	struct xfs_ail	*ailp,
			
 
				-	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
			
 
				+xfs_trans_ail_delete_bulk(
			
 
				+	struct xfs_ail		*ailp,
			
 
				+	struct xfs_log_item	**log_items,
			
 
				+	int			nr_items) __releases(ailp->xa_lock)
			
 
				 {
			
 
				-	xfs_log_item_t		*dlip;
			
 
				 	xfs_log_item_t		*mlip;
			
 
				 	xfs_lsn_t		tail_lsn;
			
 
				+	int			mlip_changed = 0;
			
 
				+	int			i;
			
 
				 
			
 
				-	if (lip->li_flags & XFS_LI_IN_AIL) {
			
 
				-		mlip = xfs_ail_min(ailp);
			
 
				-		dlip = xfs_ail_delete(ailp, lip);
			
 
				-		ASSERT(dlip == lip);
			
 
				-		xfs_trans_ail_cursor_clear(ailp, dlip);
			
 
				-
			
 
				+	mlip = xfs_ail_min(ailp);
			
 
				 
			
 
				-		lip->li_flags &= ~XFS_LI_IN_AIL;
			
 
				-		lip->li_lsn = 0;
			
 
				+	for (i = 0; i < nr_items; i++) {
			
 
				+		struct xfs_log_item *lip = log_items[i];
			
 
				+		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
			
 
				+			struct xfs_mount	*mp = ailp->xa_mount;
			
 
				 
			
 
				-		if (mlip == dlip) {
			
 
				-			mlip = xfs_ail_min(ailp);
			
 
				-			/*
			
 
				-			 * It is not safe to access mlip after the AIL lock
			
 
				-			 * is dropped, so we must get a copy of li_lsn
			
 
				-			 * before we do so.  This is especially important
			
 
				-			 * on 32-bit platforms where accessing and updating
			
 
				-			 * 64-bit values like li_lsn is not atomic.
			
 
				-			 */
			
 
				-			tail_lsn = mlip ? mlip->li_lsn : 0;
			
 
				-			spin_unlock(&ailp->xa_lock);
			
 
				-			xfs_log_move_tail(ailp->xa_mount, tail_lsn);
			
 
				-		} else {
			
 
				 			spin_unlock(&ailp->xa_lock);
			
 
				+			if (!XFS_FORCED_SHUTDOWN(mp)) {
			
 
				+				xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
			
 
				+		"%s: attempting to delete a log item that is not in the AIL",
			
 
				+						__func__);
			
 
				+				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
			
 
				+			}
			
 
				+			return;
			
 
				 		}
			
 
				+
			
 
				+		xfs_ail_delete(ailp, lip);
			
 
				+		lip->li_flags &= ~XFS_LI_IN_AIL;
			
 
				+		lip->li_lsn = 0;
			
 
				+		if (mlip == lip)
			
 
				+			mlip_changed = 1;
			
 
				 	}
			
 
				-	else {
			
 
				-		/*
			
 
				-		 * If the file system is not being shutdown, we are in
			
 
				-		 * serious trouble if we get to this stage.
			
 
				-		 */
			
 
				-		struct xfs_mount	*mp = ailp->xa_mount;
			
 
				 
			
 
				+	if (!mlip_changed) {
			
 
				 		spin_unlock(&ailp->xa_lock);
			
 
				-		if (!XFS_FORCED_SHUTDOWN(mp)) {
			
 
				-			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
			
 
				-		"%s: attempting to delete a log item that is not in the AIL",
			
 
				-					__func__);
			
 
				-			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
			
 
				-		}
			
 
				+		return;
			
 
				 	}
			
 
				-}
			
 
				-
			
 
				 
			
 
				+	/*
			
 
				+	 * It is not safe to access mlip after the AIL lock is dropped, so we
			
 
				+	 * must get a copy of li_lsn before we do so.  This is especially
			
 
				+	 * important on 32-bit platforms where accessing and updating 64-bit
			
 
				+	 * values like li_lsn is not atomic. It is possible we've emptied the
			
 
				+	 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
			
 
				+	 */
			
 
				+	mlip = xfs_ail_min(ailp);
			
 
				+	tail_lsn = mlip ? mlip->li_lsn : 0;
			
 
				+	spin_unlock(&ailp->xa_lock);
			
 
				+	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				  * The active item list (AIL) is a doubly linked list of log
			
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Insert the given log item into the AIL.
			
 
				- * We almost always insert at the end of the list, so on inserts
			
 
				- * we search from the end of the list to find where the
			
 
				- * new item belongs.
			
 
				+ * splice the log item list into the AIL at the given LSN.
			
 
				  */
			
 
				 STATIC void
			
 
				-xfs_ail_insert(
			
 
				+xfs_ail_splice(
			
 
				 	struct xfs_ail	*ailp,
			
 
				-	xfs_log_item_t	*lip)
			
 
				-/* ARGSUSED */
			
 
				+	struct list_head *list,
			
 
				+	xfs_lsn_t	lsn)
			
 
				 {
			
 
				 	xfs_log_item_t	*next_lip;
			
 
				 
			
@@ -640,39 +660,33 @@ xfs_ail_insert(
 
				 	 * If the list is empty, just insert the item.
			
 
				 	 */
			
 
				 	if (list_empty(&ailp->xa_ail)) {
			
 
				-		list_add(&lip->li_ail, &ailp->xa_ail);
			
 
				+		list_splice(list, &ailp->xa_ail);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				 	list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
			
 
				-		if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
			
 
				+		if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
			
 
				 			break;
			
 
				 	}
			
 
				 
			
 
				 	ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
			
 
				-	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
			
 
				-
			
 
				-	list_add(&lip->li_ail, &next_lip->li_ail);
			
 
				+	       (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
			
 
				 
			
 
				-	xfs_ail_check(ailp, lip);
			
 
				+	list_splice_init(list, &next_lip->li_ail);
			
 
				 	return;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * Delete the given item from the AIL.  Return a pointer to the item.
			
 
				  */
			
 
				-/*ARGSUSED*/
			
 
				-STATIC xfs_log_item_t *
			
 
				+STATIC void
			
 
				 xfs_ail_delete(
			
 
				 	struct xfs_ail	*ailp,
			
 
				 	xfs_log_item_t	*lip)
			
 
				-/* ARGSUSED */
			
 
				 {
			
 
				 	xfs_ail_check(ailp, lip);
			
 
				-
			
 
				 	list_del(&lip->li_ail);
			
 
				-
			
 
				-	return lip;
			
 
				+	xfs_trans_ail_cursor_clear(ailp, lip);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -682,7 +696,6 @@ xfs_ail_delete(
 
				 STATIC xfs_log_item_t *
			
 
				 xfs_ail_min(
			
 
				 	struct xfs_ail	*ailp)
			
 
				-/* ARGSUSED */
			
 
				 {
			
 
				 	if (list_empty(&ailp->xa_ail))
			
 
				 		return NULL;
			
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 
				 xfs_ail_next(
			
 
				 	struct xfs_ail	*ailp,
			
 
				 	xfs_log_item_t	*lip)
			
 
				-/* ARGSUSED */
			
 
				 {
			
 
				 	if (lip->li_ail.next == &ailp->xa_ail)
			
 
				 		return NULL;
			
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
 
				 	tp->t_flags |= XFS_TRANS_DIRTY;
			
 
				 	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
			
 
				 
			
 
				-	next_extent = efip->efi_next_extent;
			
 
				+	/*
			
 
				+	 * atomic_inc_return gives us the value after the increment;
			
 
				+	 * we want to use it as an array index so we need to subtract 1 from
			
 
				+	 * it.
			
 
				+	 */
			
 
				+	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
			
 
				 	ASSERT(next_extent < efip->efi_format.efi_nextents);
			
 
				 	extp = &(efip->efi_format.efi_extents[next_extent]);
			
 
				 	extp->ext_start = start_block;
			
 
				 	extp->ext_len = ext_len;
			
 
				-	efip->efi_next_extent++;
			
 
				 }
			
 
				 
			
 
				 
			
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 
				 struct xfs_log_item_desc;
			
 
				 struct xfs_mount;
			
 
				 struct xfs_trans;
			
 
				+struct xfs_ail;
			
 
				+struct xfs_log_vec;
			
 
				 
			
 
				 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
			
 
				 void	xfs_trans_del_item(struct xfs_log_item *);
			
 
				 void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
			
 
				 				int flags);
			
 
				-void	xfs_trans_item_committed(struct xfs_log_item *lip,
			
 
				-				xfs_lsn_t commit_lsn, int aborted);
			
 
				 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
			
 
				 
			
 
				+void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
			
 
				+				xfs_lsn_t commit_lsn, int aborted);
			
 
				 /*
			
 
				  * AIL traversal cursor.
			
 
				  *
			
@@ -73,12 +75,29 @@ struct xfs_ail {
 
				 /*
			
 
				  * From xfs_trans_ail.c
			
 
				  */
			
 
				-void			xfs_trans_ail_update(struct xfs_ail *ailp,
			
 
				-					struct xfs_log_item *lip, xfs_lsn_t lsn)
			
 
				-					__releases(ailp->xa_lock);
			
 
				-void			xfs_trans_ail_delete(struct xfs_ail *ailp,
			
 
				-					struct xfs_log_item *lip)
			
 
				-					__releases(ailp->xa_lock);
			
 
				+void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
			
 
				+				struct xfs_log_item **log_items, int nr_items,
			
 
				+				xfs_lsn_t lsn) __releases(ailp->xa_lock);
			
 
				+static inline void
			
 
				+xfs_trans_ail_update(
			
 
				+	struct xfs_ail		*ailp,
			
 
				+	struct xfs_log_item	*lip,
			
 
				+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
			
 
				+{
			
 
				+	xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
			
 
				+}
			
 
				+
			
 
				+void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
			
 
				+				struct xfs_log_item **log_items, int nr_items)
			
 
				+				__releases(ailp->xa_lock);
			
 
				+static inline void
			
 
				+xfs_trans_ail_delete(
			
 
				+	struct xfs_ail	*ailp,
			
 
				+	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
			
 
				+{
			
 
				+	xfs_trans_ail_delete_bulk(ailp, &lip, 1);
			
 
				+}
			
 
				+
			
 
				 void			xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
			
 
				 void			xfs_trans_unlocked_item(struct xfs_ail *,
			
 
				 					xfs_log_item_t *);
			
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
 
				 			xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
			
 
				 	}
			
 
				 
			
 
				-	if (ip->i_d.di_nlink != 0) {
			
 
				-		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
			
 
				-		     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
			
 
				-		       ip->i_delayed_blks > 0)) &&
			
 
				-		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
			
 
				-		    (!(ip->i_d.di_flags &
			
 
				-				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
			
 
				+	if (ip->i_d.di_nlink == 0)
			
 
				+		return 0;
			
 
				 
			
 
				-			/*
			
 
				-			 * If we can't get the iolock just skip truncating
			
 
				-			 * the blocks past EOF because we could deadlock
			
 
				-			 * with the mmap_sem otherwise.  We'll get another
			
 
				-			 * chance to drop them once the last reference to
			
 
				-			 * the inode is dropped, so we'll never leak blocks
			
 
				-			 * permanently.
			
 
				-			 */
			
 
				-			error = xfs_free_eofblocks(mp, ip,
			
 
				-						   XFS_FREE_EOF_TRYLOCK);
			
 
				-			if (error)
			
 
				-				return error;
			
 
				-		}
			
 
				-	}
			
 
				+	if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
			
 
				+	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
			
 
				+	       ip->i_delayed_blks > 0)) &&
			
 
				+	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
			
 
				+	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
			
 
				 
			
 
				+		/*
			
 
				+		 * If we can't get the iolock just skip truncating the blocks
			
 
				+		 * past EOF because we could deadlock with the mmap_sem
			
 
				+		 * otherwise.  We'll get another chance to drop them once the
			
 
				+		 * last reference to the inode is dropped, so we'll never leak
			
 
				+		 * blocks permanently.
			
 
				+		 *
			
 
				+		 * Further, check if the inode is being opened, written and
			
 
				+		 * closed frequently and we have delayed allocation blocks
			
 
				+		 * oustanding (e.g. streaming writes from the NFS server),
			
 
				+		 * truncating the blocks past EOF will cause fragmentation to
			
 
				+		 * occur.
			
 
				+		 *
			
 
				+		 * In this case don't do the truncation, either, but we have to
			
 
				+		 * be careful how we detect this case. Blocks beyond EOF show
			
 
				+		 * up as i_delayed_blks even when the inode is clean, so we
			
 
				+		 * need to truncate them away first before checking for a dirty
			
 
				+		 * release. Hence on the first dirty close we will still remove
			
 
				+		 * the speculative allocation, but after that we will leave it
			
 
				+		 * in place.
			
 
				+		 */
			
 
				+		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
			
 
				+			return 0;
			
 
				+
			
 
				+		error = xfs_free_eofblocks(mp, ip,
			
 
				+					   XFS_FREE_EOF_TRYLOCK);
			
 
				+		if (error)
			
 
				+			return error;
			
 
				+
			
 
				+		/* delalloc blocks after truncation means it really is dirty */
			
 
				+		if (ip->i_delayed_blks)
			
 
				+			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
			
 
				+	}
			
 
				 	return 0;
			
 
				 }