17 年之前 · 6483d152ac
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
 
				 		if (unlikely(page == NULL)) {
			
 
				 			if (flags & XBF_READ_AHEAD) {
			
 
				 				bp->b_page_count = i;
			
 
				+				for (i = 0; i < bp->b_page_count; i++)
			
 
				+					unlock_page(bp->b_pages[i]);
			
 
				 				return -ENOMEM;
			
 
				 			}
			
 
				 
			
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
 
				 		ASSERT(!PagePrivate(page));
			
 
				 		if (!PageUptodate(page)) {
			
 
				 			page_count--;
			
 
				-			if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) {
			
 
				+			if (blocksize >= PAGE_CACHE_SIZE) {
			
 
				+				if (flags & XBF_READ)
			
 
				+					bp->b_flags |= _XBF_PAGE_LOCKED;
			
 
				+			} else if (!PagePrivate(page)) {
			
 
				 				if (test_page_region(page, offset, nbytes))
			
 
				 					page_count++;
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		unlock_page(page);
			
 
				 		bp->b_pages[i] = page;
			
 
				 		offset = 0;
			
 
				 	}
			
 
				 
			
 
				+	if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
			
 
				+		for (i = 0; i < bp->b_page_count; i++)
			
 
				+			unlock_page(bp->b_pages[i]);
			
 
				+	}
			
 
				+
			
 
				 	if (page_count == bp->b_page_count)
			
 
				 		bp->b_flags |= XBF_DONE;
			
 
				 
			
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
 
				 	bp->b_count_desired = len;
			
 
				 	bp->b_buffer_length = buflen;
			
 
				 	bp->b_flags |= XBF_MAPPED;
			
 
				+	bp->b_flags &= ~_XBF_PAGE_LOCKED;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -1093,8 +1103,10 @@ _xfs_buf_ioend(
 
				 	xfs_buf_t		*bp,
			
 
				 	int			schedule)
			
 
				 {
			
 
				-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
			
 
				+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
			
 
				+		bp->b_flags &= ~_XBF_PAGE_LOCKED;
			
 
				 		xfs_buf_ioend(bp, schedule);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 STATIC void
			
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
 
				 
			
 
				 		if (--bvec >= bio->bi_io_vec)
			
 
				 			prefetchw(&bvec->bv_page->flags);
			
 
				+
			
 
				+		if (bp->b_flags & _XBF_PAGE_LOCKED)
			
 
				+			unlock_page(page);
			
 
				 	} while (bvec >= bio->bi_io_vec);
			
 
				 
			
 
				 	_xfs_buf_ioend(bp, 1);
			
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
 
				 	 * filesystem block size is not smaller than the page size.
			
 
				 	 */
			
 
				 	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
			
 
				-	    (bp->b_flags & XBF_READ) &&
			
 
				+	    ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
			
 
				+	      (XBF_READ|_XBF_PAGE_LOCKED)) &&
			
 
				 	    (blocksize >= PAGE_CACHE_SIZE)) {
			
 
				 		bio = bio_alloc(GFP_NOIO, 1);
			
 
				 
			
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -66,6 +66,25 @@ typedef enum {
 
				 	_XBF_PAGES = (1 << 18),	    /* backed by refcounted pages	   */
			
 
				 	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
			
 
				 	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
			
 
				+
			
 
				+	/*
			
 
				+	 * Special flag for supporting metadata blocks smaller than a FSB.
			
 
				+	 *
			
 
				+	 * In this case we can have multiple xfs_buf_t on a single page and
			
 
				+	 * need to lock out concurrent xfs_buf_t readers as they only
			
 
				+	 * serialise access to the buffer.
			
 
				+	 *
			
 
				+	 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
			
 
				+	 * between reads of the page. Hence we can have one thread read the
			
 
				+	 * page and modify it, but then race with another thread that thinks
			
 
				+	 * the page is not up-to-date and hence reads it again.
			
 
				+	 *
			
 
				+	 * The result is that the first modifcation to the page is lost.
			
 
				+	 * This sort of AGF/AGI reading race can happen when unlinking inodes
			
 
				+	 * that require truncation and results in the AGI unlinked list
			
 
				+	 * modifications being lost.
			
 
				+	 */
			
 
				+	_XBF_PAGE_LOCKED = (1 << 22),
			
 
				 } xfs_buf_flags_t;
			
 
				 
			
 
				 typedef enum {
			
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
 
				 	return -xfs_release(XFS_I(inode));
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * We ignore the datasync flag here because a datasync is effectively
			
 
				+ * identical to an fsync. That is, datasync implies that we need to write
			
 
				+ * only the metadata needed to be able to access the data that is written
			
 
				+ * if we crash after the call completes. Hence if we are writing beyond
			
 
				+ * EOF we have to log the inode size change as well, which makes it a
			
 
				+ * full fsync. If we don't write beyond EOF, the inode core will be
			
 
				+ * clean in memory and so we don't need to log the inode, just like
			
 
				+ * fsync.
			
 
				+ */
			
 
				 STATIC int
			
 
				 xfs_file_fsync(
			
 
				 	struct file	*filp,
			
 
				 	struct dentry	*dentry,
			
 
				 	int		datasync)
			
 
				 {
			
 
				-	int		flags = FSYNC_WAIT;
			
 
				-
			
 
				-	if (datasync)
			
 
				-		flags |= FSYNC_DATA;
			
 
				 	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
			
 
				-	return -xfs_fsync(XFS_I(dentry->d_inode), flags,
			
 
				-			(xfs_off_t)0, (xfs_off_t)-1);
			
 
				+	return -xfs_fsync(XFS_I(dentry->d_inode));
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -229,14 +229,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 
				 #define ATTR_NOLOCK	0x200	/* Don't grab any conflicting locks */
			
 
				 #define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
			
 
				 
			
 
				-/*
			
 
				- * Flags to vop_fsync/reclaim.
			
 
				- */
			
 
				-#define FSYNC_NOWAIT	0	/* asynchronous flush */
			
 
				-#define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
			
 
				-#define FSYNC_INVAL	0x2	/* flush and invalidate cached data */
			
 
				-#define FSYNC_DATA	0x4	/* synchronous fsync of data only */
			
 
				-
			
 
				 /*
			
 
				  * Tracking vnode activity.
			
 
				  */
			
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
 
				 	xfs_mount_t		*mp = ip->i_mount;
			
 
				 	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
			
 
				 	unsigned long		first_index, mask;
			
 
				+	unsigned long		inodes_per_cluster;
			
 
				 	int			ilist_size;
			
 
				 	xfs_inode_t		**ilist;
			
 
				 	xfs_inode_t		*iq;
			
@@ -2985,8 +2986,9 @@ xfs_iflush_cluster(
 
				 	ASSERT(pag->pagi_inodeok);
			
 
				 	ASSERT(pag->pag_ici_init);
			
 
				 
			
 
				-	ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
			
 
				-	ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
			
 
				+	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
			
 
				+	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
			
 
				+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
			
 
				 	if (!ilist)
			
 
				 		return 0;
			
 
				 
			
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
 
				 	read_lock(&pag->pag_ici_lock);
			
 
				 	/* really need a gang lookup range call here */
			
 
				 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
			
 
				-					first_index,
			
 
				-					XFS_INODE_CLUSTER_SIZE(mp));
			
 
				+					first_index, inodes_per_cluster);
			
 
				 	if (nr_found == 0)
			
 
				 		goto out_free;
			
 
				 
			
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
 
				 /*
			
 
				  * xfs_fsync
			
 
				  *
			
 
				- * This is called to sync the inode and its data out to disk.
			
 
				- * We need to hold the I/O lock while flushing the data, and
			
 
				- * the inode lock while flushing the inode.  The inode lock CANNOT
			
 
				- * be held while flushing the data, so acquire after we're done
			
 
				- * with that.
			
 
				+ * This is called to sync the inode and its data out to disk.  We need to hold
			
 
				+ * the I/O lock while flushing the data, and the inode lock while flushing the
			
 
				+ * inode.  The inode lock CANNOT be held while flushing the data, so acquire
			
 
				+ * after we're done with that.
			
 
				  */
			
 
				 int
			
 
				 xfs_fsync(
			
 
				-	xfs_inode_t	*ip,
			
 
				-	int		flag,
			
 
				-	xfs_off_t	start,
			
 
				-	xfs_off_t	stop)
			
 
				+	xfs_inode_t	*ip)
			
 
				 {
			
 
				 	xfs_trans_t	*tp;
			
 
				 	int		error;
			
@@ -875,103 +871,79 @@ xfs_fsync(
 
				 
			
 
				 	xfs_itrace_entry(ip);
			
 
				 
			
 
				-	ASSERT(start >= 0 && stop >= -1);
			
 
				-
			
 
				 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
			
 
				 		return XFS_ERROR(EIO);
			
 
				 
			
 
				-	if (flag & FSYNC_DATA)
			
 
				-		filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
			
 
				+	/* capture size updates in I/O completion before writing the inode. */
			
 
				+	error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
			
 
				+	if (error)
			
 
				+		return XFS_ERROR(error);
			
 
				 
			
 
				 	/*
			
 
				-	 * We always need to make sure that the required inode state
			
 
				-	 * is safe on disk.  The vnode might be clean but because
			
 
				-	 * of committed transactions that haven't hit the disk yet.
			
 
				-	 * Likewise, there could be unflushed non-transactional
			
 
				-	 * changes to the inode core that have to go to disk.
			
 
				+	 * We always need to make sure that the required inode state is safe on
			
 
				+	 * disk.  The vnode might be clean but we still might need to force the
			
 
				+	 * log because of committed transactions that haven't hit the disk yet.
			
 
				+	 * Likewise, there could be unflushed non-transactional changes to the
			
 
				+	 * inode core that have to go to disk and this requires us to issue
			
 
				+	 * a synchronous transaction to capture these changes correctly.
			
 
				 	 *
			
 
				-	 * The following code depends on one assumption:  that
			
 
				-	 * any transaction that changes an inode logs the core
			
 
				-	 * because it has to change some field in the inode core
			
 
				-	 * (typically nextents or nblocks).  That assumption
			
 
				-	 * implies that any transactions against an inode will
			
 
				-	 * catch any non-transactional updates.  If inode-altering
			
 
				-	 * transactions exist that violate this assumption, the
			
 
				-	 * code breaks.  Right now, it figures that if the involved
			
 
				-	 * update_* field is clear and the inode is unpinned, the
			
 
				-	 * inode is clean.  Either it's been flushed or it's been
			
 
				-	 * committed and the commit has hit the disk unpinning the inode.
			
 
				-	 * (Note that xfs_inode_item_format() called at commit clears
			
 
				-	 * the update_* fields.)
			
 
				+	 * This code relies on the assumption that if the update_* fields
			
 
				+	 * of the inode are clear and the inode is unpinned then it is clean
			
 
				+	 * and no action is required.
			
 
				 	 */
			
 
				 	xfs_ilock(ip, XFS_ILOCK_SHARED);
			
 
				 
			
 
				-	/* If we are flushing data then we care about update_size
			
 
				-	 * being set, otherwise we care about update_core
			
 
				-	 */
			
 
				-	if ((flag & FSYNC_DATA) ?
			
 
				-			(ip->i_update_size == 0) :
			
 
				-			(ip->i_update_core == 0)) {
			
 
				+	if (!(ip->i_update_size || ip->i_update_core)) {
			
 
				 		/*
			
 
				-		 * Timestamps/size haven't changed since last inode
			
 
				-		 * flush or inode transaction commit.  That means
			
 
				-		 * either nothing got written or a transaction
			
 
				-		 * committed which caught the updates.	If the
			
 
				-		 * latter happened and the transaction hasn't
			
 
				-		 * hit the disk yet, the inode will be still
			
 
				-		 * be pinned.  If it is, force the log.
			
 
				+		 * Timestamps/size haven't changed since last inode flush or
			
 
				+		 * inode transaction commit.  That means either nothing got
			
 
				+		 * written or a transaction committed which caught the updates.
			
 
				+		 * If the latter happened and the transaction hasn't hit the
			
 
				+		 * disk yet, the inode will be still be pinned.  If it is,
			
 
				+		 * force the log.
			
 
				 		 */
			
 
				 
			
 
				 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
			
 
				 
			
 
				 		if (xfs_ipincount(ip)) {
			
 
				-			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
			
 
				-				      XFS_LOG_FORCE |
			
 
				-				      ((flag & FSYNC_WAIT)
			
 
				-				       ? XFS_LOG_SYNC : 0),
			
 
				+			error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
			
 
				+				      XFS_LOG_FORCE | XFS_LOG_SYNC,
			
 
				 				      &log_flushed);
			
 
				 		} else {
			
 
				 			/*
			
 
				-			 * If the inode is not pinned and nothing
			
 
				-			 * has changed we don't need to flush the
			
 
				-			 * cache.
			
 
				+			 * If the inode is not pinned and nothing has changed
			
 
				+			 * we don't need to flush the cache.
			
 
				 			 */
			
 
				 			changed = 0;
			
 
				 		}
			
 
				-		error = 0;
			
 
				 	} else	{
			
 
				 		/*
			
 
				-		 * Kick off a transaction to log the inode
			
 
				-		 * core to get the updates.  Make it
			
 
				-		 * sync if FSYNC_WAIT is passed in (which
			
 
				-		 * is done by everybody but specfs).  The
			
 
				-		 * sync transaction will also force the log.
			
 
				+		 * Kick off a transaction to log the inode core to get the
			
 
				+		 * updates.  The sync transaction will also force the log.
			
 
				 		 */
			
 
				 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
			
 
				 		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
			
 
				-		if ((error = xfs_trans_reserve(tp, 0,
			
 
				-				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
			
 
				-				0, 0, 0)))  {
			
 
				+		error = xfs_trans_reserve(tp, 0,
			
 
				+				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
			
 
				+		if (error) {
			
 
				 			xfs_trans_cancel(tp, 0);
			
 
				 			return error;
			
 
				 		}
			
 
				 		xfs_ilock(ip, XFS_ILOCK_EXCL);
			
 
				 
			
 
				 		/*
			
 
				-		 * Note - it's possible that we might have pushed
			
 
				-		 * ourselves out of the way during trans_reserve
			
 
				-		 * which would flush the inode.	 But there's no
			
 
				-		 * guarantee that the inode buffer has actually
			
 
				-		 * gone out yet (it's delwri).	Plus the buffer
			
 
				-		 * could be pinned anyway if it's part of an
			
 
				-		 * inode in another recent transaction.	 So we
			
 
				-		 * play it safe and fire off the transaction anyway.
			
 
				+		 * Note - it's possible that we might have pushed ourselves out
			
 
				+		 * of the way during trans_reserve which would flush the inode.
			
 
				+		 * But there's no guarantee that the inode buffer has actually
			
 
				+		 * gone out yet (it's delwri).	Plus the buffer could be pinned
			
 
				+		 * anyway if it's part of an inode in another recent
			
 
				+		 * transaction.	 So we play it safe and fire off the
			
 
				+		 * transaction anyway.
			
 
				 		 */
			
 
				 		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
			
 
				 		xfs_trans_ihold(tp, ip);
			
 
				 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
			
 
				-		if (flag & FSYNC_WAIT)
			
 
				-			xfs_trans_set_sync(tp);
			
 
				+		xfs_trans_set_sync(tp);
			
 
				 		error = _xfs_trans_commit(tp, 0, &log_flushed);
			
 
				 
			
 
				 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
			
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
 
				 int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
			
 
				 		struct cred *credp);
			
 
				 int xfs_readlink(struct xfs_inode *ip, char *link);
			
 
				-int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
			
 
				-		xfs_off_t stop);
			
 
				+int xfs_fsync(struct xfs_inode *ip);
			
 
				 int xfs_release(struct xfs_inode *ip);
			
 
				 int xfs_inactive(struct xfs_inode *ip);
			
 
				 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,