14 years ago · 1a3e8f3da0
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 
															 {
														
 
															 	struct inode		*inode = VFS_I(ip);
														
 
															+	ASSERT(rcu_read_lock_held());
														
 
															+
														
 
															+	/*
														
 
															+	 * check for stale RCU freed inode
														
 
															+	 *
														
 
															+	 * If the inode has been reallocated, it doesn't matter if it's not in
														
 
															+	 * the AG we are walking - we are walking for writeback, so if it
														
 
															+	 * passes all the "valid inode" checks and is dirty, then we'll write
														
 
															+	 * it back anyway.  If it has been reallocated and still being
														
 
															+	 * initialised, the XFS_INEW check below will catch it.
														
 
															+	 */
														
 
															+	spin_lock(&ip->i_flags_lock);
														
 
															+	if (!ip->i_ino)
														
 
															+		goto out_unlock_noent;
														
 
															+
														
 
															+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
														
 
															+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
														
 
															+		goto out_unlock_noent;
														
 
															+	spin_unlock(&ip->i_flags_lock);
														
 
															+
														
 
															 	/* nothing to sync during shutdown */
														
 
															 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
														
 
															 		return EFSCORRUPTED;
														
 
															-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
														
 
															-	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
														
 
															-		return ENOENT;
														
 
															-
														
 
															 	/* If we can't grab the inode, it must on it's way to reclaim. */
														
 
															 	if (!igrab(inode))
														
 
															 		return ENOENT;
														
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
 
															 	/* inode is valid */
														
 
															 	return 0;
														
 
															+
														
 
															+out_unlock_noent:
														
 
															+	spin_unlock(&ip->i_flags_lock);
														
 
															+	return ENOENT;
														
 
															 }
														
 
															 STATIC int
														
@@ -98,12 +118,12 @@ restart:
 
															 		int		error = 0;
														
 
															 		int		i;
														
 
															-		read_lock(&pag->pag_ici_lock);
														
 
															+		rcu_read_lock();
														
 
															 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
														
 
															 					(void **)batch, first_index,
														
 
															 					XFS_LOOKUP_BATCH);
														
 
															 		if (!nr_found) {
														
 
															-			read_unlock(&pag->pag_ici_lock);
														
 
															+			rcu_read_unlock();
														
 
															 			break;
														
 
															 		}
														
@@ -118,18 +138,26 @@ restart:
 
															 				batch[i] = NULL;
														
 
															 			/*
														
 
															-			 * Update the index for the next lookup. Catch overflows
														
 
															-			 * into the next AG range which can occur if we have inodes
														
 
															-			 * in the last block of the AG and we are currently
														
 
															-			 * pointing to the last inode.
														
 
															+			 * Update the index for the next lookup. Catch
														
 
															+			 * overflows into the next AG range which can occur if
														
 
															+			 * we have inodes in the last block of the AG and we
														
 
															+			 * are currently pointing to the last inode.
														
 
															+			 *
														
 
															+			 * Because we may see inodes that are from the wrong AG
														
 
															+			 * due to RCU freeing and reallocation, only update the
														
 
															+			 * index if it lies in this AG. It was a race that lead
														
 
															+			 * us to see this inode, so another lookup from the
														
 
															+			 * same index will not find it again.
														
 
															 			 */
														
 
															+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
														
 
															+				continue;
														
 
															 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
														
 
															 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
														
 
															 				done = 1;
														
 
															 		}
														
 
															 		/* unlock now we've grabbed the inodes. */
														
 
															-		read_unlock(&pag->pag_ici_lock);
														
 
															+		rcu_read_unlock();
														
 
															 		for (i = 0; i < nr_found; i++) {
														
 
															 			if (!batch[i])
														
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
 
															 	struct xfs_inode	*ip,
														
 
															 	int			flags)
														
 
															 {
														
 
															+	ASSERT(rcu_read_lock_held());
														
 
															+
														
 
															+	/* quick check for stale RCU freed inode */
														
 
															+	if (!ip->i_ino)
														
 
															+		return 1;
														
 
															 	/*
														
 
															-	 * do some unlocked checks first to avoid unnecceary lock traffic.
														
 
															+	 * do some unlocked checks first to avoid unnecessary lock traffic.
														
 
															 	 * The first is a flush lock check, the second is a already in reclaim
														
 
															 	 * check. Only do these checks if we are not going to block on locks.
														
 
															 	 */
														
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
 
															 	 * The radix tree lock here protects a thread in xfs_iget from racing
														
 
															 	 * with us starting reclaim on the inode.  Once we have the
														
 
															 	 * XFS_IRECLAIM flag set it will not touch us.
														
 
															+	 *
														
 
															+	 * Due to RCU lookup, we may find inodes that have been freed and only
														
 
															+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
														
 
															+	 * aren't candidates for reclaim at all, so we must check the
														
 
															+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
														
 
															 	 */
														
 
															 	spin_lock(&ip->i_flags_lock);
														
 
															-	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
														
 
															-	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
														
 
															-		/* ignore as it is already under reclaim */
														
 
															+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
														
 
															+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
														
 
															+		/* not a reclaim candidate. */
														
 
															 		spin_unlock(&ip->i_flags_lock);
														
 
															 		return 1;
														
 
															 	}
														
@@ -864,14 +902,14 @@ restart:
 
															 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
														
 
															 			int	i;
														
 
															-			write_lock(&pag->pag_ici_lock);
														
 
															+			rcu_read_lock();
														
 
															 			nr_found = radix_tree_gang_lookup_tag(
														
 
															 					&pag->pag_ici_root,
														
 
															 					(void **)batch, first_index,
														
 
															 					XFS_LOOKUP_BATCH,
														
 
															 					XFS_ICI_RECLAIM_TAG);
														
 
															 			if (!nr_found) {
														
 
															-				write_unlock(&pag->pag_ici_lock);
														
 
															+				rcu_read_unlock();
														
 
															 				break;
														
 
															 			}
														
@@ -891,14 +929,24 @@ restart:
 
															 				 * occur if we have inodes in the last block of
														
 
															 				 * the AG and we are currently pointing to the
														
 
															 				 * last inode.
														
 
															+				 *
														
 
															+				 * Because we may see inodes that are from the
														
 
															+				 * wrong AG due to RCU freeing and
														
 
															+				 * reallocation, only update the index if it
														
 
															+				 * lies in this AG. It was a race that lead us
														
 
															+				 * to see this inode, so another lookup from
														
 
															+				 * the same index will not find it again.
														
 
															 				 */
														
 
															+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
														
 
															+								pag->pag_agno)
														
 
															+					continue;
														
 
															 				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
														
 
															 				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
														
 
															 					done = 1;
														
 
															 			}
														
 
															 			/* unlock now we've grabbed the inodes. */
														
 
															-			write_unlock(&pag->pag_ici_lock);
														
 
															+			rcu_read_unlock();
														
 
															 			for (i = 0; i < nr_found; i++) {
														
 
															 				if (!batch[i])
														
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
 
															 	ASSERT(atomic_read(&ip->i_pincount) == 0);
														
 
															 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
														
 
															 	ASSERT(completion_done(&ip->i_flush));
														
 
															+	ASSERT(ip->i_ino == 0);
														
 
															 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
														
 
															 	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
														
@@ -98,9 +99,6 @@ xfs_inode_alloc(
 
															 	ip->i_size = 0;
														
 
															 	ip->i_new_size = 0;
														
 
															-	/* prevent anyone from using this yet */
														
 
															-	VFS_I(ip)->i_state = I_NEW;
														
 
															-
														
 
															 	return ip;
														
 
															 }
														
@@ -159,6 +157,16 @@ xfs_inode_free(
 
															 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
														
 
															 	ASSERT(completion_done(&ip->i_flush));
														
 
															+	/*
														
 
															+	 * Because we use RCU freeing we need to ensure the inode always
														
 
															+	 * appears to be reclaimed with an invalid inode number when in the
														
 
															+	 * free state. The ip->i_flags_lock provides the barrier against lookup
														
 
															+	 * races.
														
 
															+	 */
														
 
															+	spin_lock(&ip->i_flags_lock);
														
 
															+	ip->i_flags = XFS_IRECLAIM;
														
 
															+	ip->i_ino = 0;
														
 
															+	spin_unlock(&ip->i_flags_lock);
														
 
															 	call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
														
 
															 }
														
@@ -169,14 +177,29 @@ static int
 
															 xfs_iget_cache_hit(
														
 
															 	struct xfs_perag	*pag,
														
 
															 	struct xfs_inode	*ip,
														
 
															+	xfs_ino_t		ino,
														
 
															 	int			flags,
														
 
															-	int			lock_flags) __releases(pag->pag_ici_lock)
														
 
															+	int			lock_flags) __releases(RCU)
														
 
															 {
														
 
															 	struct inode		*inode = VFS_I(ip);
														
 
															 	struct xfs_mount	*mp = ip->i_mount;
														
 
															 	int			error;
														
 
															+	/*
														
 
															+	 * check for re-use of an inode within an RCU grace period due to the
														
 
															+	 * radix tree nodes not being updated yet. We monitor for this by
														
 
															+	 * setting the inode number to zero before freeing the inode structure.
														
 
															+	 * If the inode has been reallocated and set up, then the inode number
														
 
															+	 * will not match, so check for that, too.
														
 
															+	 */
														
 
															 	spin_lock(&ip->i_flags_lock);
														
 
															+	if (ip->i_ino != ino) {
														
 
															+		trace_xfs_iget_skip(ip);
														
 
															+		XFS_STATS_INC(xs_ig_frecycle);
														
 
															+		error = EAGAIN;
														
 
															+		goto out_error;
														
 
															+	}
														
 
															+
														
 
															 	/*
														
 
															 	 * If we are racing with another cache hit that is currently
														
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
 
															 		ip->i_flags |= XFS_IRECLAIM;
														
 
															 		spin_unlock(&ip->i_flags_lock);
														
 
															-		read_unlock(&pag->pag_ici_lock);
														
 
															+		rcu_read_unlock();
														
 
															 		error = -inode_init_always(mp->m_super, inode);
														
 
															 		if (error) {
														
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
 
															 			 * Re-initializing the inode failed, and we are in deep
														
 
															 			 * trouble.  Try to re-add it to the reclaim list.
														
 
															 			 */
														
 
															-			read_lock(&pag->pag_ici_lock);
														
 
															+			rcu_read_lock();
														
 
															 			spin_lock(&ip->i_flags_lock);
														
 
															 			ip->i_flags &= ~XFS_INEW;
														
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
 
															 		/* We've got a live one. */
														
 
															 		spin_unlock(&ip->i_flags_lock);
														
 
															-		read_unlock(&pag->pag_ici_lock);
														
 
															+		rcu_read_unlock();
														
 
															 		trace_xfs_iget_hit(ip);
														
 
															 	}
														
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
 
															 out_error:
														
 
															 	spin_unlock(&ip->i_flags_lock);
														
 
															-	read_unlock(&pag->pag_ici_lock);
														
 
															+	rcu_read_unlock();
														
 
															 	return error;
														
 
															 }
														
@@ -397,7 +420,7 @@ xfs_iget(
 
															 	xfs_agino_t	agino;
														
 
															 	/* reject inode numbers outside existing AGs */
														
 
															-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
														
 
															+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
														
 
															 		return EINVAL;
														
 
															 	/* get the perag structure and ensure that it's inode capable */
														
@@ -406,15 +429,15 @@ xfs_iget(
 
															 again:
														
 
															 	error = 0;
														
 
															-	read_lock(&pag->pag_ici_lock);
														
 
															+	rcu_read_lock();
														
 
															 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
														
 
															 	if (ip) {
														
 
															-		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
														
 
															+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
														
 
															 		if (error)
														
 
															 			goto out_error_or_again;
														
 
															 	} else {
														
 
															-		read_unlock(&pag->pag_ici_lock);
														
 
															+		rcu_read_unlock();
														
 
															 		XFS_STATS_INC(xs_ig_missed);
														
 
															 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
														
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2000,16 +2000,32 @@ xfs_ifree_cluster(
 
															 		 */
														
 
															 		for (i = 0; i < ninodes; i++) {
														
 
															 retry:
														
 
															-			read_lock(&pag->pag_ici_lock);
														
 
															+			rcu_read_lock();
														
 
															 			ip = radix_tree_lookup(&pag->pag_ici_root,
														
 
															 					XFS_INO_TO_AGINO(mp, (inum + i)));
														
 
															-			/* Inode not in memory or stale, nothing to do */
														
 
															-			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
														
 
															-				read_unlock(&pag->pag_ici_lock);
														
 
															+			/* Inode not in memory, nothing to do */
														
 
															+			if (!ip) {
														
 
															+				rcu_read_unlock();
														
 
															 				continue;
														
 
															 			}
														
 
															+			/*
														
 
															+			 * because this is an RCU protected lookup, we could
														
 
															+			 * find a recently freed or even reallocated inode
														
 
															+			 * during the lookup. We need to check under the
														
 
															+			 * i_flags_lock for a valid inode here. Skip it if it
														
 
															+			 * is not valid, the wrong inode or stale.
														
 
															+			 */
														
 
															+			spin_lock(&ip->i_flags_lock);
														
 
															+			if (ip->i_ino != inum + i ||
														
 
															+			    __xfs_iflags_test(ip, XFS_ISTALE)) {
														
 
															+				spin_unlock(&ip->i_flags_lock);
														
 
															+				rcu_read_unlock();
														
 
															+				continue;
														
 
															+			}
														
 
															+			spin_unlock(&ip->i_flags_lock);
														
 
															+
														
 
															 			/*
														
 
															 			 * Don't try to lock/unlock the current inode, but we
														
 
															 			 * _cannot_ skip the other inodes that we did not find
														
@@ -2019,11 +2035,11 @@ retry:
 
															 			 */
														
 
															 			if (ip != free_ip &&
														
 
															 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
														
 
															-				read_unlock(&pag->pag_ici_lock);
														
 
															+				rcu_read_unlock();
														
 
															 				delay(1);
														
 
															 				goto retry;
														
 
															 			}
														
 
															-			read_unlock(&pag->pag_ici_lock);
														
 
															+			rcu_read_unlock();
														
 
															 			xfs_iflock(ip);
														
 
															 			xfs_iflags_set(ip, XFS_ISTALE);
														
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
 
															 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
														
 
															 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
														
 
															-	read_lock(&pag->pag_ici_lock);
														
 
															+	rcu_read_lock();
														
 
															 	/* really need a gang lookup range call here */
														
 
															 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
														
 
															 					first_index, inodes_per_cluster);
														
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
 
															 		iq = ilist[i];
														
 
															 		if (iq == ip)
														
 
															 			continue;
														
 
															-		/* if the inode lies outside this cluster, we're done. */
														
 
															-		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
														
 
															-			break;
														
 
															+
														
 
															+		/*
														
 
															+		 * because this is an RCU protected lookup, we could find a
														
 
															+		 * recently freed or even reallocated inode during the lookup.
														
 
															+		 * We need to check under the i_flags_lock for a valid inode
														
 
															+		 * here. Skip it if it is not valid or the wrong inode.
														
 
															+		 */
														
 
															+		spin_lock(&ip->i_flags_lock);
														
 
															+		if (!ip->i_ino ||
														
 
															+		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
														
 
															+			spin_unlock(&ip->i_flags_lock);
														
 
															+			continue;
														
 
															+		}
														
 
															+		spin_unlock(&ip->i_flags_lock);
														
 
															+
														
 
															 		/*
														
 
															 		 * Do an un-protected check to see if the inode is dirty and
														
 
															 		 * is a candidate for flushing.  These checks will be repeated
														
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
 
															 	}
														
 
															 out_free:
														
 
															-	read_unlock(&pag->pag_ici_lock);
														
 
															+	rcu_read_unlock();
														
 
															 	kmem_free(ilist);
														
 
															 out_put:
														
 
															 	xfs_perag_put(pag);
														
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
 
															 	 * Corruption detected in the clustering loop.  Invalidate the
														
 
															 	 * inode buffer and shut down the filesystem.
														
 
															 	 */
														
 
															-	read_unlock(&pag->pag_ici_lock);
														
 
															+	rcu_read_unlock();
														
 
															 	/*
														
 
															 	 * Clean up the buffer.  If it was B_DELWRI, just release it --
														
 
															 	 * brelse can handle it with no problems.  If not, shut down the