20 anni fa · ba6d2377c8
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -451,9 +451,12 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
 
															 	- Implement extension of resident files using the normal file write
														
 
															 	  code paths, i.e. most very small files can be extended to be a little
														
 
															 	  bit bigger but not by much.
														
 
															+	- Add new mount option "disable_sparse".  (See list of mount options
														
 
															+	  above for details.)
														
 
															 	- Improve handling of ntfs volumes with errors and strange boot sectors
														
 
															 	  in particular.
														
 
															-	- Fix various bugs.
														
 
															+	- Fix various bugs including a nasty deadlock that appeared in recent
														
 
															+	  kernels (around 2.6.11-2.6.12 timeframe).
														
 
															 2.1.22:
														
 
															 	- Improve handling of ntfs volumes with errors.
														
 
															 	- Fix various bugs and race conditions.
														
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -132,6 +132,48 @@ ToDo/Notes:
 
															 	  the already mapped runlist fragment which causes
														
 
															 	  ntfs_mapping_pairs_decompress() to fail and return error.  Update
														
 
															 	  ntfs_attr_find_vcn_nolock() accordingly.
														
 
															+	- Fix a nasty deadlock that appeared in recent kernels.
														
 
															+	  The situation: VFS inode X on a mounted ntfs volume is dirty.  For
														
 
															+	  same inode X, the ntfs_inode is dirty and thus corresponding on-disk
														
 
															+	  inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
														
 
															+	  to the table of inodes, i.e. $MFT, inode 0.
														
 
															+	  What happens:
														
 
															+	  Process 1: sys_sync()/umount()/whatever...  calls
														
 
															+	  __sync_single_inode() for $MFT -> do_writepages() -> write_page for
														
 
															+	  the dirty page containing the on-disk inode X, the page is now locked
														
 
															+	  -> ntfs_write_mst_block() which clears PageUptodate() on the page to
														
 
															+	  prevent anyone else getting hold of it whilst it does the write out.
														
 
															+	  This is necessary as the on-disk inode needs "fixups" applied before
														
 
															+	  the write to disk which are removed again after the write and
														
 
															+	  PageUptodate is then set again.  It then analyses the page looking
														
 
															+	  for dirty on-disk inodes and when it finds one it calls
														
 
															+	  ntfs_may_write_mft_record() to see if it is safe to write this
														
 
															+	  on-disk inode.  This then calls ilookup5() to check if the
														
 
															+	  corresponding VFS inode is in icache().  This in turn calls ifind()
														
 
															+	  which waits on the inode lock via wait_on_inode whilst holding the
														
 
															+	  global inode_lock.
														
 
															+	  Process 2: pdflush results in a call to __sync_single_inode for the
														
 
															+	  same VFS inode X on the ntfs volume.  This locks the inode (I_LOCK)
														
 
															+	  then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
														
 
															+	  read_cache_page() for the page (in page cache of table of inodes
														
 
															+	  $MFT, inode 0) containing the on-disk inode.  This page has
														
 
															+	  PageUptodate() clear because of Process 1 (see above) so
														
 
															+	  read_cache_page() blocks when it tries to take the page lock for the
														
 
															+	  page so it can call ntfs_read_page().
														
 
															+	  Thus Process 1 is holding the page lock on the page containing the
														
 
															+	  on-disk inode X and it is waiting on the inode X to be unlocked in
														
 
															+	  ifind() so it can write the page out and then unlock the page.
														
 
															+	  And Process 2 is holding the inode lock on inode X and is waiting for
														
 
															+	  the page to be unlocked so it can call ntfs_readpage() or discover
														
 
															+	  that Process 1 set PageUptodate() again and use the page.
														
 
															+	  Thus we have a deadlock due to ifind() waiting on the inode lock.
														
 
															+	  The solution: The fix is to use the newly introduced
														
 
															+	  ilookup5_nowait() which does not wait on the inode's lock and hence
														
 
															+	  avoids the deadlock.  This is safe as we do not care about the VFS
														
 
															+	  inode and only use the fact that it is in the VFS inode cache and the
														
 
															+	  fact that the vfs and ntfs inodes are one struct in memory to find
														
 
															+	  the ntfs inode in memory if present.  Also, the ntfs inode has its
														
 
															+	  own locking so it does not matter if the vfs inode is locked.
														
 
															 2.1.22 - Many bug and race fixes and error handling improvements.
														
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -948,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 
															 	na.name_len = 0;
														
 
															 	na.type = AT_UNUSED;
														
 
															 	/*
														
 
															-	 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
														
 
															-	 * we deadlock because the inode is already locked by the kernel
														
 
															-	 * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
														
 
															-	 * until the inode is unlocked before returning it and it never gets
														
 
															-	 * unlocked because ntfs_should_write_mft_record() never returns.  )-:
														
 
															-	 * Fortunately, we have inode 0 pinned in icache for the duration of
														
 
															-	 * the mount so we can access it directly.
														
 
															+	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
														
 
															+	 * we get here for it rather often.
														
 
															 	 */
														
 
															 	if (!mft_no) {
														
 
															 		/* Balance the below iput(). */
														
 
															 		vi = igrab(mft_vi);
														
 
															 		BUG_ON(vi != mft_vi);
														
 
															-	} else
														
 
															-		vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * Have to use ilookup5_nowait() since ilookup5() waits for the
														
 
															+		 * inode lock which causes ntfs to deadlock when a concurrent
														
 
															+		 * inode write via the inode dirty code paths and the page
														
 
															+		 * dirty code path of the inode dirty code path when writing
														
 
															+		 * $MFT occurs.
														
 
															+		 */
														
 
															+		vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
														
 
															+	}
														
 
															 	if (vi) {
														
 
															 		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
														
 
															 		/* The inode is in icache. */
														
@@ -1016,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 
															 	na.mft_no = MREF_LE(m->base_mft_record);
														
 
															 	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
														
 
															 			"inode 0x%lx in icache.", mft_no, na.mft_no);
														
 
															-	vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
														
 
															+	if (!na.mft_no) {
														
 
															+		/* Balance the below iput(). */
														
 
															+		vi = igrab(mft_vi);
														
 
															+		BUG_ON(vi != mft_vi);
														
 
															+	} else
														
 
															+		vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
														
 
															+				&na);
														
 
															 	if (!vi) {
														
 
															 		/*
														
 
															 		 * The base inode is not in icache, write this extent mft