Browse Source

ext4: Use readahead when reading an inode from the inode table

With modern hard drives, reading 64k takes roughly the same time as
reading a 4k block.  So request readahead for adjacent inode table
blocks to reduce the time it takes when iterating over directories
(especially when doing this in htree sort order) in a cold cache case.
With this patch, the time it takes to run "git status" on a kernel
tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches"
is reduced by 21%.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Theodore Ts'o 16 years ago
parent
commit
240799cdf2
6 changed files with 101 additions and 72 deletions
  1. 6 0
      Documentation/filesystems/ext4.txt
  2. 3 0
      Documentation/filesystems/proc.txt
  3. 2 0
      fs/ext4/ext4.h
  4. 1 0
      fs/ext4/ext4_sb.h
  5. 64 70
      fs/ext4/inode.c
  6. 25 2
      fs/ext4/super.c

+ 6 - 0
Documentation/filesystems/ext4.txt

@@ -177,6 +177,11 @@ barrier=<0|1(*)>	This enables/disables the use of write barriers in
 			your disks are battery-backed in one way or another,
 			your disks are battery-backed in one way or another,
 			disabling barriers may safely improve performance.
 			disabling barriers may safely improve performance.
 
 
+inode_readahead=n	This tuning parameter controls the maximum
+			number of inode table blocks that ext4's inode
+			table readahead algorithm will pre-read into
+			the buffer cache.  The default value is 32 blocks.
+
 orlov		(*)	This enables the new Orlov block allocator. It is
 orlov		(*)	This enables the new Orlov block allocator. It is
 			enabled by default.
 			enabled by default.
 
 
@@ -252,6 +257,7 @@ stripe=n		Number of filesystem blocks that mballoc will try
 delalloc	(*)	Deferring block allocation until write-out time.
 delalloc	(*)	Deferring block allocation until write-out time.
 nodelalloc		Disable delayed allocation. Blocks are allocation
 nodelalloc		Disable delayed allocation. Blocks are allocation
 			when data is copied from user to page cache.
 			when data is copied from user to page cache.
+
 Data Mode
 Data Mode
 =========
 =========
 There are 3 different data modes:
 There are 3 different data modes:

+ 3 - 0
Documentation/filesystems/proc.txt

@@ -956,6 +956,9 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
                  files are packed closely together.  Each large file
                  files are packed closely together.  Each large file
                  will have its blocks allocated out of its own unique
                  will have its blocks allocated out of its own unique
                  preallocation pool.
                  preallocation pool.
+inode_readahead  Tuning parameter which controls the maximum number of
+                 inode table blocks that ext4's inode table readahead
+                 algorithm will pre-read into the buffer cache
 ..............................................................................
 ..............................................................................
 
 
 
 

+ 2 - 0
fs/ext4/ext4.h

@@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 #define	EXT4_DEF_RESGID		0
 
 
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
 /*
 /*
  * Default mount options
  * Default mount options
  */
  */

+ 1 - 0
fs/ext4/ext4_sb.h

@@ -52,6 +52,7 @@ struct ext4_sb_info {
 	int s_desc_per_block_bits;
 	int s_desc_per_block_bits;
 	int s_inode_size;
 	int s_inode_size;
 	int s_first_ino;
 	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
 	spinlock_t s_next_gen_lock;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	u32 s_hash_seed[4];

+ 64 - 70
fs/ext4/inode.c

@@ -3833,41 +3833,6 @@ out_stop:
 	ext4_journal_stop(handle);
 	ext4_journal_stop(handle);
 }
 }
 
 
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
-		unsigned long ino, struct ext4_iloc *iloc)
-{
-	ext4_group_t block_group;
-	unsigned long offset;
-	ext4_fsblk_t block;
-	struct ext4_group_desc *gdp;
-
-	if (!ext4_valid_inum(sb, ino)) {
-		/*
-		 * This error is already checked for in namei.c unless we are
-		 * looking at an NFS filehandle, in which case no error
-		 * report is needed
-		 */
-		return 0;
-	}
-
-	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-	gdp = ext4_get_group_desc(sb, block_group, NULL);
-	if (!gdp)
-		return 0;
-
-	/*
-	 * Figure out the offset within the block group inode table
-	 */
-	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
-		EXT4_INODE_SIZE(sb);
-	block = ext4_inode_table(sb, gdp) +
-		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
-	iloc->block_group = block_group;
-	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
-	return block;
-}
-
 /*
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
  * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 static int __ext4_get_inode_loc(struct inode *inode,
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 				struct ext4_iloc *iloc, int in_mem)
 {
 {
-	ext4_fsblk_t block;
-	struct buffer_head *bh;
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
+
+	iloc->bh = 0;
+	if (!ext4_valid_inum(sb, inode->i_ino))
+		return -EIO;
 
 
-	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
-	if (!block)
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
 		return -EIO;
 		return -EIO;
 
 
-	bh = sb_getblk(inode->i_sb, block);
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
 	if (!bh) {
 	if (!bh) {
-		ext4_error (inode->i_sb, "ext4_get_inode_loc",
-				"unable to read inode block - "
-				"inode=%lu, block=%llu",
-				 inode->i_ino, block);
+		ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+			   "inode block - inode=%lu, block=%llu",
+			   inode->i_ino, block);
 		return -EIO;
 		return -EIO;
 	}
 	}
 	if (!buffer_uptodate(bh)) {
 	if (!buffer_uptodate(bh)) {
@@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		 */
 		 */
 		if (in_mem) {
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
 			struct buffer_head *bitmap_bh;
-			struct ext4_group_desc *desc;
-			int inodes_per_buffer;
-			int inode_offset, i;
-			ext4_group_t block_group;
-			int start;
-
-			block_group = (inode->i_ino - 1) /
-					EXT4_INODES_PER_GROUP(inode->i_sb);
-			inodes_per_buffer = bh->b_size /
-				EXT4_INODE_SIZE(inode->i_sb);
-			inode_offset = ((inode->i_ino - 1) %
-					EXT4_INODES_PER_GROUP(inode->i_sb));
-			start = inode_offset & ~(inodes_per_buffer - 1);
+			int i, start;
 
 
-			/* Is the inode bitmap in cache? */
-			desc = ext4_get_group_desc(inode->i_sb,
-						block_group, NULL);
-			if (!desc)
-				goto make_io;
+			start = inode_offset & ~(inodes_per_block - 1);
 
 
-			bitmap_bh = sb_getblk(inode->i_sb,
-				ext4_inode_bitmap(inode->i_sb, desc));
+			/* Is the inode bitmap in cache? */
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (!bitmap_bh)
 			if (!bitmap_bh)
 				goto make_io;
 				goto make_io;
 
 
@@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
 				brelse(bitmap_bh);
 				brelse(bitmap_bh);
 				goto make_io;
 				goto make_io;
 			}
 			}
-			for (i = start; i < start + inodes_per_buffer; i++) {
+			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 				if (i == inode_offset)
 					continue;
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 					break;
 			}
 			}
 			brelse(bitmap_bh);
 			brelse(bitmap_bh);
-			if (i == start + inodes_per_buffer) {
+			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
 				set_buffer_uptodate(bh);
@@ -3968,6 +3933,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		}
 		}
 
 
 make_io:
 make_io:
+		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+
+			table = ext4_inode_table(sb, gdp);
+			/* Make sure s_inode_readahead_blks is a power of 2 */
+			while (EXT4_SB(sb)->s_inode_readahead_blks &
+			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
+				EXT4_SB(sb)->s_inode_readahead_blks = 
+				   (EXT4_SB(sb)->s_inode_readahead_blks &
+				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+			if (table > b)
+				b = table;
+			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+				num -= le16_to_cpu(gdp->bg_itable_unused);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
 		/*
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * has in-inode xattrs, or we don't have this inode in memory.
@@ -3978,10 +3973,9 @@ make_io:
 		submit_bh(READ_META, bh);
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 		if (!buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, "ext4_get_inode_loc",
-					"unable to read inode block - "
-					"inode=%lu, block=%llu",
-					inode->i_ino, block);
+			ext4_error(sb, __func__,
+				   "unable to read inode block - inode=%lu, "
+				   "block=%llu", inode->i_ino, block);
 			brelse(bh);
 			brelse(bh);
 			return -EIO;
 			return -EIO;
 		}
 		}

+ 25 - 2
fs/ext4/super.c

@@ -515,8 +515,10 @@ static void ext4_put_super(struct super_block *sb)
 		mark_buffer_dirty(sbi->s_sbh);
 		mark_buffer_dirty(sbi->s_sbh);
 		ext4_commit_super(sb, es, 1);
 		ext4_commit_super(sb, es, 1);
 	}
 	}
-	if (sbi->s_proc)
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 		brelse(sbi->s_group_desc[i]);
@@ -779,6 +781,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 		seq_puts(seq, ",data=writeback");
 
 
+	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		seq_printf(seq, ",inode_readahead_blks=%u",
+			   sbi->s_inode_readahead_blks);
+
 	ext4_show_quota_options(seq, sb);
 	ext4_show_quota_options(seq, sb);
 	return 0;
 	return 0;
 }
 }
@@ -913,6 +919,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_inode_readahead_blks
 };
 };
 
 
 static match_table_t tokens = {
 static match_table_t tokens = {
@@ -973,6 +980,7 @@ static match_table_t tokens = {
 	{Opt_resize, "resize"},
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_err, NULL},
 	{Opt_err, NULL},
 };
 };
 
 
@@ -1381,6 +1389,13 @@ set_qf_format:
 		case Opt_delalloc:
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
 			break;
+		case Opt_inode_readahead_blks:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > (1 << 30))
+				return 0;
+			sbi->s_inode_readahead_blks = option;
+			break;
 		default:
 		default:
 			printk(KERN_ERR
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1938,6 +1953,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_mount_opt = 0;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
 	sbi->s_sb_block = sb_block;
 
 
 	unlock_kernel();
 	unlock_kernel();
@@ -2234,6 +2250,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext4_proc_root)
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 
 
+	if (sbi->s_proc)
+		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+				 &ext4_ui_proc_fops,
+				 &sbi->s_inode_readahead_blks);
+
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 
 	for (i = 0; i < db_count; i++) {
 	for (i = 0; i < db_count; i++) {
@@ -2513,8 +2534,10 @@ failed_mount2:
 		brelse(sbi->s_group_desc[i]);
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 	kfree(sbi->s_group_desc);
 failed_mount:
 failed_mount:
-	if (sbi->s_proc)
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 #ifdef CONFIG_QUOTA
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
 		kfree(sbi->s_qf_names[i]);