Browse Source

ext4: add fsync batch tuning knobs

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Theodore Ts'o 16 years ago
parent
commit
30773840c1
7 changed files with 91 additions and 8 deletions
  1. 29 0
      Documentation/filesystems/ext4.txt
  2. 7 0
      fs/ext4/ext4.h
  3. 2 0
      fs/ext4/ext4_sb.h
  4. 40 7
      fs/ext4/super.c
  5. 2 0
      fs/jbd2/journal.c
  6. 3 1
      fs/jbd2/transaction.c
  7. 8 0
      include/linux/jbd2.h

+ 29 - 0
Documentation/filesystems/ext4.txt

@@ -283,6 +283,35 @@ delalloc	(*)	Deferring block allocation until write-out time.
 nodelalloc		Disable delayed allocation. Blocks are allocation
 nodelalloc		Disable delayed allocation. Blocks are allocation
 			when data is copied from user to page cache.
 			when data is copied from user to page cache.
 
 
+max_batch_time=usec	Maximum amount of time ext4 should wait for
+			additional filesystem operations to be batch
+			together with a synchronous write operation.
+			Since a synchronous write operation is going to
+			force a commit and then a wait for the I/O
+			complete, it doesn't cost much, and can be a
+			huge throughput win, we wait for a small amount
+			of time to see if any other transactions can
+			piggyback on the synchronous write.   The
+			algorithm used is designed to automatically tune
+			for the speed of the disk, by measuring the
+			amount of time (on average) that it takes to
+			finish committing a transaction.  Call this time
+			the "commit time".  If the time that the
+			transactoin has been running is less than the
+			commit time, ext4 will try sleeping for the
+			commit time to see if other operations will join
+			the transaction.   The commit time is capped by
+			the max_batch_time, which defaults to 15000us
+			(15ms).   This optimization can be turned off
+			entirely by setting max_batch_time to 0.
+
+min_batch_time=usec	This parameter sets the commit time (as
+			described above) to be at least min_batch_time.
+			It defaults to zero microseconds.  Increasing
+			this parameter may improve the throughput of
+			multi-threaded, synchronous workloads on very
+			fast disks, at the cost of increasing latency.
+
 Data Mode
 Data Mode
 =========
 =========
 There are 3 different data modes:
 There are 3 different data modes:

+ 7 - 0
fs/ext4/ext4.h

@@ -328,6 +328,7 @@ struct ext4_mount_options {
 	uid_t s_resuid;
 	uid_t s_resuid;
 	gid_t s_resgid;
 	gid_t s_resgid;
 	unsigned long s_commit_interval;
 	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
 	int s_jquota_fmt;
 	char *s_qf_names[MAXQUOTAS];
 	char *s_qf_names[MAXQUOTAS];
@@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_ORDERED	0x0040
 #define EXT4_DEFM_JMODE_ORDERED	0x0040
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 
 
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
 /*
 /*
  * Structure of a directory entry
  * Structure of a directory entry
  */
  */

+ 2 - 0
fs/ext4/ext4_sb.h

@@ -74,6 +74,8 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
 	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
 #ifdef CONFIG_JBD2_DEBUG
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */

+ 40 - 7
fs/ext4/super.c

@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 #endif
 	if (!test_opt(sb, RESERVATION))
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
 		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
+	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
 	}
+	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+		seq_printf(seq, ",min_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+		seq_printf(seq, ",max_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+
 	/*
 	/*
 	 * We're changing the default of barrier mount option, so
 	 * We're changing the default of barrier mount option, so
 	 * let's always display its mount state so it's clear what its
 	 * let's always display its mount state so it's clear what its
@@ -874,7 +883,8 @@ enum {
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+	Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_data_err_abort, Opt_data_err_ignore,
@@ -913,6 +923,8 @@ static const match_table_t tokens = {
 	{Opt_nobh, "nobh"},
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
 	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			sbi->s_commit_interval = HZ * option;
 			break;
 			break;
+		case Opt_max_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = option;
+			break;
+		case Opt_min_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_min_batch_time = option;
+			break;
 		case Opt_data_journal:
 		case Opt_data_journal:
 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
 			goto datacheck;
 			goto datacheck;
@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
 	set_opt(sbi->s_mount_opt, BARRIER);
@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext4-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
 
 
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 	if (test_opt(sb, BARRIER))
@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -3178,6 +3209,8 @@ restore_opts:
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
 	for (i = 0; i < MAXQUOTAS; i++) {

+ 2 - 0
fs/jbd2/journal.c

@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_state_lock);
 	spin_lock_init(&journal->j_state_lock);
 
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
 
 
 	/* The journal is marked for error until we succeed with recovery! */
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
 	journal->j_flags = JBD2_ABORT;

+ 3 - 1
fs/jbd2/transaction.c

@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
 						   transaction->t_start_time));
 
 
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
 		commit_time = min_t(u64, commit_time,
 		commit_time = min_t(u64, commit_time,
-				    1000*jiffies_to_usecs(1));
+				    1000*journal->j_max_batch_time);
 
 
 		if (trans_time < commit_time) {
 		if (trans_time < commit_time) {
 			ktime_t expires = ktime_add_ns(ktime_get(),
 			ktime_t expires = ktime_add_ns(ktime_get(),

+ 8 - 0
include/linux/jbd2.h

@@ -956,6 +956,14 @@ struct journal_s
 	 */
 	 */
 	u64			j_average_commit_time;
 	u64			j_average_commit_time;
 
 
+	/*
+	 * minimum and maximum times that we should wait for
+	 * additional filesystem operations to get batched into a
+	 * synchronous handle in microseconds
+	 */
+	u32			j_min_batch_time;
+	u32			j_max_batch_time;
+
 	/* This function is called when a transaction is closed */
 	/* This function is called when a transaction is closed */
 	void			(*j_commit_callback)(journal_t *,
 	void			(*j_commit_callback)(journal_t *,
 						     transaction_t *);
 						     transaction_t *);