فهرست منبع

[PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
Jens Axboe 19 سال پیش
والد
کامیت
b31dc66a54
10فایلهای تغییر یافته به همراه20 افزوده شده و 29 حذف شده
  1. 1 1
      block/as-iosched.c
  2. 1 3
      block/cfq-iosched.c
  3. 3 0
      block/ll_rw_blk.c
  4. 0 2
      drivers/usb/gadget/file_storage.c
  5. 0 2
      fs/buffer.c
  6. 8 10
      fs/direct-io.c
  7. 0 2
      fs/fs-writeback.c
  8. 2 0
      include/linux/blkdev.h
  9. 5 6
      include/linux/sched.h
  10. 0 3
      mm/msync.c

+ 1 - 1
block/as-iosched.c

@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 	arq->state = AS_RQ_NEW;
 	arq->state = AS_RQ_NEW;
 
 
 	if (rq_data_dir(arq->request) == READ
 	if (rq_data_dir(arq->request) == READ
-			|| current->flags&PF_SYNCWRITE)
+			|| (arq->request->flags & REQ_RW_SYNC))
 		arq->is_sync = 1;
 		arq->is_sync = 1;
 	else
 	else
 		arq->is_sync = 0;
 		arq->is_sync = 0;

+ 1 - 3
block/cfq-iosched.c

@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
 
-#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
-
 /*
 /*
  * lots of deadline iosched dupes, can be abstracted later...
  * lots of deadline iosched dupes, can be abstracted later...
  */
  */
@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q)
 
 
 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 {
 {
-	if (rw == READ || process_sync(task))
+	if (rw == READ || rw == WRITE_SYNC)
 		return task->pid;
 		return task->pid;
 
 
 	return CFQ_KEY_ASYNC;
 	return CFQ_KEY_ASYNC;

+ 3 - 0
block/ll_rw_blk.c

@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 	if (unlikely(bio_barrier(bio)))
 	if (unlikely(bio_barrier(bio)))
 		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
 
+	if (bio_sync(bio))
+		req->flags |= REQ_RW_SYNC;
+
 	req->errors = 0;
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
 	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);

+ 0 - 2
drivers/usb/gadget/file_storage.c

@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun)
 
 
 	inode = filp->f_dentry->d_inode;
 	inode = filp->f_dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&inode->i_mutex);
-	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
 	if (!rc)
 	if (!rc)
@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun)
 	err = filemap_fdatawait(inode->i_mapping);
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!rc)
 	if (!rc)
 		rc = err;
 		rc = err;
-	current->flags &= ~PF_SYNCWRITE;
 	mutex_unlock(&inode->i_mutex);
 	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
 	return rc;

+ 0 - 2
fs/buffer.c

@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 		goto out;
 	}
 	}
 
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 	ret = filemap_fdatawrite(mapping);
 
 
 	/*
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 	if (!ret)
 		ret = err;
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 out:
 	return ret;
 	return ret;
 }
 }

+ 8 - 10
fs/direct-io.c

@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 	up_read(&current->mm->mmap_sem);
 
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 				loff_t i_size_aligned;
 
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					page_cache_release(page);
 					return -ENOTBLK;
 					return -ENOTBLK;
 				}
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 		}
 	} /* end iovec loop */
 	} /* end iovec loop */
 
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		/*
 		 * The remaining part of the request will be
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 	if (dio->is_async) {
 		int should_wait = 0;
 		int should_wait = 0;
 
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			dio->waiter = current;
 			should_wait = 1;
 			should_wait = 1;
 		}
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 			ret = transferred;
 
 
 		/* We could have also come here on an AIO file extend */
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			/*
 			 * For AIO writes where we have completed the
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 	int acquire_i_mutex = 0;
 
 
 	if (rw & WRITE)
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 
 	if (bdev)
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 * returning in this case.
 	 */
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 		(end > i_size_read(inode)));
 
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 	return retval;
 }
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
 EXPORT_SYMBOL(__blockdev_direct_IO);

+ 0 - 2
fs/fs-writeback.c

@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int need_write_inode_now = 0;
 	int err2;
 	int err2;
 
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 		if (!err)
 			err = err2;
 			err = err2;
 	}
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 
 	spin_lock(&inode_lock);
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
 	if ((inode->i_state & I_DIRTY) &&

+ 2 - 0
include/linux/blkdev.h

@@ -241,6 +241,7 @@ enum rq_flag_bits {
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_NR_BITS,		/* stops here */
 	__REQ_NR_BITS,		/* stops here */
 };
 };
 
 
@@ -270,6 +271,7 @@ enum rq_flag_bits {
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 
 
 /*
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME

+ 5 - 6
include/linux/sched.h

@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
-#define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
-#define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
-#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
-#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 
 /*
 /*

+ 0 - 3
mm/msync.c

@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * just ignore them, but return -ENOMEM at the end.
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
 	 */
 	down_read(&current->mm->mmap_sem);
 	down_read(&current->mm->mmap_sem);
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
 	vma = find_vma(current->mm, start);
 	if (!vma) {
 	if (!vma) {
 		error = -ENOMEM;
 		error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		}
 		}
 	} while (vma && !done);
 	} while (vma && !done);
 out_unlock:
 out_unlock:
-	current->flags &= ~PF_SYNCWRITE;
 	up_read(&current->mm->mmap_sem);
 	up_read(&current->mm->mmap_sem);
 out:
 out:
 	return error;
 	return error;