|
@@ -35,6 +35,7 @@
|
|
|
#include <linux/mount.h>
|
|
|
#include <linux/namei.h>
|
|
|
#include <linux/bio.h> /* struct bio */
|
|
|
+#include <linux/buffer_head.h> /* various write calls */
|
|
|
|
|
|
#include "blocklayout.h"
|
|
|
|
|
@@ -79,12 +80,8 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect)
|
|
|
*/
|
|
|
static int is_writable(struct pnfs_block_extent *be, sector_t isect)
|
|
|
{
|
|
|
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA)
|
|
|
- return 1;
|
|
|
- else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
|
|
|
- return 0;
|
|
|
- else
|
|
|
- return bl_is_sector_init(be->be_inval, isect);
|
|
|
+ return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
|
|
|
+ be->be_state == PNFS_BLOCK_INVALID_DATA);
|
|
|
}
|
|
|
|
|
|
/* The data we are handed might be spread across several bios. We need
|
|
@@ -353,6 +350,31 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static void bl_end_io_write_zero(struct bio *bio, int err)
|
|
|
+{
|
|
|
+ struct parallel_io *par = bio->bi_private;
|
|
|
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
|
|
|
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
|
|
|
+ struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
|
|
|
+
|
|
|
+ do {
|
|
|
+ struct page *page = bvec->bv_page;
|
|
|
+
|
|
|
+ if (--bvec >= bio->bi_io_vec)
|
|
|
+ prefetchw(&bvec->bv_page->flags);
|
|
|
+ /* This is the zeroing page we added */
|
|
|
+ end_page_writeback(page);
|
|
|
+ page_cache_release(page);
|
|
|
+ } while (bvec >= bio->bi_io_vec);
|
|
|
+ if (!uptodate) {
|
|
|
+ if (!wdata->pnfs_error)
|
|
|
+ wdata->pnfs_error = -EIO;
|
|
|
+ bl_set_lo_fail(wdata->lseg);
|
|
|
+ }
|
|
|
+ bio_put(bio);
|
|
|
+ put_parallel(par);
|
|
|
+}
|
|
|
+
|
|
|
/* This is basically copied from mpage_end_io_read */
|
|
|
static void bl_end_io_write(struct bio *bio, int err)
|
|
|
{
|
|
@@ -379,11 +401,8 @@ static void bl_write_cleanup(struct work_struct *work)
|
|
|
dprintk("%s enter\n", __func__);
|
|
|
task = container_of(work, struct rpc_task, u.tk_work);
|
|
|
wdata = container_of(task, struct nfs_write_data, task);
|
|
|
- if (!wdata->task.tk_status) {
|
|
|
+ if (!wdata->pnfs_error) {
|
|
|
/* Marks for LAYOUTCOMMIT */
|
|
|
- /* BUG - this should be called after each bio, not after
|
|
|
- * all finish, unless have some way of storing success/failure
|
|
|
- */
|
|
|
mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
|
|
|
wdata->args.offset, wdata->args.count);
|
|
|
}
|
|
@@ -391,38 +410,110 @@ static void bl_write_cleanup(struct work_struct *work)
|
|
|
}
|
|
|
|
|
|
/* Called when last of bios associated with a bl_write_pagelist call finishes */
|
|
|
-static void
|
|
|
-bl_end_par_io_write(void *data)
|
|
|
+static void bl_end_par_io_write(void *data)
|
|
|
{
|
|
|
struct nfs_write_data *wdata = data;
|
|
|
|
|
|
- /* STUB - ignoring error handling */
|
|
|
wdata->task.tk_status = 0;
|
|
|
wdata->verf.committed = NFS_FILE_SYNC;
|
|
|
INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
|
|
|
schedule_work(&wdata->task.u.tk_work);
|
|
|
}
|
|
|
|
|
|
+/* FIXME STUB - mark intersection of layout and page as bad, so is not
|
|
|
+ * used again.
|
|
|
+ */
|
|
|
+static void mark_bad_read(void)
|
|
|
+{
|
|
|
+ return;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * map_block: map a requested I/0 block (isect) into an offset in the LVM
|
|
|
+ * block_device
|
|
|
+ */
|
|
|
+static void
|
|
|
+map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
|
|
|
+{
|
|
|
+ dprintk("%s enter be=%p\n", __func__, be);
|
|
|
+
|
|
|
+ set_buffer_mapped(bh);
|
|
|
+ bh->b_bdev = be->be_mdev;
|
|
|
+ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
|
|
|
+ (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
|
|
|
+
|
|
|
+ dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
|
|
|
+ __func__, (unsigned long long)isect, (long)bh->b_blocknr,
|
|
|
+ bh->b_size);
|
|
|
+ return;
|
|
|
+}
|
|
|
+
|
|
|
+/* Given an unmapped page, zero it or read in page for COW, page is locked
|
|
|
+ * by caller.
|
|
|
+ */
|
|
|
+static int
|
|
|
+init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
|
|
|
+{
|
|
|
+ struct buffer_head *bh = NULL;
|
|
|
+ int ret = 0;
|
|
|
+ sector_t isect;
|
|
|
+
|
|
|
+ dprintk("%s enter, %p\n", __func__, page);
|
|
|
+ BUG_ON(PageUptodate(page));
|
|
|
+ if (!cow_read) {
|
|
|
+ zero_user_segment(page, 0, PAGE_SIZE);
|
|
|
+ SetPageUptodate(page);
|
|
|
+ goto cleanup;
|
|
|
+ }
|
|
|
+
|
|
|
+ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
|
|
|
+ if (!bh) {
|
|
|
+ ret = -ENOMEM;
|
|
|
+ goto cleanup;
|
|
|
+ }
|
|
|
+
|
|
|
+ isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
|
|
|
+ map_block(bh, isect, cow_read);
|
|
|
+ if (!bh_uptodate_or_lock(bh))
|
|
|
+ ret = bh_submit_read(bh);
|
|
|
+ if (ret)
|
|
|
+ goto cleanup;
|
|
|
+ SetPageUptodate(page);
|
|
|
+
|
|
|
+cleanup:
|
|
|
+ bl_put_extent(cow_read);
|
|
|
+ if (bh)
|
|
|
+ free_buffer_head(bh);
|
|
|
+ if (ret) {
|
|
|
+ /* Need to mark layout with bad read...should now
|
|
|
+ * just use nfs4 for reads and writes.
|
|
|
+ */
|
|
|
+ mark_bad_read();
|
|
|
+ }
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
static enum pnfs_try_status
|
|
|
bl_write_pagelist(struct nfs_write_data *wdata, int sync)
|
|
|
{
|
|
|
- int i;
|
|
|
+ int i, ret, npg_zero, pg_index, last = 0;
|
|
|
struct bio *bio = NULL;
|
|
|
- struct pnfs_block_extent *be = NULL;
|
|
|
- sector_t isect, extent_length = 0;
|
|
|
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
|
|
|
+ sector_t isect, last_isect = 0, extent_length = 0;
|
|
|
struct parallel_io *par;
|
|
|
loff_t offset = wdata->args.offset;
|
|
|
size_t count = wdata->args.count;
|
|
|
struct page **pages = wdata->args.pages;
|
|
|
- int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
|
|
|
+ struct page *page;
|
|
|
+ pgoff_t index;
|
|
|
+ u64 temp;
|
|
|
+ int npg_per_block =
|
|
|
+ NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
|
|
|
|
|
|
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
|
|
|
/* At this point, wdata->pages is a (sequential) list of nfs_pages.
|
|
|
- * We want to write each, and if there is an error remove it from
|
|
|
- * list and call
|
|
|
- * nfs_retry_request(req) to have it redone using nfs.
|
|
|
- * QUEST? Do as block or per req? Think have to do per block
|
|
|
- * as part of end_bio
|
|
|
+ * We want to write each, and if there is an error set pnfs_error
|
|
|
+ * to have it redone using nfs.
|
|
|
*/
|
|
|
par = alloc_parallel(wdata);
|
|
|
if (!par)
|
|
@@ -433,7 +524,91 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
|
|
|
/* At this point, have to be more careful with error handling */
|
|
|
|
|
|
isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
|
|
|
- for (i = pg_index; i < wdata->npages ; i++) {
|
|
|
+ be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
|
|
|
+ if (!be || !is_writable(be, isect)) {
|
|
|
+ dprintk("%s no matching extents!\n", __func__);
|
|
|
+ wdata->pnfs_error = -EINVAL;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* First page inside INVALID extent */
|
|
|
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
|
|
|
+ temp = offset >> PAGE_CACHE_SHIFT;
|
|
|
+ npg_zero = do_div(temp, npg_per_block);
|
|
|
+ isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
|
|
|
+ (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
|
|
|
+ extent_length = be->be_length - (isect - be->be_f_offset);
|
|
|
+
|
|
|
+fill_invalid_ext:
|
|
|
+ dprintk("%s need to zero %d pages\n", __func__, npg_zero);
|
|
|
+ for (;npg_zero > 0; npg_zero--) {
|
|
|
+ /* page ref released in bl_end_io_write_zero */
|
|
|
+ index = isect >> PAGE_CACHE_SECTOR_SHIFT;
|
|
|
+ dprintk("%s zero %dth page: index %lu isect %llu\n",
|
|
|
+ __func__, npg_zero, index,
|
|
|
+ (unsigned long long)isect);
|
|
|
+ page =
|
|
|
+ find_or_create_page(wdata->inode->i_mapping, index,
|
|
|
+ GFP_NOFS);
|
|
|
+ if (!page) {
|
|
|
+ dprintk("%s oom\n", __func__);
|
|
|
+ wdata->pnfs_error = -ENOMEM;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* PageDirty: Other will write this out
|
|
|
+ * PageWriteback: Other is writing this out
|
|
|
+ * PageUptodate: It was read before
|
|
|
+ * sector_initialized: already written out
|
|
|
+ */
|
|
|
+ if (PageDirty(page) || PageWriteback(page) ||
|
|
|
+ bl_is_sector_init(be->be_inval, isect)) {
|
|
|
+ print_page(page);
|
|
|
+ unlock_page(page);
|
|
|
+ page_cache_release(page);
|
|
|
+ goto next_page;
|
|
|
+ }
|
|
|
+ if (!PageUptodate(page)) {
|
|
|
+ /* New page, readin or zero it */
|
|
|
+ init_page_for_write(page, cow_read);
|
|
|
+ }
|
|
|
+ set_page_writeback(page);
|
|
|
+ unlock_page(page);
|
|
|
+
|
|
|
+ ret = bl_mark_sectors_init(be->be_inval, isect,
|
|
|
+ PAGE_CACHE_SECTORS,
|
|
|
+ NULL);
|
|
|
+ if (unlikely(ret)) {
|
|
|
+ dprintk("%s bl_mark_sectors_init fail %d\n",
|
|
|
+ __func__, ret);
|
|
|
+ end_page_writeback(page);
|
|
|
+ page_cache_release(page);
|
|
|
+ wdata->pnfs_error = ret;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
|
|
|
+ isect, page, be,
|
|
|
+ bl_end_io_write_zero, par);
|
|
|
+ if (IS_ERR(bio)) {
|
|
|
+ wdata->pnfs_error = PTR_ERR(bio);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ /* FIXME: This should be done in bi_end_io */
|
|
|
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
|
|
|
+ page->index << PAGE_CACHE_SHIFT,
|
|
|
+ PAGE_CACHE_SIZE);
|
|
|
+next_page:
|
|
|
+ isect += PAGE_CACHE_SECTORS;
|
|
|
+ extent_length -= PAGE_CACHE_SECTORS;
|
|
|
+ }
|
|
|
+ if (last)
|
|
|
+ goto write_done;
|
|
|
+ }
|
|
|
+ bio = bl_submit_bio(WRITE, bio);
|
|
|
+
|
|
|
+ /* Middle pages */
|
|
|
+ pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
|
|
|
+ for (i = pg_index; i < wdata->npages; i++) {
|
|
|
if (!extent_length) {
|
|
|
/* We've used up the previous extent */
|
|
|
bl_put_extent(be);
|
|
@@ -442,35 +617,51 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
|
|
|
be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
|
|
|
isect, NULL);
|
|
|
if (!be || !is_writable(be, isect)) {
|
|
|
- wdata->pnfs_error = -ENOMEM;
|
|
|
+ wdata->pnfs_error = -EINVAL;
|
|
|
goto out;
|
|
|
}
|
|
|
extent_length = be->be_length -
|
|
|
- (isect - be->be_f_offset);
|
|
|
+ (isect - be->be_f_offset);
|
|
|
}
|
|
|
- for (;;) {
|
|
|
- if (!bio) {
|
|
|
- bio = bio_alloc(GFP_NOIO, wdata->npages - i);
|
|
|
- if (!bio) {
|
|
|
- wdata->pnfs_error = -ENOMEM;
|
|
|
- goto out;
|
|
|
- }
|
|
|
- bio->bi_sector = isect - be->be_f_offset +
|
|
|
- be->be_v_offset;
|
|
|
- bio->bi_bdev = be->be_mdev;
|
|
|
- bio->bi_end_io = bl_end_io_write;
|
|
|
- bio->bi_private = par;
|
|
|
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
|
|
|
+ ret = bl_mark_sectors_init(be->be_inval, isect,
|
|
|
+ PAGE_CACHE_SECTORS,
|
|
|
+ NULL);
|
|
|
+ if (unlikely(ret)) {
|
|
|
+ dprintk("%s bl_mark_sectors_init fail %d\n",
|
|
|
+ __func__, ret);
|
|
|
+ wdata->pnfs_error = ret;
|
|
|
+ goto out;
|
|
|
}
|
|
|
- if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
|
|
|
- break;
|
|
|
- bio = bl_submit_bio(WRITE, bio);
|
|
|
+ }
|
|
|
+ bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
|
|
|
+ isect, pages[i], be,
|
|
|
+ bl_end_io_write, par);
|
|
|
+ if (IS_ERR(bio)) {
|
|
|
+ wdata->pnfs_error = PTR_ERR(bio);
|
|
|
+ goto out;
|
|
|
}
|
|
|
isect += PAGE_CACHE_SECTORS;
|
|
|
+ last_isect = isect;
|
|
|
extent_length -= PAGE_CACHE_SECTORS;
|
|
|
}
|
|
|
- wdata->res.count = (isect << SECTOR_SHIFT) - (offset);
|
|
|
- if (count < wdata->res.count)
|
|
|
+
|
|
|
+ /* Last page inside INVALID extent */
|
|
|
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
|
|
|
+ bio = bl_submit_bio(WRITE, bio);
|
|
|
+ temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
|
|
|
+ npg_zero = npg_per_block - do_div(temp, npg_per_block);
|
|
|
+ if (npg_zero < npg_per_block) {
|
|
|
+ last = 1;
|
|
|
+ goto fill_invalid_ext;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+write_done:
|
|
|
+ wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
|
|
|
+ if (count < wdata->res.count) {
|
|
|
wdata->res.count = count;
|
|
|
+ }
|
|
|
out:
|
|
|
bl_put_extent(be);
|
|
|
bl_submit_bio(WRITE, bio);
|