|
@@ -236,15 +236,21 @@ static int ceph_readpage(struct file *filp, struct page *page)
|
|
|
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
|
|
|
{
|
|
|
struct inode *inode = req->r_inode;
|
|
|
+ struct ceph_osd_data *osd_data;
|
|
|
int rc = req->r_result;
|
|
|
int bytes = le32_to_cpu(msg->hdr.data_len);
|
|
|
+ int num_pages;
|
|
|
int i;
|
|
|
|
|
|
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
|
|
|
|
|
|
/* unlock all pages, zeroing any data we didn't read */
|
|
|
- for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
|
|
|
- struct page *page = req->r_pages[i];
|
|
|
+ osd_data = osd_req_op_extent_osd_data(req, 0);
|
|
|
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
|
|
|
+ num_pages = calc_pages_for((u64)osd_data->alignment,
|
|
|
+ (u64)osd_data->length);
|
|
|
+ for (i = 0; i < num_pages; i++) {
|
|
|
+ struct page *page = osd_data->pages[i];
|
|
|
|
|
|
if (bytes < (int)PAGE_CACHE_SIZE) {
|
|
|
/* zero (remainder of) page */
|
|
@@ -257,8 +263,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
|
|
|
SetPageUptodate(page);
|
|
|
unlock_page(page);
|
|
|
page_cache_release(page);
|
|
|
+ bytes -= PAGE_CACHE_SIZE;
|
|
|
}
|
|
|
- kfree(req->r_pages);
|
|
|
+ kfree(osd_data->pages);
|
|
|
}
|
|
|
|
|
|
static void ceph_unlock_page_vector(struct page **pages, int num_pages)
|
|
@@ -279,6 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
|
|
|
&ceph_inode_to_client(inode)->client->osdc;
|
|
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
|
|
struct page *page = list_entry(page_list->prev, struct page, lru);
|
|
|
+ struct ceph_vino vino;
|
|
|
struct ceph_osd_request *req;
|
|
|
u64 off;
|
|
|
u64 len;
|
|
@@ -303,18 +311,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
|
|
|
len = nr_pages << PAGE_CACHE_SHIFT;
|
|
|
dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
|
|
|
off, len);
|
|
|
-
|
|
|
- req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
|
|
|
- off, &len,
|
|
|
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
|
|
|
- NULL, 0,
|
|
|
+ vino = ceph_vino(inode);
|
|
|
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
|
|
|
+ 1, CEPH_OSD_OP_READ,
|
|
|
+ CEPH_OSD_FLAG_READ, NULL,
|
|
|
ci->i_truncate_seq, ci->i_truncate_size,
|
|
|
- NULL, false, 0);
|
|
|
+ false);
|
|
|
if (IS_ERR(req))
|
|
|
return PTR_ERR(req);
|
|
|
|
|
|
/* build page vector */
|
|
|
- nr_pages = len >> PAGE_CACHE_SHIFT;
|
|
|
+ nr_pages = calc_pages_for(0, len);
|
|
|
pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
|
|
|
ret = -ENOMEM;
|
|
|
if (!pages)
|
|
@@ -336,11 +343,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
|
|
|
}
|
|
|
pages[i] = page;
|
|
|
}
|
|
|
- req->r_pages = pages;
|
|
|
- req->r_num_pages = nr_pages;
|
|
|
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
|
|
|
req->r_callback = finish_read;
|
|
|
req->r_inode = inode;
|
|
|
|
|
|
+ ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
|
|
|
+
|
|
|
dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
|
|
|
ret = ceph_osdc_start_request(osdc, req, false);
|
|
|
if (ret < 0)
|
|
@@ -373,7 +381,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
|
|
|
max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
|
|
|
>> PAGE_SHIFT;
|
|
|
|
|
|
- dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
|
|
|
+ dout("readpages %p file %p nr_pages %d max %d\n", inode,
|
|
|
+ file, nr_pages,
|
|
|
max);
|
|
|
while (!list_empty(page_list)) {
|
|
|
rc = start_read(inode, page_list, max);
|
|
@@ -548,17 +557,23 @@ static void writepages_finish(struct ceph_osd_request *req,
|
|
|
{
|
|
|
struct inode *inode = req->r_inode;
|
|
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
|
|
+ struct ceph_osd_data *osd_data;
|
|
|
unsigned wrote;
|
|
|
struct page *page;
|
|
|
+ int num_pages;
|
|
|
int i;
|
|
|
struct ceph_snap_context *snapc = req->r_snapc;
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
int rc = req->r_result;
|
|
|
- u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
|
|
|
+ u64 bytes = req->r_ops[0].extent.length;
|
|
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
|
|
long writeback_stat;
|
|
|
unsigned issued = ceph_caps_issued(ci);
|
|
|
|
|
|
+ osd_data = osd_req_op_extent_osd_data(req, 0);
|
|
|
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
|
|
|
+ num_pages = calc_pages_for((u64)osd_data->alignment,
|
|
|
+ (u64)osd_data->length);
|
|
|
if (rc >= 0) {
|
|
|
/*
|
|
|
* Assume we wrote the pages we originally sent. The
|
|
@@ -566,7 +581,7 @@ static void writepages_finish(struct ceph_osd_request *req,
|
|
|
* raced with a truncation and was adjusted at the osd,
|
|
|
* so don't believe the reply.
|
|
|
*/
|
|
|
- wrote = req->r_num_pages;
|
|
|
+ wrote = num_pages;
|
|
|
} else {
|
|
|
wrote = 0;
|
|
|
mapping_set_error(mapping, rc);
|
|
@@ -575,8 +590,8 @@ static void writepages_finish(struct ceph_osd_request *req,
|
|
|
inode, rc, bytes, wrote);
|
|
|
|
|
|
/* clean all pages */
|
|
|
- for (i = 0; i < req->r_num_pages; i++) {
|
|
|
- page = req->r_pages[i];
|
|
|
+ for (i = 0; i < num_pages; i++) {
|
|
|
+ page = osd_data->pages[i];
|
|
|
BUG_ON(!page);
|
|
|
WARN_ON(!PageUptodate(page));
|
|
|
|
|
@@ -605,32 +620,34 @@ static void writepages_finish(struct ceph_osd_request *req,
|
|
|
unlock_page(page);
|
|
|
}
|
|
|
dout("%p wrote+cleaned %d pages\n", inode, wrote);
|
|
|
- ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
|
|
|
+ ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
|
|
|
|
|
|
- ceph_release_pages(req->r_pages, req->r_num_pages);
|
|
|
- if (req->r_pages_from_pool)
|
|
|
- mempool_free(req->r_pages,
|
|
|
+ ceph_release_pages(osd_data->pages, num_pages);
|
|
|
+ if (osd_data->pages_from_pool)
|
|
|
+ mempool_free(osd_data->pages,
|
|
|
ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
|
|
|
else
|
|
|
- kfree(req->r_pages);
|
|
|
+ kfree(osd_data->pages);
|
|
|
ceph_osdc_put_request(req);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * allocate a page vec, either directly, or if necessary, via a the
|
|
|
- * mempool. we avoid the mempool if we can because req->r_num_pages
|
|
|
- * may be less than the maximum write size.
|
|
|
- */
|
|
|
-static void alloc_page_vec(struct ceph_fs_client *fsc,
|
|
|
- struct ceph_osd_request *req)
|
|
|
+static struct ceph_osd_request *
|
|
|
+ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
|
|
|
+ struct ceph_snap_context *snapc, int num_ops)
|
|
|
{
|
|
|
- req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
|
|
|
- GFP_NOFS);
|
|
|
- if (!req->r_pages) {
|
|
|
- req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
|
|
|
- req->r_pages_from_pool = 1;
|
|
|
- WARN_ON(!req->r_pages);
|
|
|
- }
|
|
|
+ struct ceph_fs_client *fsc;
|
|
|
+ struct ceph_inode_info *ci;
|
|
|
+ struct ceph_vino vino;
|
|
|
+
|
|
|
+ fsc = ceph_inode_to_client(inode);
|
|
|
+ ci = ceph_inode(inode);
|
|
|
+ vino = ceph_vino(inode);
|
|
|
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
|
|
|
+
|
|
|
+ return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
|
|
+ vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
|
|
|
+ CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
|
|
|
+ snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -653,7 +670,7 @@ static int ceph_writepages_start(struct address_space *mapping,
|
|
|
unsigned wsize = 1 << inode->i_blkbits;
|
|
|
struct ceph_osd_request *req = NULL;
|
|
|
int do_sync;
|
|
|
- u64 snap_size = 0;
|
|
|
+ u64 snap_size;
|
|
|
|
|
|
/*
|
|
|
* Include a 'sync' in the OSD request if this is a data
|
|
@@ -699,6 +716,7 @@ static int ceph_writepages_start(struct address_space *mapping,
|
|
|
retry:
|
|
|
/* find oldest snap context with dirty data */
|
|
|
ceph_put_snap_context(snapc);
|
|
|
+ snap_size = 0;
|
|
|
snapc = get_oldest_context(inode, &snap_size);
|
|
|
if (!snapc) {
|
|
|
/* hmm, why does writepages get called when there
|
|
@@ -706,6 +724,8 @@ retry:
|
|
|
dout(" no snap context with dirty data?\n");
|
|
|
goto out;
|
|
|
}
|
|
|
+ if (snap_size == 0)
|
|
|
+ snap_size = i_size_read(inode);
|
|
|
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
|
|
|
snapc, snapc->seq, snapc->num_snaps);
|
|
|
if (last_snapc && snapc != last_snapc) {
|
|
@@ -718,10 +738,14 @@ retry:
|
|
|
last_snapc = snapc;
|
|
|
|
|
|
while (!done && index <= end) {
|
|
|
+ int num_ops = do_sync ? 2 : 1;
|
|
|
+ struct ceph_vino vino;
|
|
|
unsigned i;
|
|
|
int first;
|
|
|
pgoff_t next;
|
|
|
int pvec_pages, locked_pages;
|
|
|
+ struct page **pages = NULL;
|
|
|
+ mempool_t *pool = NULL; /* Becomes non-null if mempool used */
|
|
|
struct page *page;
|
|
|
int want;
|
|
|
u64 offset, len;
|
|
@@ -773,11 +797,8 @@ get_more_pages:
|
|
|
dout("waiting on writeback %p\n", page);
|
|
|
wait_on_page_writeback(page);
|
|
|
}
|
|
|
- if ((snap_size && page_offset(page) > snap_size) ||
|
|
|
- (!snap_size &&
|
|
|
- page_offset(page) > i_size_read(inode))) {
|
|
|
- dout("%p page eof %llu\n", page, snap_size ?
|
|
|
- snap_size : i_size_read(inode));
|
|
|
+ if (page_offset(page) >= snap_size) {
|
|
|
+ dout("%p page eof %llu\n", page, snap_size);
|
|
|
done = 1;
|
|
|
unlock_page(page);
|
|
|
break;
|
|
@@ -805,22 +826,23 @@ get_more_pages:
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
- /* ok */
|
|
|
+ /*
|
|
|
+ * We have something to write. If this is
|
|
|
+ * the first locked page this time through,
|
|
|
+ * allocate an osd request and a page array
|
|
|
+ * that it will use.
|
|
|
+ */
|
|
|
if (locked_pages == 0) {
|
|
|
+ size_t size;
|
|
|
+
|
|
|
+ BUG_ON(pages);
|
|
|
+
|
|
|
/* prepare async write request */
|
|
|
- offset = (u64) page_offset(page);
|
|
|
+ offset = (u64)page_offset(page);
|
|
|
len = wsize;
|
|
|
- req = ceph_osdc_new_request(&fsc->client->osdc,
|
|
|
- &ci->i_layout,
|
|
|
- ceph_vino(inode),
|
|
|
- offset, &len,
|
|
|
- CEPH_OSD_OP_WRITE,
|
|
|
- CEPH_OSD_FLAG_WRITE |
|
|
|
- CEPH_OSD_FLAG_ONDISK,
|
|
|
- snapc, do_sync,
|
|
|
- ci->i_truncate_seq,
|
|
|
- ci->i_truncate_size,
|
|
|
- &inode->i_mtime, true, 0);
|
|
|
+ req = ceph_writepages_osd_request(inode,
|
|
|
+ offset, &len, snapc,
|
|
|
+ num_ops);
|
|
|
|
|
|
if (IS_ERR(req)) {
|
|
|
rc = PTR_ERR(req);
|
|
@@ -828,11 +850,17 @@ get_more_pages:
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
- max_pages = req->r_num_pages;
|
|
|
-
|
|
|
- alloc_page_vec(fsc, req);
|
|
|
req->r_callback = writepages_finish;
|
|
|
req->r_inode = inode;
|
|
|
+
|
|
|
+ max_pages = calc_pages_for(0, (u64)len);
|
|
|
+ size = max_pages * sizeof (*pages);
|
|
|
+ pages = kmalloc(size, GFP_NOFS);
|
|
|
+ if (!pages) {
|
|
|
+ pool = fsc->wb_pagevec_pool;
|
|
|
+ pages = mempool_alloc(pool, GFP_NOFS);
|
|
|
+ BUG_ON(!pages);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* note position of first page in pvec */
|
|
@@ -850,7 +878,7 @@ get_more_pages:
|
|
|
}
|
|
|
|
|
|
set_page_writeback(page);
|
|
|
- req->r_pages[locked_pages] = page;
|
|
|
+ pages[locked_pages] = page;
|
|
|
locked_pages++;
|
|
|
next = page->index + 1;
|
|
|
}
|
|
@@ -879,18 +907,27 @@ get_more_pages:
|
|
|
pvec.nr -= i-first;
|
|
|
}
|
|
|
|
|
|
- /* submit the write */
|
|
|
- offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
|
|
|
- len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
|
|
|
+ /* Format the osd request message and submit the write */
|
|
|
+
|
|
|
+ offset = page_offset(pages[0]);
|
|
|
+ len = min(snap_size - offset,
|
|
|
(u64)locked_pages << PAGE_CACHE_SHIFT);
|
|
|
dout("writepages got %d pages at %llu~%llu\n",
|
|
|
locked_pages, offset, len);
|
|
|
|
|
|
- /* revise final length, page count */
|
|
|
- req->r_num_pages = locked_pages;
|
|
|
- req->r_request_ops[0].extent.length = cpu_to_le64(len);
|
|
|
- req->r_request_ops[0].payload_len = cpu_to_le32(len);
|
|
|
- req->r_request->hdr.data_len = cpu_to_le32(len);
|
|
|
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
|
|
|
+ !!pool, false);
|
|
|
+
|
|
|
+ pages = NULL; /* request message now owns the pages array */
|
|
|
+ pool = NULL;
|
|
|
+
|
|
|
+ /* Update the write op length in case we changed it */
|
|
|
+
|
|
|
+ osd_req_op_extent_update(req, 0, len);
|
|
|
+
|
|
|
+ vino = ceph_vino(inode);
|
|
|
+ ceph_osdc_build_request(req, offset, snapc, vino.snap,
|
|
|
+ &inode->i_mtime);
|
|
|
|
|
|
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
|
|
|
BUG_ON(rc);
|
|
@@ -1067,51 +1104,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
|
|
|
struct page **pagep, void **fsdata)
|
|
|
{
|
|
|
struct inode *inode = file_inode(file);
|
|
|
- struct ceph_inode_info *ci = ceph_inode(inode);
|
|
|
- struct ceph_file_info *fi = file->private_data;
|
|
|
struct page *page;
|
|
|
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
|
|
|
- int r, want, got = 0;
|
|
|
-
|
|
|
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
|
|
|
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
|
|
|
- else
|
|
|
- want = CEPH_CAP_FILE_BUFFER;
|
|
|
-
|
|
|
- dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
|
|
|
- inode, ceph_vinop(inode), pos, len, inode->i_size);
|
|
|
- r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
|
|
|
- if (r < 0)
|
|
|
- return r;
|
|
|
- dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
|
|
|
- inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
|
|
|
- if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
|
|
|
- ceph_put_cap_refs(ci, got);
|
|
|
- return -EAGAIN;
|
|
|
- }
|
|
|
+ int r;
|
|
|
|
|
|
do {
|
|
|
/* get a page */
|
|
|
page = grab_cache_page_write_begin(mapping, index, 0);
|
|
|
- if (!page) {
|
|
|
- r = -ENOMEM;
|
|
|
- break;
|
|
|
- }
|
|
|
+ if (!page)
|
|
|
+ return -ENOMEM;
|
|
|
+ *pagep = page;
|
|
|
|
|
|
dout("write_begin file %p inode %p page %p %d~%d\n", file,
|
|
|
inode, page, (int)pos, (int)len);
|
|
|
|
|
|
r = ceph_update_writeable_page(file, pos, len, page);
|
|
|
- if (r)
|
|
|
- page_cache_release(page);
|
|
|
} while (r == -EAGAIN);
|
|
|
|
|
|
- if (r) {
|
|
|
- ceph_put_cap_refs(ci, got);
|
|
|
- } else {
|
|
|
- *pagep = page;
|
|
|
- *(int *)fsdata = got;
|
|
|
- }
|
|
|
return r;
|
|
|
}
|
|
|
|
|
@@ -1125,12 +1134,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
|
|
|
struct page *page, void *fsdata)
|
|
|
{
|
|
|
struct inode *inode = file_inode(file);
|
|
|
- struct ceph_inode_info *ci = ceph_inode(inode);
|
|
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
|
|
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
|
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
|
|
|
int check_cap = 0;
|
|
|
- int got = (unsigned long)fsdata;
|
|
|
|
|
|
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
|
|
|
inode, page, (int)pos, (int)copied, (int)len);
|
|
@@ -1153,19 +1160,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
|
|
|
up_read(&mdsc->snap_rwsem);
|
|
|
page_cache_release(page);
|
|
|
|
|
|
- if (copied > 0) {
|
|
|
- int dirty;
|
|
|
- spin_lock(&ci->i_ceph_lock);
|
|
|
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
|
|
|
- spin_unlock(&ci->i_ceph_lock);
|
|
|
- if (dirty)
|
|
|
- __mark_inode_dirty(inode, dirty);
|
|
|
- }
|
|
|
-
|
|
|
- dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
|
|
|
- inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
|
|
|
- ceph_put_cap_refs(ci, got);
|
|
|
-
|
|
|
if (check_cap)
|
|
|
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
|
|
|
|