12 years ago · 40889e8d9f
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -70,6 +70,10 @@ snap_*
 
				 
			
 
				 	A directory per each snapshot
			
 
				 
			
 
				+parent
			
 
				+
			
 
				+	Information identifying the pool, image, and snapshot id for
			
 
				+	the parent image in a layered rbd image (format 2 only).
			
 
				 
			
 
				 Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name>
			
 
				 -------------------------------------------------------------
			
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -46,8 +46,6 @@
 
				 #define RBD_MIN_OBJ_ORDER       16
			
 
				 #define RBD_MAX_OBJ_ORDER       30
			
 
				 
			
 
				-#define RBD_MAX_SEG_NAME_LEN	128
			
 
				-
			
 
				 #define RBD_COMP_NONE		0
			
 
				 #define RBD_CRYPT_NONE		0
			
 
				 
			
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 
				 	kfree(req->r_pages);
			
 
				 }
			
 
				 
			
 
				+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < num_pages; i++)
			
 
				+		unlock_page(pages[i]);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * start an async read(ahead) operation.  return nr_pages we submitted
			
 
				  * a read for on success, or negative error code.
			
@@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 
				 	return nr_pages;
			
 
				 
			
 
				 out_pages:
			
 
				+	ceph_unlock_page_vector(pages, nr_pages);
			
 
				 	ceph_release_page_vector(pages, nr_pages);
			
 
				 out:
			
 
				 	ceph_osdc_put_request(req);
			
@@ -1078,23 +1087,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 
				 			    struct page **pagep, void **fsdata)
			
 
				 {
			
 
				 	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_file_info *fi = file->private_data;
			
 
				 	struct page *page;
			
 
				 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
			
 
				-	int r;
			
 
				+	int r, want, got = 0;
			
 
				+
			
 
				+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
			
 
				+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
			
 
				+	else
			
 
				+		want = CEPH_CAP_FILE_BUFFER;
			
 
				+
			
 
				+	dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
			
 
				+	     inode, ceph_vinop(inode), pos, len, inode->i_size);
			
 
				+	r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
			
 
				+	if (r < 0)
			
 
				+		return r;
			
 
				+	dout("write_begin %p %llx.%llx %llu~%u  got cap refs on %s\n",
			
 
				+	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
			
 
				+	if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
			
 
				+		ceph_put_cap_refs(ci, got);
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				 
			
 
				 	do {
			
 
				 		/* get a page */
			
 
				 		page = grab_cache_page_write_begin(mapping, index, 0);
			
 
				-		if (!page)
			
 
				-			return -ENOMEM;
			
 
				-		*pagep = page;
			
 
				+		if (!page) {
			
 
				+			r = -ENOMEM;
			
 
				+			break;
			
 
				+		}
			
 
				 
			
 
				 		dout("write_begin file %p inode %p page %p %d~%d\n", file,
			
 
				 		     inode, page, (int)pos, (int)len);
			
 
				 
			
 
				 		r = ceph_update_writeable_page(file, pos, len, page);
			
 
				+		if (r)
			
 
				+			page_cache_release(page);
			
 
				 	} while (r == -EAGAIN);
			
 
				 
			
 
				+	if (r) {
			
 
				+		ceph_put_cap_refs(ci, got);
			
 
				+	} else {
			
 
				+		*pagep = page;
			
 
				+		*(int *)fsdata = got;
			
 
				+	}
			
 
				 	return r;
			
 
				 }
			
 
				 
			
@@ -1108,10 +1145,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 
				 			  struct page *page, void *fsdata)
			
 
				 {
			
 
				 	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
			
 
				 	struct ceph_mds_client *mdsc = fsc->mdsc;
			
 
				 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
			
 
				 	int check_cap = 0;
			
 
				+	int got = (unsigned long)fsdata;
			
 
				 
			
 
				 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
			
 
				 	     inode, page, (int)pos, (int)copied, (int)len);
			
@@ -1134,6 +1173,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 
				 	up_read(&mdsc->snap_rwsem);
			
 
				 	page_cache_release(page);
			
 
				 
			
 
				+	if (copied > 0) {
			
 
				+		int dirty;
			
 
				+		spin_lock(&ci->i_ceph_lock);
			
 
				+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
			
 
				+		spin_unlock(&ci->i_ceph_lock);
			
 
				+		if (dirty)
			
 
				+			__mark_inode_dirty(inode, dirty);
			
 
				+	}
			
 
				+
			
 
				+	dout("write_end %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
			
 
				+	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
			
 
				+	ceph_put_cap_refs(ci, got);
			
 
				+
			
 
				 	if (check_cap)
			
 
				 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
			
 
				 
			
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
 
				 	if (!ctx) {
			
 
				 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
			
 
				 		if (cap) {
			
 
				+			spin_lock(&mdsc->caps_list_lock);
			
 
				 			mdsc->caps_use_count++;
			
 
				 			mdsc->caps_total_count++;
			
 
				+			spin_unlock(&mdsc->caps_list_lock);
			
 
				 		}
			
 
				 		return cap;
			
 
				 	}
			
@@ -1349,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 
				 		if (!ci->i_head_snapc)
			
 
				 			ci->i_head_snapc = ceph_get_snap_context(
			
 
				 				ci->i_snap_realm->cached_context);
			
 
				-		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
			
 
				-			ci->i_head_snapc);
			
 
				+		dout(" inode %p now dirty snapc %p auth cap %p\n",
			
 
				+		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
			
 
				 		BUG_ON(!list_empty(&ci->i_dirty_item));
			
 
				 		spin_lock(&mdsc->cap_dirty_lock);
			
 
				-		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
			
 
				+		if (ci->i_auth_cap)
			
 
				+			list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
			
 
				+		else
			
 
				+			list_add(&ci->i_dirty_item,
			
 
				+				 &mdsc->cap_dirty_migrating);
			
 
				 		spin_unlock(&mdsc->cap_dirty_lock);
			
 
				 		if (ci->i_flushing_caps == 0) {
			
 
				 			ihold(inode);
			
@@ -2388,7 +2394,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 
				 			    &atime);
			
 
				 
			
 
				 	/* max size increase? */
			
 
				-	if (max_size != ci->i_max_size) {
			
 
				+	if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
			
 
				 		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
			
 
				 		ci->i_max_size = max_size;
			
 
				 		if (max_size >= ci->i_wanted_max_size) {
			
@@ -2745,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 
				 
			
 
				 	/* make sure we re-request max_size, if necessary */
			
 
				 	spin_lock(&ci->i_ceph_lock);
			
 
				+	ci->i_wanted_max_size = 0;  /* reset */
			
 
				 	ci->i_requested_max_size = 0;
			
 
				 	spin_unlock(&ci->i_ceph_lock);
			
 
				 }
			
@@ -2840,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
				 	case CEPH_CAP_OP_IMPORT:
			
 
				 		handle_cap_import(mdsc, inode, h, session,
			
 
				 				  snaptrace, snaptrace_len);
			
 
				-		ceph_check_caps(ceph_inode(inode), 0, session);
			
 
				-		goto done_unlocked;
			
 
				 	}
			
 
				 
			
 
				 	/* the rest require a cap */
			
@@ -2858,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
				 	switch (op) {
			
 
				 	case CEPH_CAP_OP_REVOKE:
			
 
				 	case CEPH_CAP_OP_GRANT:
			
 
				+	case CEPH_CAP_OP_IMPORT:
			
 
				 		handle_cap_grant(inode, h, session, cap, msg->middle);
			
 
				 		goto done_unlocked;
			
 
				 
			
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
				 	struct ceph_osd_client *osdc =
			
 
				 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
			
 
				 	loff_t endoff = pos + iov->iov_len;
			
 
				-	int want, got = 0;
			
 
				-	int ret, err;
			
 
				+	int got = 0;
			
 
				+	int ret, err, written;
			
 
				 
			
 
				 	if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				 		return -EROFS;
			
 
				 
			
 
				 retry_snap:
			
 
				+	written = 0;
			
 
				 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
			
 
				 		return -ENOSPC;
			
 
				 	__ceph_do_pending_vmtruncate(inode);
			
 
				-	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
			
 
				-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
			
 
				-	     inode->i_size);
			
 
				-	if (fi->fmode & CEPH_FILE_MODE_LAZY)
			
 
				-		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
			
 
				-	else
			
 
				-		want = CEPH_CAP_FILE_BUFFER;
			
 
				-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
			
 
				-	if (ret < 0)
			
 
				-		goto out_put;
			
 
				-
			
 
				-	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
			
 
				-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
			
 
				-	     ceph_cap_string(got));
			
 
				-
			
 
				-	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
			
 
				-	    (iocb->ki_filp->f_flags & O_DIRECT) ||
			
 
				-	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
			
 
				-	    (fi->flags & CEPH_F_SYNC)) {
			
 
				-		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
			
 
				-			&iocb->ki_pos);
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * buffered write; drop Fw early to avoid slow
			
 
				-		 * revocation if we get stuck on balance_dirty_pages
			
 
				-		 */
			
 
				-		int dirty;
			
 
				-
			
 
				-		spin_lock(&ci->i_ceph_lock);
			
 
				-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
			
 
				-		spin_unlock(&ci->i_ceph_lock);
			
 
				-		ceph_put_cap_refs(ci, got);
			
 
				 
			
 
				+	/*
			
 
				+	 * try to do a buffered write.  if we don't have sufficient
			
 
				+	 * caps, we'll get -EAGAIN from generic_file_aio_write, or a
			
 
				+	 * short write if we only get caps for some pages.
			
 
				+	 */
			
 
				+	if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
			
 
				+	    !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
			
 
				+	    !(fi->flags & CEPH_F_SYNC)) {
			
 
				 		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
			
 
				+		if (ret >= 0)
			
 
				+			written = ret;
			
 
				+
			
 
				 		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
			
 
				 		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
			
 
				 		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
			
 
				-			err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
			
 
				+			err = vfs_fsync_range(file, pos, pos + written - 1, 1);
			
 
				 			if (err < 0)
			
 
				 				ret = err;
			
 
				 		}
			
 
				+		if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
			
 
				+			goto out;
			
 
				+	}
			
 
				 
			
 
				-		if (dirty)
			
 
				-			__mark_inode_dirty(inode, dirty);
			
 
				+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
			
 
				+	     inode, ceph_vinop(inode), pos + written,
			
 
				+	     (unsigned)iov->iov_len - written, inode->i_size);
			
 
				+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
			
 
				+	if (ret < 0)
			
 
				 		goto out;
			
 
				-	}
			
 
				 
			
 
				+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
			
 
				+	     inode, ceph_vinop(inode), pos + written,
			
 
				+	     (unsigned)iov->iov_len - written, ceph_cap_string(got));
			
 
				+	ret = ceph_sync_write(file, iov->iov_base + written,
			
 
				+			      iov->iov_len - written, &iocb->ki_pos);
			
 
				 	if (ret >= 0) {
			
 
				 		int dirty;
			
 
				 		spin_lock(&ci->i_ceph_lock);
			
@@ -777,13 +767,10 @@ retry_snap:
 
				 		if (dirty)
			
 
				 			__mark_inode_dirty(inode, dirty);
			
 
				 	}
			
 
				-
			
 
				-out_put:
			
 
				 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
			
 
				-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
			
 
				-	     ceph_cap_string(got));
			
 
				+	     inode, ceph_vinop(inode), pos + written,
			
 
				+	     (unsigned)iov->iov_len - written, ceph_cap_string(got));
			
 
				 	ceph_put_cap_refs(ci, got);
			
 
				-
			
 
				 out:
			
 
				 	if (ret == -EOLDSNAPC) {
			
 
				 		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
			
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
 
				 {
			
 
				 	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				 	u64 to;
			
 
				-	int wrbuffer_refs, wake = 0;
			
 
				+	int wrbuffer_refs, finish = 0;
			
 
				 
			
 
				 retry:
			
 
				 	spin_lock(&ci->i_ceph_lock);
			
@@ -1498,15 +1498,18 @@ retry:
 
				 	truncate_inode_pages(inode->i_mapping, to);
			
 
				 
			
 
				 	spin_lock(&ci->i_ceph_lock);
			
 
				-	ci->i_truncate_pending--;
			
 
				-	if (ci->i_truncate_pending == 0)
			
 
				-		wake = 1;
			
 
				+	if (to == ci->i_truncate_size) {
			
 
				+		ci->i_truncate_pending = 0;
			
 
				+		finish = 1;
			
 
				+	}
			
 
				 	spin_unlock(&ci->i_ceph_lock);
			
 
				+	if (!finish)
			
 
				+		goto retry;
			
 
				 
			
 
				 	if (wrbuffer_refs == 0)
			
 
				 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
			
 
				-	if (wake)
			
 
				-		wake_up_all(&ci->i_cap_wq);
			
 
				+
			
 
				+	wake_up_all(&ci->i_cap_wq);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 
				 	} else if (rpath || rino) {
			
 
				 		*ino = rino;
			
 
				 		*ppath = rpath;
			
 
				-		*pathlen = strlen(rpath);
			
 
				+		*pathlen = rpath ? strlen(rpath) : 0;
			
 
				 		dout(" path %.*s\n", *pathlen, rpath);
			
 
				 	}
			
 
				 
			
@@ -1876,9 +1876,14 @@ finish:
 
				 static void __wake_requests(struct ceph_mds_client *mdsc,
			
 
				 			    struct list_head *head)
			
 
				 {
			
 
				-	struct ceph_mds_request *req, *nreq;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	LIST_HEAD(tmp_list);
			
 
				+
			
 
				+	list_splice_init(head, &tmp_list);
			
 
				 
			
 
				-	list_for_each_entry_safe(req, nreq, head, r_wait) {
			
 
				+	while (!list_empty(&tmp_list)) {
			
 
				+		req = list_entry(tmp_list.next,
			
 
				+				 struct ceph_mds_request, r_wait);
			
 
				 		list_del_init(&req->r_wait);
			
 
				 		__do_request(mdsc, req);
			
 
				 	}
			
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 
				 		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
			
 
				 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
			
 
				 		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
			
 
				-	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
			
 
				-		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
			
 
				 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
			
 
				 		seq_printf(m, ",osdkeepalivetimeout=%d",
			
 
				 			   opt->osd_keepalive_timeout);
			
@@ -849,7 +847,7 @@ static int ceph_register_bdi(struct super_block *sb,
 
				 		fsc->backing_dev_info.ra_pages =
			
 
				 			default_backing_dev_info.ra_pages;
			
 
				 
			
 
				-	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
			
 
				+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
			
 
				 			   atomic_long_inc_return(&bdi_seq));
			
 
				 	if (!err)
			
 
				 		sb->s_bdi = &fsc->backing_dev_info;
			
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -114,6 +114,7 @@ struct backing_dev_info {
 
				 int bdi_init(struct backing_dev_info *bdi);
			
 
				 void bdi_destroy(struct backing_dev_info *bdi);
			
 
				 
			
 
				+__printf(3, 4)
			
 
				 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
			
 
				 		const char *fmt, ...);
			
 
				 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
			
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -43,7 +43,6 @@ struct ceph_options {
 
				 	struct ceph_entity_addr my_addr;
			
 
				 	int mount_timeout;
			
 
				 	int osd_idle_ttl;
			
 
				-	int osd_timeout;
			
 
				 	int osd_keepalive_timeout;
			
 
				 
			
 
				 	/*
			
@@ -63,7 +62,6 @@ struct ceph_options {
 
				  * defaults
			
 
				  */
			
 
				 #define CEPH_MOUNT_TIMEOUT_DEFAULT  60
			
 
				-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
			
 
				 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
			
 
				 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
			
 
				 
			
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -123,6 +123,7 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 
				 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
			
 
				 				struct ceph_pg pgid);
			
 
				 
			
 
				+extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
			
 
				 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
			
 
				 
			
 
				 #endif
			
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -87,6 +87,8 @@ struct ceph_pg {
 
				  *
			
 
				  *  lpgp_num -- as above.
			
 
				  */
			
 
				+#define CEPH_NOPOOL  ((__u64) (-1))  /* pool id not defined */
			
 
				+
			
 
				 #define CEPH_PG_TYPE_REP     1
			
 
				 #define CEPH_PG_TYPE_RAID4   2
			
 
				 #define CEPH_PG_POOL_VERSION 2
			
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -305,7 +305,6 @@ ceph_parse_options(char *options, const char *dev_name,
 
				 
			
 
				 	/* start with defaults */
			
 
				 	opt->flags = CEPH_OPT_DEFAULT;
			
 
				-	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
			
 
				 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
			
 
				 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
			
 
				 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
			
@@ -391,7 +390,7 @@ ceph_parse_options(char *options, const char *dev_name,
 
				 
			
 
				 			/* misc */
			
 
				 		case Opt_osdtimeout:
			
 
				-			opt->osd_timeout = intval;
			
 
				+			pr_warning("ignoring deprecated osdtimeout option\n");
			
 
				 			break;
			
 
				 		case Opt_osdkeepalivetimeout:
			
 
				 			opt->osd_keepalive_timeout = intval;
			
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2244,22 +2244,62 @@ bad_tag:
 
				 
			
 
				 
			
 
				 /*
			
 
				- * Atomically queue work on a connection.  Bump @con reference to
			
 
				- * avoid races with connection teardown.
			
 
				+ * Atomically queue work on a connection after the specified delay.
			
 
				+ * Bump @con reference to avoid races with connection teardown.
			
 
				+ * Returns 0 if work was queued, or an error code otherwise.
			
 
				  */
			
 
				-static void queue_con(struct ceph_connection *con)
			
 
				+static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
			
 
				 {
			
 
				 	if (!con->ops->get(con)) {
			
 
				-		dout("queue_con %p ref count 0\n", con);
			
 
				-		return;
			
 
				+		dout("%s %p ref count 0\n", __func__, con);
			
 
				+
			
 
				+		return -ENOENT;
			
 
				 	}
			
 
				 
			
 
				-	if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
			
 
				-		dout("queue_con %p - already queued\n", con);
			
 
				+	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
			
 
				+		dout("%s %p - already queued\n", __func__, con);
			
 
				 		con->ops->put(con);
			
 
				-	} else {
			
 
				-		dout("queue_con %p\n", con);
			
 
				+
			
 
				+		return -EBUSY;
			
 
				 	}
			
 
				+
			
 
				+	dout("%s %p %lu\n", __func__, con, delay);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void queue_con(struct ceph_connection *con)
			
 
				+{
			
 
				+	(void) queue_con_delay(con, 0);
			
 
				+}
			
 
				+
			
 
				+static bool con_sock_closed(struct ceph_connection *con)
			
 
				+{
			
 
				+	if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
			
 
				+		return false;
			
 
				+
			
 
				+#define CASE(x)								\
			
 
				+	case CON_STATE_ ## x:						\
			
 
				+		con->error_msg = "socket closed (con state " #x ")";	\
			
 
				+		break;
			
 
				+
			
 
				+	switch (con->state) {
			
 
				+	CASE(CLOSED);
			
 
				+	CASE(PREOPEN);
			
 
				+	CASE(CONNECTING);
			
 
				+	CASE(NEGOTIATING);
			
 
				+	CASE(OPEN);
			
 
				+	CASE(STANDBY);
			
 
				+	default:
			
 
				+		pr_warning("%s con %p unrecognized state %lu\n",
			
 
				+			__func__, con, con->state);
			
 
				+		con->error_msg = "unrecognized con state";
			
 
				+		BUG();
			
 
				+		break;
			
 
				+	}
			
 
				+#undef CASE
			
 
				+
			
 
				+	return true;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2273,35 +2313,16 @@ static void con_work(struct work_struct *work)
 
				 
			
 
				 	mutex_lock(&con->mutex);
			
 
				 restart:
			
 
				-	if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
			
 
				-		switch (con->state) {
			
 
				-		case CON_STATE_CONNECTING:
			
 
				-			con->error_msg = "connection failed";
			
 
				-			break;
			
 
				-		case CON_STATE_NEGOTIATING:
			
 
				-			con->error_msg = "negotiation failed";
			
 
				-			break;
			
 
				-		case CON_STATE_OPEN:
			
 
				-			con->error_msg = "socket closed";
			
 
				-			break;
			
 
				-		default:
			
 
				-			dout("unrecognized con state %d\n", (int)con->state);
			
 
				-			con->error_msg = "unrecognized con state";
			
 
				-			BUG();
			
 
				-		}
			
 
				+	if (con_sock_closed(con))
			
 
				 		goto fault;
			
 
				-	}
			
 
				 
			
 
				 	if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
			
 
				 		dout("con_work %p backing off\n", con);
			
 
				-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
			
 
				-				       round_jiffies_relative(con->delay))) {
			
 
				-			dout("con_work %p backoff %lu\n", con, con->delay);
			
 
				-			mutex_unlock(&con->mutex);
			
 
				-			return;
			
 
				-		} else {
			
 
				+		ret = queue_con_delay(con, round_jiffies_relative(con->delay));
			
 
				+		if (ret) {
			
 
				 			dout("con_work %p FAILED to back off %lu\n", con,
			
 
				 			     con->delay);
			
 
				+			BUG_ON(ret == -ENOENT);
			
 
				 			set_bit(CON_FLAG_BACKOFF, &con->flags);
			
 
				 		}
			
 
				 		goto done;
			
@@ -2356,7 +2377,7 @@ fault:
 
				 static void ceph_fault(struct ceph_connection *con)
			
 
				 	__releases(con->mutex)
			
 
				 {
			
 
				-	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
			
 
				+	pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
			
 
				 	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
			
 
				 	dout("fault %p state %lu to peer %s\n",
			
 
				 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
			
@@ -2398,24 +2419,8 @@ static void ceph_fault(struct ceph_connection *con)
 
				 			con->delay = BASE_DELAY_INTERVAL;
			
 
				 		else if (con->delay < MAX_DELAY_INTERVAL)
			
 
				 			con->delay *= 2;
			
 
				-		con->ops->get(con);
			
 
				-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
			
 
				-				       round_jiffies_relative(con->delay))) {
			
 
				-			dout("fault queued %p delay %lu\n", con, con->delay);
			
 
				-		} else {
			
 
				-			con->ops->put(con);
			
 
				-			dout("fault failed to queue %p delay %lu, backoff\n",
			
 
				-			     con, con->delay);
			
 
				-			/*
			
 
				-			 * In many cases we see a socket state change
			
 
				-			 * while con_work is running and end up
			
 
				-			 * queuing (non-delayed) work, such that we
			
 
				-			 * can't backoff with a delay.  Set a flag so
			
 
				-			 * that when con_work restarts we schedule the
			
 
				-			 * delay then.
			
 
				-			 */
			
 
				-			set_bit(CON_FLAG_BACKOFF, &con->flags);
			
 
				-		}
			
 
				+		set_bit(CON_FLAG_BACKOFF, &con->flags);
			
 
				+		queue_con(con);
			
 
				 	}
			
 
				 
			
 
				 out_unlock:
			
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -221,6 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
				 	kref_init(&req->r_kref);
			
 
				 	init_completion(&req->r_completion);
			
 
				 	init_completion(&req->r_safe_completion);
			
 
				+	RB_CLEAR_NODE(&req->r_node);
			
 
				 	INIT_LIST_HEAD(&req->r_unsafe_item);
			
 
				 	INIT_LIST_HEAD(&req->r_linger_item);
			
 
				 	INIT_LIST_HEAD(&req->r_linger_osd);
			
@@ -580,7 +581,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 
				 
			
 
				 	dout("__kick_osd_requests osd%d\n", osd->o_osd);
			
 
				 	err = __reset_osd(osdc, osd);
			
 
				-	if (err == -EAGAIN)
			
 
				+	if (err)
			
 
				 		return;
			
 
				 
			
 
				 	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
			
@@ -607,14 +608,6 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void kick_osd_requests(struct ceph_osd_client *osdc,
			
 
				-			      struct ceph_osd *kickosd)
			
 
				-{
			
 
				-	mutex_lock(&osdc->request_mutex);
			
 
				-	__kick_osd_requests(osdc, kickosd);
			
 
				-	mutex_unlock(&osdc->request_mutex);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * If the osd connection drops, we need to resubmit all requests.
			
 
				  */
			
@@ -628,7 +621,9 @@ static void osd_reset(struct ceph_connection *con)
 
				 	dout("osd_reset osd%d\n", osd->o_osd);
			
 
				 	osdc = osd->o_osdc;
			
 
				 	down_read(&osdc->map_sem);
			
 
				-	kick_osd_requests(osdc, osd);
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	__kick_osd_requests(osdc, osd);
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				 	send_queued(osdc);
			
 
				 	up_read(&osdc->map_sem);
			
 
				 }
			
@@ -647,6 +642,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 
				 	atomic_set(&osd->o_ref, 1);
			
 
				 	osd->o_osdc = osdc;
			
 
				 	osd->o_osd = onum;
			
 
				+	RB_CLEAR_NODE(&osd->o_node);
			
 
				 	INIT_LIST_HEAD(&osd->o_requests);
			
 
				 	INIT_LIST_HEAD(&osd->o_linger_requests);
			
 
				 	INIT_LIST_HEAD(&osd->o_osd_lru);
			
@@ -750,6 +746,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 
				 	if (list_empty(&osd->o_requests) &&
			
 
				 	    list_empty(&osd->o_linger_requests)) {
			
 
				 		__remove_osd(osdc, osd);
			
 
				+		ret = -ENODEV;
			
 
				 	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
			
 
				 			  &osd->o_con.peer_addr,
			
 
				 			  sizeof(osd->o_con.peer_addr)) == 0 &&
			
@@ -876,9 +873,9 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 
				 			req->r_osd = NULL;
			
 
				 	}
			
 
				 
			
 
				+	list_del_init(&req->r_req_lru_item);
			
 
				 	ceph_osdc_put_request(req);
			
 
				 
			
 
				-	list_del_init(&req->r_req_lru_item);
			
 
				 	if (osdc->num_requests == 0) {
			
 
				 		dout(" no requests, canceling timeout\n");
			
 
				 		__cancel_osd_timeout(osdc);
			
@@ -910,8 +907,8 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 
				 					struct ceph_osd_request *req)
			
 
				 {
			
 
				 	dout("__unregister_linger_request %p\n", req);
			
 
				+	list_del_init(&req->r_linger_item);
			
 
				 	if (req->r_osd) {
			
 
				-		list_del_init(&req->r_linger_item);
			
 
				 		list_del_init(&req->r_linger_osd);
			
 
				 
			
 
				 		if (list_empty(&req->r_osd->o_requests) &&
			
@@ -1090,12 +1087,10 @@ static void handle_timeout(struct work_struct *work)
 
				 {
			
 
				 	struct ceph_osd_client *osdc =
			
 
				 		container_of(work, struct ceph_osd_client, timeout_work.work);
			
 
				-	struct ceph_osd_request *req, *last_req = NULL;
			
 
				+	struct ceph_osd_request *req;
			
 
				 	struct ceph_osd *osd;
			
 
				-	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
			
 
				 	unsigned long keepalive =
			
 
				 		osdc->client->options->osd_keepalive_timeout * HZ;
			
 
				-	unsigned long last_stamp = 0;
			
 
				 	struct list_head slow_osds;
			
 
				 	dout("timeout\n");
			
 
				 	down_read(&osdc->map_sem);
			
@@ -1104,37 +1099,6 @@ static void handle_timeout(struct work_struct *work)
 
				 
			
 
				 	mutex_lock(&osdc->request_mutex);
			
 
				 
			
 
				-	/*
			
 
				-	 * reset osds that appear to be _really_ unresponsive.  this
			
 
				-	 * is a failsafe measure.. we really shouldn't be getting to
			
 
				-	 * this point if the system is working properly.  the monitors
			
 
				-	 * should mark the osd as failed and we should find out about
			
 
				-	 * it from an updated osd map.
			
 
				-	 */
			
 
				-	while (timeout && !list_empty(&osdc->req_lru)) {
			
 
				-		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
			
 
				-				 r_req_lru_item);
			
 
				-
			
 
				-		/* hasn't been long enough since we sent it? */
			
 
				-		if (time_before(jiffies, req->r_stamp + timeout))
			
 
				-			break;
			
 
				-
			
 
				-		/* hasn't been long enough since it was acked? */
			
 
				-		if (req->r_request->ack_stamp == 0 ||
			
 
				-		    time_before(jiffies, req->r_request->ack_stamp + timeout))
			
 
				-			break;
			
 
				-
			
 
				-		BUG_ON(req == last_req && req->r_stamp == last_stamp);
			
 
				-		last_req = req;
			
 
				-		last_stamp = req->r_stamp;
			
 
				-
			
 
				-		osd = req->r_osd;
			
 
				-		BUG_ON(!osd);
			
 
				-		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
			
 
				-			   req->r_tid, osd->o_osd);
			
 
				-		__kick_osd_requests(osdc, osd);
			
 
				-	}
			
 
				-
			
 
				 	/*
			
 
				 	 * ping osds that are a bit slow.  this ensures that if there
			
 
				 	 * is a break in the TCP connection we will notice, and reopen
			
@@ -1364,8 +1328,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
 
				 
			
 
				 		dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
			
 
				 		     req->r_osd ? req->r_osd->o_osd : -1);
			
 
				-		__unregister_linger_request(osdc, req);
			
 
				 		__register_request(osdc, req);
			
 
				+		__unregister_linger_request(osdc, req);
			
 
				 	}
			
 
				 	mutex_unlock(&osdc->request_mutex);
			
 
				 
			
@@ -1599,6 +1563,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
 
				 	event->data = data;
			
 
				 	event->osdc = osdc;
			
 
				 	INIT_LIST_HEAD(&event->osd_node);
			
 
				+	RB_CLEAR_NODE(&event->node);
			
 
				 	kref_init(&event->kref);   /* one ref for us */
			
 
				 	kref_get(&event->kref);    /* one ref for the caller */
			
 
				 	init_completion(&event->completion);
			
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -469,6 +469,22 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
			
 
				+{
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				+
			
 
				+	if (id == CEPH_NOPOOL)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (WARN_ON_ONCE(id > (u64) INT_MAX))
			
 
				+		return NULL;
			
 
				+
			
 
				+	pi = __lookup_pg_pool(&map->pg_pools, (int) id);
			
 
				+
			
 
				+	return pi ? pi->name : NULL;
			
 
				+}
			
 
				+EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
			
 
				+
			
 
				 int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
			
 
				 {
			
 
				 	struct rb_node *rbp;
			
@@ -645,10 +661,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
				 	ceph_decode_32_safe(p, end, max, bad);
			
 
				 	while (max--) {
			
 
				 		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
			
 
				+		err = -ENOMEM;
			
 
				 		pi = kzalloc(sizeof(*pi), GFP_NOFS);
			
 
				 		if (!pi)
			
 
				 			goto bad;
			
 
				 		pi->id = ceph_decode_32(p);
			
 
				+		err = -EINVAL;
			
 
				 		ev = ceph_decode_8(p); /* encoding version */
			
 
				 		if (ev > CEPH_PG_POOL_VERSION) {
			
 
				 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
			
@@ -664,8 +682,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
				 		__insert_pg_pool(&map->pg_pools, pi);
			
 
				 	}
			
 
				 
			
 
				-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
			
 
				-		goto bad;
			
 
				+	if (version >= 5) {
			
 
				+		err = __decode_pool_names(p, end, map);
			
 
				+		if (err < 0) {
			
 
				+			dout("fail to decode pool names");
			
 
				+			goto bad;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	ceph_decode_32_safe(p, end, map->pool_max, bad);
			
 
				 
			
@@ -745,7 +768,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
				 	return map;
			
 
				 
			
 
				 bad:
			
 
				-	dout("osdmap_decode fail\n");
			
 
				+	dout("osdmap_decode fail err %d\n", err);
			
 
				 	ceph_osdmap_destroy(map);
			
 
				 	return ERR_PTR(err);
			
 
				 }
			
@@ -839,6 +862,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
				 		if (ev > CEPH_PG_POOL_VERSION) {
			
 
				 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
			
 
				 				   ev, CEPH_PG_POOL_VERSION);
			
 
				+			err = -EINVAL;
			
 
				 			goto bad;
			
 
				 		}
			
 
				 		pi = __lookup_pg_pool(&map->pg_pools, pool);
			
@@ -855,8 +879,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
				 		if (err < 0)
			
 
				 			goto bad;
			
 
				 	}
			
 
				-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
			
 
				-		goto bad;
			
 
				+	if (version >= 5) {
			
 
				+		err = __decode_pool_names(p, end, map);
			
 
				+		if (err < 0)
			
 
				+			goto bad;
			
 
				+	}
			
 
				 
			
 
				 	/* old_pool */
			
 
				 	ceph_decode_32_safe(p, end, len, bad);
			
@@ -932,15 +959,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
				 			(void) __remove_pg_mapping(&map->pg_temp, pgid);
			
 
				 
			
 
				 			/* insert */
			
 
				-			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
			
 
				-				err = -EINVAL;
			
 
				+			err = -EINVAL;
			
 
				+			if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
			
 
				 				goto bad;
			
 
				-			}
			
 
				+			err = -ENOMEM;
			
 
				 			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
			
 
				-			if (!pg) {
			
 
				-				err = -ENOMEM;
			
 
				+			if (!pg)
			
 
				 				goto bad;
			
 
				-			}
			
 
				 			pg->pgid = pgid;
			
 
				 			pg->len = pglen;
			
 
				 			for (j = 0; j < pglen; j++)