Explorar o código

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates for 3.4-rc1 from Sage Weil:
 "Alex has been busy.  There are a range of rbd and libceph cleanups,
  especially surrounding device setup and teardown, and a few critical
  fixes in that code.  There are more cleanups in the messenger code,
  virtual xattrs, a fix for CRC calculation/checks, and lots of other
  miscellaneous stuff.

  There's a patch from Amon Ott to make inos behave a bit better on
  32-bit boxes, some decode check fixes from Xi Wang, and network
  throttling fix from Jim Schutt, and a couple RBD fixes from Josh
  Durgin.

  No new functionality, just a lot of cleanup and bug fixing."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (65 commits)
  rbd: move snap_rwsem to the device, rename to header_rwsem
  ceph: fix three bugs, two in ceph_vxattrcb_file_layout()
  libceph: isolate kmap() call in write_partial_msg_pages()
  libceph: rename "page_shift" variable to something sensible
  libceph: get rid of zero_page_address
  libceph: only call kernel_sendpage() via helper
  libceph: use kernel_sendpage() for sending zeroes
  libceph: fix inverted crc option logic
  libceph: some simple changes
  libceph: small refactor in write_partial_kvec()
  libceph: do crc calculations outside loop
  libceph: separate CRC calculation from byte swapping
  libceph: use "do" in CRC-related Boolean variables
  ceph: ensure Boolean options support both senses
  libceph: a few small changes
  libceph: make ceph_tcp_connect() return int
  libceph: encapsulate some messenger cleanup code
  libceph: make ceph_msgr_wq private
  libceph: encapsulate connection kvec operations
  libceph: move prepare_write_banner()
  ...
Linus Torvalds %!s(int64=13) %!d(string=hai) anos
pai
achega
56b59b429b

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 419 - 244
drivers/block/rbd.c


+ 0 - 4
drivers/block/rbd_types.h

@@ -41,10 +41,6 @@
 #define RBD_HEADER_SIGNATURE	"RBD"
 #define RBD_HEADER_VERSION	"001.005"
 
-struct rbd_info {
-	__le64 max_id;
-} __attribute__ ((packed));
-
 struct rbd_image_snap_ondisk {
 	__le64 id;
 	__le64 image_size;

+ 6 - 5
fs/ceph/inode.c

@@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode,
 	case S_IFLNK:
 		inode->i_op = &ceph_symlink_iops;
 		if (!ci->i_symlink) {
-			int symlen = iinfo->symlink_len;
+			u32 symlen = iinfo->symlink_len;
 			char *sym;
 
-			BUG_ON(symlen != inode->i_size);
 			spin_unlock(&ci->i_ceph_lock);
 
+			err = -EINVAL;
+			if (WARN_ON(symlen != inode->i_size))
+				goto out;
+
 			err = -ENOMEM;
-			sym = kmalloc(symlen+1, GFP_NOFS);
+			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
 			if (!sym)
 				goto out;
-			memcpy(sym, iinfo->symlink, symlen);
-			sym[symlen] = 0;
 
 			spin_lock(&ci->i_ceph_lock);
 			if (!ci->i_symlink)

+ 3 - 4
fs/ceph/mds_client.c

@@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 
 	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
-	s->s_cap_ttl = 0;
+	s->s_cap_ttl = jiffies - 1;
 
 	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
@@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
 	int wake = 0;
 
 	spin_lock(&session->s_cap_lock);
-	was_stale = is_renew && (session->s_cap_ttl == 0 ||
-				 time_after_eq(jiffies, session->s_cap_ttl));
+	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
 
 	session->s_cap_ttl = session->s_renew_requested +
 		mdsc->mdsmap->m_session_timeout*HZ;
@@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session,
 			session->s_mds);
 		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
-		session->s_cap_ttl = 0;
+		session->s_cap_ttl = jiffies - 1;
 		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;

+ 1 - 1
fs/ceph/snap.c

@@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 
 	/* alloc new snap context */
 	err = -ENOMEM;
-	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+	if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
 		goto fail;
 	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
 	if (!snapc)

+ 17 - 2
fs/ceph/super.c

@@ -130,10 +130,12 @@ enum {
 	Opt_nodirstat,
 	Opt_rbytes,
 	Opt_norbytes,
+	Opt_asyncreaddir,
 	Opt_noasyncreaddir,
 	Opt_dcache,
 	Opt_nodcache,
 	Opt_ino32,
+	Opt_noino32,
 };
 
 static match_table_t fsopt_tokens = {
@@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = {
 	{Opt_nodirstat, "nodirstat"},
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
+	{Opt_asyncreaddir, "asyncreaddir"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
 	{Opt_dcache, "dcache"},
 	{Opt_nodcache, "nodcache"},
 	{Opt_ino32, "ino32"},
+	{Opt_noino32, "noino32"},
 	{-1, NULL}
 };
 
@@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_norbytes:
 		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
 		break;
+	case Opt_asyncreaddir:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
 	case Opt_noasyncreaddir:
 		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 		break;
@@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_ino32:
 		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 		break;
+	case Opt_noino32:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+		break;
 	default:
 		BUG_ON(token);
 	}
@@ -334,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	*path += 2;
 	dout("server path '%s'\n", *path);
 
-	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+	*popt = ceph_parse_options(options, dev_name, dev_name_end,
 				 parse_fsopt_token, (void *)fsopt);
-	if (err)
+	if (IS_ERR(*popt)) {
+		err = PTR_ERR(*popt);
 		goto out;
+	}
 
 	/* success */
 	*pfsopt = fsopt;
@@ -926,6 +938,7 @@ static int __init init_ceph(void)
 	if (ret)
 		goto out;
 
+	ceph_xattr_init();
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
 		goto out_icache;
@@ -935,6 +948,7 @@ static int __init init_ceph(void)
 	return 0;
 
 out_icache:
+	ceph_xattr_exit();
 	destroy_caches();
 out:
 	return ret;
@@ -944,6 +958,7 @@ static void __exit exit_ceph(void)
 {
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
+	ceph_xattr_exit();
 	destroy_caches();
 }
 

+ 3 - 1
fs/ceph/super.h

@@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
 	u32 ino = vino & 0xffffffff;
 	ino ^= vino >> 32;
 	if (!ino)
-		ino = 1;
+		ino = 2;
 	return ino;
 }
 
@@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
 extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
 
 /* caps.c */
 extern const char *ceph_cap_string(int c);

+ 133 - 69
fs/ceph/xattr.c

@@ -8,9 +8,12 @@
 #include <linux/xattr.h>
 #include <linux/slab.h>
 
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, "ceph.", 5) ||
+	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
@@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name)
  * These define virtual xattrs exposing the recursive directory
  * statistics and layout metadata.
  */
-struct ceph_vxattr_cb {
-	bool readonly;
+struct ceph_vxattr {
 	char *name;
+	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
+	bool readonly;
 };
 
 /* directories */
 
-static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
 				      size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files);
 }
 
-static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles);
 }
 
-static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rbytes);
 }
 
-static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
-	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+	return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
 			(long)ci->i_rctime.tv_nsec);
 }
 
-static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
-	{ true, "ceph.dir.files", ceph_vxattrcb_files},
-	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
-	{ true, NULL, NULL }
+#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+
+#define XATTR_NAME_CEPH(_type, _name) \
+		{ \
+			.name = CEPH_XATTR_NAME(_type, _name), \
+			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+			.readonly = true, \
+		}
+
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
+	XATTR_NAME_CEPH(dir, entries),
+	XATTR_NAME_CEPH(dir, files),
+	XATTR_NAME_CEPH(dir, subdirs),
+	XATTR_NAME_CEPH(dir, rentries),
+	XATTR_NAME_CEPH(dir, rfiles),
+	XATTR_NAME_CEPH(dir, rsubdirs),
+	XATTR_NAME_CEPH(dir, rbytes),
+	XATTR_NAME_CEPH(dir, rctime),
+	{ 0 }	/* Required table terminator */
 };
+static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
 /* files */
 
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
 				   size_t size)
 {
 	int ret;
@@ -103,21 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 		(unsigned long long)ceph_file_layout_su(ci->i_layout),
 		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
 		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-	if (ceph_file_layout_pg_preferred(ci->i_layout))
-		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+
+	if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
+		val += ret;
+		size -= ret;
+		ret += snprintf(val, size, "preferred_osd=%lld\n",
 			    (unsigned long long)ceph_file_layout_pg_preferred(
 				    ci->i_layout));
+	}
+
 	return ret;
 }
 
-static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-	{ true, "ceph.file.layout", ceph_vxattrcb_layout},
+static struct ceph_vxattr ceph_file_vxattrs[] = {
+	XATTR_NAME_CEPH(file, layout),
 	/* The following extended attribute name is deprecated */
-	{ true, "ceph.layout", ceph_vxattrcb_layout},
-	{ true, NULL, NULL }
+	{
+		.name = XATTR_CEPH_PREFIX "layout",
+		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
+		.getxattr_cb = ceph_vxattrcb_file_layout,
+		.readonly = true,
+	},
+	{ 0 }	/* Required table terminator */
 };
+static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 
-static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode))
 		return ceph_dir_vxattrs;
@@ -126,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
 	return NULL;
 }
 
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	if (vxattrs == ceph_dir_vxattrs)
+		return ceph_dir_vxattrs_name_size;
+	if (vxattrs == ceph_file_vxattrs)
+		return ceph_file_vxattrs_name_size;
+	BUG();
+
+	return 0;
+}
+
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	struct ceph_vxattr *vxattr;
+	size_t size = 0;
+
+	for (vxattr = vxattrs; vxattr->name; vxattr++)
+		size += vxattr->name_size;
+
+	return size;
+}
+
+/* Routines called at initialization and exit time */
+
+void __init ceph_xattr_init(void)
+{
+	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+
+void ceph_xattr_exit(void)
+{
+	ceph_dir_vxattrs_name_size = 0;
+	ceph_file_vxattrs_name_size = 0;
+}
+
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 						const char *name)
 {
-	do {
-		if (strcmp(vxattr->name, name) == 0)
-			return vxattr;
-		vxattr++;
-	} while (vxattr->name);
+	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
+
+	if (vxattr) {
+		while (vxattr->name) {
+			if (!strcmp(vxattr->name, name))
+				return vxattr;
+			vxattr++;
+		}
+	}
+
 	return NULL;
 }
 
@@ -502,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int err;
 	struct ceph_inode_xattr *xattr;
-	struct ceph_vxattr_cb *vxattr = NULL;
+	struct ceph_vxattr *vxattr = NULL;
 
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 
 	/* let's see if a virtual xattr was requested */
-	if (vxattrs)
-		vxattr = ceph_match_vxattr(vxattrs, name);
+	vxattr = ceph_match_vxattr(inode, name);
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -568,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
 	u32 vir_namelen = 0;
 	u32 namelen;
 	int err;
@@ -596,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 		goto out;
 
 list_xattr:
-	vir_namelen = 0;
-	/* include virtual dir xattrs */
-	if (vxattrs)
-		for (i = 0; vxattrs[i].name; i++)
-			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/*
+	 * Start with virtual dir xattr names (if any) (including
+	 * terminating '\0' characters for each).
+	 */
+	vir_namelen = ceph_vxattrs_name_size(vxattrs);
+
 	/* adding 1 byte per each variable due to the null termination */
 	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
@@ -698,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 		  const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int issued;
 	int err;
+	int dirty;
 	int name_len = strlen(name);
 	int val_len = size;
 	char *newname = NULL;
 	char *newval = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
-	int issued;
 	int required_blob_size;
-	int dirty;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
@@ -716,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
-	if (vxattrs) {
-		struct ceph_vxattr_cb *vxattr =
-			ceph_match_vxattr(vxattrs, name);
-		if (vxattr && vxattr->readonly)
-			return -EOPNOTSUPP;
-	}
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
 
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
@@ -730,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 		goto out;
 
 	if (val_len) {
-		newval = kmalloc(val_len + 1, GFP_NOFS);
+		newval = kmemdup(value, val_len, GFP_NOFS);
 		if (!newval)
 			goto out;
-		memcpy(newval, value, val_len);
-		newval[val_len] = '\0';
 	}
 
 	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
@@ -744,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
 	__build_xattrs(inode);
@@ -752,7 +818,7 @@ retry:
 
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
-		struct ceph_buffer *blob = NULL;
+		struct ceph_buffer *blob;
 
 		spin_unlock(&ci->i_ceph_lock);
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
@@ -766,12 +832,13 @@ retry:
 		goto retry;
 	}
 
-	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	err = __set_xattr(ci, newname, name_len, newval,
 			  val_len, 1, 1, 1, &xattr);
+
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
+
 	spin_unlock(&ci->i_ceph_lock);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
@@ -816,8 +883,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 int ceph_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int issued;
 	int err;
 	int required_blob_size;
@@ -829,22 +896,19 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
-	if (vxattrs) {
-		struct ceph_vxattr_cb *vxattr =
-			ceph_match_vxattr(vxattrs, name);
-		if (vxattr && vxattr->readonly)
-			return -EOPNOTSUPP;
-	}
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
 
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
-	__build_xattrs(inode);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
 	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
 
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
+	__build_xattrs(inode);
 
 	required_blob_size = __get_required_blob_size(ci, 0, 0);
 
@@ -865,10 +929,10 @@ retry:
 	}
 
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
+
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
-
 	spin_unlock(&ci->i_ceph_lock);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);

+ 1 - 1
include/linux/ceph/libceph.h

@@ -208,7 +208,7 @@ extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 
-extern int ceph_parse_options(struct ceph_options **popt, char *options,
+extern struct ceph_options *ceph_parse_options(char *options,
 			      const char *dev_name, const char *dev_name_end,
 			      int (*parse_extra_token)(char *c, void *private),
 			      void *private);

+ 1 - 4
include/linux/ceph/messenger.h

@@ -14,8 +14,6 @@
 struct ceph_msg;
 struct ceph_connection;
 
-extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
-
 /*
  * Ceph defines these callbacks for handling connection events.
  */
@@ -54,7 +52,6 @@ struct ceph_connection_operations {
 struct ceph_messenger {
 	struct ceph_entity_inst inst;    /* my name+address */
 	struct ceph_entity_addr my_enc_addr;
-	struct page *zero_page;          /* used in certain error cases */
 
 	bool nocrc;
 
@@ -101,7 +98,7 @@ struct ceph_msg {
 struct ceph_msg_pos {
 	int page, page_pos;  /* which page; offset in page */
 	int data_pos;        /* offset in data payload */
-	int did_page_crc;    /* true if we've calculated crc for current page */
+	bool did_page_crc;   /* true if we've calculated crc for current page */
 };
 
 /* ceph connection fault delay defaults, for exponential backoff */

+ 18 - 8
net/ceph/ceph_common.c

@@ -201,7 +201,9 @@ enum {
 	Opt_ip,
 	Opt_last_string,
 	/* string args above */
+	Opt_share,
 	Opt_noshare,
+	Opt_crc,
 	Opt_nocrc,
 };
 
@@ -217,7 +219,9 @@ static match_table_t opt_tokens = {
 	{Opt_key, "key=%s"},
 	{Opt_ip, "ip=%s"},
 	/* string args above */
+	{Opt_share, "share"},
 	{Opt_noshare, "noshare"},
+	{Opt_crc, "crc"},
 	{Opt_nocrc, "nocrc"},
 	{-1, NULL}
 };
@@ -277,10 +281,11 @@ out:
 	return err;
 }
 
-int ceph_parse_options(struct ceph_options **popt, char *options,
-		       const char *dev_name, const char *dev_name_end,
-		       int (*parse_extra_token)(char *c, void *private),
-		       void *private)
+struct ceph_options *
+ceph_parse_options(char *options, const char *dev_name,
+			const char *dev_name_end,
+			int (*parse_extra_token)(char *c, void *private),
+			void *private)
 {
 	struct ceph_options *opt;
 	const char *c;
@@ -289,7 +294,7 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
 
 	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
 	if (!opt)
-		return err;
+		return ERR_PTR(-ENOMEM);
 	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
 				GFP_KERNEL);
 	if (!opt->mon_addr)
@@ -398,10 +403,16 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
 			opt->mount_timeout = intval;
 			break;
 
+		case Opt_share:
+			opt->flags &= ~CEPH_OPT_NOSHARE;
+			break;
 		case Opt_noshare:
 			opt->flags |= CEPH_OPT_NOSHARE;
 			break;
 
+		case Opt_crc:
+			opt->flags &= ~CEPH_OPT_NOCRC;
+			break;
 		case Opt_nocrc:
 			opt->flags |= CEPH_OPT_NOCRC;
 			break;
@@ -412,12 +423,11 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
 	}
 
 	/* success */
-	*popt = opt;
-	return 0;
+	return opt;
 
 out:
 	ceph_destroy_options(opt);
-	return err;
+	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(ceph_parse_options);
 

+ 235 - 221
net/ceph/messenger.c

@@ -38,48 +38,54 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
 static struct lock_class_key socket_class;
 #endif
 
+/*
+ * When skipping (ignoring) a block of input we read it into a "skip
+ * buffer," which is this many bytes in size.
+ */
+#define SKIP_BUF_SIZE	1024
 
 static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
 
 /*
- * nicely render a sockaddr as a string.
+ * Nicely render a sockaddr as a string.  An array of formatted
+ * strings is used, to approximate reentrancy.
  */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
+#define ADDR_STR_COUNT_LOG	5	/* log2(# address strings in array) */
+#define ADDR_STR_COUNT		(1 << ADDR_STR_COUNT_LOG)
+#define ADDR_STR_COUNT_MASK	(ADDR_STR_COUNT - 1)
+#define MAX_ADDR_STR_LEN	64	/* 54 is enough */
+
+static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
+static atomic_t addr_str_seq = ATOMIC_INIT(0);
+
+static struct page *zero_page;		/* used in certain error cases */
 
 const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 {
 	int i;
 	char *s;
-	struct sockaddr_in *in4 = (void *)ss;
-	struct sockaddr_in6 *in6 = (void *)ss;
-
-	spin_lock(&addr_str_lock);
-	i = last_addr_str++;
-	if (last_addr_str == MAX_ADDR_STR)
-		last_addr_str = 0;
-	spin_unlock(&addr_str_lock);
+	struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
+	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+
+	i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
 	s = addr_str[i];
 
 	switch (ss->ss_family) {
 	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-			 (unsigned int)ntohs(in4->sin_port));
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+			 ntohs(in4->sin_port));
 		break;
 
 	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-			 (unsigned int)ntohs(in6->sin6_port));
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+			 ntohs(in6->sin6_port));
 		break;
 
 	default:
-		snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)",
-			 (int)ss->ss_family);
+		snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
+			 ss->ss_family);
 	}
 
 	return s;
@@ -95,22 +101,43 @@ static void encode_my_addr(struct ceph_messenger *msgr)
 /*
  * work queue for all reading and writing to/from the socket.
  */
-struct workqueue_struct *ceph_msgr_wq;
+static struct workqueue_struct *ceph_msgr_wq;
+
+void _ceph_msgr_exit(void)
+{
+	if (ceph_msgr_wq) {
+		destroy_workqueue(ceph_msgr_wq);
+		ceph_msgr_wq = NULL;
+	}
+
+	BUG_ON(zero_page == NULL);
+	kunmap(zero_page);
+	page_cache_release(zero_page);
+	zero_page = NULL;
+}
 
 int ceph_msgr_init(void)
 {
+	BUG_ON(zero_page != NULL);
+	zero_page = ZERO_PAGE(0);
+	page_cache_get(zero_page);
+
 	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
-	if (!ceph_msgr_wq) {
-		pr_err("msgr_init failed to create workqueue\n");
-		return -ENOMEM;
-	}
-	return 0;
+	if (ceph_msgr_wq)
+		return 0;
+
+	pr_err("msgr_init failed to create workqueue\n");
+	_ceph_msgr_exit();
+
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(ceph_msgr_init);
 
 void ceph_msgr_exit(void)
 {
-	destroy_workqueue(ceph_msgr_wq);
+	BUG_ON(ceph_msgr_wq == NULL);
+
+	_ceph_msgr_exit();
 }
 EXPORT_SYMBOL(ceph_msgr_exit);
 
@@ -128,8 +155,8 @@ EXPORT_SYMBOL(ceph_msgr_flush);
 /* data available on socket, or listen socket received a connect */
 static void ceph_data_ready(struct sock *sk, int count_unused)
 {
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
+	struct ceph_connection *con = sk->sk_user_data;
+
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
 		dout("ceph_data_ready on %p state = %lu, queueing work\n",
 		     con, con->state);
@@ -140,26 +167,30 @@ static void ceph_data_ready(struct sock *sk, int count_unused)
 /* socket has buffer space for writing */
 static void ceph_write_space(struct sock *sk)
 {
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
+	struct ceph_connection *con = sk->sk_user_data;
 
-	/* only queue to workqueue if there is data we want to write. */
+	/* only queue to workqueue if there is data we want to write,
+	 * and there is sufficient space in the socket buffer to accept
+	 * more data.  clear SOCK_NOSPACE so that ceph_write_space()
+	 * doesn't get called again until try_write() fills the socket
+	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
+	 * and net/core/stream.c:sk_stream_write_space().
+	 */
 	if (test_bit(WRITE_PENDING, &con->state)) {
-		dout("ceph_write_space %p queueing write work\n", con);
-		queue_con(con);
+		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+			dout("ceph_write_space %p queueing write work\n", con);
+			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			queue_con(con);
+		}
 	} else {
 		dout("ceph_write_space %p nothing to write\n", con);
 	}
-
-	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
-	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 }
 
 /* socket's state has changed */
 static void ceph_state_change(struct sock *sk)
 {
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
+	struct ceph_connection *con = sk->sk_user_data;
 
 	dout("ceph_state_change %p state = %lu sk_state = %u\n",
 	     con, con->state, sk->sk_state);
@@ -184,6 +215,8 @@ static void ceph_state_change(struct sock *sk)
 		dout("ceph_state_change TCP_ESTABLISHED\n");
 		queue_con(con);
 		break;
+	default:	/* Everything else is uninteresting */
+		break;
 	}
 }
 
@@ -194,7 +227,7 @@ static void set_sock_callbacks(struct socket *sock,
 			       struct ceph_connection *con)
 {
 	struct sock *sk = sock->sk;
-	sk->sk_user_data = (void *)con;
+	sk->sk_user_data = con;
 	sk->sk_data_ready = ceph_data_ready;
 	sk->sk_write_space = ceph_write_space;
 	sk->sk_state_change = ceph_state_change;
@@ -208,7 +241,7 @@ static void set_sock_callbacks(struct socket *sock,
 /*
  * initiate connection to a remote socket.
  */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+static int ceph_tcp_connect(struct ceph_connection *con)
 {
 	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
 	struct socket *sock;
@@ -218,8 +251,7 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
 			       IPPROTO_TCP, &sock);
 	if (ret)
-		return ERR_PTR(ret);
-	con->sock = sock;
+		return ret;
 	sock->sk->sk_allocation = GFP_NOFS;
 
 #ifdef CONFIG_LOCKDEP
@@ -236,19 +268,17 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 		dout("connect %s EINPROGRESS sk_state = %u\n",
 		     ceph_pr_addr(&con->peer_addr.in_addr),
 		     sock->sk->sk_state);
-		ret = 0;
-	}
-	if (ret < 0) {
+	} else if (ret < 0) {
 		pr_err("connect %s error %d\n",
 		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
 		sock_release(sock);
-		con->sock = NULL;
 		con->error_msg = "connect error";
+
+		return ret;
 	}
+	con->sock = sock;
 
-	if (ret < 0)
-		return ERR_PTR(ret);
-	return sock;
+	return 0;
 }
 
 static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
@@ -284,6 +314,19 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
 	return r;
 }
 
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+		     int offset, size_t size, int more)
+{
+	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
+	int ret;
+
+	ret = kernel_sendpage(sock, page, offset, size, flags);
+	if (ret == -EAGAIN)
+		ret = 0;
+
+	return ret;
+}
+
 
 /*
  * Shutdown/close the socket for the given connection.
@@ -391,22 +434,23 @@ bool ceph_con_opened(struct ceph_connection *con)
  */
 struct ceph_connection *ceph_con_get(struct ceph_connection *con)
 {
-	dout("con_get %p nref = %d -> %d\n", con,
-	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
-	if (atomic_inc_not_zero(&con->nref))
-		return con;
-	return NULL;
+	int nref = __atomic_add_unless(&con->nref, 1, 0);
+
+	dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1);
+
+	return nref ? con : NULL;
 }
 
 void ceph_con_put(struct ceph_connection *con)
 {
-	dout("con_put %p nref = %d -> %d\n", con,
-	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
-	BUG_ON(atomic_read(&con->nref) == 0);
-	if (atomic_dec_and_test(&con->nref)) {
+	int nref = atomic_dec_return(&con->nref);
+
+	BUG_ON(nref < 0);
+	if (nref == 0) {
 		BUG_ON(con->sock);
 		kfree(con);
 	}
+	dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref);
 }
 
 /*
@@ -442,14 +486,35 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 	return ret;
 }
 
+static void ceph_con_out_kvec_reset(struct ceph_connection *con)
+{
+	con->out_kvec_left = 0;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_cur = &con->out_kvec[0];
+}
+
+static void ceph_con_out_kvec_add(struct ceph_connection *con,
+				size_t size, void *data)
+{
+	int index;
+
+	index = con->out_kvec_left;
+	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+
+	con->out_kvec[index].iov_len = size;
+	con->out_kvec[index].iov_base = data;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += size;
+}
 
 /*
  * Prepare footer for currently outgoing message, and finish things
  * off.  Assumes out_kvec* are already valid.. we just add on to the end.
  */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
+static void prepare_write_message_footer(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->out_msg;
+	int v = con->out_kvec_left;
 
 	dout("prepare_write_message_footer %p\n", con);
 	con->out_kvec_is_msg = true;
@@ -467,9 +532,9 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v)
 static void prepare_write_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m;
-	int v = 0;
+	u32 crc;
 
-	con->out_kvec_bytes = 0;
+	ceph_con_out_kvec_reset(con);
 	con->out_kvec_is_msg = true;
 	con->out_msg_done = false;
 
@@ -477,16 +542,13 @@ static void prepare_write_message(struct ceph_connection *con)
 	 * TCP packet that's a good thing. */
 	if (con->in_seq > con->in_seq_acked) {
 		con->in_seq_acked = con->in_seq;
-		con->out_kvec[v].iov_base = &tag_ack;
-		con->out_kvec[v++].iov_len = 1;
+		ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
 		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con->out_kvec[v].iov_base = &con->out_temp_ack;
-		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
-		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+		ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			&con->out_temp_ack);
 	}
 
-	m = list_first_entry(&con->out_queue,
-		       struct ceph_msg, list_head);
+	m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
 	con->out_msg = m;
 
 	/* put message on sent list */
@@ -510,30 +572,26 @@ static void prepare_write_message(struct ceph_connection *con)
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
-	con->out_kvec[v].iov_base = &tag_msg;
-	con->out_kvec[v++].iov_len = 1;
-	con->out_kvec[v].iov_base = &m->hdr;
-	con->out_kvec[v++].iov_len = sizeof(m->hdr);
-	con->out_kvec[v++] = m->front;
+	ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+	ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+	ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
 	if (m->middle)
-		con->out_kvec[v++] = m->middle->vec;
-	con->out_kvec_left = v;
-	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
-		(m->middle ? m->middle->vec.iov_len : 0);
-	con->out_kvec_cur = con->out_kvec;
+		ceph_con_out_kvec_add(con, m->middle->vec.iov_len,
+			m->middle->vec.iov_base);
 
 	/* fill in crc (except data pages), footer */
-	con->out_msg->hdr.crc =
-		cpu_to_le32(crc32c(0, (void *)&m->hdr,
-				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+	con->out_msg->hdr.crc = cpu_to_le32(crc);
 	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
-	con->out_msg->footer.front_crc =
-		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
-	if (m->middle)
-		con->out_msg->footer.middle_crc =
-			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
-					   m->middle->vec.iov_len));
-	else
+
+	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+	con->out_msg->footer.front_crc = cpu_to_le32(crc);
+	if (m->middle) {
+		crc = crc32c(0, m->middle->vec.iov_base,
+				m->middle->vec.iov_len);
+		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+	} else
 		con->out_msg->footer.middle_crc = 0;
 	con->out_msg->footer.data_crc = 0;
 	dout("prepare_write_message front_crc %u data_crc %u\n",
@@ -549,11 +607,11 @@ static void prepare_write_message(struct ceph_connection *con)
 		else
 			con->out_msg_pos.page_pos = 0;
 		con->out_msg_pos.data_pos = 0;
-		con->out_msg_pos.did_page_crc = 0;
+		con->out_msg_pos.did_page_crc = false;
 		con->out_more = 1;  /* data + footer will follow */
 	} else {
 		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con, v);
+		prepare_write_message_footer(con);
 	}
 
 	set_bit(WRITE_PENDING, &con->state);
@@ -568,14 +626,14 @@ static void prepare_write_ack(struct ceph_connection *con)
 	     con->in_seq_acked, con->in_seq);
 	con->in_seq_acked = con->in_seq;
 
-	con->out_kvec[0].iov_base = &tag_ack;
-	con->out_kvec[0].iov_len = 1;
+	ceph_con_out_kvec_reset(con);
+
+	ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
 	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con->out_kvec[1].iov_base = &con->out_temp_ack;
-	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
-	con->out_kvec_left = 2;
-	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-	con->out_kvec_cur = con->out_kvec;
+	ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack),
+				&con->out_temp_ack);
+
 	con->out_more = 1;  /* more will follow.. eventually.. */
 	set_bit(WRITE_PENDING, &con->state);
 }
@@ -586,11 +644,8 @@ static void prepare_write_ack(struct ceph_connection *con)
 static void prepare_write_keepalive(struct ceph_connection *con)
 {
 	dout("prepare_write_keepalive %p\n", con);
-	con->out_kvec[0].iov_base = &tag_keepalive;
-	con->out_kvec[0].iov_len = 1;
-	con->out_kvec_left = 1;
-	con->out_kvec_bytes = 1;
-	con->out_kvec_cur = con->out_kvec;
+	ceph_con_out_kvec_reset(con);
+	ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
 	set_bit(WRITE_PENDING, &con->state);
 }
 
@@ -619,12 +674,9 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
 	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
 
-	if (auth_len) {
-		con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
-		con->out_kvec[con->out_kvec_left].iov_len = auth_len;
-		con->out_kvec_left++;
-		con->out_kvec_bytes += auth_len;
-	}
+	if (auth_len)
+		ceph_con_out_kvec_add(con, auth_len, auth_buf);
+
 	return 0;
 }
 
@@ -634,22 +686,18 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 static void prepare_write_banner(struct ceph_messenger *msgr,
 				 struct ceph_connection *con)
 {
-	int len = strlen(CEPH_BANNER);
+	ceph_con_out_kvec_reset(con);
+	ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+	ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr),
+					&msgr->my_enc_addr);
 
-	con->out_kvec[0].iov_base = CEPH_BANNER;
-	con->out_kvec[0].iov_len = len;
-	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-	con->out_kvec_left = 2;
-	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
-	con->out_kvec_cur = con->out_kvec;
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
 }
 
 static int prepare_write_connect(struct ceph_messenger *msgr,
 				 struct ceph_connection *con,
-				 int after_banner)
+				 int include_banner)
 {
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
@@ -678,22 +726,18 @@ static int prepare_write_connect(struct ceph_messenger *msgr,
 	con->out_connect.protocol_version = cpu_to_le32(proto);
 	con->out_connect.flags = 0;
 
-	if (!after_banner) {
-		con->out_kvec_left = 0;
-		con->out_kvec_bytes = 0;
-	}
-	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
-	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
-	con->out_kvec_left++;
-	con->out_kvec_bytes += sizeof(con->out_connect);
-	con->out_kvec_cur = con->out_kvec;
+	if (include_banner)
+		prepare_write_banner(msgr, con);
+	else
+		ceph_con_out_kvec_reset(con);
+	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
+
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
 
 	return prepare_connect_authorizer(con);
 }
 
-
 /*
  * write as much of pending kvecs to the socket as we can.
  *  1 -> done
@@ -714,17 +758,18 @@ static int write_partial_kvec(struct ceph_connection *con)
 		con->out_kvec_bytes -= ret;
 		if (con->out_kvec_bytes == 0)
 			break;            /* done */
-		while (ret > 0) {
-			if (ret >= con->out_kvec_cur->iov_len) {
-				ret -= con->out_kvec_cur->iov_len;
-				con->out_kvec_cur++;
-				con->out_kvec_left--;
-			} else {
-				con->out_kvec_cur->iov_len -= ret;
-				con->out_kvec_cur->iov_base += ret;
-				ret = 0;
-				break;
-			}
+
+		/* account for full iov entries consumed */
+		while (ret >= con->out_kvec_cur->iov_len) {
+			BUG_ON(!con->out_kvec_left);
+			ret -= con->out_kvec_cur->iov_len;
+			con->out_kvec_cur++;
+			con->out_kvec_left--;
+		}
+		/* and for a partially-consumed entry */
+		if (ret) {
+			con->out_kvec_cur->iov_len -= ret;
+			con->out_kvec_cur->iov_base += ret;
 		}
 	}
 	con->out_kvec_left = 0;
@@ -773,7 +818,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	struct ceph_msg *msg = con->out_msg;
 	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
 	size_t len;
-	int crc = con->msgr->nocrc;
+	bool do_datacrc = !con->msgr->nocrc;
 	int ret;
 	int total_max_write;
 	int in_trail = 0;
@@ -790,9 +835,8 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 
 	while (data_len > con->out_msg_pos.data_pos) {
 		struct page *page = NULL;
-		void *kaddr = NULL;
 		int max_write = PAGE_SIZE;
-		int page_shift = 0;
+		int bio_offset = 0;
 
 		total_max_write = data_len - trail_len -
 			con->out_msg_pos.data_pos;
@@ -811,58 +855,47 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 
 			page = list_first_entry(&msg->trail->head,
 						struct page, lru);
-			if (crc)
-				kaddr = kmap(page);
 			max_write = PAGE_SIZE;
 		} else if (msg->pages) {
 			page = msg->pages[con->out_msg_pos.page];
-			if (crc)
-				kaddr = kmap(page);
 		} else if (msg->pagelist) {
 			page = list_first_entry(&msg->pagelist->head,
 						struct page, lru);
-			if (crc)
-				kaddr = kmap(page);
 #ifdef CONFIG_BLOCK
 		} else if (msg->bio) {
 			struct bio_vec *bv;
 
 			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
 			page = bv->bv_page;
-			page_shift = bv->bv_offset;
-			if (crc)
-				kaddr = kmap(page) + page_shift;
+			bio_offset = bv->bv_offset;
 			max_write = bv->bv_len;
 #endif
 		} else {
-			page = con->msgr->zero_page;
-			if (crc)
-				kaddr = page_address(con->msgr->zero_page);
+			page = zero_page;
 		}
 		len = min_t(int, max_write - con->out_msg_pos.page_pos,
 			    total_max_write);
 
-		if (crc && !con->out_msg_pos.did_page_crc) {
-			void *base = kaddr + con->out_msg_pos.page_pos;
+		if (do_datacrc && !con->out_msg_pos.did_page_crc) {
+			void *base;
+			u32 crc;
 			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+			char *kaddr;
 
+			kaddr = kmap(page);
 			BUG_ON(kaddr == NULL);
-			con->out_msg->footer.data_crc =
-				cpu_to_le32(crc32c(tmpcrc, base, len));
-			con->out_msg_pos.did_page_crc = 1;
+			base = kaddr + con->out_msg_pos.page_pos + bio_offset;
+			crc = crc32c(tmpcrc, base, len);
+			con->out_msg->footer.data_crc = cpu_to_le32(crc);
+			con->out_msg_pos.did_page_crc = true;
 		}
-		ret = kernel_sendpage(con->sock, page,
-				      con->out_msg_pos.page_pos + page_shift,
-				      len,
-				      MSG_DONTWAIT | MSG_NOSIGNAL |
-				      MSG_MORE);
-
-		if (crc &&
-		    (msg->pages || msg->pagelist || msg->bio || in_trail))
+		ret = ceph_tcp_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos + bio_offset,
+				      len, 1);
+
+		if (do_datacrc)
 			kunmap(page);
 
-		if (ret == -EAGAIN)
-			ret = 0;
 		if (ret <= 0)
 			goto out;
 
@@ -871,7 +904,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 		if (ret == len) {
 			con->out_msg_pos.page_pos = 0;
 			con->out_msg_pos.page++;
-			con->out_msg_pos.did_page_crc = 0;
+			con->out_msg_pos.did_page_crc = false;
 			if (in_trail)
 				list_move_tail(&page->lru,
 					       &msg->trail->head);
@@ -888,12 +921,10 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
 
 	/* prepare and queue up footer, too */
-	if (!crc)
+	if (!do_datacrc)
 		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_left = 0;
-	con->out_kvec_cur = con->out_kvec;
-	prepare_write_message_footer(con, 0);
+	ceph_con_out_kvec_reset(con);
+	prepare_write_message_footer(con);
 	ret = 1;
 out:
 	return ret;
@@ -907,12 +938,9 @@ static int write_partial_skip(struct ceph_connection *con)
 	int ret;
 
 	while (con->out_skip > 0) {
-		struct kvec iov = {
-			.iov_base = page_address(con->msgr->zero_page),
-			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
-		};
+		size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
 
-		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1);
 		if (ret <= 0)
 			goto out;
 		con->out_skip -= ret;
@@ -1085,8 +1113,8 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)
 static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
 		char delim, const char **ipend)
 {
-	struct sockaddr_in *in4 = (void *)ss;
-	struct sockaddr_in6 *in6 = (void *)ss;
+	struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
+	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
 
 	memset(ss, 0, sizeof(*ss));
 
@@ -1512,10 +1540,9 @@ static int read_partial_message_section(struct ceph_connection *con,
 		if (ret <= 0)
 			return ret;
 		section->iov_len += ret;
-		if (section->iov_len == sec_len)
-			*crc = crc32c(0, section->iov_base,
-				      section->iov_len);
 	}
+	if (section->iov_len == sec_len)
+		*crc = crc32c(0, section->iov_base, section->iov_len);
 
 	return 1;
 }
@@ -1527,7 +1554,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 
 static int read_partial_message_pages(struct ceph_connection *con,
 				      struct page **pages,
-				      unsigned data_len, int datacrc)
+				      unsigned data_len, bool do_datacrc)
 {
 	void *p;
 	int ret;
@@ -1540,7 +1567,7 @@ static int read_partial_message_pages(struct ceph_connection *con,
 	p = kmap(pages[con->in_msg_pos.page]);
 	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
 			       left);
-	if (ret > 0 && datacrc)
+	if (ret > 0 && do_datacrc)
 		con->in_data_crc =
 			crc32c(con->in_data_crc,
 				  p + con->in_msg_pos.page_pos, ret);
@@ -1560,7 +1587,7 @@ static int read_partial_message_pages(struct ceph_connection *con,
 #ifdef CONFIG_BLOCK
 static int read_partial_message_bio(struct ceph_connection *con,
 				    struct bio **bio_iter, int *bio_seg,
-				    unsigned data_len, int datacrc)
+				    unsigned data_len, bool do_datacrc)
 {
 	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
 	void *p;
@@ -1576,7 +1603,7 @@ static int read_partial_message_bio(struct ceph_connection *con,
 
 	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
 			       left);
-	if (ret > 0 && datacrc)
+	if (ret > 0 && do_datacrc)
 		con->in_data_crc =
 			crc32c(con->in_data_crc,
 				  p + con->in_msg_pos.page_pos, ret);
@@ -1603,9 +1630,10 @@ static int read_partial_message(struct ceph_connection *con)
 	int ret;
 	int to, left;
 	unsigned front_len, middle_len, data_len;
-	int datacrc = con->msgr->nocrc;
+	bool do_datacrc = !con->msgr->nocrc;
 	int skip;
 	u64 seq;
+	u32 crc;
 
 	dout("read_partial_message con %p msg %p\n", con, m);
 
@@ -1618,17 +1646,16 @@ static int read_partial_message(struct ceph_connection *con)
 		if (ret <= 0)
 			return ret;
 		con->in_base_pos += ret;
-		if (con->in_base_pos == sizeof(con->in_hdr)) {
-			u32 crc = crc32c(0, (void *)&con->in_hdr,
-				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
-			if (crc != le32_to_cpu(con->in_hdr.crc)) {
-				pr_err("read_partial_message bad hdr "
-				       " crc %u != expected %u\n",
-				       crc, con->in_hdr.crc);
-				return -EBADMSG;
-			}
-		}
 	}
+
+	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+		pr_err("read_partial_message bad hdr "
+		       " crc %u != expected %u\n",
+		       crc, con->in_hdr.crc);
+		return -EBADMSG;
+	}
+
 	front_len = le32_to_cpu(con->in_hdr.front_len);
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
@@ -1714,7 +1741,7 @@ static int read_partial_message(struct ceph_connection *con)
 	while (con->in_msg_pos.data_pos < data_len) {
 		if (m->pages) {
 			ret = read_partial_message_pages(con, m->pages,
-						 data_len, datacrc);
+						 data_len, do_datacrc);
 			if (ret <= 0)
 				return ret;
 #ifdef CONFIG_BLOCK
@@ -1722,7 +1749,7 @@ static int read_partial_message(struct ceph_connection *con)
 
 			ret = read_partial_message_bio(con,
 						 &m->bio_iter, &m->bio_seg,
-						 data_len, datacrc);
+						 data_len, do_datacrc);
 			if (ret <= 0)
 				return ret;
 #endif
@@ -1757,7 +1784,7 @@ static int read_partial_message(struct ceph_connection *con)
 		       m, con->in_middle_crc, m->footer.middle_crc);
 		return -EBADMSG;
 	}
-	if (datacrc &&
+	if (do_datacrc &&
 	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
 	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
 		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
@@ -1819,7 +1846,6 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
-		prepare_write_banner(msgr, con);
 		prepare_write_connect(msgr, con, 1);
 		prepare_read_banner(con);
 		set_bit(CONNECTING, &con->state);
@@ -1829,11 +1855,9 @@ more:
 		con->in_tag = CEPH_MSGR_TAG_READY;
 		dout("try_write initiating connect on %p new state %lu\n",
 		     con, con->state);
-		con->sock = ceph_tcp_connect(con);
-		if (IS_ERR(con->sock)) {
-			con->sock = NULL;
+		ret = ceph_tcp_connect(con);
+		if (ret < 0) {
 			con->error_msg = "connect error";
-			ret = -1;
 			goto out;
 		}
 	}
@@ -1953,8 +1977,9 @@ more:
 		 *
 		 * FIXME: there must be a better way to do this!
 		 */
-		static char buf[1024];
-		int skip = min(1024, -con->in_base_pos);
+		static char buf[SKIP_BUF_SIZE];
+		int skip = min((int) sizeof (buf), -con->in_base_pos);
+
 		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
 		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
 		if (ret <= 0)
@@ -2216,15 +2241,6 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
 
 	spin_lock_init(&msgr->global_seq_lock);
 
-	/* the zero page is needed if a request is "canceled" while the message
-	 * is being written over the socket */
-	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-	if (!msgr->zero_page) {
-		kfree(msgr);
-		return ERR_PTR(-ENOMEM);
-	}
-	kmap(msgr->zero_page);
-
 	if (myaddr)
 		msgr->inst.addr = *myaddr;
 
@@ -2241,8 +2257,6 @@ EXPORT_SYMBOL(ceph_messenger_create);
 void ceph_messenger_destroy(struct ceph_messenger *msgr)
 {
 	dout("destroy %p\n", msgr);
-	kunmap(msgr->zero_page);
-	__free_page(msgr->zero_page);
 	kfree(msgr);
 	dout("destroyed messenger %p\n", msgr);
 }

+ 2 - 1
net/ceph/osdmap.c

@@ -283,7 +283,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		ceph_decode_32_safe(p, end, yes, bad);
 #if BITS_PER_LONG == 32
 		err = -EINVAL;
-		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+		if (yes > (ULONG_MAX - sizeof(*r))
+			  / sizeof(struct crush_rule_step))
 			goto bad;
 #endif
 		r = c->rules[i] = kmalloc(sizeof(*r) +

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio