15 years ago · fc7f99cf36
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,139 @@
 
				+Ceph Distributed File System
			
 
				+============================
			
 
				+
			
 
				+Ceph is a distributed network file system designed to provide good
			
 
				+performance, reliability, and scalability.
			
 
				+
			
 
				+Basic features include:
			
 
				+
			
 
				+ * POSIX semantics
			
 
				+ * Seamless scaling from 1 to many thousands of nodes
			
 
				+ * High availability and reliability.  No single points of failure.
			
 
				+ * N-way replication of data across storage nodes
			
 
				+ * Fast recovery from node failures
			
 
				+ * Automatic rebalancing of data on node addition/removal
			
 
				+ * Easy deployment: most FS components are userspace daemons
			
 
				+
			
 
				+Also,
			
 
				+ * Flexible snapshots (on any directory)
			
 
				+ * Recursive accounting (nested files, directories, bytes)
			
 
				+
			
 
				+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
			
 
				+on symmetric access by all clients to shared block devices, Ceph
			
 
				+separates data and metadata management into independent server
			
 
				+clusters, similar to Lustre.  Unlike Lustre, however, metadata and
			
 
				+storage nodes run entirely as user space daemons.  Storage nodes
			
 
				+utilize btrfs to store data objects, leveraging its advanced features
			
 
				+(checksumming, metadata replication, etc.).  File data is striped
			
 
				+across storage nodes in large chunks to distribute workload and
			
 
				+facilitate high throughputs.  When storage nodes fail, data is
			
 
				+re-replicated in a distributed fashion by the storage nodes themselves
			
 
				+(with some minimal coordination from a cluster monitor), making the
			
 
				+system extremely efficient and scalable.
			
 
				+
			
 
				+Metadata servers effectively form a large, consistent, distributed
			
 
				+in-memory cache above the file namespace that is extremely scalable,
			
 
				+dynamically redistributes metadata in response to workload changes,
			
 
				+and can tolerate arbitrary (well, non-Byzantine) node failures.  The
			
 
				+metadata server takes a somewhat unconventional approach to metadata
			
 
				+storage to significantly improve performance for common workloads.  In
			
 
				+particular, inodes with only a single link are embedded in
			
 
				+directories, allowing entire directories of dentries and inodes to be
			
 
				+loaded into its cache with a single I/O operation.  The contents of
			
 
				+extremely large directories can be fragmented and managed by
			
 
				+independent metadata servers, allowing scalable concurrent access.
			
 
				+
			
 
				+The system offers automatic data rebalancing/migration when scaling
			
 
				+from a small cluster of just a few nodes to many hundreds, without
			
 
				+requiring an administrator carve the data set into static volumes or
			
 
				+go through the tedious process of migrating data between servers.
			
 
				+When the file system approaches full, new nodes can be easily added
			
 
				+and things will "just work."
			
 
				+
			
 
				+Ceph includes flexible snapshot mechanism that allows a user to create
			
 
				+a snapshot on any subdirectory (and its nested contents) in the
			
 
				+system.  Snapshot creation and deletion are as simple as 'mkdir
			
 
				+.snap/foo' and 'rmdir .snap/foo'.
			
 
				+
			
 
				+Ceph also provides some recursive accounting on directories for nested
			
 
				+files and bytes.  That is, a 'getfattr -d foo' on any directory in the
			
 
				+system will reveal the total number of nested regular files and
			
 
				+subdirectories, and a summation of all nested file sizes.  This makes
			
 
				+the identification of large disk space consumers relatively quick, as
			
 
				+no 'du' or similar recursive scan of the file system is required.
			
 
				+
			
 
				+
			
 
				+Mount Syntax
			
 
				+============
			
 
				+
			
 
				+The basic mount syntax is:
			
 
				+
			
 
				+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
			
 
				+
			
 
				+You only need to specify a single monitor, as the client will get the
			
 
				+full list when it connects.  (However, if the monitor you specify
			
 
				+happens to be down, the mount won't succeed.)  The port can be left
			
 
				+off if the monitor is using the default.  So if the monitor is at
			
 
				+1.2.3.4,
			
 
				+
			
 
				+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
			
 
				+
			
 
				+is sufficient.  If /sbin/mount.ceph is installed, a hostname can be
			
 
				+used instead of an IP address.
			
 
				+
			
 
				+
			
 
				+
			
 
				+Mount Options
			
 
				+=============
			
 
				+
			
 
				+  ip=A.B.C.D[:N]
			
 
				+	Specify the IP and/or port the client should bind to locally.
			
 
				+	There is normally not much reason to do this.  If the IP is not
			
 
				+	specified, the client's IP address is determined by looking at the
			
 
				+	address it's connection to the monitor originates from.
			
 
				+
			
 
				+  wsize=X
			
 
				+	Specify the maximum write size in bytes.  By default there is no
			
 
				+	maximu.  Ceph will normally size writes based on the file stripe
			
 
				+	size.
			
 
				+
			
 
				+  rsize=X
			
 
				+	Specify the maximum readahead.
			
 
				+
			
 
				+  mount_timeout=X
			
 
				+	Specify the timeout value for mount (in seconds), in the case
			
 
				+	of a non-responsive Ceph file system.  The default is 30
			
 
				+	seconds.
			
 
				+
			
 
				+  rbytes
			
 
				+	When stat() is called on a directory, set st_size to 'rbytes',
			
 
				+	the summation of file sizes over all files nested beneath that
			
 
				+	directory.  This is the default.
			
 
				+
			
 
				+  norbytes
			
 
				+	When stat() is called on a directory, set st_size to the
			
 
				+	number of entries in that directory.
			
 
				+
			
 
				+  nocrc
			
 
				+	Disable CRC32C calculation for data writes.  If set, the OSD
			
 
				+	must rely on TCP's error correction to detect data corruption
			
 
				+	in the data payload.
			
 
				+
			
 
				+  noasyncreaddir
			
 
				+	Disable client's use its local cache to satisfy	readdir
			
 
				+	requests.  (This does not change correctness; the client uses
			
 
				+	cached metadata only when a lease or capability ensures it is
			
 
				+	valid.)
			
 
				+
			
 
				+
			
 
				+More Information
			
 
				+================
			
 
				+
			
 
				+For more information on Ceph, see the home page at
			
 
				+	http://ceph.newdream.net/
			
 
				+
			
 
				+The Linux kernel client source tree is available at
			
 
				+	git://ceph.newdream.net/linux-ceph-client.git
			
 
				+
			
 
				+and the source for the full system is at
			
 
				+	git://ceph.newdream.net/ceph.git
			
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -291,6 +291,7 @@ Code  Seq#(hex)	Include File		Comments
 
				 0x92	00-0F	drivers/usb/mon/mon_bin.c
			
 
				 0x93	60-7F	linux/auto_fs.h
			
 
				 0x94	all	fs/btrfs/ioctl.h
			
 
				+0x97	00-7F	fs/ceph/ioctl.h		Ceph file system
			
 
				 0x99	00-0F				537-Addinboard driver
			
 
				 					<mailto:buk@buks.ipn.de>
			
 
				 0xA0	all	linux/sdp/sdp.h		Industrial Device Project
			
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1441,6 +1441,15 @@ F:	arch/powerpc/include/asm/spu*.h
 
				 F:	arch/powerpc/oprofile/*cell*
			
 
				 F:	arch/powerpc/platforms/cell/
			
 
				 
			
 
				+CEPH DISTRIBUTED FILE SYSTEM CLIENT
			
 
				+M:	Sage Weil <sage@newdream.net>
			
 
				+L:	ceph-devel@lists.sourceforge.net
			
 
				+W:	http://ceph.newdream.net/
			
 
				+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
			
 
				+S:	Supported
			
 
				+F:	Documentation/filesystems/ceph.txt
			
 
				+F:	fs/ceph
			
 
				+
			
 
				 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
			
 
				 M:	David Vrabel <david.vrabel@csr.com>
			
 
				 L:	linux-usb@vger.kernel.org
			
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
 
				 
			
 
				 source "net/sunrpc/Kconfig"
			
 
				 source "fs/smbfs/Kconfig"
			
 
				+source "fs/ceph/Kconfig"
			
 
				 source "fs/cifs/Kconfig"
			
 
				 source "fs/ncpfs/Kconfig"
			
 
				 source "fs/coda/Kconfig"
			
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 
				 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
			
 
				 obj-$(CONFIG_GFS2_FS)           += gfs2/
			
 
				 obj-$(CONFIG_EXOFS_FS)          += exofs/
			
 
				+obj-$(CONFIG_CEPH_FS)		+= ceph/
			
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
 
				+config CEPH_FS
			
 
				+        tristate "Ceph distributed file system (EXPERIMENTAL)"
			
 
				+	depends on INET && EXPERIMENTAL
			
 
				+	select LIBCRC32C
			
 
				+	select CONFIG_CRYPTO_AES
			
 
				+	help
			
 
				+	  Choose Y or M here to include support for mounting the
			
 
				+	  experimental Ceph distributed file system.  Ceph is an extremely
			
 
				+	  scalable file system designed to provide high performance,
			
 
				+	  reliable access to petabytes of storage.
			
 
				+
			
 
				+	  More information at http://ceph.newdream.net/.
			
 
				+
			
 
				+	  If unsure, say N.
			
 
				+
			
 
				+config CEPH_FS_PRETTYDEBUG
			
 
				+	bool "Include file:line in ceph debug output"
			
 
				+	depends on CEPH_FS
			
 
				+	default n
			
 
				+	help
			
 
				+	  If you say Y here, debug output will include a filename and
			
 
				+	  line to aid debugging.  This icnreases kernel size and slows
			
 
				+	  execution slightly when debug call sites are enabled (e.g.,
			
 
				+	  via CONFIG_DYNAMIC_DEBUG).
			
 
				+
			
 
				+	  If unsure, say N.
			
 
				+
			
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
 
				+#
			
 
				+# Makefile for CEPH filesystem.
			
 
				+#
			
 
				+
			
 
				+ifneq ($(KERNELRELEASE),)
			
 
				+
			
 
				+obj-$(CONFIG_CEPH_FS) += ceph.o
			
 
				+
			
 
				+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
			
 
				+	export.o caps.o snap.o xattr.o \
			
 
				+	messenger.o msgpool.o buffer.o pagelist.o \
			
 
				+	mds_client.o mdsmap.o \
			
 
				+	mon_client.o \
			
 
				+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
			
 
				+	debugfs.o \
			
 
				+	auth.o auth_none.o \
			
 
				+	crypto.o armor.o \
			
 
				+	auth_x.o \
			
 
				+	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
			
 
				+
			
 
				+else
			
 
				+#Otherwise we were called directly from the command
			
 
				+# line; invoke the kernel build system.
			
 
				+
			
 
				+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
			
 
				+PWD := $(shell pwd)
			
 
				+
			
 
				+default: all
			
 
				+
			
 
				+all:
			
 
				+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
			
 
				+
			
 
				+modules_install:
			
 
				+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
			
 
				+
			
 
				+clean:
			
 
				+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
			
 
				+
			
 
				+endif
			
--- a/fs/ceph/README
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
 
				+#
			
 
				+# The following files are shared by (and manually synchronized
			
 
				+# between) the Ceph userland and kernel client.
			
 
				+#
			
 
				+# userland                  kernel
			
 
				+src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
			
 
				+src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
			
 
				+src/include/msgr.h	    fs/ceph/msgr.h
			
 
				+src/include/rados.h	    fs/ceph/rados.h
			
 
				+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
			
 
				+src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
			
 
				+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
			
 
				+src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
			
 
				+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
			
 
				+src/crush/crush.c	    fs/ceph/crush/crush.c
			
 
				+src/crush/crush.h	    fs/ceph/crush/crush.h
			
 
				+src/crush/mapper.c	    fs/ceph/crush/mapper.c
			
 
				+src/crush/mapper.h	    fs/ceph/crush/mapper.h
			
 
				+src/crush/hash.h	    fs/ceph/crush/hash.h
			
 
				+src/crush/hash.c	    fs/ceph/crush/hash.c
			
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1188 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/backing-dev.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/pagemap.h>
			
 
				+#include <linux/writeback.h>	/* generic_writepages */
			
 
				+#include <linux/pagevec.h>
			
 
				+#include <linux/task_io_accounting_ops.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "osd_client.h"
			
 
				+
			
 
				+/*
			
 
				+ * Ceph address space ops.
			
 
				+ *
			
 
				+ * There are a few funny things going on here.
			
 
				+ *
			
 
				+ * The page->private field is used to reference a struct
			
 
				+ * ceph_snap_context for _every_ dirty page.  This indicates which
			
 
				+ * snapshot the page was logically dirtied in, and thus which snap
			
 
				+ * context needs to be associated with the osd write during writeback.
			
 
				+ *
			
 
				+ * Similarly, struct ceph_inode_info maintains a set of counters to
			
 
				+ * count dirty pages on the inode.  In the absense of snapshots,
			
 
				+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
			
 
				+ *
			
 
				+ * When a snapshot is taken (that is, when the client receives
			
 
				+ * notification that a snapshot was taken), each inode with caps and
			
 
				+ * with dirty pages (dirty pages implies there is a cap) gets a new
			
 
				+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
			
 
				+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
			
 
				+ * moved to capsnap->dirty. (Unless a sync write is currently in
			
 
				+ * progress.  In that case, the capsnap is said to be "pending", new
			
 
				+ * writes cannot start, and the capsnap isn't "finalized" until the
			
 
				+ * write completes (or fails) and a final size/mtime for the inode for
			
 
				+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
			
 
				+ *
			
 
				+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
			
 
				+ * we look for the first capsnap in i_cap_snaps and write out pages in
			
 
				+ * that snap context _only_.  Then we move on to the next capsnap,
			
 
				+ * eventually reaching the "live" or "head" context (i.e., pages that
			
 
				+ * are not yet snapped) and are writing the most recently dirtied
			
 
				+ * pages.
			
 
				+ *
			
 
				+ * Invalidate and so forth must take care to ensure the dirty page
			
 
				+ * accounting is preserved.
			
 
				+ */
			
 
				+
			
 
				+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
			
 
				+#define CONGESTION_OFF_THRESH(congestion_kb)				\
			
 
				+	(CONGESTION_ON_THRESH(congestion_kb) -				\
			
 
				+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Dirty a page.  Optimistically adjust accounting, on the assumption
			
 
				+ * that we won't race with invalidate.  If we do, readjust.
			
 
				+ */
			
 
				+static int ceph_set_page_dirty(struct page *page)
			
 
				+{
			
 
				+	struct address_space *mapping = page->mapping;
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	int undo = 0;
			
 
				+	struct ceph_snap_context *snapc;
			
 
				+
			
 
				+	if (unlikely(!mapping))
			
 
				+		return !TestSetPageDirty(page);
			
 
				+
			
 
				+	if (TestSetPageDirty(page)) {
			
 
				+		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
			
 
				+		     mapping->host, page, page->index);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	inode = mapping->host;
			
 
				+	ci = ceph_inode(inode);
			
 
				+
			
 
				+	/*
			
 
				+	 * Note that we're grabbing a snapc ref here without holding
			
 
				+	 * any locks!
			
 
				+	 */
			
 
				+	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
			
 
				+
			
 
				+	/* dirty the head */
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (ci->i_wrbuffer_ref_head == 0)
			
 
				+		ci->i_head_snapc = ceph_get_snap_context(snapc);
			
 
				+	++ci->i_wrbuffer_ref_head;
			
 
				+	if (ci->i_wrbuffer_ref == 0)
			
 
				+		igrab(inode);
			
 
				+	++ci->i_wrbuffer_ref;
			
 
				+	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
			
 
				+	     "snapc %p seq %lld (%d snaps)\n",
			
 
				+	     mapping->host, page, page->index,
			
 
				+	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
			
 
				+	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
			
 
				+	     snapc, snapc->seq, snapc->num_snaps);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	/* now adjust page */
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	if (page->mapping) {	/* Race with truncate? */
			
 
				+		WARN_ON_ONCE(!PageUptodate(page));
			
 
				+
			
 
				+		if (mapping_cap_account_dirty(mapping)) {
			
 
				+			__inc_zone_page_state(page, NR_FILE_DIRTY);
			
 
				+			__inc_bdi_stat(mapping->backing_dev_info,
			
 
				+					BDI_RECLAIMABLE);
			
 
				+			task_io_account_write(PAGE_CACHE_SIZE);
			
 
				+		}
			
 
				+		radix_tree_tag_set(&mapping->page_tree,
			
 
				+				page_index(page), PAGECACHE_TAG_DIRTY);
			
 
				+
			
 
				+		/*
			
 
				+		 * Reference snap context in page->private.  Also set
			
 
				+		 * PagePrivate so that we get invalidatepage callback.
			
 
				+		 */
			
 
				+		page->private = (unsigned long)snapc;
			
 
				+		SetPagePrivate(page);
			
 
				+	} else {
			
 
				+		dout("ANON set_page_dirty %p (raced truncate?)\n", page);
			
 
				+		undo = 1;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+
			
 
				+	if (undo)
			
 
				+		/* whoops, we failed to dirty the page */
			
 
				+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
			
 
				+
			
 
				+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
			
 
				+
			
 
				+	BUG_ON(!PageDirty(page));
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If we are truncating the full page (i.e. offset == 0), adjust the
			
 
				+ * dirty page counters appropriately.  Only called if there is private
			
 
				+ * data on the page.
			
 
				+ */
			
 
				+static void ceph_invalidatepage(struct page *page, unsigned long offset)
			
 
				+{
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct ceph_snap_context *snapc = (void *)page->private;
			
 
				+
			
 
				+	BUG_ON(!PageLocked(page));
			
 
				+	BUG_ON(!page->private);
			
 
				+	BUG_ON(!PagePrivate(page));
			
 
				+	BUG_ON(!page->mapping);
			
 
				+
			
 
				+	inode = page->mapping->host;
			
 
				+
			
 
				+	/*
			
 
				+	 * We can get non-dirty pages here due to races between
			
 
				+	 * set_page_dirty and truncate_complete_page; just spit out a
			
 
				+	 * warning, in case we end up with accounting problems later.
			
 
				+	 */
			
 
				+	if (!PageDirty(page))
			
 
				+		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
			
 
				+
			
 
				+	if (offset == 0)
			
 
				+		ClearPageChecked(page);
			
 
				+
			
 
				+	ci = ceph_inode(inode);
			
 
				+	if (offset == 0) {
			
 
				+		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
			
 
				+		     inode, page, page->index, offset);
			
 
				+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
			
 
				+		ceph_put_snap_context(snapc);
			
 
				+		page->private = 0;
			
 
				+		ClearPagePrivate(page);
			
 
				+	} else {
			
 
				+		dout("%p invalidatepage %p idx %lu partial dirty page\n",
			
 
				+		     inode, page, page->index);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* just a sanity check */
			
 
				+static int ceph_releasepage(struct page *page, gfp_t g)
			
 
				+{
			
 
				+	struct inode *inode = page->mapping ? page->mapping->host : NULL;
			
 
				+	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
			
 
				+	WARN_ON(PageDirty(page));
			
 
				+	WARN_ON(page->private);
			
 
				+	WARN_ON(PagePrivate(page));
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * read a single page, without unlocking it.
			
 
				+ */
			
 
				+static int readpage_nounlock(struct file *filp, struct page *page)
			
 
				+{
			
 
				+	struct inode *inode = filp->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
			
 
				+	int err = 0;
			
 
				+	u64 len = PAGE_CACHE_SIZE;
			
 
				+
			
 
				+	dout("readpage inode %p file %p page %p index %lu\n",
			
 
				+	     inode, filp, page, page->index);
			
 
				+	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
			
 
				+				  page->index << PAGE_CACHE_SHIFT, &len,
			
 
				+				  ci->i_truncate_seq, ci->i_truncate_size,
			
 
				+				  &page, 1);
			
 
				+	if (err == -ENOENT)
			
 
				+		err = 0;
			
 
				+	if (err < 0) {
			
 
				+		SetPageError(page);
			
 
				+		goto out;
			
 
				+	} else if (err < PAGE_CACHE_SIZE) {
			
 
				+		/* zero fill remainder of page */
			
 
				+		zero_user_segment(page, err, PAGE_CACHE_SIZE);
			
 
				+	}
			
 
				+	SetPageUptodate(page);
			
 
				+
			
 
				+out:
			
 
				+	return err < 0 ? err : 0;
			
 
				+}
			
 
				+
			
 
				+static int ceph_readpage(struct file *filp, struct page *page)
			
 
				+{
			
 
				+	int r = readpage_nounlock(filp, page);
			
 
				+	unlock_page(page);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Build a vector of contiguous pages from the provided page list.
			
 
				+ */
			
 
				+static struct page **page_vector_from_list(struct list_head *page_list,
			
 
				+					   unsigned *nr_pages)
			
 
				+{
			
 
				+	struct page **pages;
			
 
				+	struct page *page;
			
 
				+	int next_index, contig_pages = 0;
			
 
				+
			
 
				+	/* build page vector */
			
 
				+	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
			
 
				+	if (!pages)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	BUG_ON(list_empty(page_list));
			
 
				+	next_index = list_entry(page_list->prev, struct page, lru)->index;
			
 
				+	list_for_each_entry_reverse(page, page_list, lru) {
			
 
				+		if (page->index == next_index) {
			
 
				+			dout("readpages page %d %p\n", contig_pages, page);
			
 
				+			pages[contig_pages] = page;
			
 
				+			contig_pages++;
			
 
				+			next_index++;
			
 
				+		} else {
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	*nr_pages = contig_pages;
			
 
				+	return pages;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
			
 
				+ * the caller (VM) cleans them up.
			
 
				+ */
			
 
				+static int ceph_readpages(struct file *file, struct address_space *mapping,
			
 
				+			  struct list_head *page_list, unsigned nr_pages)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
			
 
				+	int rc = 0;
			
 
				+	struct page **pages;
			
 
				+	struct pagevec pvec;
			
 
				+	loff_t offset;
			
 
				+	u64 len;
			
 
				+
			
 
				+	dout("readpages %p file %p nr_pages %d\n",
			
 
				+	     inode, file, nr_pages);
			
 
				+
			
 
				+	pages = page_vector_from_list(page_list, &nr_pages);
			
 
				+	if (IS_ERR(pages))
			
 
				+		return PTR_ERR(pages);
			
 
				+
			
 
				+	/* guess read extent */
			
 
				+	offset = pages[0]->index << PAGE_CACHE_SHIFT;
			
 
				+	len = nr_pages << PAGE_CACHE_SHIFT;
			
 
				+	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
			
 
				+				 offset, &len,
			
 
				+				 ci->i_truncate_seq, ci->i_truncate_size,
			
 
				+				 pages, nr_pages);
			
 
				+	if (rc == -ENOENT)
			
 
				+		rc = 0;
			
 
				+	if (rc < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	/* set uptodate and add to lru in pagevec-sized chunks */
			
 
				+	pagevec_init(&pvec, 0);
			
 
				+	for (; !list_empty(page_list) && len > 0;
			
 
				+	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
			
 
				+		struct page *page =
			
 
				+			list_entry(page_list->prev, struct page, lru);
			
 
				+
			
 
				+		list_del(&page->lru);
			
 
				+
			
 
				+		if (rc < (int)PAGE_CACHE_SIZE) {
			
 
				+			/* zero (remainder of) page */
			
 
				+			int s = rc < 0 ? 0 : rc;
			
 
				+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
			
 
				+		}
			
 
				+
			
 
				+		if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
			
 
				+			page_cache_release(page);
			
 
				+			dout("readpages %p add_to_page_cache failed %p\n",
			
 
				+			     inode, page);
			
 
				+			continue;
			
 
				+		}
			
 
				+		dout("readpages %p adding %p idx %lu\n", inode, page,
			
 
				+		     page->index);
			
 
				+		flush_dcache_page(page);
			
 
				+		SetPageUptodate(page);
			
 
				+		unlock_page(page);
			
 
				+		if (pagevec_add(&pvec, page) == 0)
			
 
				+			pagevec_lru_add_file(&pvec);   /* add to lru */
			
 
				+	}
			
 
				+	pagevec_lru_add_file(&pvec);
			
 
				+	rc = 0;
			
 
				+
			
 
				+out:
			
 
				+	kfree(pages);
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
			
 
				+ * only snap context we are allowed to write back.
			
 
				+ *
			
 
				+ * Caller holds i_lock.
			
 
				+ */
			
 
				+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
			
 
				+						      u64 *snap_size)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_snap_context *snapc = NULL;
			
 
				+	struct ceph_cap_snap *capsnap = NULL;
			
 
				+
			
 
				+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
			
 
				+		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
			
 
				+		     capsnap->context, capsnap->dirty_pages);
			
 
				+		if (capsnap->dirty_pages) {
			
 
				+			snapc = ceph_get_snap_context(capsnap->context);
			
 
				+			if (snap_size)
			
 
				+				*snap_size = capsnap->size;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	if (!snapc && ci->i_snap_realm) {
			
 
				+		snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
			
 
				+		dout(" head snapc %p has %d dirty pages\n",
			
 
				+		     snapc, ci->i_wrbuffer_ref_head);
			
 
				+	}
			
 
				+	return snapc;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
			
 
				+						    u64 *snap_size)
			
 
				+{
			
 
				+	struct ceph_snap_context *snapc = NULL;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	snapc = __get_oldest_context(inode, snap_size);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return snapc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Write a single page, but leave the page locked.
			
 
				+ *
			
 
				+ * If we get a write error, set the page error bit, but still adjust the
			
 
				+ * dirty page accounting (i.e., page is no longer dirty).
			
 
				+ */
			
 
				+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
			
 
				+{
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct ceph_client *client;
			
 
				+	struct ceph_osd_client *osdc;
			
 
				+	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
			
 
				+	int len = PAGE_CACHE_SIZE;
			
 
				+	loff_t i_size;
			
 
				+	int err = 0;
			
 
				+	struct ceph_snap_context *snapc;
			
 
				+	u64 snap_size = 0;
			
 
				+	long writeback_stat;
			
 
				+
			
 
				+	dout("writepage %p idx %lu\n", page, page->index);
			
 
				+
			
 
				+	if (!page->mapping || !page->mapping->host) {
			
 
				+		dout("writepage %p - no mapping\n", page);
			
 
				+		return -EFAULT;
			
 
				+	}
			
 
				+	inode = page->mapping->host;
			
 
				+	ci = ceph_inode(inode);
			
 
				+	client = ceph_inode_to_client(inode);
			
 
				+	osdc = &client->osdc;
			
 
				+
			
 
				+	/* verify this is a writeable snap context */
			
 
				+	snapc = (void *)page->private;
			
 
				+	if (snapc == NULL) {
			
 
				+		dout("writepage %p page %p not dirty?\n", inode, page);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	if (snapc != get_oldest_context(inode, &snap_size)) {
			
 
				+		dout("writepage %p page %p snapc %p not writeable - noop\n",
			
 
				+		     inode, page, (void *)page->private);
			
 
				+		/* we should only noop if called by kswapd */
			
 
				+		WARN_ON((current->flags & PF_MEMALLOC) == 0);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* is this a partial page at end of file? */
			
 
				+	if (snap_size)
			
 
				+		i_size = snap_size;
			
 
				+	else
			
 
				+		i_size = i_size_read(inode);
			
 
				+	if (i_size < page_off + len)
			
 
				+		len = i_size - page_off;
			
 
				+
			
 
				+	dout("writepage %p page %p index %lu on %llu~%u\n",
			
 
				+	     inode, page, page->index, page_off, len);
			
 
				+
			
 
				+	writeback_stat = atomic_long_inc_return(&client->writeback_count);
			
 
				+	if (writeback_stat >
			
 
				+	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
			
 
				+		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
			
 
				+
			
 
				+	set_page_writeback(page);
			
 
				+	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
			
 
				+				   &ci->i_layout, snapc,
			
 
				+				   page_off, len,
			
 
				+				   ci->i_truncate_seq, ci->i_truncate_size,
			
 
				+				   &inode->i_mtime,
			
 
				+				   &page, 1, 0, 0, true);
			
 
				+	if (err < 0) {
			
 
				+		dout("writepage setting page/mapping error %d %p\n", err, page);
			
 
				+		SetPageError(page);
			
 
				+		mapping_set_error(&inode->i_data, err);
			
 
				+		if (wbc)
			
 
				+			wbc->pages_skipped++;
			
 
				+	} else {
			
 
				+		dout("writepage cleaned page %p\n", page);
			
 
				+		err = 0;  /* vfs expects us to return 0 */
			
 
				+	}
			
 
				+	page->private = 0;
			
 
				+	ClearPagePrivate(page);
			
 
				+	end_page_writeback(page);
			
 
				+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
			
 
				+	ceph_put_snap_context(snapc);
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
			
 
				+{
			
 
				+	int err;
			
 
				+	struct inode *inode = page->mapping->host;
			
 
				+	BUG_ON(!inode);
			
 
				+	igrab(inode);
			
 
				+	err = writepage_nounlock(page, wbc);
			
 
				+	unlock_page(page);
			
 
				+	iput(inode);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * lame release_pages helper.  release_pages() isn't exported to
			
 
				+ * modules.
			
 
				+ */
			
 
				+static void ceph_release_pages(struct page **pages, int num)
			
 
				+{
			
 
				+	struct pagevec pvec;
			
 
				+	int i;
			
 
				+
			
 
				+	pagevec_init(&pvec, 0);
			
 
				+	for (i = 0; i < num; i++) {
			
 
				+		if (pagevec_add(&pvec, pages[i]) == 0)
			
 
				+			pagevec_release(&pvec);
			
 
				+	}
			
 
				+	pagevec_release(&pvec);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * async writeback completion handler.
			
 
				+ *
			
 
				+ * If we get an error, set the mapping error bit, but not the individual
			
 
				+ * page error bits.
			
 
				+ */
			
 
				+static void writepages_finish(struct ceph_osd_request *req,
			
 
				+			      struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct inode *inode = req->r_inode;
			
 
				+	struct ceph_osd_reply_head *replyhead;
			
 
				+	struct ceph_osd_op *op;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	unsigned wrote;
			
 
				+	struct page *page;
			
 
				+	int i;
			
 
				+	struct ceph_snap_context *snapc = req->r_snapc;
			
 
				+	struct address_space *mapping = inode->i_mapping;
			
 
				+	struct writeback_control *wbc = req->r_wbc;
			
 
				+	__s32 rc = -EIO;
			
 
				+	u64 bytes = 0;
			
 
				+	struct ceph_client *client = ceph_inode_to_client(inode);
			
 
				+	long writeback_stat;
			
 
				+	unsigned issued = __ceph_caps_issued(ci, NULL);
			
 
				+
			
 
				+	/* parse reply */
			
 
				+	replyhead = msg->front.iov_base;
			
 
				+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
			
 
				+	op = (void *)(replyhead + 1);
			
 
				+	rc = le32_to_cpu(replyhead->result);
			
 
				+	bytes = le64_to_cpu(op->extent.length);
			
 
				+
			
 
				+	if (rc >= 0) {
			
 
				+		/*
			
 
				+		 * Assume we wrote the pages we originally sent.  The
			
 
				+		 * osd might reply with fewer pages if our writeback
			
 
				+		 * raced with a truncation and was adjusted at the osd,
			
 
				+		 * so don't believe the reply.
			
 
				+		 */
			
 
				+		wrote = req->r_num_pages;
			
 
				+	} else {
			
 
				+		wrote = 0;
			
 
				+		mapping_set_error(mapping, rc);
			
 
				+	}
			
 
				+	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
			
 
				+	     inode, rc, bytes, wrote);
			
 
				+
			
 
				+	/* clean all pages */
			
 
				+	for (i = 0; i < req->r_num_pages; i++) {
			
 
				+		page = req->r_pages[i];
			
 
				+		BUG_ON(!page);
			
 
				+		WARN_ON(!PageUptodate(page));
			
 
				+
			
 
				+		writeback_stat =
			
 
				+			atomic_long_dec_return(&client->writeback_count);
			
 
				+		if (writeback_stat <
			
 
				+		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
			
 
				+			clear_bdi_congested(&client->backing_dev_info,
			
 
				+					    BLK_RW_ASYNC);
			
 
				+
			
 
				+		if (i >= wrote) {
			
 
				+			dout("inode %p skipping page %p\n", inode, page);
			
 
				+			wbc->pages_skipped++;
			
 
				+		}
			
 
				+		page->private = 0;
			
 
				+		ClearPagePrivate(page);
			
 
				+		ceph_put_snap_context(snapc);
			
 
				+		dout("unlocking %d %p\n", i, page);
			
 
				+		end_page_writeback(page);
			
 
				+
			
 
				+		/*
			
 
				+		 * We lost the cache cap, need to truncate the page before
			
 
				+		 * it is unlocked, otherwise we'd truncate it later in the
			
 
				+		 * page truncation thread, possibly losing some data that
			
 
				+		 * raced its way in
			
 
				+		 */
			
 
				+		if ((issued & CEPH_CAP_FILE_CACHE) == 0)
			
 
				+			generic_error_remove_page(inode->i_mapping, page);
			
 
				+
			
 
				+		unlock_page(page);
			
 
				+	}
			
 
				+	dout("%p wrote+cleaned %d pages\n", inode, wrote);
			
 
				+	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
			
 
				+
			
 
				+	ceph_release_pages(req->r_pages, req->r_num_pages);
			
 
				+	if (req->r_pages_from_pool)
			
 
				+		mempool_free(req->r_pages,
			
 
				+			     ceph_client(inode->i_sb)->wb_pagevec_pool);
			
 
				+	else
			
 
				+		kfree(req->r_pages);
			
 
				+	ceph_osdc_put_request(req);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * allocate a page vec, either directly, or if necessary, via a the
			
 
				+ * mempool.  we avoid the mempool if we can because req->r_num_pages
			
 
				+ * may be less than the maximum write size.
			
 
				+ */
			
 
				+static void alloc_page_vec(struct ceph_client *client,
			
 
				+			   struct ceph_osd_request *req)
			
 
				+{
			
 
				+	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
			
 
				+			       GFP_NOFS);
			
 
				+	if (!req->r_pages) {
			
 
				+		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
			
 
				+		req->r_pages_from_pool = 1;
			
 
				+		WARN_ON(!req->r_pages);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * initiate async writeback
			
 
				+ */
			
 
				+static int ceph_writepages_start(struct address_space *mapping,
			
 
				+				 struct writeback_control *wbc)
			
 
				+{
			
 
				+	struct inode *inode = mapping->host;
			
 
				+	struct backing_dev_info *bdi = mapping->backing_dev_info;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_client *client;
			
 
				+	pgoff_t index, start, end;
			
 
				+	int range_whole = 0;
			
 
				+	int should_loop = 1;
			
 
				+	pgoff_t max_pages = 0, max_pages_ever = 0;
			
 
				+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
			
 
				+	struct pagevec pvec;
			
 
				+	int done = 0;
			
 
				+	int rc = 0;
			
 
				+	unsigned wsize = 1 << inode->i_blkbits;
			
 
				+	struct ceph_osd_request *req = NULL;
			
 
				+	int do_sync;
			
 
				+	u64 snap_size = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Include a 'sync' in the OSD request if this is a data
			
 
				+	 * integrity write (e.g., O_SYNC write or fsync()), or if our
			
 
				+	 * cap is being revoked.
			
 
				+	 */
			
 
				+	do_sync = wbc->sync_mode == WB_SYNC_ALL;
			
 
				+	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
			
 
				+		do_sync = 1;
			
 
				+	dout("writepages_start %p dosync=%d (mode=%s)\n",
			
 
				+	     inode, do_sync,
			
 
				+	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
			
 
				+	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
			
 
				+
			
 
				+	client = ceph_inode_to_client(inode);
			
 
				+	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
			
 
				+		pr_warning("writepage_start %p on forced umount\n", inode);
			
 
				+		return -EIO; /* we're in a forced umount, don't write! */
			
 
				+	}
			
 
				+	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
			
 
				+		wsize = client->mount_args->wsize;
			
 
				+	if (wsize < PAGE_CACHE_SIZE)
			
 
				+		wsize = PAGE_CACHE_SIZE;
			
 
				+	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+	pagevec_init(&pvec, 0);
			
 
				+
			
 
				+	/* ?? */
			
 
				+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
			
 
				+		dout(" writepages congested\n");
			
 
				+		wbc->encountered_congestion = 1;
			
 
				+		goto out_final;
			
 
				+	}
			
 
				+
			
 
				+	/* where to start/end? */
			
 
				+	if (wbc->range_cyclic) {
			
 
				+		start = mapping->writeback_index; /* Start from prev offset */
			
 
				+		end = -1;
			
 
				+		dout(" cyclic, start at %lu\n", start);
			
 
				+	} else {
			
 
				+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
			
 
				+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
			
 
				+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
			
 
				+			range_whole = 1;
			
 
				+		should_loop = 0;
			
 
				+		dout(" not cyclic, %lu to %lu\n", start, end);
			
 
				+	}
			
 
				+	index = start;
			
 
				+
			
 
				+retry:
			
 
				+	/* find oldest snap context with dirty data */
			
 
				+	ceph_put_snap_context(snapc);
			
 
				+	snapc = get_oldest_context(inode, &snap_size);
			
 
				+	if (!snapc) {
			
 
				+		/* hmm, why does writepages get called when there
			
 
				+		   is no dirty data? */
			
 
				+		dout(" no snap context with dirty data?\n");
			
 
				+		goto out;
			
 
				+	}
			
 
				+	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
			
 
				+	     snapc, snapc->seq, snapc->num_snaps);
			
 
				+	if (last_snapc && snapc != last_snapc) {
			
 
				+		/* if we switched to a newer snapc, restart our scan at the
			
 
				+		 * start of the original file range. */
			
 
				+		dout("  snapc differs from last pass, restarting at %lu\n",
			
 
				+		     index);
			
 
				+		index = start;
			
 
				+	}
			
 
				+	last_snapc = snapc;
			
 
				+
			
 
				+	while (!done && index <= end) {
			
 
				+		unsigned i;
			
 
				+		int first;
			
 
				+		pgoff_t next;
			
 
				+		int pvec_pages, locked_pages;
			
 
				+		struct page *page;
			
 
				+		int want;
			
 
				+		u64 offset, len;
			
 
				+		struct ceph_osd_request_head *reqhead;
			
 
				+		struct ceph_osd_op *op;
			
 
				+		long writeback_stat;
			
 
				+
			
 
				+		next = 0;
			
 
				+		locked_pages = 0;
			
 
				+		max_pages = max_pages_ever;
			
 
				+
			
 
				+get_more_pages:
			
 
				+		first = -1;
			
 
				+		want = min(end - index,
			
 
				+			   min((pgoff_t)PAGEVEC_SIZE,
			
 
				+			       max_pages - (pgoff_t)locked_pages) - 1)
			
 
				+			+ 1;
			
 
				+		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			
 
				+						PAGECACHE_TAG_DIRTY,
			
 
				+						want);
			
 
				+		dout("pagevec_lookup_tag got %d\n", pvec_pages);
			
 
				+		if (!pvec_pages && !locked_pages)
			
 
				+			break;
			
 
				+		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
			
 
				+			page = pvec.pages[i];
			
 
				+			dout("? %p idx %lu\n", page, page->index);
			
 
				+			if (locked_pages == 0)
			
 
				+				lock_page(page);  /* first page */
			
 
				+			else if (!trylock_page(page))
			
 
				+				break;
			
 
				+
			
 
				+			/* only dirty pages, or our accounting breaks */
			
 
				+			if (unlikely(!PageDirty(page)) ||
			
 
				+			    unlikely(page->mapping != mapping)) {
			
 
				+				dout("!dirty or !mapping %p\n", page);
			
 
				+				unlock_page(page);
			
 
				+				break;
			
 
				+			}
			
 
				+			if (!wbc->range_cyclic && page->index > end) {
			
 
				+				dout("end of range %p\n", page);
			
 
				+				done = 1;
			
 
				+				unlock_page(page);
			
 
				+				break;
			
 
				+			}
			
 
				+			if (next && (page->index != next)) {
			
 
				+				dout("not consecutive %p\n", page);
			
 
				+				unlock_page(page);
			
 
				+				break;
			
 
				+			}
			
 
				+			if (wbc->sync_mode != WB_SYNC_NONE) {
			
 
				+				dout("waiting on writeback %p\n", page);
			
 
				+				wait_on_page_writeback(page);
			
 
				+			}
			
 
				+			if ((snap_size && page_offset(page) > snap_size) ||
			
 
				+			    (!snap_size &&
			
 
				+			     page_offset(page) > i_size_read(inode))) {
			
 
				+				dout("%p page eof %llu\n", page, snap_size ?
			
 
				+				     snap_size : i_size_read(inode));
			
 
				+				done = 1;
			
 
				+				unlock_page(page);
			
 
				+				break;
			
 
				+			}
			
 
				+			if (PageWriteback(page)) {
			
 
				+				dout("%p under writeback\n", page);
			
 
				+				unlock_page(page);
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			/* only if matching snap context */
			
 
				+			if (snapc != (void *)page->private) {
			
 
				+				dout("page snapc %p != oldest %p\n",
			
 
				+				     (void *)page->private, snapc);
			
 
				+				unlock_page(page);
			
 
				+				if (!locked_pages)
			
 
				+					continue; /* keep looking for snap */
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			if (!clear_page_dirty_for_io(page)) {
			
 
				+				dout("%p !clear_page_dirty_for_io\n", page);
			
 
				+				unlock_page(page);
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			/* ok */
			
 
				+			if (locked_pages == 0) {
			
 
				+				/* prepare async write request */
			
 
				+				offset = page->index << PAGE_CACHE_SHIFT;
			
 
				+				len = wsize;
			
 
				+				req = ceph_osdc_new_request(&client->osdc,
			
 
				+					    &ci->i_layout,
			
 
				+					    ceph_vino(inode),
			
 
				+					    offset, &len,
			
 
				+					    CEPH_OSD_OP_WRITE,
			
 
				+					    CEPH_OSD_FLAG_WRITE |
			
 
				+						    CEPH_OSD_FLAG_ONDISK,
			
 
				+					    snapc, do_sync,
			
 
				+					    ci->i_truncate_seq,
			
 
				+					    ci->i_truncate_size,
			
 
				+					    &inode->i_mtime, true, 1);
			
 
				+				max_pages = req->r_num_pages;
			
 
				+
			
 
				+				alloc_page_vec(client, req);
			
 
				+				req->r_callback = writepages_finish;
			
 
				+				req->r_inode = inode;
			
 
				+				req->r_wbc = wbc;
			
 
				+			}
			
 
				+
			
 
				+			/* note position of first page in pvec */
			
 
				+			if (first < 0)
			
 
				+				first = i;
			
 
				+			dout("%p will write page %p idx %lu\n",
			
 
				+			     inode, page, page->index);
			
 
				+
			
 
				+			writeback_stat = atomic_long_inc_return(&client->writeback_count);
			
 
				+			if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
			
 
				+				set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
			
 
				+			}
			
 
				+
			
 
				+			set_page_writeback(page);
			
 
				+			req->r_pages[locked_pages] = page;
			
 
				+			locked_pages++;
			
 
				+			next = page->index + 1;
			
 
				+		}
			
 
				+
			
 
				+		/* did we get anything? */
			
 
				+		if (!locked_pages)
			
 
				+			goto release_pvec_pages;
			
 
				+		if (i) {
			
 
				+			int j;
			
 
				+			BUG_ON(!locked_pages || first < 0);
			
 
				+
			
 
				+			if (pvec_pages && i == pvec_pages &&
			
 
				+			    locked_pages < max_pages) {
			
 
				+				dout("reached end pvec, trying for more\n");
			
 
				+				pagevec_reinit(&pvec);
			
 
				+				goto get_more_pages;
			
 
				+			}
			
 
				+
			
 
				+			/* shift unused pages over in the pvec...  we
			
 
				+			 * will need to release them below. */
			
 
				+			for (j = i; j < pvec_pages; j++) {
			
 
				+				dout(" pvec leftover page %p\n",
			
 
				+				     pvec.pages[j]);
			
 
				+				pvec.pages[j-i+first] = pvec.pages[j];
			
 
				+			}
			
 
				+			pvec.nr -= i-first;
			
 
				+		}
			
 
				+
			
 
				+		/* submit the write */
			
 
				+		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
			
 
				+		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
			
 
				+			  (u64)locked_pages << PAGE_CACHE_SHIFT);
			
 
				+		dout("writepages got %d pages at %llu~%llu\n",
			
 
				+		     locked_pages, offset, len);
			
 
				+
			
 
				+		/* revise final length, page count */
			
 
				+		req->r_num_pages = locked_pages;
			
 
				+		reqhead = req->r_request->front.iov_base;
			
 
				+		op = (void *)(reqhead + 1);
			
 
				+		op->extent.length = cpu_to_le64(len);
			
 
				+		op->payload_len = cpu_to_le32(len);
			
 
				+		req->r_request->hdr.data_len = cpu_to_le32(len);
			
 
				+
			
 
				+		ceph_osdc_start_request(&client->osdc, req, true);
			
 
				+		req = NULL;
			
 
				+
			
 
				+		/* continue? */
			
 
				+		index = next;
			
 
				+		wbc->nr_to_write -= locked_pages;
			
 
				+		if (wbc->nr_to_write <= 0)
			
 
				+			done = 1;
			
 
				+
			
 
				+release_pvec_pages:
			
 
				+		dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
			
 
				+		     pvec.nr ? pvec.pages[0] : NULL);
			
 
				+		pagevec_release(&pvec);
			
 
				+
			
 
				+		if (locked_pages && !done)
			
 
				+			goto retry;
			
 
				+	}
			
 
				+
			
 
				+	if (should_loop && !done) {
			
 
				+		/* more to do; loop back to beginning of file */
			
 
				+		dout("writepages looping back to beginning of file\n");
			
 
				+		should_loop = 0;
			
 
				+		index = 0;
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
			
 
				+		mapping->writeback_index = index;
			
 
				+
			
 
				+out:
			
 
				+	if (req)
			
 
				+		ceph_osdc_put_request(req);
			
 
				+	if (rc > 0)
			
 
				+		rc = 0;  /* vfs expects us to return 0 */
			
 
				+	ceph_put_snap_context(snapc);
			
 
				+	dout("writepages done, rc = %d\n", rc);
			
 
				+out_final:
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * See if a given @snapc is either writeable, or already written.
			
 
				+ */
			
 
				+static int context_is_writeable_or_written(struct inode *inode,
			
 
				+					   struct ceph_snap_context *snapc)
			
 
				+{
			
 
				+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
			
 
				+	return !oldest || snapc->seq <= oldest->seq;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We are only allowed to write into/dirty the page if the page is
			
 
				+ * clean, or already dirty within the same snap context.
			
 
				+ */
			
 
				+static int ceph_update_writeable_page(struct file *file,
			
 
				+			    loff_t pos, unsigned len,
			
 
				+			    struct page *page)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
			
 
				+	loff_t page_off = pos & PAGE_CACHE_MASK;
			
 
				+	int pos_in_page = pos & ~PAGE_CACHE_MASK;
			
 
				+	int end_in_page = pos_in_page + len;
			
 
				+	loff_t i_size;
			
 
				+	struct ceph_snap_context *snapc;
			
 
				+	int r;
			
 
				+
			
 
				+retry_locked:
			
 
				+	/* writepages currently holds page lock, but if we change that later, */
			
 
				+	wait_on_page_writeback(page);
			
 
				+
			
 
				+	/* check snap context */
			
 
				+	BUG_ON(!ci->i_snap_realm);
			
 
				+	down_read(&mdsc->snap_rwsem);
			
 
				+	BUG_ON(!ci->i_snap_realm->cached_context);
			
 
				+	if (page->private &&
			
 
				+	    (void *)page->private != ci->i_snap_realm->cached_context) {
			
 
				+		/*
			
 
				+		 * this page is already dirty in another (older) snap
			
 
				+		 * context!  is it writeable now?
			
 
				+		 */
			
 
				+		snapc = get_oldest_context(inode, NULL);
			
 
				+		up_read(&mdsc->snap_rwsem);
			
 
				+
			
 
				+		if (snapc != (void *)page->private) {
			
 
				+			dout(" page %p snapc %p not current or oldest\n",
			
 
				+			     page, (void *)page->private);
			
 
				+			/*
			
 
				+			 * queue for writeback, and wait for snapc to
			
 
				+			 * be writeable or written
			
 
				+			 */
			
 
				+			snapc = ceph_get_snap_context((void *)page->private);
			
 
				+			unlock_page(page);
			
 
				+			ceph_queue_writeback(inode);
			
 
				+			wait_event_interruptible(ci->i_cap_wq,
			
 
				+			       context_is_writeable_or_written(inode, snapc));
			
 
				+			ceph_put_snap_context(snapc);
			
 
				+			return -EAGAIN;
			
 
				+		}
			
 
				+
			
 
				+		/* yay, writeable, do it now (without dropping page lock) */
			
 
				+		dout(" page %p snapc %p not current, but oldest\n",
			
 
				+		     page, snapc);
			
 
				+		if (!clear_page_dirty_for_io(page))
			
 
				+			goto retry_locked;
			
 
				+		r = writepage_nounlock(page, NULL);
			
 
				+		if (r < 0)
			
 
				+			goto fail_nosnap;
			
 
				+		goto retry_locked;
			
 
				+	}
			
 
				+
			
 
				+	if (PageUptodate(page)) {
			
 
				+		dout(" page %p already uptodate\n", page);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* full page? */
			
 
				+	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* past end of file? */
			
 
				+	i_size = inode->i_size;   /* caller holds i_mutex */
			
 
				+
			
 
				+	if (i_size + len > inode->i_sb->s_maxbytes) {
			
 
				+		/* file is too big */
			
 
				+		r = -EINVAL;
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if (page_off >= i_size ||
			
 
				+	    (pos_in_page == 0 && (pos+len) >= i_size &&
			
 
				+	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
			
 
				+		dout(" zeroing %p 0 - %d and %d - %d\n",
			
 
				+		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
			
 
				+		zero_user_segments(page,
			
 
				+				   0, pos_in_page,
			
 
				+				   end_in_page, PAGE_CACHE_SIZE);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* we need to read it. */
			
 
				+	up_read(&mdsc->snap_rwsem);
			
 
				+	r = readpage_nounlock(file, page);
			
 
				+	if (r < 0)
			
 
				+		goto fail_nosnap;
			
 
				+	goto retry_locked;
			
 
				+
			
 
				+fail:
			
 
				+	up_read(&mdsc->snap_rwsem);
			
 
				+fail_nosnap:
			
 
				+	unlock_page(page);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We are only allowed to write into/dirty the page if the page is
			
 
				+ * clean, or already dirty within the same snap context.
			
 
				+ */
			
 
				+static int ceph_write_begin(struct file *file, struct address_space *mapping,
			
 
				+			    loff_t pos, unsigned len, unsigned flags,
			
 
				+			    struct page **pagep, void **fsdata)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct page *page;
			
 
				+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
			
 
				+	int r;
			
 
				+
			
 
				+	do {
			
 
				+		/* get a page*/
			
 
				+		page = grab_cache_page_write_begin(mapping, index, 0);
			
 
				+		if (!page)
			
 
				+			return -ENOMEM;
			
 
				+		*pagep = page;
			
 
				+
			
 
				+		dout("write_begin file %p inode %p page %p %d~%d\n", file,
			
 
				+	     	inode, page, (int)pos, (int)len);
			
 
				+
			
 
				+		r = ceph_update_writeable_page(file, pos, len, page);
			
 
				+	} while (r == -EAGAIN);
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * we don't do anything in here that simple_write_end doesn't do
			
 
				+ * except adjust dirty page accounting and drop read lock on
			
 
				+ * mdsc->snap_rwsem.
			
 
				+ */
			
 
				+static int ceph_write_end(struct file *file, struct address_space *mapping,
			
 
				+			  loff_t pos, unsigned len, unsigned copied,
			
 
				+			  struct page *page, void *fsdata)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_client *client = ceph_inode_to_client(inode);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
			
 
				+	int check_cap = 0;
			
 
				+
			
 
				+	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
			
 
				+	     inode, page, (int)pos, (int)copied, (int)len);
			
 
				+
			
 
				+	/* zero the stale part of the page if we did a short copy */
			
 
				+	if (copied < len)
			
 
				+		zero_user_segment(page, from+copied, len);
			
 
				+
			
 
				+	/* did file size increase? */
			
 
				+	/* (no need for i_size_read(); we caller holds i_mutex */
			
 
				+	if (pos+copied > inode->i_size)
			
 
				+		check_cap = ceph_inode_set_size(inode, pos+copied);
			
 
				+
			
 
				+	if (!PageUptodate(page))
			
 
				+		SetPageUptodate(page);
			
 
				+
			
 
				+	set_page_dirty(page);
			
 
				+
			
 
				+	unlock_page(page);
			
 
				+	up_read(&mdsc->snap_rwsem);
			
 
				+	page_cache_release(page);
			
 
				+
			
 
				+	if (check_cap)
			
 
				+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
			
 
				+
			
 
				+	return copied;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * we set .direct_IO to indicate direct io is supported, but since we
			
 
				+ * intercept O_DIRECT reads and writes early, this function should
			
 
				+ * never get called.
			
 
				+ */
			
 
				+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
			
 
				+			      const struct iovec *iov,
			
 
				+			      loff_t pos, unsigned long nr_segs)
			
 
				+{
			
 
				+	WARN_ON(1);
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+const struct address_space_operations ceph_aops = {
			
 
				+	.readpage = ceph_readpage,
			
 
				+	.readpages = ceph_readpages,
			
 
				+	.writepage = ceph_writepage,
			
 
				+	.writepages = ceph_writepages_start,
			
 
				+	.write_begin = ceph_write_begin,
			
 
				+	.write_end = ceph_write_end,
			
 
				+	.set_page_dirty = ceph_set_page_dirty,
			
 
				+	.invalidatepage = ceph_invalidatepage,
			
 
				+	.releasepage = ceph_releasepage,
			
 
				+	.direct_IO = ceph_direct_io,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * vm ops
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Reuse write_begin here for simplicity.
			
 
				+ */
			
 
				+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
			
 
				+{
			
 
				+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
			
 
				+	struct page *page = vmf->page;
			
 
				+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
			
 
				+	loff_t off = page->index << PAGE_CACHE_SHIFT;
			
 
				+	loff_t size, len;
			
 
				+	int ret;
			
 
				+
			
 
				+	size = i_size_read(inode);
			
 
				+	if (off + PAGE_CACHE_SIZE <= size)
			
 
				+		len = PAGE_CACHE_SIZE;
			
 
				+	else
			
 
				+		len = size & ~PAGE_CACHE_MASK;
			
 
				+
			
 
				+	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
			
 
				+	     off, len, page, page->index);
			
 
				+
			
 
				+	lock_page(page);
			
 
				+
			
 
				+	ret = VM_FAULT_NOPAGE;
			
 
				+	if ((off > size) ||
			
 
				+	    (page->mapping != inode->i_mapping))
			
 
				+		goto out;
			
 
				+
			
 
				+	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
			
 
				+	if (ret == 0) {
			
 
				+		/* success.  we'll keep the page locked. */
			
 
				+		set_page_dirty(page);
			
 
				+		up_read(&mdsc->snap_rwsem);
			
 
				+		ret = VM_FAULT_LOCKED;
			
 
				+	} else {
			
 
				+		if (ret == -ENOMEM)
			
 
				+			ret = VM_FAULT_OOM;
			
 
				+		else
			
 
				+			ret = VM_FAULT_SIGBUS;
			
 
				+	}
			
 
				+out:
			
 
				+	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
			
 
				+	if (ret != VM_FAULT_LOCKED)
			
 
				+		unlock_page(page);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct vm_operations_struct ceph_vmops = {
			
 
				+	.fault		= filemap_fault,
			
 
				+	.page_mkwrite	= ceph_page_mkwrite,
			
 
				+};
			
 
				+
			
 
				+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
			
 
				+{
			
 
				+	struct address_space *mapping = file->f_mapping;
			
 
				+
			
 
				+	if (!mapping->a_ops->readpage)
			
 
				+		return -ENOEXEC;
			
 
				+	file_accessed(file);
			
 
				+	vma->vm_ops = &ceph_vmops;
			
 
				+	vma->vm_flags |= VM_CAN_NONLINEAR;
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/ceph/armor.c
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
 
				+
			
 
				+#include <linux/errno.h>
			
 
				+
			
 
				+/*
			
 
				+ * base64 encode/decode.
			
 
				+ */
			
 
				+
			
 
				+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
			
 
				+
			
 
				+static int encode_bits(int c)
			
 
				+{
			
 
				+	return pem_key[c];
			
 
				+}
			
 
				+
			
 
				+static int decode_bits(char c)
			
 
				+{
			
 
				+	if (c >= 'A' && c <= 'Z')
			
 
				+		return c - 'A';
			
 
				+	if (c >= 'a' && c <= 'z')
			
 
				+		return c - 'a' + 26;
			
 
				+	if (c >= '0' && c <= '9')
			
 
				+		return c - '0' + 52;
			
 
				+	if (c == '+')
			
 
				+		return 62;
			
 
				+	if (c == '/')
			
 
				+		return 63;
			
 
				+	if (c == '=')
			
 
				+		return 0; /* just non-negative, please */
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+int ceph_armor(char *dst, const char *src, const char *end)
			
 
				+{
			
 
				+	int olen = 0;
			
 
				+	int line = 0;
			
 
				+
			
 
				+	while (src < end) {
			
 
				+		unsigned char a, b, c;
			
 
				+
			
 
				+		a = *src++;
			
 
				+		*dst++ = encode_bits(a >> 2);
			
 
				+		if (src < end) {
			
 
				+			b = *src++;
			
 
				+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
			
 
				+			if (src < end) {
			
 
				+				c = *src++;
			
 
				+				*dst++ = encode_bits(((b & 15) << 2) |
			
 
				+						     (c >> 6));
			
 
				+				*dst++ = encode_bits(c & 63);
			
 
				+			} else {
			
 
				+				*dst++ = encode_bits((b & 15) << 2);
			
 
				+				*dst++ = '=';
			
 
				+			}
			
 
				+		} else {
			
 
				+			*dst++ = encode_bits(((a & 3) << 4));
			
 
				+			*dst++ = '=';
			
 
				+			*dst++ = '=';
			
 
				+		}
			
 
				+		olen += 4;
			
 
				+		line += 4;
			
 
				+		if (line == 64) {
			
 
				+			line = 0;
			
 
				+			*(dst++) = '\n';
			
 
				+			olen++;
			
 
				+		}
			
 
				+	}
			
 
				+	return olen;
			
 
				+}
			
 
				+
			
 
				+int ceph_unarmor(char *dst, const char *src, const char *end)
			
 
				+{
			
 
				+	int olen = 0;
			
 
				+
			
 
				+	while (src < end) {
			
 
				+		int a, b, c, d;
			
 
				+
			
 
				+		if (src < end && src[0] == '\n')
			
 
				+			src++;
			
 
				+		if (src + 4 > end)
			
 
				+			return -EINVAL;
			
 
				+		a = decode_bits(src[0]);
			
 
				+		b = decode_bits(src[1]);
			
 
				+		c = decode_bits(src[2]);
			
 
				+		d = decode_bits(src[3]);
			
 
				+		if (a < 0 || b < 0 || c < 0 || d < 0)
			
 
				+			return -EINVAL;
			
 
				+
			
 
				+		*dst++ = (a << 2) | (b >> 4);
			
 
				+		if (src[2] == '=')
			
 
				+			return olen + 1;
			
 
				+		*dst++ = ((b & 15) << 4) | (c >> 2);
			
 
				+		if (src[3] == '=')
			
 
				+			return olen + 2;
			
 
				+		*dst++ = ((c & 3) << 6) | d;
			
 
				+		olen += 3;
			
 
				+		src += 4;
			
 
				+	}
			
 
				+	return olen;
			
 
				+}
			
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -0,0 +1,257 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/err.h>
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "auth_none.h"
			
 
				+#include "auth_x.h"
			
 
				+#include "decode.h"
			
 
				+#include "super.h"
			
 
				+
			
 
				+#include "messenger.h"
			
 
				+
			
 
				+/*
			
 
				+ * get protocol handler
			
 
				+ */
			
 
				+static u32 supported_protocols[] = {
			
 
				+	CEPH_AUTH_NONE,
			
 
				+	CEPH_AUTH_CEPHX
			
 
				+};
			
 
				+
			
 
				+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
			
 
				+{
			
 
				+	switch (protocol) {
			
 
				+	case CEPH_AUTH_NONE:
			
 
				+		return ceph_auth_none_init(ac);
			
 
				+	case CEPH_AUTH_CEPHX:
			
 
				+		return ceph_x_init(ac);
			
 
				+	default:
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * setup, teardown.
			
 
				+ */
			
 
				+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
			
 
				+{
			
 
				+	struct ceph_auth_client *ac;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("auth_init name '%s' secret '%s'\n", name, secret);
			
 
				+
			
 
				+	ret = -ENOMEM;
			
 
				+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
			
 
				+	if (!ac)
			
 
				+		goto out;
			
 
				+
			
 
				+	ac->negotiating = true;
			
 
				+	if (name)
			
 
				+		ac->name = name;
			
 
				+	else
			
 
				+		ac->name = CEPH_AUTH_NAME_DEFAULT;
			
 
				+	dout("auth_init name %s secret %s\n", ac->name, secret);
			
 
				+	ac->secret = secret;
			
 
				+	return ac;
			
 
				+
			
 
				+out:
			
 
				+	return ERR_PTR(ret);
			
 
				+}
			
 
				+
			
 
				+void ceph_auth_destroy(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	dout("auth_destroy %p\n", ac);
			
 
				+	if (ac->ops)
			
 
				+		ac->ops->destroy(ac);
			
 
				+	kfree(ac);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reset occurs when reconnecting to the monitor.
			
 
				+ */
			
 
				+void ceph_auth_reset(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	dout("auth_reset %p\n", ac);
			
 
				+	if (ac->ops && !ac->negotiating)
			
 
				+		ac->ops->reset(ac);
			
 
				+	ac->negotiating = true;
			
 
				+}
			
 
				+
			
 
				+int ceph_entity_name_encode(const char *name, void **p, void *end)
			
 
				+{
			
 
				+	int len = strlen(name);
			
 
				+
			
 
				+	if (*p + 2*sizeof(u32) + len > end)
			
 
				+		return -ERANGE;
			
 
				+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
			
 
				+	ceph_encode_32(p, len);
			
 
				+	ceph_encode_copy(p, name, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Initiate protocol negotiation with monitor.  Include entity name
			
 
				+ * and list supported protocols.
			
 
				+ */
			
 
				+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
			
 
				+{
			
 
				+	struct ceph_mon_request_header *monhdr = buf;
			
 
				+	void *p = monhdr + 1, *end = buf + len, *lenp;
			
 
				+	int i, num;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("auth_build_hello\n");
			
 
				+	monhdr->have_version = 0;
			
 
				+	monhdr->session_mon = cpu_to_le16(-1);
			
 
				+	monhdr->session_mon_tid = 0;
			
 
				+
			
 
				+	ceph_encode_32(&p, 0);  /* no protocol, yet */
			
 
				+
			
 
				+	lenp = p;
			
 
				+	p += sizeof(u32);
			
 
				+
			
 
				+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
			
 
				+	ceph_encode_8(&p, 1);
			
 
				+	num = ARRAY_SIZE(supported_protocols);
			
 
				+	ceph_encode_32(&p, num);
			
 
				+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
			
 
				+	for (i = 0; i < num; i++)
			
 
				+		ceph_encode_32(&p, supported_protocols[i]);
			
 
				+
			
 
				+	ret = ceph_entity_name_encode(ac->name, &p, end);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+	ceph_decode_need(&p, end, sizeof(u64), bad);
			
 
				+	ceph_encode_64(&p, ac->global_id);
			
 
				+
			
 
				+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
			
 
				+	return p - buf;
			
 
				+
			
 
				+bad:
			
 
				+	return -ERANGE;
			
 
				+}
			
 
				+
			
 
				+int ceph_build_auth_request(struct ceph_auth_client *ac,
			
 
				+			   void *msg_buf, size_t msg_len)
			
 
				+{
			
 
				+	struct ceph_mon_request_header *monhdr = msg_buf;
			
 
				+	void *p = monhdr + 1;
			
 
				+	void *end = msg_buf + msg_len;
			
 
				+	int ret;
			
 
				+
			
 
				+	monhdr->have_version = 0;
			
 
				+	monhdr->session_mon = cpu_to_le16(-1);
			
 
				+	monhdr->session_mon_tid = 0;
			
 
				+
			
 
				+	ceph_encode_32(&p, ac->protocol);
			
 
				+
			
 
				+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
			
 
				+	if (ret < 0) {
			
 
				+		pr_err("error %d building request\n", ret);
			
 
				+		return ret;
			
 
				+	}
			
 
				+	dout(" built request %d bytes\n", ret);
			
 
				+	ceph_encode_32(&p, ret);
			
 
				+	return p + ret - msg_buf;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle auth message from monitor.
			
 
				+ */
			
 
				+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
			
 
				+			   void *buf, size_t len,
			
 
				+			   void *reply_buf, size_t reply_len)
			
 
				+{
			
 
				+	void *p = buf;
			
 
				+	void *end = buf + len;
			
 
				+	int protocol;
			
 
				+	s32 result;
			
 
				+	u64 global_id;
			
 
				+	void *payload, *payload_end;
			
 
				+	int payload_len;
			
 
				+	char *result_msg;
			
 
				+	int result_msg_len;
			
 
				+	int ret = -EINVAL;
			
 
				+
			
 
				+	dout("handle_auth_reply %p %p\n", p, end);
			
 
				+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
			
 
				+	protocol = ceph_decode_32(&p);
			
 
				+	result = ceph_decode_32(&p);
			
 
				+	global_id = ceph_decode_64(&p);
			
 
				+	payload_len = ceph_decode_32(&p);
			
 
				+	payload = p;
			
 
				+	p += payload_len;
			
 
				+	ceph_decode_need(&p, end, sizeof(u32), bad);
			
 
				+	result_msg_len = ceph_decode_32(&p);
			
 
				+	result_msg = p;
			
 
				+	p += result_msg_len;
			
 
				+	if (p != end)
			
 
				+		goto bad;
			
 
				+
			
 
				+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
			
 
				+	     result_msg, global_id, payload_len);
			
 
				+
			
 
				+	payload_end = payload + payload_len;
			
 
				+
			
 
				+	if (global_id && ac->global_id != global_id) {
			
 
				+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
			
 
				+		ac->global_id = global_id;
			
 
				+	}
			
 
				+
			
 
				+	if (ac->negotiating) {
			
 
				+		/* server does not support our protocols? */
			
 
				+		if (!protocol && result < 0) {
			
 
				+			ret = result;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		/* set up (new) protocol handler? */
			
 
				+		if (ac->protocol && ac->protocol != protocol) {
			
 
				+			ac->ops->destroy(ac);
			
 
				+			ac->protocol = 0;
			
 
				+			ac->ops = NULL;
			
 
				+		}
			
 
				+		if (ac->protocol != protocol) {
			
 
				+			ret = ceph_auth_init_protocol(ac, protocol);
			
 
				+			if (ret) {
			
 
				+				pr_err("error %d on auth protocol %d init\n",
			
 
				+				       ret, protocol);
			
 
				+				goto out;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		ac->negotiating = false;
			
 
				+	}
			
 
				+
			
 
				+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
			
 
				+	if (ret == -EAGAIN) {
			
 
				+		return ceph_build_auth_request(ac, reply_buf, reply_len);
			
 
				+	} else if (ret) {
			
 
				+		pr_err("authentication error %d\n", ret);
			
 
				+		return ret;
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("failed to decode auth msg\n");
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int ceph_build_auth(struct ceph_auth_client *ac,
			
 
				+		    void *msg_buf, size_t msg_len)
			
 
				+{
			
 
				+	if (!ac->protocol)
			
 
				+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
			
 
				+	BUG_ON(!ac->ops);
			
 
				+	if (!ac->ops->is_authenticated(ac))
			
 
				+		return ceph_build_auth_request(ac, msg_buf, msg_len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	if (!ac->ops)
			
 
				+		return 0;
			
 
				+	return ac->ops->is_authenticated(ac);
			
 
				+}
			
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
 
				+#ifndef _FS_CEPH_AUTH_H
			
 
				+#define _FS_CEPH_AUTH_H
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "buffer.h"
			
 
				+
			
 
				+/*
			
 
				+ * Abstract interface for communicating with the authenticate module.
			
 
				+ * There is some handshake that takes place between us and the monitor
			
 
				+ * to acquire the necessary keys.  These are used to generate an
			
 
				+ * 'authorizer' that we use when connecting to a service (mds, osd).
			
 
				+ */
			
 
				+
			
 
				+struct ceph_auth_client;
			
 
				+struct ceph_authorizer;
			
 
				+
			
 
				+struct ceph_auth_client_ops {
			
 
				+	/*
			
 
				+	 * true if we are authenticated and can connect to
			
 
				+	 * services.
			
 
				+	 */
			
 
				+	int (*is_authenticated)(struct ceph_auth_client *ac);
			
 
				+
			
 
				+	/*
			
 
				+	 * build requests and process replies during monitor
			
 
				+	 * handshake.  if handle_reply returns -EAGAIN, we build
			
 
				+	 * another request.
			
 
				+	 */
			
 
				+	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
			
 
				+	int (*handle_reply)(struct ceph_auth_client *ac, int result,
			
 
				+			    void *buf, void *end);
			
 
				+
			
 
				+	/*
			
 
				+	 * Create authorizer for connecting to a service, and verify
			
 
				+	 * the response to authenticate the service.
			
 
				+	 */
			
 
				+	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
			
 
				+				 struct ceph_authorizer **a,
			
 
				+				 void **buf, size_t *len,
			
 
				+				 void **reply_buf, size_t *reply_len);
			
 
				+	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
			
 
				+				       struct ceph_authorizer *a, size_t len);
			
 
				+	void (*destroy_authorizer)(struct ceph_auth_client *ac,
			
 
				+				   struct ceph_authorizer *a);
			
 
				+	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
			
 
				+				      int peer_type);
			
 
				+
			
 
				+	/* reset when we (re)connect to a monitor */
			
 
				+	void (*reset)(struct ceph_auth_client *ac);
			
 
				+
			
 
				+	void (*destroy)(struct ceph_auth_client *ac);
			
 
				+};
			
 
				+
			
 
				+struct ceph_auth_client {
			
 
				+	u32 protocol;           /* CEPH_AUTH_* */
			
 
				+	void *private;          /* for use by protocol implementation */
			
 
				+	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
			
 
				+
			
 
				+	bool negotiating;       /* true if negotiating protocol */
			
 
				+	const char *name;       /* entity name */
			
 
				+	u64 global_id;          /* our unique id in system */
			
 
				+	const char *secret;     /* our secret key */
			
 
				+	unsigned want_keys;     /* which services we want */
			
 
				+};
			
 
				+
			
 
				+extern struct ceph_auth_client *ceph_auth_init(const char *name,
			
 
				+					       const char *secret);
			
 
				+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
			
 
				+
			
 
				+extern void ceph_auth_reset(struct ceph_auth_client *ac);
			
 
				+
			
 
				+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
			
 
				+				 void *buf, size_t len);
			
 
				+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
			
 
				+				  void *buf, size_t len,
			
 
				+				  void *reply_buf, size_t reply_len);
			
 
				+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
			
 
				+
			
 
				+extern int ceph_build_auth(struct ceph_auth_client *ac,
			
 
				+		    void *msg_buf, size_t msg_len);
			
 
				+
			
 
				+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
 
				+
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/random.h>
			
 
				+
			
 
				+#include "auth_none.h"
			
 
				+#include "auth.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+static void reset(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_auth_none_info *xi = ac->private;
			
 
				+
			
 
				+	xi->starting = true;
			
 
				+	xi->built_authorizer = false;
			
 
				+}
			
 
				+
			
 
				+static void destroy(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	kfree(ac->private);
			
 
				+	ac->private = NULL;
			
 
				+}
			
 
				+
			
 
				+static int is_authenticated(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_auth_none_info *xi = ac->private;
			
 
				+
			
 
				+	return !xi->starting;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * the generic auth code decode the global_id, and we carry no actual
			
 
				+ * authenticate state, so nothing happens here.
			
 
				+ */
			
 
				+static int handle_reply(struct ceph_auth_client *ac, int result,
			
 
				+			void *buf, void *end)
			
 
				+{
			
 
				+	struct ceph_auth_none_info *xi = ac->private;
			
 
				+
			
 
				+	xi->starting = false;
			
 
				+	return result;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * build an 'authorizer' with our entity_name and global_id.  we can
			
 
				+ * reuse a single static copy since it is identical for all services
			
 
				+ * we connect to.
			
 
				+ */
			
 
				+static int ceph_auth_none_create_authorizer(
			
 
				+	struct ceph_auth_client *ac, int peer_type,
			
 
				+	struct ceph_authorizer **a,
			
 
				+	void **buf, size_t *len,
			
 
				+	void **reply_buf, size_t *reply_len)
			
 
				+{
			
 
				+	struct ceph_auth_none_info *ai = ac->private;
			
 
				+	struct ceph_none_authorizer *au = &ai->au;
			
 
				+	void *p, *end;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!ai->built_authorizer) {
			
 
				+		p = au->buf;
			
 
				+		end = p + sizeof(au->buf);
			
 
				+		ceph_encode_8(&p, 1);
			
 
				+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
			
 
				+		if (ret < 0)
			
 
				+			goto bad;
			
 
				+		ceph_decode_need(&p, end, sizeof(u64), bad2);
			
 
				+		ceph_encode_64(&p, ac->global_id);
			
 
				+		au->buf_len = p - (void *)au->buf;
			
 
				+		ai->built_authorizer = true;
			
 
				+		dout("built authorizer len %d\n", au->buf_len);
			
 
				+	}
			
 
				+
			
 
				+	*a = (struct ceph_authorizer *)au;
			
 
				+	*buf = au->buf;
			
 
				+	*len = au->buf_len;
			
 
				+	*reply_buf = au->reply_buf;
			
 
				+	*reply_len = sizeof(au->reply_buf);
			
 
				+	return 0;
			
 
				+
			
 
				+bad2:
			
 
				+	ret = -ERANGE;
			
 
				+bad:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
			
 
				+				      struct ceph_authorizer *a)
			
 
				+{
			
 
				+	/* nothing to do */
			
 
				+}
			
 
				+
			
 
				+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
			
 
				+	.reset = reset,
			
 
				+	.destroy = destroy,
			
 
				+	.is_authenticated = is_authenticated,
			
 
				+	.handle_reply = handle_reply,
			
 
				+	.create_authorizer = ceph_auth_none_create_authorizer,
			
 
				+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
			
 
				+};
			
 
				+
			
 
				+int ceph_auth_none_init(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_auth_none_info *xi;
			
 
				+
			
 
				+	dout("ceph_auth_none_init %p\n", ac);
			
 
				+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
			
 
				+	if (!xi)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	xi->starting = true;
			
 
				+	xi->built_authorizer = false;
			
 
				+
			
 
				+	ac->protocol = CEPH_AUTH_NONE;
			
 
				+	ac->private = xi;
			
 
				+	ac->ops = &ceph_auth_none_ops;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/fs/ceph/auth_none.h
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
 
				+#ifndef _FS_CEPH_AUTH_NONE_H
			
 
				+#define _FS_CEPH_AUTH_NONE_H
			
 
				+
			
 
				+#include "auth.h"
			
 
				+
			
 
				+/*
			
 
				+ * null security mode.
			
 
				+ *
			
 
				+ * we use a single static authorizer that simply encodes our entity name
			
 
				+ * and global id.
			
 
				+ */
			
 
				+
			
 
				+struct ceph_none_authorizer {
			
 
				+	char buf[128];
			
 
				+	int buf_len;
			
 
				+	char reply_buf[0];
			
 
				+};
			
 
				+
			
 
				+struct ceph_auth_none_info {
			
 
				+	bool starting;
			
 
				+	bool built_authorizer;
			
 
				+	struct ceph_none_authorizer au;   /* we only need one; it's static */
			
 
				+};
			
 
				+
			
 
				+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
			
 
				+
			
 
				+#endif
			
 
				+
			
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,656 @@
 
				+
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/random.h>
			
 
				+
			
 
				+#include "auth_x.h"
			
 
				+#include "auth_x_protocol.h"
			
 
				+#include "crypto.h"
			
 
				+#include "auth.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+struct kmem_cache *ceph_x_ticketbuf_cachep;
			
 
				+
			
 
				+#define TEMP_TICKET_BUF_LEN	256
			
 
				+
			
 
				+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
			
 
				+
			
 
				+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	int need;
			
 
				+
			
 
				+	ceph_x_validate_tickets(ac, &need);
			
 
				+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
			
 
				+	     ac->want_keys, need, xi->have_keys);
			
 
				+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
			
 
				+			  void *ibuf, int ilen, void *obuf, size_t olen)
			
 
				+{
			
 
				+	struct ceph_x_encrypt_header head = {
			
 
				+		.struct_v = 1,
			
 
				+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
			
 
				+	};
			
 
				+	size_t len = olen - sizeof(u32);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
			
 
				+			    &head, sizeof(head), ibuf, ilen);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	ceph_encode_32(&obuf, len);
			
 
				+	return len + sizeof(u32);
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
			
 
				+			  void **p, void *end, void *obuf, size_t olen)
			
 
				+{
			
 
				+	struct ceph_x_encrypt_header head;
			
 
				+	size_t head_len = sizeof(head);
			
 
				+	int len, ret;
			
 
				+
			
 
				+	len = ceph_decode_32(p);
			
 
				+	if (*p + len > end)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	dout("ceph_x_decrypt len %d\n", len);
			
 
				+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
			
 
				+			    *p, len);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
			
 
				+		return -EPERM;
			
 
				+	*p += len;
			
 
				+	return olen;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * get existing (or insert new) ticket handler
			
 
				+ */
			
 
				+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
			
 
				+						 int service)
			
 
				+{
			
 
				+	struct ceph_x_ticket_handler *th;
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
			
 
				+		if (service < th->service)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (service > th->service)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			return th;
			
 
				+	}
			
 
				+
			
 
				+	/* add it */
			
 
				+	th = kzalloc(sizeof(*th), GFP_NOFS);
			
 
				+	if (!th)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	th->service = service;
			
 
				+	rb_link_node(&th->node, parent, p);
			
 
				+	rb_insert_color(&th->node, &xi->ticket_handlers);
			
 
				+	return th;
			
 
				+}
			
 
				+
			
 
				+static void remove_ticket_handler(struct ceph_auth_client *ac,
			
 
				+				  struct ceph_x_ticket_handler *th)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+
			
 
				+	dout("remove_ticket_handler %p %d\n", th, th->service);
			
 
				+	rb_erase(&th->node, &xi->ticket_handlers);
			
 
				+	ceph_crypto_key_destroy(&th->session_key);
			
 
				+	if (th->ticket_blob)
			
 
				+		ceph_buffer_put(th->ticket_blob);
			
 
				+	kfree(th);
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
			
 
				+				    struct ceph_crypto_key *secret,
			
 
				+				    void *buf, void *end)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	int num;
			
 
				+	void *p = buf;
			
 
				+	int ret;
			
 
				+	char *dbuf;
			
 
				+	char *ticket_buf;
			
 
				+	u8 struct_v;
			
 
				+
			
 
				+	dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
			
 
				+	if (!dbuf)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ret = -ENOMEM;
			
 
				+	ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
			
 
				+				      GFP_NOFS | GFP_ATOMIC);
			
 
				+	if (!ticket_buf)
			
 
				+		goto out_dbuf;
			
 
				+
			
 
				+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
			
 
				+	struct_v = ceph_decode_8(&p);
			
 
				+	if (struct_v != 1)
			
 
				+		goto bad;
			
 
				+	num = ceph_decode_32(&p);
			
 
				+	dout("%d tickets\n", num);
			
 
				+	while (num--) {
			
 
				+		int type;
			
 
				+		u8 struct_v;
			
 
				+		struct ceph_x_ticket_handler *th;
			
 
				+		void *dp, *dend;
			
 
				+		int dlen;
			
 
				+		char is_enc;
			
 
				+		struct timespec validity;
			
 
				+		struct ceph_crypto_key old_key;
			
 
				+		void *tp, *tpend;
			
 
				+
			
 
				+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
			
 
				+
			
 
				+		type = ceph_decode_32(&p);
			
 
				+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
			
 
				+
			
 
				+		struct_v = ceph_decode_8(&p);
			
 
				+		if (struct_v != 1)
			
 
				+			goto bad;
			
 
				+
			
 
				+		th = get_ticket_handler(ac, type);
			
 
				+		if (IS_ERR(th)) {
			
 
				+			ret = PTR_ERR(th);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		/* blob for me */
			
 
				+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
			
 
				+				      TEMP_TICKET_BUF_LEN);
			
 
				+		if (dlen <= 0) {
			
 
				+			ret = dlen;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		dout(" decrypted %d bytes\n", dlen);
			
 
				+		dend = dbuf + dlen;
			
 
				+		dp = dbuf;
			
 
				+
			
 
				+		struct_v = ceph_decode_8(&dp);
			
 
				+		if (struct_v != 1)
			
 
				+			goto bad;
			
 
				+
			
 
				+		memcpy(&old_key, &th->session_key, sizeof(old_key));
			
 
				+		ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+
			
 
				+		ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
			
 
				+		ceph_decode_timespec(&validity, &th->validity);
			
 
				+		th->expires = get_seconds() + validity.tv_sec;
			
 
				+		th->renew_after = th->expires - (validity.tv_sec / 4);
			
 
				+		dout(" expires=%lu renew_after=%lu\n", th->expires,
			
 
				+		     th->renew_after);
			
 
				+
			
 
				+		/* ticket blob for service */
			
 
				+		ceph_decode_8_safe(&p, end, is_enc, bad);
			
 
				+		tp = ticket_buf;
			
 
				+		if (is_enc) {
			
 
				+			/* encrypted */
			
 
				+			dout(" encrypted ticket\n");
			
 
				+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
			
 
				+					      TEMP_TICKET_BUF_LEN);
			
 
				+			if (dlen < 0) {
			
 
				+				ret = dlen;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			dlen = ceph_decode_32(&tp);
			
 
				+		} else {
			
 
				+			/* unencrypted */
			
 
				+			ceph_decode_32_safe(&p, end, dlen, bad);
			
 
				+			ceph_decode_need(&p, end, dlen, bad);
			
 
				+			ceph_decode_copy(&p, ticket_buf, dlen);
			
 
				+		}
			
 
				+		tpend = tp + dlen;
			
 
				+		dout(" ticket blob is %d bytes\n", dlen);
			
 
				+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
			
 
				+		struct_v = ceph_decode_8(&tp);
			
 
				+		th->secret_id = ceph_decode_64(&tp);
			
 
				+		ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
			
 
				+		     type, ceph_entity_type_name(type), th->secret_id,
			
 
				+		     (int)th->ticket_blob->vec.iov_len);
			
 
				+		xi->have_keys |= th->service;
			
 
				+	}
			
 
				+
			
 
				+	ret = 0;
			
 
				+out:
			
 
				+	kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
			
 
				+out_dbuf:
			
 
				+	kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
			
 
				+	return ret;
			
 
				+
			
 
				+bad:
			
 
				+	ret = -EINVAL;
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
			
 
				+				   struct ceph_x_ticket_handler *th,
			
 
				+				   struct ceph_x_authorizer *au)
			
 
				+{
			
 
				+	int len;
			
 
				+	struct ceph_x_authorize_a *msg_a;
			
 
				+	struct ceph_x_authorize_b msg_b;
			
 
				+	void *p, *end;
			
 
				+	int ret;
			
 
				+	int ticket_blob_len =
			
 
				+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
			
 
				+
			
 
				+	dout("build_authorizer for %s %p\n",
			
 
				+	     ceph_entity_type_name(th->service), au);
			
 
				+
			
 
				+	len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
			
 
				+		ticket_blob_len + 16;
			
 
				+	dout("  need len %d\n", len);
			
 
				+	if (au->buf && au->buf->alloc_len < len) {
			
 
				+		ceph_buffer_put(au->buf);
			
 
				+		au->buf = NULL;
			
 
				+	}
			
 
				+	if (!au->buf) {
			
 
				+		au->buf = ceph_buffer_new(len, GFP_NOFS);
			
 
				+		if (!au->buf)
			
 
				+			return -ENOMEM;
			
 
				+	}
			
 
				+	au->service = th->service;
			
 
				+
			
 
				+	msg_a = au->buf->vec.iov_base;
			
 
				+	msg_a->struct_v = 1;
			
 
				+	msg_a->global_id = cpu_to_le64(ac->global_id);
			
 
				+	msg_a->service_id = cpu_to_le32(th->service);
			
 
				+	msg_a->ticket_blob.struct_v = 1;
			
 
				+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
			
 
				+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
			
 
				+	if (ticket_blob_len) {
			
 
				+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
			
 
				+		       th->ticket_blob->vec.iov_len);
			
 
				+	}
			
 
				+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
			
 
				+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
			
 
				+
			
 
				+	p = msg_a + 1;
			
 
				+	p += ticket_blob_len;
			
 
				+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
			
 
				+
			
 
				+	get_random_bytes(&au->nonce, sizeof(au->nonce));
			
 
				+	msg_b.struct_v = 1;
			
 
				+	msg_b.nonce = cpu_to_le64(au->nonce);
			
 
				+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
			
 
				+			     p, end - p);
			
 
				+	if (ret < 0)
			
 
				+		goto out_buf;
			
 
				+	p += ret;
			
 
				+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
			
 
				+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
			
 
				+	     (int)au->buf->vec.iov_len);
			
 
				+	return 0;
			
 
				+
			
 
				+out_buf:
			
 
				+	ceph_buffer_put(au->buf);
			
 
				+	au->buf = NULL;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
			
 
				+				void **p, void *end)
			
 
				+{
			
 
				+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
			
 
				+	ceph_encode_8(p, 1);
			
 
				+	ceph_encode_64(p, th->secret_id);
			
 
				+	if (th->ticket_blob) {
			
 
				+		const char *buf = th->ticket_blob->vec.iov_base;
			
 
				+		u32 len = th->ticket_blob->vec.iov_len;
			
 
				+
			
 
				+		ceph_encode_32_safe(p, end, len, bad);
			
 
				+		ceph_encode_copy_safe(p, end, buf, len, bad);
			
 
				+	} else {
			
 
				+		ceph_encode_32_safe(p, end, 0, bad);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return -ERANGE;
			
 
				+}
			
 
				+
			
 
				+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
			
 
				+{
			
 
				+	int want = ac->want_keys;
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	int service;
			
 
				+
			
 
				+	*pneed = ac->want_keys & ~(xi->have_keys);
			
 
				+
			
 
				+	for (service = 1; service <= want; service <<= 1) {
			
 
				+		struct ceph_x_ticket_handler *th;
			
 
				+
			
 
				+		if (!(ac->want_keys & service))
			
 
				+			continue;
			
 
				+
			
 
				+		if (*pneed & service)
			
 
				+			continue;
			
 
				+
			
 
				+		th = get_ticket_handler(ac, service);
			
 
				+
			
 
				+		if (!th) {
			
 
				+			*pneed |= service;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (get_seconds() >= th->renew_after)
			
 
				+			*pneed |= service;
			
 
				+		if (get_seconds() >= th->expires)
			
 
				+			xi->have_keys &= ~service;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int ceph_x_build_request(struct ceph_auth_client *ac,
			
 
				+				void *buf, void *end)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	int need;
			
 
				+	struct ceph_x_request_header *head = buf;
			
 
				+	int ret;
			
 
				+	struct ceph_x_ticket_handler *th =
			
 
				+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
			
 
				+
			
 
				+	ceph_x_validate_tickets(ac, &need);
			
 
				+
			
 
				+	dout("build_request want %x have %x need %x\n",
			
 
				+	     ac->want_keys, xi->have_keys, need);
			
 
				+
			
 
				+	if (need & CEPH_ENTITY_TYPE_AUTH) {
			
 
				+		struct ceph_x_authenticate *auth = (void *)(head + 1);
			
 
				+		void *p = auth + 1;
			
 
				+		struct ceph_x_challenge_blob tmp;
			
 
				+		char tmp_enc[40];
			
 
				+		u64 *u;
			
 
				+
			
 
				+		if (p > end)
			
 
				+			return -ERANGE;
			
 
				+
			
 
				+		dout(" get_auth_session_key\n");
			
 
				+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
			
 
				+
			
 
				+		/* encrypt and hash */
			
 
				+		get_random_bytes(&auth->client_challenge, sizeof(u64));
			
 
				+		tmp.client_challenge = auth->client_challenge;
			
 
				+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
			
 
				+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
			
 
				+				     tmp_enc, sizeof(tmp_enc));
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+
			
 
				+		auth->struct_v = 1;
			
 
				+		auth->key = 0;
			
 
				+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
			
 
				+			auth->key ^= *u;
			
 
				+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
			
 
				+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
			
 
				+		     le64_to_cpu(auth->key));
			
 
				+
			
 
				+		/* now encode the old ticket if exists */
			
 
				+		ret = ceph_x_encode_ticket(th, &p, end);
			
 
				+		if (ret < 0)
			
 
				+			return ret;
			
 
				+
			
 
				+		return p - buf;
			
 
				+	}
			
 
				+
			
 
				+	if (need) {
			
 
				+		void *p = head + 1;
			
 
				+		struct ceph_x_service_ticket_request *req;
			
 
				+
			
 
				+		if (p > end)
			
 
				+			return -ERANGE;
			
 
				+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
			
 
				+
			
 
				+		BUG_ON(!th);
			
 
				+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
			
 
				+				 xi->auth_authorizer.buf->vec.iov_len);
			
 
				+
			
 
				+		req = p;
			
 
				+		req->keys = cpu_to_le32(need);
			
 
				+		p += sizeof(*req);
			
 
				+		return p - buf;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
			
 
				+			       void *buf, void *end)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	struct ceph_x_reply_header *head = buf;
			
 
				+	struct ceph_x_ticket_handler *th;
			
 
				+	int len = end - buf;
			
 
				+	int op;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (result)
			
 
				+		return result;  /* XXX hmm? */
			
 
				+
			
 
				+	if (xi->starting) {
			
 
				+		/* it's a hello */
			
 
				+		struct ceph_x_server_challenge *sc = buf;
			
 
				+
			
 
				+		if (len != sizeof(*sc))
			
 
				+			return -EINVAL;
			
 
				+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
			
 
				+		dout("handle_reply got server challenge %llx\n",
			
 
				+		     xi->server_challenge);
			
 
				+		xi->starting = false;
			
 
				+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	op = le32_to_cpu(head->op);
			
 
				+	result = le32_to_cpu(head->result);
			
 
				+	dout("handle_reply op %d result %d\n", op, result);
			
 
				+	switch (op) {
			
 
				+	case CEPHX_GET_AUTH_SESSION_KEY:
			
 
				+		/* verify auth key */
			
 
				+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
			
 
				+					       buf + sizeof(*head), end);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
			
 
				+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
			
 
				+		BUG_ON(!th);
			
 
				+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
			
 
				+					       buf + sizeof(*head), end);
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	if (ac->want_keys == xi->have_keys)
			
 
				+		return 0;
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_create_authorizer(
			
 
				+	struct ceph_auth_client *ac, int peer_type,
			
 
				+	struct ceph_authorizer **a,
			
 
				+	void **buf, size_t *len,
			
 
				+	void **reply_buf, size_t *reply_len)
			
 
				+{
			
 
				+	struct ceph_x_authorizer *au;
			
 
				+	struct ceph_x_ticket_handler *th;
			
 
				+	int ret;
			
 
				+
			
 
				+	th = get_ticket_handler(ac, peer_type);
			
 
				+	if (IS_ERR(th))
			
 
				+		return PTR_ERR(th);
			
 
				+
			
 
				+	au = kzalloc(sizeof(*au), GFP_NOFS);
			
 
				+	if (!au)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ret = ceph_x_build_authorizer(ac, th, au);
			
 
				+	if (ret) {
			
 
				+		kfree(au);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	*a = (struct ceph_authorizer *)au;
			
 
				+	*buf = au->buf->vec.iov_base;
			
 
				+	*len = au->buf->vec.iov_len;
			
 
				+	*reply_buf = au->reply_buf;
			
 
				+	*reply_len = sizeof(au->reply_buf);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
			
 
				+					  struct ceph_authorizer *a, size_t len)
			
 
				+{
			
 
				+	struct ceph_x_authorizer *au = (void *)a;
			
 
				+	struct ceph_x_ticket_handler *th;
			
 
				+	int ret = 0;
			
 
				+	struct ceph_x_authorize_reply reply;
			
 
				+	void *p = au->reply_buf;
			
 
				+	void *end = p + sizeof(au->reply_buf);
			
 
				+
			
 
				+	th = get_ticket_handler(ac, au->service);
			
 
				+	if (!th)
			
 
				+		return -EIO;  /* hrm! */
			
 
				+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+	if (ret != sizeof(reply))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
			
 
				+		ret = -EPERM;
			
 
				+	else
			
 
				+		ret = 0;
			
 
				+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
			
 
				+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
			
 
				+				      struct ceph_authorizer *a)
			
 
				+{
			
 
				+	struct ceph_x_authorizer *au = (void *)a;
			
 
				+
			
 
				+	ceph_buffer_put(au->buf);
			
 
				+	kfree(au);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void ceph_x_reset(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+
			
 
				+	dout("reset\n");
			
 
				+	xi->starting = true;
			
 
				+	xi->server_challenge = 0;
			
 
				+}
			
 
				+
			
 
				+static void ceph_x_destroy(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_x_info *xi = ac->private;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	dout("ceph_x_destroy %p\n", ac);
			
 
				+	ceph_crypto_key_destroy(&xi->secret);
			
 
				+
			
 
				+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
			
 
				+		struct ceph_x_ticket_handler *th =
			
 
				+			rb_entry(p, struct ceph_x_ticket_handler, node);
			
 
				+		remove_ticket_handler(ac, th);
			
 
				+	}
			
 
				+
			
 
				+	kmem_cache_destroy(ceph_x_ticketbuf_cachep);
			
 
				+
			
 
				+	kfree(ac->private);
			
 
				+	ac->private = NULL;
			
 
				+}
			
 
				+
			
 
				+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
			
 
				+				   int peer_type)
			
 
				+{
			
 
				+	struct ceph_x_ticket_handler *th;
			
 
				+
			
 
				+	th = get_ticket_handler(ac, peer_type);
			
 
				+	if (th && !IS_ERR(th))
			
 
				+		remove_ticket_handler(ac, th);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static const struct ceph_auth_client_ops ceph_x_ops = {
			
 
				+	.is_authenticated = ceph_x_is_authenticated,
			
 
				+	.build_request = ceph_x_build_request,
			
 
				+	.handle_reply = ceph_x_handle_reply,
			
 
				+	.create_authorizer = ceph_x_create_authorizer,
			
 
				+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
			
 
				+	.destroy_authorizer = ceph_x_destroy_authorizer,
			
 
				+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
			
 
				+	.reset =  ceph_x_reset,
			
 
				+	.destroy = ceph_x_destroy,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+int ceph_x_init(struct ceph_auth_client *ac)
			
 
				+{
			
 
				+	struct ceph_x_info *xi;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("ceph_x_init %p\n", ac);
			
 
				+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
			
 
				+	if (!xi)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ret = -ENOMEM;
			
 
				+	ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
			
 
				+				      TEMP_TICKET_BUF_LEN, 8,
			
 
				+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
			
 
				+				      NULL);
			
 
				+	if (!ceph_x_ticketbuf_cachep)
			
 
				+		goto done_nomem;
			
 
				+	ret = -EINVAL;
			
 
				+	if (!ac->secret) {
			
 
				+		pr_err("no secret set (for auth_x protocol)\n");
			
 
				+		goto done_nomem;
			
 
				+	}
			
 
				+
			
 
				+	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
			
 
				+	if (ret)
			
 
				+		goto done_nomem;
			
 
				+
			
 
				+	xi->starting = true;
			
 
				+	xi->ticket_handlers = RB_ROOT;
			
 
				+
			
 
				+	ac->protocol = CEPH_AUTH_CEPHX;
			
 
				+	ac->private = xi;
			
 
				+	ac->ops = &ceph_x_ops;
			
 
				+	return 0;
			
 
				+
			
 
				+done_nomem:
			
 
				+	kfree(xi);
			
 
				+	if (ceph_x_ticketbuf_cachep)
			
 
				+		kmem_cache_destroy(ceph_x_ticketbuf_cachep);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/fs/ceph/auth_x.h
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
 
				+#ifndef _FS_CEPH_AUTH_X_H
			
 
				+#define _FS_CEPH_AUTH_X_H
			
 
				+
			
 
				+#include <linux/rbtree.h>
			
 
				+
			
 
				+#include "crypto.h"
			
 
				+#include "auth.h"
			
 
				+#include "auth_x_protocol.h"
			
 
				+
			
 
				+/*
			
 
				+ * Handle ticket for a single service.
			
 
				+ */
			
 
				+struct ceph_x_ticket_handler {
			
 
				+	struct rb_node node;
			
 
				+	unsigned service;
			
 
				+
			
 
				+	struct ceph_crypto_key session_key;
			
 
				+	struct ceph_timespec validity;
			
 
				+
			
 
				+	u64 secret_id;
			
 
				+	struct ceph_buffer *ticket_blob;
			
 
				+
			
 
				+	unsigned long renew_after, expires;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct ceph_x_authorizer {
			
 
				+	struct ceph_buffer *buf;
			
 
				+	unsigned service;
			
 
				+	u64 nonce;
			
 
				+	char reply_buf[128];  /* big enough for encrypted blob */
			
 
				+};
			
 
				+
			
 
				+struct ceph_x_info {
			
 
				+	struct ceph_crypto_key secret;
			
 
				+
			
 
				+	bool starting;
			
 
				+	u64 server_challenge;
			
 
				+
			
 
				+	unsigned have_keys;
			
 
				+	struct rb_root ticket_handlers;
			
 
				+
			
 
				+	struct ceph_x_authorizer auth_authorizer;
			
 
				+};
			
 
				+
			
 
				+extern int ceph_x_init(struct ceph_auth_client *ac);
			
 
				+
			
 
				+#endif
			
 
				+
			
--- a/fs/ceph/auth_x_protocol.h
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
 
				+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
			
 
				+#define __FS_CEPH_AUTH_X_PROTOCOL
			
 
				+
			
 
				+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
			
 
				+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
			
 
				+#define CEPHX_GET_ROTATING_KEY          0x0400
			
 
				+
			
 
				+/* common bits */
			
 
				+struct ceph_x_ticket_blob {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 secret_id;
			
 
				+	__le32 blob_len;
			
 
				+	char blob[];
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+/* common request/reply headers */
			
 
				+struct ceph_x_request_header {
			
 
				+	__le16 op;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_x_reply_header {
			
 
				+	__le16 op;
			
 
				+	__le32 result;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+/* authenticate handshake */
			
 
				+
			
 
				+/* initial hello (no reply header) */
			
 
				+struct ceph_x_server_challenge {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 server_challenge;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_x_authenticate {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 client_challenge;
			
 
				+	__le64 key;
			
 
				+	/* ticket blob */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_x_service_ticket_request {
			
 
				+	__u8 struct_v;
			
 
				+	__le32 keys;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_x_challenge_blob {
			
 
				+	__le64 server_challenge;
			
 
				+	__le64 client_challenge;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* authorize handshake */
			
 
				+
			
 
				+/*
			
 
				+ * The authorizer consists of two pieces:
			
 
				+ *  a - service id, ticket blob
			
 
				+ *  b - encrypted with session key
			
 
				+ */
			
 
				+struct ceph_x_authorize_a {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 global_id;
			
 
				+	__le32 service_id;
			
 
				+	struct ceph_x_ticket_blob ticket_blob;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_x_authorize_b {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 nonce;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_x_authorize_reply {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 nonce_plus_one;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * encyption bundle
			
 
				+ */
			
 
				+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
			
 
				+
			
 
				+struct ceph_x_encrypt_header {
			
 
				+	__u8 struct_v;
			
 
				+	__le64 magic;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
 
				+
			
 
				+#include "ceph_debug.h"
			
 
				+#include "buffer.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
			
 
				+{
			
 
				+	struct ceph_buffer *b;
			
 
				+
			
 
				+	b = kmalloc(sizeof(*b), gfp);
			
 
				+	if (!b)
			
 
				+		return NULL;
			
 
				+
			
 
				+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
			
 
				+	if (b->vec.iov_base) {
			
 
				+		b->is_vmalloc = false;
			
 
				+	} else {
			
 
				+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
			
 
				+		if (!b->vec.iov_base) {
			
 
				+			kfree(b);
			
 
				+			return NULL;
			
 
				+		}
			
 
				+		b->is_vmalloc = true;
			
 
				+	}
			
 
				+
			
 
				+	kref_init(&b->kref);
			
 
				+	b->alloc_len = len;
			
 
				+	b->vec.iov_len = len;
			
 
				+	dout("buffer_new %p\n", b);
			
 
				+	return b;
			
 
				+}
			
 
				+
			
 
				+void ceph_buffer_release(struct kref *kref)
			
 
				+{
			
 
				+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
			
 
				+
			
 
				+	dout("buffer_release %p\n", b);
			
 
				+	if (b->vec.iov_base) {
			
 
				+		if (b->is_vmalloc)
			
 
				+			vfree(b->vec.iov_base);
			
 
				+		else
			
 
				+			kfree(b->vec.iov_base);
			
 
				+	}
			
 
				+	kfree(b);
			
 
				+}
			
 
				+
			
 
				+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
			
 
				+{
			
 
				+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
			
 
				+	if (b->vec.iov_base) {
			
 
				+		b->is_vmalloc = false;
			
 
				+	} else {
			
 
				+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
			
 
				+		b->is_vmalloc = true;
			
 
				+	}
			
 
				+	if (!b->vec.iov_base)
			
 
				+		return -ENOMEM;
			
 
				+	b->alloc_len = len;
			
 
				+	b->vec.iov_len = len;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
			
 
				+{
			
 
				+	size_t len;
			
 
				+
			
 
				+	ceph_decode_need(p, end, sizeof(u32), bad);
			
 
				+	len = ceph_decode_32(p);
			
 
				+	dout("decode_buffer len %d\n", (int)len);
			
 
				+	ceph_decode_need(p, end, len, bad);
			
 
				+	*b = ceph_buffer_new(len, GFP_NOFS);
			
 
				+	if (!*b)
			
 
				+		return -ENOMEM;
			
 
				+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return -EINVAL;
			
 
				+}
			
--- a/fs/ceph/buffer.h
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
 
				+#ifndef __FS_CEPH_BUFFER_H
			
 
				+#define __FS_CEPH_BUFFER_H
			
 
				+
			
 
				+#include <linux/kref.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/uio.h>
			
 
				+
			
 
				+/*
			
 
				+ * a simple reference counted buffer.
			
 
				+ *
			
 
				+ * use kmalloc for small sizes (<= one page), vmalloc for larger
			
 
				+ * sizes.
			
 
				+ */
			
 
				+struct ceph_buffer {
			
 
				+	struct kref kref;
			
 
				+	struct kvec vec;
			
 
				+	size_t alloc_len;
			
 
				+	bool is_vmalloc;
			
 
				+};
			
 
				+
			
 
				+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
			
 
				+extern void ceph_buffer_release(struct kref *kref);
			
 
				+
			
 
				+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
			
 
				+{
			
 
				+	kref_get(&b->kref);
			
 
				+	return b;
			
 
				+}
			
 
				+
			
 
				+static inline void ceph_buffer_put(struct ceph_buffer *b)
			
 
				+{
			
 
				+	kref_put(&b->kref, ceph_buffer_release);
			
 
				+}
			
 
				+
			
 
				+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2927 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/wait.h>
			
 
				+#include <linux/writeback.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "decode.h"
			
 
				+#include "messenger.h"
			
 
				+
			
 
				+/*
			
 
				+ * Capability management
			
 
				+ *
			
 
				+ * The Ceph metadata servers control client access to inode metadata
			
 
				+ * and file data by issuing capabilities, granting clients permission
			
 
				+ * to read and/or write both inode field and file data to OSDs
			
 
				+ * (storage nodes).  Each capability consists of a set of bits
			
 
				+ * indicating which operations are allowed.
			
 
				+ *
			
 
				+ * If the client holds a *_SHARED cap, the client has a coherent value
			
 
				+ * that can be safely read from the cached inode.
			
 
				+ *
			
 
				+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
			
 
				+ * client is allowed to change inode attributes (e.g., file size,
			
 
				+ * mtime), note its dirty state in the ceph_cap, and asynchronously
			
 
				+ * flush that metadata change to the MDS.
			
 
				+ *
			
 
				+ * In the event of a conflicting operation (perhaps by another
			
 
				+ * client), the MDS will revoke the conflicting client capabilities.
			
 
				+ *
			
 
				+ * In order for a client to cache an inode, it must hold a capability
			
 
				+ * with at least one MDS server.  When inodes are released, release
			
 
				+ * notifications are batched and periodically sent en masse to the MDS
			
 
				+ * cluster to release server state.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Generate readable cap strings for debugging output.
			
 
				+ */
			
 
				+#define MAX_CAP_STR 20
			
 
				+static char cap_str[MAX_CAP_STR][40];
			
 
				+static DEFINE_SPINLOCK(cap_str_lock);
			
 
				+static int last_cap_str;
			
 
				+
			
 
				+static char *gcap_string(char *s, int c)
			
 
				+{
			
 
				+	if (c & CEPH_CAP_GSHARED)
			
 
				+		*s++ = 's';
			
 
				+	if (c & CEPH_CAP_GEXCL)
			
 
				+		*s++ = 'x';
			
 
				+	if (c & CEPH_CAP_GCACHE)
			
 
				+		*s++ = 'c';
			
 
				+	if (c & CEPH_CAP_GRD)
			
 
				+		*s++ = 'r';
			
 
				+	if (c & CEPH_CAP_GWR)
			
 
				+		*s++ = 'w';
			
 
				+	if (c & CEPH_CAP_GBUFFER)
			
 
				+		*s++ = 'b';
			
 
				+	if (c & CEPH_CAP_GLAZYIO)
			
 
				+		*s++ = 'l';
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+const char *ceph_cap_string(int caps)
			
 
				+{
			
 
				+	int i;
			
 
				+	char *s;
			
 
				+	int c;
			
 
				+
			
 
				+	spin_lock(&cap_str_lock);
			
 
				+	i = last_cap_str++;
			
 
				+	if (last_cap_str == MAX_CAP_STR)
			
 
				+		last_cap_str = 0;
			
 
				+	spin_unlock(&cap_str_lock);
			
 
				+
			
 
				+	s = cap_str[i];
			
 
				+
			
 
				+	if (caps & CEPH_CAP_PIN)
			
 
				+		*s++ = 'p';
			
 
				+
			
 
				+	c = (caps >> CEPH_CAP_SAUTH) & 3;
			
 
				+	if (c) {
			
 
				+		*s++ = 'A';
			
 
				+		s = gcap_string(s, c);
			
 
				+	}
			
 
				+
			
 
				+	c = (caps >> CEPH_CAP_SLINK) & 3;
			
 
				+	if (c) {
			
 
				+		*s++ = 'L';
			
 
				+		s = gcap_string(s, c);
			
 
				+	}
			
 
				+
			
 
				+	c = (caps >> CEPH_CAP_SXATTR) & 3;
			
 
				+	if (c) {
			
 
				+		*s++ = 'X';
			
 
				+		s = gcap_string(s, c);
			
 
				+	}
			
 
				+
			
 
				+	c = caps >> CEPH_CAP_SFILE;
			
 
				+	if (c) {
			
 
				+		*s++ = 'F';
			
 
				+		s = gcap_string(s, c);
			
 
				+	}
			
 
				+
			
 
				+	if (s == cap_str[i])
			
 
				+		*s++ = '-';
			
 
				+	*s = 0;
			
 
				+	return cap_str[i];
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cap reservations
			
 
				+ *
			
 
				+ * Maintain a global pool of preallocated struct ceph_caps, referenced
			
 
				+ * by struct ceph_caps_reservations.  This ensures that we preallocate
			
 
				+ * memory needed to successfully process an MDS response.  (If an MDS
			
 
				+ * sends us cap information and we fail to process it, we will have
			
 
				+ * problems due to the client and MDS being out of sync.)
			
 
				+ *
			
 
				+ * Reservations are 'owned' by a ceph_cap_reservation context.
			
 
				+ */
			
 
				+static spinlock_t caps_list_lock;
			
 
				+static struct list_head caps_list;  /* unused (reserved or unreserved) */
			
 
				+static int caps_total_count;        /* total caps allocated */
			
 
				+static int caps_use_count;          /* in use */
			
 
				+static int caps_reserve_count;      /* unused, reserved */
			
 
				+static int caps_avail_count;        /* unused, unreserved */
			
 
				+static int caps_min_count;          /* keep at least this many (unreserved) */
			
 
				+
			
 
				+void __init ceph_caps_init(void)
			
 
				+{
			
 
				+	INIT_LIST_HEAD(&caps_list);
			
 
				+	spin_lock_init(&caps_list_lock);
			
 
				+}
			
 
				+
			
 
				+void ceph_caps_finalize(void)
			
 
				+{
			
 
				+	struct ceph_cap *cap;
			
 
				+
			
 
				+	spin_lock(&caps_list_lock);
			
 
				+	while (!list_empty(&caps_list)) {
			
 
				+		cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
			
 
				+		list_del(&cap->caps_item);
			
 
				+		kmem_cache_free(ceph_cap_cachep, cap);
			
 
				+	}
			
 
				+	caps_total_count = 0;
			
 
				+	caps_avail_count = 0;
			
 
				+	caps_use_count = 0;
			
 
				+	caps_reserve_count = 0;
			
 
				+	caps_min_count = 0;
			
 
				+	spin_unlock(&caps_list_lock);
			
 
				+}
			
 
				+
			
 
				+void ceph_adjust_min_caps(int delta)
			
 
				+{
			
 
				+	spin_lock(&caps_list_lock);
			
 
				+	caps_min_count += delta;
			
 
				+	BUG_ON(caps_min_count < 0);
			
 
				+	spin_unlock(&caps_list_lock);
			
 
				+}
			
 
				+
			
 
				+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct ceph_cap *cap;
			
 
				+	int have;
			
 
				+	int alloc = 0;
			
 
				+	LIST_HEAD(newcaps);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	dout("reserve caps ctx=%p need=%d\n", ctx, need);
			
 
				+
			
 
				+	/* first reserve any caps that are already allocated */
			
 
				+	spin_lock(&caps_list_lock);
			
 
				+	if (caps_avail_count >= need)
			
 
				+		have = need;
			
 
				+	else
			
 
				+		have = caps_avail_count;
			
 
				+	caps_avail_count -= have;
			
 
				+	caps_reserve_count += have;
			
 
				+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
			
 
				+	       caps_avail_count);
			
 
				+	spin_unlock(&caps_list_lock);
			
 
				+
			
 
				+	for (i = have; i < need; i++) {
			
 
				+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
			
 
				+		if (!cap) {
			
 
				+			ret = -ENOMEM;
			
 
				+			goto out_alloc_count;
			
 
				+		}
			
 
				+		list_add(&cap->caps_item, &newcaps);
			
 
				+		alloc++;
			
 
				+	}
			
 
				+	BUG_ON(have + alloc != need);
			
 
				+
			
 
				+	spin_lock(&caps_list_lock);
			
 
				+	caps_total_count += alloc;
			
 
				+	caps_reserve_count += alloc;
			
 
				+	list_splice(&newcaps, &caps_list);
			
 
				+
			
 
				+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
			
 
				+	       caps_avail_count);
			
 
				+	spin_unlock(&caps_list_lock);
			
 
				+
			
 
				+	ctx->count = need;
			
 
				+	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
			
 
				+	     ctx, caps_total_count, caps_use_count, caps_reserve_count,
			
 
				+	     caps_avail_count);
			
 
				+	return 0;
			
 
				+
			
 
				+out_alloc_count:
			
 
				+	/* we didn't manage to reserve as much as we needed */
			
 
				+	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
			
 
				+		   ctx, need, have);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
			
 
				+{
			
 
				+	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
			
 
				+	if (ctx->count) {
			
 
				+		spin_lock(&caps_list_lock);
			
 
				+		BUG_ON(caps_reserve_count < ctx->count);
			
 
				+		caps_reserve_count -= ctx->count;
			
 
				+		caps_avail_count += ctx->count;
			
 
				+		ctx->count = 0;
			
 
				+		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
			
 
				+		     caps_total_count, caps_use_count, caps_reserve_count,
			
 
				+		     caps_avail_count);
			
 
				+		BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
			
 
				+		       caps_avail_count);
			
 
				+		spin_unlock(&caps_list_lock);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
			
 
				+{
			
 
				+	struct ceph_cap *cap = NULL;
			
 
				+
			
 
				+	/* temporary, until we do something about cap import/export */
			
 
				+	if (!ctx)
			
 
				+		return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
			
 
				+
			
 
				+	spin_lock(&caps_list_lock);
			
 
				+	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
			
 
				+	     ctx, ctx->count, caps_total_count, caps_use_count,
			
 
				+	     caps_reserve_count, caps_avail_count);
			
 
				+	BUG_ON(!ctx->count);
			
 
				+	BUG_ON(ctx->count > caps_reserve_count);
			
 
				+	BUG_ON(list_empty(&caps_list));
			
 
				+
			
 
				+	ctx->count--;
			
 
				+	caps_reserve_count--;
			
 
				+	caps_use_count++;
			
 
				+
			
 
				+	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
			
 
				+	list_del(&cap->caps_item);
			
 
				+
			
 
				+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
			
 
				+	       caps_avail_count);
			
 
				+	spin_unlock(&caps_list_lock);
			
 
				+	return cap;
			
 
				+}
			
 
				+
			
 
				+void ceph_put_cap(struct ceph_cap *cap)
			
 
				+{
			
 
				+	spin_lock(&caps_list_lock);
			
 
				+	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
			
 
				+	     cap, caps_total_count, caps_use_count,
			
 
				+	     caps_reserve_count, caps_avail_count);
			
 
				+	caps_use_count--;
			
 
				+	/*
			
 
				+	 * Keep some preallocated caps around (ceph_min_count), to
			
 
				+	 * avoid lots of free/alloc churn.
			
 
				+	 */
			
 
				+	if (caps_avail_count >= caps_reserve_count + caps_min_count) {
			
 
				+		caps_total_count--;
			
 
				+		kmem_cache_free(ceph_cap_cachep, cap);
			
 
				+	} else {
			
 
				+		caps_avail_count++;
			
 
				+		list_add(&cap->caps_item, &caps_list);
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
			
 
				+	       caps_avail_count);
			
 
				+	spin_unlock(&caps_list_lock);
			
 
				+}
			
 
				+
			
 
				+void ceph_reservation_status(struct ceph_client *client,
			
 
				+			     int *total, int *avail, int *used, int *reserved,
			
 
				+			     int *min)
			
 
				+{
			
 
				+	if (total)
			
 
				+		*total = caps_total_count;
			
 
				+	if (avail)
			
 
				+		*avail = caps_avail_count;
			
 
				+	if (used)
			
 
				+		*used = caps_use_count;
			
 
				+	if (reserved)
			
 
				+		*reserved = caps_reserve_count;
			
 
				+	if (min)
			
 
				+		*min = caps_min_count;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Find ceph_cap for given mds, if any.
			
 
				+ *
			
 
				+ * Called with i_lock held.
			
 
				+ */
			
 
				+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
			
 
				+{
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct rb_node *n = ci->i_caps.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		cap = rb_entry(n, struct ceph_cap, ci_node);
			
 
				+		if (mds < cap->mds)
			
 
				+			n = n->rb_left;
			
 
				+		else if (mds > cap->mds)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return cap;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
			
 
				+ * -1.
			
 
				+ */
			
 
				+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
			
 
				+{
			
 
				+	struct ceph_cap *cap;
			
 
				+	int mds = -1;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	/* prefer mds with WR|WRBUFFER|EXCL caps */
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		mds = cap->mds;
			
 
				+		if (mseq)
			
 
				+			*mseq = cap->mseq;
			
 
				+		if (cap->issued & (CEPH_CAP_FILE_WR |
			
 
				+				   CEPH_CAP_FILE_BUFFER |
			
 
				+				   CEPH_CAP_FILE_EXCL))
			
 
				+			break;
			
 
				+	}
			
 
				+	return mds;
			
 
				+}
			
 
				+
			
 
				+int ceph_get_cap_mds(struct inode *inode)
			
 
				+{
			
 
				+	int mds;
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return mds;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called under i_lock.
			
 
				+ */
			
 
				+static void __insert_cap_node(struct ceph_inode_info *ci,
			
 
				+			      struct ceph_cap *new)
			
 
				+{
			
 
				+	struct rb_node **p = &ci->i_caps.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_cap *cap = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		cap = rb_entry(parent, struct ceph_cap, ci_node);
			
 
				+		if (new->mds < cap->mds)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->mds > cap->mds)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->ci_node, parent, p);
			
 
				+	rb_insert_color(&new->ci_node, &ci->i_caps);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * (re)set cap hold timeouts, which control the delayed release
			
 
				+ * of unused caps back to the MDS.  Should be called on cap use.
			
 
				+ */
			
 
				+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
			
 
				+			       struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	struct ceph_mount_args *ma = mdsc->client->mount_args;
			
 
				+
			
 
				+	ci->i_hold_caps_min = round_jiffies(jiffies +
			
 
				+					    ma->caps_wanted_delay_min * HZ);
			
 
				+	ci->i_hold_caps_max = round_jiffies(jiffies +
			
 
				+					    ma->caps_wanted_delay_max * HZ);
			
 
				+	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
			
 
				+	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * (Re)queue cap at the end of the delayed cap release list.
			
 
				+ *
			
 
				+ * If I_FLUSH is set, leave the inode at the front of the list.
			
 
				+ *
			
 
				+ * Caller holds i_lock
			
 
				+ *    -> we take mdsc->cap_delay_lock
			
 
				+ */
			
 
				+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
			
 
				+				struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	__cap_set_timeouts(mdsc, ci);
			
 
				+	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
			
 
				+	     ci->i_ceph_flags, ci->i_hold_caps_max);
			
 
				+	if (!mdsc->stopping) {
			
 
				+		spin_lock(&mdsc->cap_delay_lock);
			
 
				+		if (!list_empty(&ci->i_cap_delay_list)) {
			
 
				+			if (ci->i_ceph_flags & CEPH_I_FLUSH)
			
 
				+				goto no_change;
			
 
				+			list_del_init(&ci->i_cap_delay_list);
			
 
				+		}
			
 
				+		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
			
 
				+no_change:
			
 
				+		spin_unlock(&mdsc->cap_delay_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
			
 
				+ * indicating we should send a cap message to flush dirty metadata
			
 
				+ * asap, and move to the front of the delayed cap list.
			
 
				+ */
			
 
				+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
			
 
				+				      struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
			
 
				+	spin_lock(&mdsc->cap_delay_lock);
			
 
				+	ci->i_ceph_flags |= CEPH_I_FLUSH;
			
 
				+	if (!list_empty(&ci->i_cap_delay_list))
			
 
				+		list_del_init(&ci->i_cap_delay_list);
			
 
				+	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
			
 
				+	spin_unlock(&mdsc->cap_delay_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cancel delayed work on cap.
			
 
				+ *
			
 
				+ * Caller must hold i_lock.
			
 
				+ */
			
 
				+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
			
 
				+			       struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
			
 
				+	if (list_empty(&ci->i_cap_delay_list))
			
 
				+		return;
			
 
				+	spin_lock(&mdsc->cap_delay_lock);
			
 
				+	list_del_init(&ci->i_cap_delay_list);
			
 
				+	spin_unlock(&mdsc->cap_delay_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Common issue checks for add_cap, handle_cap_grant.
			
 
				+ */
			
 
				+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
			
 
				+			      unsigned issued)
			
 
				+{
			
 
				+	unsigned had = __ceph_caps_issued(ci, NULL);
			
 
				+
			
 
				+	/*
			
 
				+	 * Each time we receive FILE_CACHE anew, we increment
			
 
				+	 * i_rdcache_gen.
			
 
				+	 */
			
 
				+	if ((issued & CEPH_CAP_FILE_CACHE) &&
			
 
				+	    (had & CEPH_CAP_FILE_CACHE) == 0)
			
 
				+		ci->i_rdcache_gen++;
			
 
				+
			
 
				+	/*
			
 
				+	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
			
 
				+	 * don't know what happened to this directory while we didn't
			
 
				+	 * have the cap.
			
 
				+	 */
			
 
				+	if ((issued & CEPH_CAP_FILE_SHARED) &&
			
 
				+	    (had & CEPH_CAP_FILE_SHARED) == 0) {
			
 
				+		ci->i_shared_gen++;
			
 
				+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
			
 
				+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
			
 
				+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Add a capability under the given MDS session.
			
 
				+ *
			
 
				+ * Caller should hold session snap_rwsem (read) and s_mutex.
			
 
				+ *
			
 
				+ * @fmode is the open file mode, if we are opening a file, otherwise
			
 
				+ * it is < 0.  (This is so we can atomically add the cap and add an
			
 
				+ * open file reference to it.)
			
 
				+ */
			
 
				+int ceph_add_cap(struct inode *inode,
			
 
				+		 struct ceph_mds_session *session, u64 cap_id,
			
 
				+		 int fmode, unsigned issued, unsigned wanted,
			
 
				+		 unsigned seq, unsigned mseq, u64 realmino, int flags,
			
 
				+		 struct ceph_cap_reservation *caps_reservation)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_cap *new_cap = NULL;
			
 
				+	struct ceph_cap *cap;
			
 
				+	int mds = session->s_mds;
			
 
				+	int actual_wanted;
			
 
				+
			
 
				+	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
			
 
				+	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are opening the file, include file mode wanted bits
			
 
				+	 * in wanted.
			
 
				+	 */
			
 
				+	if (fmode >= 0)
			
 
				+		wanted |= ceph_caps_for_mode(fmode);
			
 
				+
			
 
				+retry:
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	cap = __get_cap_for_mds(ci, mds);
			
 
				+	if (!cap) {
			
 
				+		if (new_cap) {
			
 
				+			cap = new_cap;
			
 
				+			new_cap = NULL;
			
 
				+		} else {
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+			new_cap = get_cap(caps_reservation);
			
 
				+			if (new_cap == NULL)
			
 
				+				return -ENOMEM;
			
 
				+			goto retry;
			
 
				+		}
			
 
				+
			
 
				+		cap->issued = 0;
			
 
				+		cap->implemented = 0;
			
 
				+		cap->mds = mds;
			
 
				+		cap->mds_wanted = 0;
			
 
				+
			
 
				+		cap->ci = ci;
			
 
				+		__insert_cap_node(ci, cap);
			
 
				+
			
 
				+		/* clear out old exporting info?  (i.e. on cap import) */
			
 
				+		if (ci->i_cap_exporting_mds == mds) {
			
 
				+			ci->i_cap_exporting_issued = 0;
			
 
				+			ci->i_cap_exporting_mseq = 0;
			
 
				+			ci->i_cap_exporting_mds = -1;
			
 
				+		}
			
 
				+
			
 
				+		/* add to session cap list */
			
 
				+		cap->session = session;
			
 
				+		spin_lock(&session->s_cap_lock);
			
 
				+		list_add_tail(&cap->session_caps, &session->s_caps);
			
 
				+		session->s_nr_caps++;
			
 
				+		spin_unlock(&session->s_cap_lock);
			
 
				+	}
			
 
				+
			
 
				+	if (!ci->i_snap_realm) {
			
 
				+		/*
			
 
				+		 * add this inode to the appropriate snap realm
			
 
				+		 */
			
 
				+		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
			
 
				+							       realmino);
			
 
				+		if (realm) {
			
 
				+			ceph_get_snap_realm(mdsc, realm);
			
 
				+			spin_lock(&realm->inodes_with_caps_lock);
			
 
				+			ci->i_snap_realm = realm;
			
 
				+			list_add(&ci->i_snap_realm_item,
			
 
				+				 &realm->inodes_with_caps);
			
 
				+			spin_unlock(&realm->inodes_with_caps_lock);
			
 
				+		} else {
			
 
				+			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
			
 
				+			       realmino);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	__check_cap_issue(ci, cap, issued);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are issued caps we don't want, or the mds' wanted
			
 
				+	 * value appears to be off, queue a check so we'll release
			
 
				+	 * later and/or update the mds wanted value.
			
 
				+	 */
			
 
				+	actual_wanted = __ceph_caps_wanted(ci);
			
 
				+	if ((wanted & ~actual_wanted) ||
			
 
				+	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
			
 
				+		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
			
 
				+		     ceph_cap_string(issued), ceph_cap_string(wanted),
			
 
				+		     ceph_cap_string(actual_wanted));
			
 
				+		__cap_delay_requeue(mdsc, ci);
			
 
				+	}
			
 
				+
			
 
				+	if (flags & CEPH_CAP_FLAG_AUTH)
			
 
				+		ci->i_auth_cap = cap;
			
 
				+	else if (ci->i_auth_cap == cap)
			
 
				+		ci->i_auth_cap = NULL;
			
 
				+
			
 
				+	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
			
 
				+	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
			
 
				+	     ceph_cap_string(issued|cap->issued), seq, mds);
			
 
				+	cap->cap_id = cap_id;
			
 
				+	cap->issued = issued;
			
 
				+	cap->implemented |= issued;
			
 
				+	cap->mds_wanted |= wanted;
			
 
				+	cap->seq = seq;
			
 
				+	cap->issue_seq = seq;
			
 
				+	cap->mseq = mseq;
			
 
				+	cap->cap_gen = session->s_cap_gen;
			
 
				+
			
 
				+	if (fmode >= 0)
			
 
				+		__ceph_get_fmode(ci, fmode);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	wake_up(&ci->i_cap_wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return true if cap has not timed out and belongs to the current
			
 
				+ * generation of the MDS session (i.e. has not gone 'stale' due to
			
 
				+ * us losing touch with the mds).
			
 
				+ */
			
 
				+static int __cap_is_valid(struct ceph_cap *cap)
			
 
				+{
			
 
				+	unsigned long ttl;
			
 
				+	u32 gen;
			
 
				+
			
 
				+	spin_lock(&cap->session->s_cap_lock);
			
 
				+	gen = cap->session->s_cap_gen;
			
 
				+	ttl = cap->session->s_cap_ttl;
			
 
				+	spin_unlock(&cap->session->s_cap_lock);
			
 
				+
			
 
				+	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
			
 
				+		dout("__cap_is_valid %p cap %p issued %s "
			
 
				+		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
			
 
				+		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return set of valid cap bits issued to us.  Note that caps time
			
 
				+ * out, and may be invalidated in bulk if the client session times out
			
 
				+ * and session->s_cap_gen is bumped.
			
 
				+ */
			
 
				+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
			
 
				+{
			
 
				+	int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	if (implemented)
			
 
				+		*implemented = 0;
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		if (!__cap_is_valid(cap))
			
 
				+			continue;
			
 
				+		dout("__ceph_caps_issued %p cap %p issued %s\n",
			
 
				+		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
			
 
				+		have |= cap->issued;
			
 
				+		if (implemented)
			
 
				+			*implemented |= cap->implemented;
			
 
				+	}
			
 
				+	return have;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get cap bits issued by caps other than @ocap
			
 
				+ */
			
 
				+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
			
 
				+{
			
 
				+	int have = ci->i_snap_caps;
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		if (cap == ocap)
			
 
				+			continue;
			
 
				+		if (!__cap_is_valid(cap))
			
 
				+			continue;
			
 
				+		have |= cap->issued;
			
 
				+	}
			
 
				+	return have;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Move a cap to the end of the LRU (oldest caps at list head, newest
			
 
				+ * at list tail).
			
 
				+ */
			
 
				+static void __touch_cap(struct ceph_cap *cap)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = cap->session;
			
 
				+
			
 
				+	spin_lock(&s->s_cap_lock);
			
 
				+	if (s->s_cap_iterator == NULL) {
			
 
				+		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
			
 
				+		     s->s_mds);
			
 
				+		list_move_tail(&cap->session_caps, &s->s_caps);
			
 
				+	} else {
			
 
				+		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
			
 
				+		     &cap->ci->vfs_inode, cap, s->s_mds);
			
 
				+	}
			
 
				+	spin_unlock(&s->s_cap_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check if we hold the given mask.  If so, move the cap(s) to the
			
 
				+ * front of their respective LRUs.  (This is the preferred way for
			
 
				+ * callers to check for caps they want.)
			
 
				+ */
			
 
				+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
			
 
				+{
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct rb_node *p;
			
 
				+	int have = ci->i_snap_caps;
			
 
				+
			
 
				+	if ((have & mask) == mask) {
			
 
				+		dout("__ceph_caps_issued_mask %p snap issued %s"
			
 
				+		     " (mask %s)\n", &ci->vfs_inode,
			
 
				+		     ceph_cap_string(have),
			
 
				+		     ceph_cap_string(mask));
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		if (!__cap_is_valid(cap))
			
 
				+			continue;
			
 
				+		if ((cap->issued & mask) == mask) {
			
 
				+			dout("__ceph_caps_issued_mask %p cap %p issued %s"
			
 
				+			     " (mask %s)\n", &ci->vfs_inode, cap,
			
 
				+			     ceph_cap_string(cap->issued),
			
 
				+			     ceph_cap_string(mask));
			
 
				+			if (touch)
			
 
				+				__touch_cap(cap);
			
 
				+			return 1;
			
 
				+		}
			
 
				+
			
 
				+		/* does a combination of caps satisfy mask? */
			
 
				+		have |= cap->issued;
			
 
				+		if ((have & mask) == mask) {
			
 
				+			dout("__ceph_caps_issued_mask %p combo issued %s"
			
 
				+			     " (mask %s)\n", &ci->vfs_inode,
			
 
				+			     ceph_cap_string(cap->issued),
			
 
				+			     ceph_cap_string(mask));
			
 
				+			if (touch) {
			
 
				+				struct rb_node *q;
			
 
				+
			
 
				+				/* touch this + preceeding caps */
			
 
				+				__touch_cap(cap);
			
 
				+				for (q = rb_first(&ci->i_caps); q != p;
			
 
				+				     q = rb_next(q)) {
			
 
				+					cap = rb_entry(q, struct ceph_cap,
			
 
				+						       ci_node);
			
 
				+					if (!__cap_is_valid(cap))
			
 
				+						continue;
			
 
				+					__touch_cap(cap);
			
 
				+				}
			
 
				+			}
			
 
				+			return 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return true if mask caps are currently being revoked by an MDS.
			
 
				+ */
			
 
				+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct rb_node *p;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		if (__cap_is_valid(cap) &&
			
 
				+		    (cap->implemented & ~cap->issued & mask)) {
			
 
				+			ret = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	dout("ceph_caps_revoking %p %s = %d\n", inode,
			
 
				+	     ceph_cap_string(mask), ret);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int __ceph_caps_used(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	int used = 0;
			
 
				+	if (ci->i_pin_ref)
			
 
				+		used |= CEPH_CAP_PIN;
			
 
				+	if (ci->i_rd_ref)
			
 
				+		used |= CEPH_CAP_FILE_RD;
			
 
				+	if (ci->i_rdcache_ref || ci->i_rdcache_gen)
			
 
				+		used |= CEPH_CAP_FILE_CACHE;
			
 
				+	if (ci->i_wr_ref)
			
 
				+		used |= CEPH_CAP_FILE_WR;
			
 
				+	if (ci->i_wrbuffer_ref)
			
 
				+		used |= CEPH_CAP_FILE_BUFFER;
			
 
				+	return used;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * wanted, by virtue of open file modes
			
 
				+ */
			
 
				+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	int want = 0;
			
 
				+	int mode;
			
 
				+	for (mode = 0; mode < 4; mode++)
			
 
				+		if (ci->i_nr_by_mode[mode])
			
 
				+			want |= ceph_caps_for_mode(mode);
			
 
				+	return want;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return caps we have registered with the MDS(s) as 'wanted'.
			
 
				+ */
			
 
				+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct rb_node *p;
			
 
				+	int mds_wanted = 0;
			
 
				+
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		if (!__cap_is_valid(cap))
			
 
				+			continue;
			
 
				+		mds_wanted |= cap->mds_wanted;
			
 
				+	}
			
 
				+	return mds_wanted;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under i_lock
			
 
				+ */
			
 
				+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caller should hold i_lock.
			
 
				+ * caller will not hold session s_mutex if called from destroy_inode.
			
 
				+ */
			
 
				+void __ceph_remove_cap(struct ceph_cap *cap)
			
 
				+{
			
 
				+	struct ceph_mds_session *session = cap->session;
			
 
				+	struct ceph_inode_info *ci = cap->ci;
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
			
 
				+
			
 
				+	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
			
 
				+
			
 
				+	/* remove from inode list */
			
 
				+	rb_erase(&cap->ci_node, &ci->i_caps);
			
 
				+	cap->ci = NULL;
			
 
				+	if (ci->i_auth_cap == cap)
			
 
				+		ci->i_auth_cap = NULL;
			
 
				+
			
 
				+	/* remove from session list */
			
 
				+	spin_lock(&session->s_cap_lock);
			
 
				+	if (session->s_cap_iterator == cap) {
			
 
				+		/* not yet, we are iterating over this very cap */
			
 
				+		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
			
 
				+		     cap, cap->session);
			
 
				+	} else {
			
 
				+		list_del_init(&cap->session_caps);
			
 
				+		session->s_nr_caps--;
			
 
				+		cap->session = NULL;
			
 
				+	}
			
 
				+	spin_unlock(&session->s_cap_lock);
			
 
				+
			
 
				+	if (cap->session == NULL)
			
 
				+		ceph_put_cap(cap);
			
 
				+
			
 
				+	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
			
 
				+		struct ceph_snap_realm *realm = ci->i_snap_realm;
			
 
				+		spin_lock(&realm->inodes_with_caps_lock);
			
 
				+		list_del_init(&ci->i_snap_realm_item);
			
 
				+		ci->i_snap_realm_counter++;
			
 
				+		ci->i_snap_realm = NULL;
			
 
				+		spin_unlock(&realm->inodes_with_caps_lock);
			
 
				+		ceph_put_snap_realm(mdsc, realm);
			
 
				+	}
			
 
				+	if (!__ceph_is_any_real_caps(ci))
			
 
				+		__cap_delay_cancel(mdsc, ci);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Build and send a cap message to the given MDS.
			
 
				+ *
			
 
				+ * Caller should be holding s_mutex.
			
 
				+ */
			
 
				+static int send_cap_msg(struct ceph_mds_session *session,
			
 
				+			u64 ino, u64 cid, int op,
			
 
				+			int caps, int wanted, int dirty,
			
 
				+			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
			
 
				+			u64 size, u64 max_size,
			
 
				+			struct timespec *mtime, struct timespec *atime,
			
 
				+			u64 time_warp_seq,
			
 
				+			uid_t uid, gid_t gid, mode_t mode,
			
 
				+			u64 xattr_version,
			
 
				+			struct ceph_buffer *xattrs_buf,
			
 
				+			u64 follows)
			
 
				+{
			
 
				+	struct ceph_mds_caps *fc;
			
 
				+	struct ceph_msg *msg;
			
 
				+
			
 
				+	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
			
 
				+	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
			
 
				+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
			
 
				+	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
			
 
				+	     ceph_cap_string(dirty),
			
 
				+	     seq, issue_seq, mseq, follows, size, max_size,
			
 
				+	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
			
 
				+
			
 
				+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
			
 
				+	if (IS_ERR(msg))
			
 
				+		return PTR_ERR(msg);
			
 
				+
			
 
				+	msg->hdr.tid = cpu_to_le64(flush_tid);
			
 
				+
			
 
				+	fc = msg->front.iov_base;
			
 
				+	memset(fc, 0, sizeof(*fc));
			
 
				+
			
 
				+	fc->cap_id = cpu_to_le64(cid);
			
 
				+	fc->op = cpu_to_le32(op);
			
 
				+	fc->seq = cpu_to_le32(seq);
			
 
				+	fc->issue_seq = cpu_to_le32(issue_seq);
			
 
				+	fc->migrate_seq = cpu_to_le32(mseq);
			
 
				+	fc->caps = cpu_to_le32(caps);
			
 
				+	fc->wanted = cpu_to_le32(wanted);
			
 
				+	fc->dirty = cpu_to_le32(dirty);
			
 
				+	fc->ino = cpu_to_le64(ino);
			
 
				+	fc->snap_follows = cpu_to_le64(follows);
			
 
				+
			
 
				+	fc->size = cpu_to_le64(size);
			
 
				+	fc->max_size = cpu_to_le64(max_size);
			
 
				+	if (mtime)
			
 
				+		ceph_encode_timespec(&fc->mtime, mtime);
			
 
				+	if (atime)
			
 
				+		ceph_encode_timespec(&fc->atime, atime);
			
 
				+	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
			
 
				+
			
 
				+	fc->uid = cpu_to_le32(uid);
			
 
				+	fc->gid = cpu_to_le32(gid);
			
 
				+	fc->mode = cpu_to_le32(mode);
			
 
				+
			
 
				+	fc->xattr_version = cpu_to_le64(xattr_version);
			
 
				+	if (xattrs_buf) {
			
 
				+		msg->middle = ceph_buffer_get(xattrs_buf);
			
 
				+		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
			
 
				+		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
			
 
				+	}
			
 
				+
			
 
				+	ceph_con_send(&session->s_con, msg);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Queue cap releases when an inode is dropped from our cache.  Since
			
 
				+ * inode is about to be destroyed, there is no need for i_lock.
			
 
				+ */
			
 
				+void ceph_queue_caps_release(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	p = rb_first(&ci->i_caps);
			
 
				+	while (p) {
			
 
				+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		struct ceph_mds_session *session = cap->session;
			
 
				+		struct ceph_msg *msg;
			
 
				+		struct ceph_mds_cap_release *head;
			
 
				+		struct ceph_mds_cap_item *item;
			
 
				+
			
 
				+		spin_lock(&session->s_cap_lock);
			
 
				+		BUG_ON(!session->s_num_cap_releases);
			
 
				+		msg = list_first_entry(&session->s_cap_releases,
			
 
				+				       struct ceph_msg, list_head);
			
 
				+
			
 
				+		dout(" adding %p release to mds%d msg %p (%d left)\n",
			
 
				+		     inode, session->s_mds, msg, session->s_num_cap_releases);
			
 
				+
			
 
				+		BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
			
 
				+		head = msg->front.iov_base;
			
 
				+		head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
			
 
				+		item = msg->front.iov_base + msg->front.iov_len;
			
 
				+		item->ino = cpu_to_le64(ceph_ino(inode));
			
 
				+		item->cap_id = cpu_to_le64(cap->cap_id);
			
 
				+		item->migrate_seq = cpu_to_le32(cap->mseq);
			
 
				+		item->seq = cpu_to_le32(cap->issue_seq);
			
 
				+
			
 
				+		session->s_num_cap_releases--;
			
 
				+
			
 
				+		msg->front.iov_len += sizeof(*item);
			
 
				+		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
			
 
				+			dout(" release msg %p full\n", msg);
			
 
				+			list_move_tail(&msg->list_head,
			
 
				+				       &session->s_cap_releases_done);
			
 
				+		} else {
			
 
				+			dout(" release msg %p at %d/%d (%d)\n", msg,
			
 
				+			     (int)le32_to_cpu(head->num),
			
 
				+			     (int)CEPH_CAPS_PER_RELEASE,
			
 
				+			     (int)msg->front.iov_len);
			
 
				+		}
			
 
				+		spin_unlock(&session->s_cap_lock);
			
 
				+		p = rb_next(p);
			
 
				+		__ceph_remove_cap(cap);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Send a cap msg on the given inode.  Update our caps state, then
			
 
				+ * drop i_lock and send the message.
			
 
				+ *
			
 
				+ * Make note of max_size reported/requested from mds, revoked caps
			
 
				+ * that have now been implemented.
			
 
				+ *
			
 
				+ * Make half-hearted attempt ot to invalidate page cache if we are
			
 
				+ * dropping RDCACHE.  Note that this will leave behind locked pages
			
 
				+ * that we'll then need to deal with elsewhere.
			
 
				+ *
			
 
				+ * Return non-zero if delayed release, or we experienced an error
			
 
				+ * such that the caller should requeue + retry later.
			
 
				+ *
			
 
				+ * called with i_lock, then drops it.
			
 
				+ * caller should hold snap_rwsem (read), s_mutex.
			
 
				+ */
			
 
				+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
			
 
				+		      int op, int used, int want, int retain, int flushing,
			
 
				+		      unsigned *pflush_tid)
			
 
				+	__releases(cap->ci->vfs_inode->i_lock)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = cap->ci;
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	u64 cap_id = cap->cap_id;
			
 
				+	int held, revoking, dropping, keep;
			
 
				+	u64 seq, issue_seq, mseq, time_warp_seq, follows;
			
 
				+	u64 size, max_size;
			
 
				+	struct timespec mtime, atime;
			
 
				+	int wake = 0;
			
 
				+	mode_t mode;
			
 
				+	uid_t uid;
			
 
				+	gid_t gid;
			
 
				+	struct ceph_mds_session *session;
			
 
				+	u64 xattr_version = 0;
			
 
				+	int delayed = 0;
			
 
				+	u64 flush_tid = 0;
			
 
				+	int i;
			
 
				+	int ret;
			
 
				+
			
 
				+	held = cap->issued | cap->implemented;
			
 
				+	revoking = cap->implemented & ~cap->issued;
			
 
				+	retain &= ~revoking;
			
 
				+	dropping = cap->issued & ~retain;
			
 
				+
			
 
				+	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
			
 
				+	     inode, cap, cap->session,
			
 
				+	     ceph_cap_string(held), ceph_cap_string(held & retain),
			
 
				+	     ceph_cap_string(revoking));
			
 
				+	BUG_ON((retain & CEPH_CAP_PIN) == 0);
			
 
				+
			
 
				+	session = cap->session;
			
 
				+
			
 
				+	/* don't release wanted unless we've waited a bit. */
			
 
				+	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
			
 
				+	    time_before(jiffies, ci->i_hold_caps_min)) {
			
 
				+		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
			
 
				+		     ceph_cap_string(cap->issued),
			
 
				+		     ceph_cap_string(cap->issued & retain),
			
 
				+		     ceph_cap_string(cap->mds_wanted),
			
 
				+		     ceph_cap_string(want));
			
 
				+		want |= cap->mds_wanted;
			
 
				+		retain |= cap->issued;
			
 
				+		delayed = 1;
			
 
				+	}
			
 
				+	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
			
 
				+
			
 
				+	cap->issued &= retain;  /* drop bits we don't want */
			
 
				+	if (cap->implemented & ~cap->issued) {
			
 
				+		/*
			
 
				+		 * Wake up any waiters on wanted -> needed transition.
			
 
				+		 * This is due to the weird transition from buffered
			
 
				+		 * to sync IO... we need to flush dirty pages _before_
			
 
				+		 * allowing sync writes to avoid reordering.
			
 
				+		 */
			
 
				+		wake = 1;
			
 
				+	}
			
 
				+	cap->implemented &= cap->issued | used;
			
 
				+	cap->mds_wanted = want;
			
 
				+
			
 
				+	if (flushing) {
			
 
				+		/*
			
 
				+		 * assign a tid for flush operations so we can avoid
			
 
				+		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
			
 
				+		 * clean type races.  track latest tid for every bit
			
 
				+		 * so we can handle flush AxFw, flush Fw, and have the
			
 
				+		 * first ack clean Ax.
			
 
				+		 */
			
 
				+		flush_tid = ++ci->i_cap_flush_last_tid;
			
 
				+		if (pflush_tid)
			
 
				+			*pflush_tid = flush_tid;
			
 
				+		dout(" cap_flush_tid %d\n", (int)flush_tid);
			
 
				+		for (i = 0; i < CEPH_CAP_BITS; i++)
			
 
				+			if (flushing & (1 << i))
			
 
				+				ci->i_cap_flush_tid[i] = flush_tid;
			
 
				+	}
			
 
				+
			
 
				+	keep = cap->implemented;
			
 
				+	seq = cap->seq;
			
 
				+	issue_seq = cap->issue_seq;
			
 
				+	mseq = cap->mseq;
			
 
				+	size = inode->i_size;
			
 
				+	ci->i_reported_size = size;
			
 
				+	max_size = ci->i_wanted_max_size;
			
 
				+	ci->i_requested_max_size = max_size;
			
 
				+	mtime = inode->i_mtime;
			
 
				+	atime = inode->i_atime;
			
 
				+	time_warp_seq = ci->i_time_warp_seq;
			
 
				+	follows = ci->i_snap_realm->cached_context->seq;
			
 
				+	uid = inode->i_uid;
			
 
				+	gid = inode->i_gid;
			
 
				+	mode = inode->i_mode;
			
 
				+
			
 
				+	if (dropping & CEPH_CAP_XATTR_EXCL) {
			
 
				+		__ceph_build_xattrs_blob(ci);
			
 
				+		xattr_version = ci->i_xattrs.version + 1;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
			
 
				+		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
			
 
				+		size, max_size, &mtime, &atime, time_warp_seq,
			
 
				+		uid, gid, mode,
			
 
				+		xattr_version,
			
 
				+		(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
			
 
				+		follows);
			
 
				+	if (ret < 0) {
			
 
				+		dout("error sending cap msg, must requeue %p\n", inode);
			
 
				+		delayed = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (wake)
			
 
				+		wake_up(&ci->i_cap_wq);
			
 
				+
			
 
				+	return delayed;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * When a snapshot is taken, clients accumulate dirty metadata on
			
 
				+ * inodes with capabilities in ceph_cap_snaps to describe the file
			
 
				+ * state at the time the snapshot was taken.  This must be flushed
			
 
				+ * asynchronously back to the MDS once sync writes complete and dirty
			
 
				+ * data is written out.
			
 
				+ *
			
 
				+ * Called under i_lock.  Takes s_mutex as needed.
			
 
				+ */
			
 
				+void __ceph_flush_snaps(struct ceph_inode_info *ci,
			
 
				+			struct ceph_mds_session **psession)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	int mds;
			
 
				+	struct ceph_cap_snap *capsnap;
			
 
				+	u32 mseq;
			
 
				+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
			
 
				+	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
			
 
				+						    session->s_mutex */
			
 
				+	u64 next_follows = 0;  /* keep track of how far we've gotten through the
			
 
				+			     i_cap_snaps list, and skip these entries next time
			
 
				+			     around to avoid an infinite loop */
			
 
				+
			
 
				+	if (psession)
			
 
				+		session = *psession;
			
 
				+
			
 
				+	dout("__flush_snaps %p\n", inode);
			
 
				+retry:
			
 
				+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
			
 
				+		/* avoid an infiniute loop after retry */
			
 
				+		if (capsnap->follows < next_follows)
			
 
				+			continue;
			
 
				+		/*
			
 
				+		 * we need to wait for sync writes to complete and for dirty
			
 
				+		 * pages to be written out.
			
 
				+		 */
			
 
				+		if (capsnap->dirty_pages || capsnap->writing)
			
 
				+			continue;
			
 
				+
			
 
				+		/* pick mds, take s_mutex */
			
 
				+		mds = __ceph_get_cap_mds(ci, &mseq);
			
 
				+		if (session && session->s_mds != mds) {
			
 
				+			dout("oops, wrong session %p mutex\n", session);
			
 
				+			mutex_unlock(&session->s_mutex);
			
 
				+			ceph_put_mds_session(session);
			
 
				+			session = NULL;
			
 
				+		}
			
 
				+		if (!session) {
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+			mutex_lock(&mdsc->mutex);
			
 
				+			session = __ceph_lookup_mds_session(mdsc, mds);
			
 
				+			mutex_unlock(&mdsc->mutex);
			
 
				+			if (session) {
			
 
				+				dout("inverting session/ino locks on %p\n",
			
 
				+				     session);
			
 
				+				mutex_lock(&session->s_mutex);
			
 
				+			}
			
 
				+			/*
			
 
				+			 * if session == NULL, we raced against a cap
			
 
				+			 * deletion.  retry, and we'll get a better
			
 
				+			 * @mds value next time.
			
 
				+			 */
			
 
				+			spin_lock(&inode->i_lock);
			
 
				+			goto retry;
			
 
				+		}
			
 
				+
			
 
				+		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
			
 
				+		atomic_inc(&capsnap->nref);
			
 
				+		if (!list_empty(&capsnap->flushing_item))
			
 
				+			list_del_init(&capsnap->flushing_item);
			
 
				+		list_add_tail(&capsnap->flushing_item,
			
 
				+			      &session->s_cap_snaps_flushing);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+		dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
			
 
				+		     inode, capsnap, next_follows, capsnap->size);
			
 
				+		send_cap_msg(session, ceph_vino(inode).ino, 0,
			
 
				+			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
			
 
				+			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
			
 
				+			     capsnap->size, 0,
			
 
				+			     &capsnap->mtime, &capsnap->atime,
			
 
				+			     capsnap->time_warp_seq,
			
 
				+			     capsnap->uid, capsnap->gid, capsnap->mode,
			
 
				+			     0, NULL,
			
 
				+			     capsnap->follows);
			
 
				+
			
 
				+		next_follows = capsnap->follows + 1;
			
 
				+		ceph_put_cap_snap(capsnap);
			
 
				+
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	/* we flushed them all; remove this inode from the queue */
			
 
				+	spin_lock(&mdsc->snap_flush_lock);
			
 
				+	list_del_init(&ci->i_snap_flush_item);
			
 
				+	spin_unlock(&mdsc->snap_flush_lock);
			
 
				+
			
 
				+	if (psession)
			
 
				+		*psession = session;
			
 
				+	else if (session) {
			
 
				+		mutex_unlock(&session->s_mutex);
			
 
				+		ceph_put_mds_session(session);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ceph_flush_snaps(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	__ceph_flush_snaps(ci, NULL);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
			
 
				+ * list.
			
 
				+ */
			
 
				+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	int was = ci->i_dirty_caps;
			
 
				+	int dirty = 0;
			
 
				+
			
 
				+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
			
 
				+	     ceph_cap_string(mask), ceph_cap_string(was),
			
 
				+	     ceph_cap_string(was | mask));
			
 
				+	ci->i_dirty_caps |= mask;
			
 
				+	if (was == 0) {
			
 
				+		dout(" inode %p now dirty\n", &ci->vfs_inode);
			
 
				+		BUG_ON(!list_empty(&ci->i_dirty_item));
			
 
				+		spin_lock(&mdsc->cap_dirty_lock);
			
 
				+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
			
 
				+		spin_unlock(&mdsc->cap_dirty_lock);
			
 
				+		if (ci->i_flushing_caps == 0) {
			
 
				+			igrab(inode);
			
 
				+			dirty |= I_DIRTY_SYNC;
			
 
				+		}
			
 
				+	}
			
 
				+	BUG_ON(list_empty(&ci->i_dirty_item));
			
 
				+	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
			
 
				+	    (mask & CEPH_CAP_FILE_BUFFER))
			
 
				+		dirty |= I_DIRTY_DATASYNC;
			
 
				+	if (dirty)
			
 
				+		__mark_inode_dirty(inode, dirty);
			
 
				+	__cap_delay_requeue(mdsc, ci);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Add dirty inode to the flushing list.  Assigned a seq number so we
			
 
				+ * can wait for caps to flush without starving.
			
 
				+ *
			
 
				+ * Called under i_lock.
			
 
				+ */
			
 
				+static int __mark_caps_flushing(struct inode *inode,
			
 
				+				 struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int flushing;
			
 
				+
			
 
				+	BUG_ON(ci->i_dirty_caps == 0);
			
 
				+	BUG_ON(list_empty(&ci->i_dirty_item));
			
 
				+
			
 
				+	flushing = ci->i_dirty_caps;
			
 
				+	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
			
 
				+	     ceph_cap_string(flushing),
			
 
				+	     ceph_cap_string(ci->i_flushing_caps),
			
 
				+	     ceph_cap_string(ci->i_flushing_caps | flushing));
			
 
				+	ci->i_flushing_caps |= flushing;
			
 
				+	ci->i_dirty_caps = 0;
			
 
				+	dout(" inode %p now !dirty\n", inode);
			
 
				+
			
 
				+	spin_lock(&mdsc->cap_dirty_lock);
			
 
				+	list_del_init(&ci->i_dirty_item);
			
 
				+
			
 
				+	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
			
 
				+	if (list_empty(&ci->i_flushing_item)) {
			
 
				+		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
			
 
				+		mdsc->num_cap_flushing++;
			
 
				+		dout(" inode %p now flushing seq %lld\n", inode,
			
 
				+		     ci->i_cap_flush_seq);
			
 
				+	} else {
			
 
				+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
			
 
				+		dout(" inode %p now flushing (more) seq %lld\n", inode,
			
 
				+		     ci->i_cap_flush_seq);
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->cap_dirty_lock);
			
 
				+
			
 
				+	return flushing;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * try to invalidate mapping pages without blocking.
			
 
				+ */
			
 
				+static int mapping_is_empty(struct address_space *mapping)
			
 
				+{
			
 
				+	struct page *page = find_get_page(mapping, 0);
			
 
				+
			
 
				+	if (!page)
			
 
				+		return 1;
			
 
				+
			
 
				+	put_page(page);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int try_nonblocking_invalidate(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	u32 invalidating_gen = ci->i_rdcache_gen;
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	invalidate_mapping_pages(&inode->i_data, 0, -1);
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	if (mapping_is_empty(&inode->i_data) &&
			
 
				+	    invalidating_gen == ci->i_rdcache_gen) {
			
 
				+		/* success. */
			
 
				+		dout("try_nonblocking_invalidate %p success\n", inode);
			
 
				+		ci->i_rdcache_gen = 0;
			
 
				+		ci->i_rdcache_revoking = 0;
			
 
				+		return 0;
			
 
				+	}
			
 
				+	dout("try_nonblocking_invalidate %p failed\n", inode);
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Swiss army knife function to examine currently used and wanted
			
 
				+ * versus held caps.  Release, flush, ack revoked caps to mds as
			
 
				+ * appropriate.
			
 
				+ *
			
 
				+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
			
 
				+ *    cap release further.
			
 
				+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
			
 
				+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
			
 
				+ *    further delay.
			
 
				+ */
			
 
				+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
			
 
				+		     struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	struct ceph_cap *cap;
			
 
				+	int file_wanted, used;
			
 
				+	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
			
 
				+	int drop_session_lock = session ? 0 : 1;
			
 
				+	int issued, implemented, want, retain, revoking, flushing = 0;
			
 
				+	int mds = -1;   /* keep track of how far we've gone through i_caps list
			
 
				+			   to avoid an infinite loop on retry */
			
 
				+	struct rb_node *p;
			
 
				+	int tried_invalidate = 0;
			
 
				+	int delayed = 0, sent = 0, force_requeue = 0, num;
			
 
				+	int queue_invalidate = 0;
			
 
				+	int is_delayed = flags & CHECK_CAPS_NODELAY;
			
 
				+
			
 
				+	/* if we are unmounting, flush any unused caps immediately. */
			
 
				+	if (mdsc->stopping)
			
 
				+		is_delayed = 1;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	if (ci->i_ceph_flags & CEPH_I_FLUSH)
			
 
				+		flags |= CHECK_CAPS_FLUSH;
			
 
				+
			
 
				+	/* flush snaps first time around only */
			
 
				+	if (!list_empty(&ci->i_cap_snaps))
			
 
				+		__ceph_flush_snaps(ci, &session);
			
 
				+	goto retry_locked;
			
 
				+retry:
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+retry_locked:
			
 
				+	file_wanted = __ceph_caps_file_wanted(ci);
			
 
				+	used = __ceph_caps_used(ci);
			
 
				+	want = file_wanted | used;
			
 
				+	issued = __ceph_caps_issued(ci, &implemented);
			
 
				+	revoking = implemented & ~issued;
			
 
				+
			
 
				+	retain = want | CEPH_CAP_PIN;
			
 
				+	if (!mdsc->stopping && inode->i_nlink > 0) {
			
 
				+		if (want) {
			
 
				+			retain |= CEPH_CAP_ANY;       /* be greedy */
			
 
				+		} else {
			
 
				+			retain |= CEPH_CAP_ANY_SHARED;
			
 
				+			/*
			
 
				+			 * keep RD only if we didn't have the file open RW,
			
 
				+			 * because then the mds would revoke it anyway to
			
 
				+			 * journal max_size=0.
			
 
				+			 */
			
 
				+			if (ci->i_max_size == 0)
			
 
				+				retain |= CEPH_CAP_ANY_RD;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
			
 
				+	     " issued %s revoking %s retain %s %s%s%s\n", inode,
			
 
				+	     ceph_cap_string(file_wanted),
			
 
				+	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
			
 
				+	     ceph_cap_string(ci->i_flushing_caps),
			
 
				+	     ceph_cap_string(issued), ceph_cap_string(revoking),
			
 
				+	     ceph_cap_string(retain),
			
 
				+	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
			
 
				+	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
			
 
				+	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
			
 
				+
			
 
				+	/*
			
 
				+	 * If we no longer need to hold onto old our caps, and we may
			
 
				+	 * have cached pages, but don't want them, then try to invalidate.
			
 
				+	 * If we fail, it's because pages are locked.... try again later.
			
 
				+	 */
			
 
				+	if ((!is_delayed || mdsc->stopping) &&
			
 
				+	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
			
 
				+	    ci->i_rdcache_gen &&                     /* may have cached pages */
			
 
				+	    (file_wanted == 0 ||                     /* no open files */
			
 
				+	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
			
 
				+	    !tried_invalidate) {
			
 
				+		dout("check_caps trying to invalidate on %p\n", inode);
			
 
				+		if (try_nonblocking_invalidate(inode) < 0) {
			
 
				+			if (revoking & CEPH_CAP_FILE_CACHE) {
			
 
				+				dout("check_caps queuing invalidate\n");
			
 
				+				queue_invalidate = 1;
			
 
				+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
			
 
				+			} else {
			
 
				+				dout("check_caps failed to invalidate pages\n");
			
 
				+				/* we failed to invalidate pages.  check these
			
 
				+				   caps again later. */
			
 
				+				force_requeue = 1;
			
 
				+				__cap_set_timeouts(mdsc, ci);
			
 
				+			}
			
 
				+		}
			
 
				+		tried_invalidate = 1;
			
 
				+		goto retry_locked;
			
 
				+	}
			
 
				+
			
 
				+	num = 0;
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		cap = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		num++;
			
 
				+
			
 
				+		/* avoid looping forever */
			
 
				+		if (mds >= cap->mds ||
			
 
				+		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
			
 
				+			continue;
			
 
				+
			
 
				+		/* NOTE: no side-effects allowed, until we take s_mutex */
			
 
				+
			
 
				+		revoking = cap->implemented & ~cap->issued;
			
 
				+		if (revoking)
			
 
				+			dout(" mds%d revoking %s\n", cap->mds,
			
 
				+			     ceph_cap_string(revoking));
			
 
				+
			
 
				+		if (cap == ci->i_auth_cap &&
			
 
				+		    (cap->issued & CEPH_CAP_FILE_WR)) {
			
 
				+			/* request larger max_size from MDS? */
			
 
				+			if (ci->i_wanted_max_size > ci->i_max_size &&
			
 
				+			    ci->i_wanted_max_size > ci->i_requested_max_size) {
			
 
				+				dout("requesting new max_size\n");
			
 
				+				goto ack;
			
 
				+			}
			
 
				+
			
 
				+			/* approaching file_max? */
			
 
				+			if ((inode->i_size << 1) >= ci->i_max_size &&
			
 
				+			    (ci->i_reported_size << 1) < ci->i_max_size) {
			
 
				+				dout("i_size approaching max_size\n");
			
 
				+				goto ack;
			
 
				+			}
			
 
				+		}
			
 
				+		/* flush anything dirty? */
			
 
				+		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
			
 
				+		    ci->i_dirty_caps) {
			
 
				+			dout("flushing dirty caps\n");
			
 
				+			goto ack;
			
 
				+		}
			
 
				+
			
 
				+		/* completed revocation? going down and there are no caps? */
			
 
				+		if (revoking && (revoking & used) == 0) {
			
 
				+			dout("completed revocation of %s\n",
			
 
				+			     ceph_cap_string(cap->implemented & ~cap->issued));
			
 
				+			goto ack;
			
 
				+		}
			
 
				+
			
 
				+		/* want more caps from mds? */
			
 
				+		if (want & ~(cap->mds_wanted | cap->issued))
			
 
				+			goto ack;
			
 
				+
			
 
				+		/* things we might delay */
			
 
				+		if ((cap->issued & ~retain) == 0 &&
			
 
				+		    cap->mds_wanted == want)
			
 
				+			continue;     /* nope, all good */
			
 
				+
			
 
				+		if (is_delayed)
			
 
				+			goto ack;
			
 
				+
			
 
				+		/* delay? */
			
 
				+		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
			
 
				+		    time_before(jiffies, ci->i_hold_caps_max)) {
			
 
				+			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
			
 
				+			     ceph_cap_string(cap->issued),
			
 
				+			     ceph_cap_string(cap->issued & retain),
			
 
				+			     ceph_cap_string(cap->mds_wanted),
			
 
				+			     ceph_cap_string(want));
			
 
				+			delayed++;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+ack:
			
 
				+		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
			
 
				+			dout(" skipping %p I_NOFLUSH set\n", inode);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (session && session != cap->session) {
			
 
				+			dout("oops, wrong session %p mutex\n", session);
			
 
				+			mutex_unlock(&session->s_mutex);
			
 
				+			session = NULL;
			
 
				+		}
			
 
				+		if (!session) {
			
 
				+			session = cap->session;
			
 
				+			if (mutex_trylock(&session->s_mutex) == 0) {
			
 
				+				dout("inverting session/ino locks on %p\n",
			
 
				+				     session);
			
 
				+				spin_unlock(&inode->i_lock);
			
 
				+				if (took_snap_rwsem) {
			
 
				+					up_read(&mdsc->snap_rwsem);
			
 
				+					took_snap_rwsem = 0;
			
 
				+				}
			
 
				+				mutex_lock(&session->s_mutex);
			
 
				+				goto retry;
			
 
				+			}
			
 
				+		}
			
 
				+		/* take snap_rwsem after session mutex */
			
 
				+		if (!took_snap_rwsem) {
			
 
				+			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
			
 
				+				dout("inverting snap/in locks on %p\n",
			
 
				+				     inode);
			
 
				+				spin_unlock(&inode->i_lock);
			
 
				+				down_read(&mdsc->snap_rwsem);
			
 
				+				took_snap_rwsem = 1;
			
 
				+				goto retry;
			
 
				+			}
			
 
				+			took_snap_rwsem = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
			
 
				+			flushing = __mark_caps_flushing(inode, session);
			
 
				+
			
 
				+		mds = cap->mds;  /* remember mds, so we don't repeat */
			
 
				+		sent++;
			
 
				+
			
 
				+		/* __send_cap drops i_lock */
			
 
				+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
			
 
				+				      retain, flushing, NULL);
			
 
				+		goto retry; /* retake i_lock and restart our cap scan. */
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Reschedule delayed caps release if we delayed anything,
			
 
				+	 * otherwise cancel.
			
 
				+	 */
			
 
				+	if (delayed && is_delayed)
			
 
				+		force_requeue = 1;   /* __send_cap delayed release; requeue */
			
 
				+	if (!delayed && !is_delayed)
			
 
				+		__cap_delay_cancel(mdsc, ci);
			
 
				+	else if (!is_delayed || force_requeue)
			
 
				+		__cap_delay_requeue(mdsc, ci);
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (queue_invalidate)
			
 
				+		ceph_queue_invalidate(inode);
			
 
				+
			
 
				+	if (session && drop_session_lock)
			
 
				+		mutex_unlock(&session->s_mutex);
			
 
				+	if (took_snap_rwsem)
			
 
				+		up_read(&mdsc->snap_rwsem);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Try to flush dirty caps back to the auth mds.
			
 
				+ */
			
 
				+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
			
 
				+			  unsigned *flush_tid)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int unlock_session = session ? 0 : 1;
			
 
				+	int flushing = 0;
			
 
				+
			
 
				+retry:
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
			
 
				+		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	if (ci->i_dirty_caps && ci->i_auth_cap) {
			
 
				+		struct ceph_cap *cap = ci->i_auth_cap;
			
 
				+		int used = __ceph_caps_used(ci);
			
 
				+		int want = __ceph_caps_wanted(ci);
			
 
				+		int delayed;
			
 
				+
			
 
				+		if (!session) {
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+			session = cap->session;
			
 
				+			mutex_lock(&session->s_mutex);
			
 
				+			goto retry;
			
 
				+		}
			
 
				+		BUG_ON(session != cap->session);
			
 
				+		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
			
 
				+			goto out;
			
 
				+
			
 
				+		flushing = __mark_caps_flushing(inode, session);
			
 
				+
			
 
				+		/* __send_cap drops i_lock */
			
 
				+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
			
 
				+				     cap->issued | cap->implemented, flushing,
			
 
				+				     flush_tid);
			
 
				+		if (!delayed)
			
 
				+			goto out_unlocked;
			
 
				+
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		__cap_delay_requeue(mdsc, ci);
			
 
				+	}
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+out_unlocked:
			
 
				+	if (session && unlock_session)
			
 
				+		mutex_unlock(&session->s_mutex);
			
 
				+	return flushing;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return true if we've flushed caps through the given flush_tid.
			
 
				+ */
			
 
				+static int caps_are_flushed(struct inode *inode, unsigned tid)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int dirty, i, ret = 1;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	dirty = __ceph_caps_dirty(ci);
			
 
				+	for (i = 0; i < CEPH_CAP_BITS; i++)
			
 
				+		if ((ci->i_flushing_caps & (1 << i)) &&
			
 
				+		    ci->i_cap_flush_tid[i] <= tid) {
			
 
				+			/* still flushing this bit */
			
 
				+			ret = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wait on any unsafe replies for the given inode.  First wait on the
			
 
				+ * newest request, and make that the upper bound.  Then, if there are
			
 
				+ * more requests, keep waiting on the oldest as long as it is still older
			
 
				+ * than the original request.
			
 
				+ */
			
 
				+static void sync_write_wait(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct list_head *head = &ci->i_unsafe_writes;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	u64 last_tid;
			
 
				+
			
 
				+	spin_lock(&ci->i_unsafe_lock);
			
 
				+	if (list_empty(head))
			
 
				+		goto out;
			
 
				+
			
 
				+	/* set upper bound as _last_ entry in chain */
			
 
				+	req = list_entry(head->prev, struct ceph_osd_request,
			
 
				+			 r_unsafe_item);
			
 
				+	last_tid = req->r_tid;
			
 
				+
			
 
				+	do {
			
 
				+		ceph_osdc_get_request(req);
			
 
				+		spin_unlock(&ci->i_unsafe_lock);
			
 
				+		dout("sync_write_wait on tid %llu (until %llu)\n",
			
 
				+		     req->r_tid, last_tid);
			
 
				+		wait_for_completion(&req->r_safe_completion);
			
 
				+		spin_lock(&ci->i_unsafe_lock);
			
 
				+		ceph_osdc_put_request(req);
			
 
				+
			
 
				+		/*
			
 
				+		 * from here on look at first entry in chain, since we
			
 
				+		 * only want to wait for anything older than last_tid
			
 
				+		 */
			
 
				+		if (list_empty(head))
			
 
				+			break;
			
 
				+		req = list_entry(head->next, struct ceph_osd_request,
			
 
				+				 r_unsafe_item);
			
 
				+	} while (req->r_tid < last_tid);
			
 
				+out:
			
 
				+	spin_unlock(&ci->i_unsafe_lock);
			
 
				+}
			
 
				+
			
 
				+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	unsigned flush_tid;
			
 
				+	int ret;
			
 
				+	int dirty;
			
 
				+
			
 
				+	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
			
 
				+	sync_write_wait(inode);
			
 
				+
			
 
				+	ret = filemap_write_and_wait(inode->i_mapping);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	dirty = try_flush_caps(inode, NULL, &flush_tid);
			
 
				+	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
			
 
				+
			
 
				+	/*
			
 
				+	 * only wait on non-file metadata writeback (the mds
			
 
				+	 * can recover size and mtime, so we don't need to
			
 
				+	 * wait for that)
			
 
				+	 */
			
 
				+	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
			
 
				+		dout("fsync waiting for flush_tid %u\n", flush_tid);
			
 
				+		ret = wait_event_interruptible(ci->i_cap_wq,
			
 
				+				       caps_are_flushed(inode, flush_tid));
			
 
				+	}
			
 
				+
			
 
				+	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
			
 
				+ * queue inode for flush but don't do so immediately, because we can
			
 
				+ * get by with fewer MDS messages if we wait for data writeback to
			
 
				+ * complete first.
			
 
				+ */
			
 
				+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	unsigned flush_tid;
			
 
				+	int err = 0;
			
 
				+	int dirty;
			
 
				+	int wait = wbc->sync_mode == WB_SYNC_ALL;
			
 
				+
			
 
				+	dout("write_inode %p wait=%d\n", inode, wait);
			
 
				+	if (wait) {
			
 
				+		dirty = try_flush_caps(inode, NULL, &flush_tid);
			
 
				+		if (dirty)
			
 
				+			err = wait_event_interruptible(ci->i_cap_wq,
			
 
				+				       caps_are_flushed(inode, flush_tid));
			
 
				+	} else {
			
 
				+		struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
			
 
				+
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		if (__ceph_caps_dirty(ci))
			
 
				+			__cap_delay_requeue_front(mdsc, ci);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * After a recovering MDS goes active, we need to resend any caps
			
 
				+ * we were flushing.
			
 
				+ *
			
 
				+ * Caller holds session->s_mutex.
			
 
				+ */
			
 
				+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
			
 
				+				   struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_cap_snap *capsnap;
			
 
				+
			
 
				+	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
			
 
				+	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
			
 
				+			    flushing_item) {
			
 
				+		struct ceph_inode_info *ci = capsnap->ci;
			
 
				+		struct inode *inode = &ci->vfs_inode;
			
 
				+		struct ceph_cap *cap;
			
 
				+
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		cap = ci->i_auth_cap;
			
 
				+		if (cap && cap->session == session) {
			
 
				+			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
			
 
				+			     cap, capsnap);
			
 
				+			__ceph_flush_snaps(ci, &session);
			
 
				+		} else {
			
 
				+			pr_err("%p auth cap %p not mds%d ???\n", inode,
			
 
				+			       cap, session->s_mds);
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci;
			
 
				+
			
 
				+	kick_flushing_capsnaps(mdsc, session);
			
 
				+
			
 
				+	dout("kick_flushing_caps mds%d\n", session->s_mds);
			
 
				+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
			
 
				+		struct inode *inode = &ci->vfs_inode;
			
 
				+		struct ceph_cap *cap;
			
 
				+		int delayed = 0;
			
 
				+
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		cap = ci->i_auth_cap;
			
 
				+		if (cap && cap->session == session) {
			
 
				+			dout("kick_flushing_caps %p cap %p %s\n", inode,
			
 
				+			     cap, ceph_cap_string(ci->i_flushing_caps));
			
 
				+			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
			
 
				+					     __ceph_caps_used(ci),
			
 
				+					     __ceph_caps_wanted(ci),
			
 
				+					     cap->issued | cap->implemented,
			
 
				+					     ci->i_flushing_caps, NULL);
			
 
				+			if (delayed) {
			
 
				+				spin_lock(&inode->i_lock);
			
 
				+				__cap_delay_requeue(mdsc, ci);
			
 
				+				spin_unlock(&inode->i_lock);
			
 
				+			}
			
 
				+		} else {
			
 
				+			pr_err("%p auth cap %p not mds%d ???\n", inode,
			
 
				+			       cap, session->s_mds);
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Take references to capabilities we hold, so that we don't release
			
 
				+ * them to the MDS prematurely.
			
 
				+ *
			
 
				+ * Protected by i_lock.
			
 
				+ */
			
 
				+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
			
 
				+{
			
 
				+	if (got & CEPH_CAP_PIN)
			
 
				+		ci->i_pin_ref++;
			
 
				+	if (got & CEPH_CAP_FILE_RD)
			
 
				+		ci->i_rd_ref++;
			
 
				+	if (got & CEPH_CAP_FILE_CACHE)
			
 
				+		ci->i_rdcache_ref++;
			
 
				+	if (got & CEPH_CAP_FILE_WR)
			
 
				+		ci->i_wr_ref++;
			
 
				+	if (got & CEPH_CAP_FILE_BUFFER) {
			
 
				+		if (ci->i_wrbuffer_ref == 0)
			
 
				+			igrab(&ci->vfs_inode);
			
 
				+		ci->i_wrbuffer_ref++;
			
 
				+		dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
			
 
				+		     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Try to grab cap references.  Specify those refs we @want, and the
			
 
				+ * minimal set we @need.  Also include the larger offset we are writing
			
 
				+ * to (when applicable), and check against max_size here as well.
			
 
				+ * Note that caller is responsible for ensuring max_size increases are
			
 
				+ * requested from the MDS.
			
 
				+ */
			
 
				+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
			
 
				+			    int *got, loff_t endoff, int *check_max, int *err)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	int ret = 0;
			
 
				+	int have, implemented;
			
 
				+	int file_wanted;
			
 
				+
			
 
				+	dout("get_cap_refs %p need %s want %s\n", inode,
			
 
				+	     ceph_cap_string(need), ceph_cap_string(want));
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	/* make sure file is actually open */
			
 
				+	file_wanted = __ceph_caps_file_wanted(ci);
			
 
				+	if ((file_wanted & need) == 0) {
			
 
				+		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
			
 
				+		     ceph_cap_string(need), ceph_cap_string(file_wanted));
			
 
				+		*err = -EBADF;
			
 
				+		ret = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (need & CEPH_CAP_FILE_WR) {
			
 
				+		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
			
 
				+			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
			
 
				+			     inode, endoff, ci->i_max_size);
			
 
				+			if (endoff > ci->i_wanted_max_size) {
			
 
				+				*check_max = 1;
			
 
				+				ret = 1;
			
 
				+			}
			
 
				+			goto out;
			
 
				+		}
			
 
				+		/*
			
 
				+		 * If a sync write is in progress, we must wait, so that we
			
 
				+		 * can get a final snapshot value for size+mtime.
			
 
				+		 */
			
 
				+		if (__ceph_have_pending_cap_snap(ci)) {
			
 
				+			dout("get_cap_refs %p cap_snap_pending\n", inode);
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+	have = __ceph_caps_issued(ci, &implemented);
			
 
				+
			
 
				+	/*
			
 
				+	 * disallow writes while a truncate is pending
			
 
				+	 */
			
 
				+	if (ci->i_truncate_pending)
			
 
				+		have &= ~CEPH_CAP_FILE_WR;
			
 
				+
			
 
				+	if ((have & need) == need) {
			
 
				+		/*
			
 
				+		 * Look at (implemented & ~have & not) so that we keep waiting
			
 
				+		 * on transition from wanted -> needed caps.  This is needed
			
 
				+		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
			
 
				+		 * going before a prior buffered writeback happens.
			
 
				+		 */
			
 
				+		int not = want & ~(have & need);
			
 
				+		int revoking = implemented & ~have;
			
 
				+		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
			
 
				+		     inode, ceph_cap_string(have), ceph_cap_string(not),
			
 
				+		     ceph_cap_string(revoking));
			
 
				+		if ((revoking & not) == 0) {
			
 
				+			*got = need | (have & want);
			
 
				+			__take_cap_refs(ci, *got);
			
 
				+			ret = 1;
			
 
				+		}
			
 
				+	} else {
			
 
				+		dout("get_cap_refs %p have %s needed %s\n", inode,
			
 
				+		     ceph_cap_string(have), ceph_cap_string(need));
			
 
				+	}
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	dout("get_cap_refs %p ret %d got %s\n", inode,
			
 
				+	     ret, ceph_cap_string(*got));
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check the offset we are writing up to against our current
			
 
				+ * max_size.  If necessary, tell the MDS we want to write to
			
 
				+ * a larger offset.
			
 
				+ */
			
 
				+static void check_max_size(struct inode *inode, loff_t endoff)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int check = 0;
			
 
				+
			
 
				+	/* do we need to explicitly request a larger max_size? */
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if ((endoff >= ci->i_max_size ||
			
 
				+	     endoff > (inode->i_size << 1)) &&
			
 
				+	    endoff > ci->i_wanted_max_size) {
			
 
				+		dout("write %p at large endoff %llu, req max_size\n",
			
 
				+		     inode, endoff);
			
 
				+		ci->i_wanted_max_size = endoff;
			
 
				+		check = 1;
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	if (check)
			
 
				+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wait for caps, and take cap references.  If we can't get a WR cap
			
 
				+ * due to a small max_size, make sure we check_max_size (and possibly
			
 
				+ * ask the mds) so we don't get hung up indefinitely.
			
 
				+ */
			
 
				+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
			
 
				+		  loff_t endoff)
			
 
				+{
			
 
				+	int check_max, ret, err;
			
 
				+
			
 
				+retry:
			
 
				+	if (endoff > 0)
			
 
				+		check_max_size(&ci->vfs_inode, endoff);
			
 
				+	check_max = 0;
			
 
				+	err = 0;
			
 
				+	ret = wait_event_interruptible(ci->i_cap_wq,
			
 
				+				       try_get_cap_refs(ci, need, want,
			
 
				+							got, endoff,
			
 
				+							&check_max, &err));
			
 
				+	if (err)
			
 
				+		ret = err;
			
 
				+	if (check_max)
			
 
				+		goto retry;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Take cap refs.  Caller must already know we hold at least one ref
			
 
				+ * on the caps in question or we don't know this is safe.
			
 
				+ */
			
 
				+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
			
 
				+{
			
 
				+	spin_lock(&ci->vfs_inode.i_lock);
			
 
				+	__take_cap_refs(ci, caps);
			
 
				+	spin_unlock(&ci->vfs_inode.i_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Release cap refs.
			
 
				+ *
			
 
				+ * If we released the last ref on any given cap, call ceph_check_caps
			
 
				+ * to release (or schedule a release).
			
 
				+ *
			
 
				+ * If we are releasing a WR cap (from a sync write), finalize any affected
			
 
				+ * cap_snap, and wake up any waiters.
			
 
				+ */
			
 
				+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	int last = 0, put = 0, flushsnaps = 0, wake = 0;
			
 
				+	struct ceph_cap_snap *capsnap;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (had & CEPH_CAP_PIN)
			
 
				+		--ci->i_pin_ref;
			
 
				+	if (had & CEPH_CAP_FILE_RD)
			
 
				+		if (--ci->i_rd_ref == 0)
			
 
				+			last++;
			
 
				+	if (had & CEPH_CAP_FILE_CACHE)
			
 
				+		if (--ci->i_rdcache_ref == 0)
			
 
				+			last++;
			
 
				+	if (had & CEPH_CAP_FILE_BUFFER) {
			
 
				+		if (--ci->i_wrbuffer_ref == 0) {
			
 
				+			last++;
			
 
				+			put++;
			
 
				+		}
			
 
				+		dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
			
 
				+		     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
			
 
				+	}
			
 
				+	if (had & CEPH_CAP_FILE_WR)
			
 
				+		if (--ci->i_wr_ref == 0) {
			
 
				+			last++;
			
 
				+			if (!list_empty(&ci->i_cap_snaps)) {
			
 
				+				capsnap = list_first_entry(&ci->i_cap_snaps,
			
 
				+						     struct ceph_cap_snap,
			
 
				+						     ci_item);
			
 
				+				if (capsnap->writing) {
			
 
				+					capsnap->writing = 0;
			
 
				+					flushsnaps =
			
 
				+						__ceph_finish_cap_snap(ci,
			
 
				+								       capsnap);
			
 
				+					wake = 1;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
			
 
				+	     last ? "last" : "");
			
 
				+
			
 
				+	if (last && !flushsnaps)
			
 
				+		ceph_check_caps(ci, 0, NULL);
			
 
				+	else if (flushsnaps)
			
 
				+		ceph_flush_snaps(ci);
			
 
				+	if (wake)
			
 
				+		wake_up(&ci->i_cap_wq);
			
 
				+	if (put)
			
 
				+		iput(inode);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
			
 
				+ * context.  Adjust per-snap dirty page accounting as appropriate.
			
 
				+ * Once all dirty data for a cap_snap is flushed, flush snapped file
			
 
				+ * metadata back to the MDS.  If we dropped the last ref, call
			
 
				+ * ceph_check_caps.
			
 
				+ */
			
 
				+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
			
 
				+				struct ceph_snap_context *snapc)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	int last = 0;
			
 
				+	int last_snap = 0;
			
 
				+	int found = 0;
			
 
				+	struct ceph_cap_snap *capsnap = NULL;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	ci->i_wrbuffer_ref -= nr;
			
 
				+	last = !ci->i_wrbuffer_ref;
			
 
				+
			
 
				+	if (ci->i_head_snapc == snapc) {
			
 
				+		ci->i_wrbuffer_ref_head -= nr;
			
 
				+		if (!ci->i_wrbuffer_ref_head) {
			
 
				+			ceph_put_snap_context(ci->i_head_snapc);
			
 
				+			ci->i_head_snapc = NULL;
			
 
				+		}
			
 
				+		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
			
 
				+		     inode,
			
 
				+		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
			
 
				+		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
			
 
				+		     last ? " LAST" : "");
			
 
				+	} else {
			
 
				+		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
			
 
				+			if (capsnap->context == snapc) {
			
 
				+				found = 1;
			
 
				+				capsnap->dirty_pages -= nr;
			
 
				+				last_snap = !capsnap->dirty_pages;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		BUG_ON(!found);
			
 
				+		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
			
 
				+		     " snap %lld %d/%d -> %d/%d %s%s\n",
			
 
				+		     inode, capsnap, capsnap->context->seq,
			
 
				+		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
			
 
				+		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
			
 
				+		     last ? " (wrbuffer last)" : "",
			
 
				+		     last_snap ? " (capsnap last)" : "");
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (last) {
			
 
				+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
			
 
				+		iput(inode);
			
 
				+	} else if (last_snap) {
			
 
				+		ceph_flush_snaps(ci);
			
 
				+		wake_up(&ci->i_cap_wq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
			
 
				+ * actually be a revocation if it specifies a smaller cap set.)
			
 
				+ *
			
 
				+ * caller holds s_mutex.
			
 
				+ * return value:
			
 
				+ *  0 - ok
			
 
				+ *  1 - check_caps on auth cap only (writeback)
			
 
				+ *  2 - check_caps (ack revoke)
			
 
				+ */
			
 
				+static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
			
 
				+			    struct ceph_mds_session *session,
			
 
				+			    struct ceph_cap *cap,
			
 
				+			    struct ceph_buffer *xattr_buf)
			
 
				+	__releases(inode->i_lock)
			
 
				+
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int mds = session->s_mds;
			
 
				+	int seq = le32_to_cpu(grant->seq);
			
 
				+	int newcaps = le32_to_cpu(grant->caps);
			
 
				+	int issued, implemented, used, wanted, dirty;
			
 
				+	u64 size = le64_to_cpu(grant->size);
			
 
				+	u64 max_size = le64_to_cpu(grant->max_size);
			
 
				+	struct timespec mtime, atime, ctime;
			
 
				+	int reply = 0;
			
 
				+	int wake = 0;
			
 
				+	int writeback = 0;
			
 
				+	int revoked_rdcache = 0;
			
 
				+	int queue_invalidate = 0;
			
 
				+
			
 
				+	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
			
 
				+	     inode, cap, mds, seq, ceph_cap_string(newcaps));
			
 
				+	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
			
 
				+		inode->i_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * If CACHE is being revoked, and we have no dirty buffers,
			
 
				+	 * try to invalidate (once).  (If there are dirty buffers, we
			
 
				+	 * will invalidate _after_ writeback.)
			
 
				+	 */
			
 
				+	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
			
 
				+	    !ci->i_wrbuffer_ref) {
			
 
				+		if (try_nonblocking_invalidate(inode) == 0) {
			
 
				+			revoked_rdcache = 1;
			
 
				+		} else {
			
 
				+			/* there were locked pages.. invalidate later
			
 
				+			   in a separate thread. */
			
 
				+			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
			
 
				+				queue_invalidate = 1;
			
 
				+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* side effects now are allowed */
			
 
				+
			
 
				+	issued = __ceph_caps_issued(ci, &implemented);
			
 
				+	issued |= implemented | __ceph_caps_dirty(ci);
			
 
				+
			
 
				+	cap->cap_gen = session->s_cap_gen;
			
 
				+
			
 
				+	__check_cap_issue(ci, cap, newcaps);
			
 
				+
			
 
				+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
			
 
				+		inode->i_mode = le32_to_cpu(grant->mode);
			
 
				+		inode->i_uid = le32_to_cpu(grant->uid);
			
 
				+		inode->i_gid = le32_to_cpu(grant->gid);
			
 
				+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
			
 
				+		     inode->i_uid, inode->i_gid);
			
 
				+	}
			
 
				+
			
 
				+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
			
 
				+		inode->i_nlink = le32_to_cpu(grant->nlink);
			
 
				+
			
 
				+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
			
 
				+		int len = le32_to_cpu(grant->xattr_len);
			
 
				+		u64 version = le64_to_cpu(grant->xattr_version);
			
 
				+
			
 
				+		if (version > ci->i_xattrs.version) {
			
 
				+			dout(" got new xattrs v%llu on %p len %d\n",
			
 
				+			     version, inode, len);
			
 
				+			if (ci->i_xattrs.blob)
			
 
				+				ceph_buffer_put(ci->i_xattrs.blob);
			
 
				+			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
			
 
				+			ci->i_xattrs.version = version;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* size/ctime/mtime/atime? */
			
 
				+	ceph_fill_file_size(inode, issued,
			
 
				+			    le32_to_cpu(grant->truncate_seq),
			
 
				+			    le64_to_cpu(grant->truncate_size), size);
			
 
				+	ceph_decode_timespec(&mtime, &grant->mtime);
			
 
				+	ceph_decode_timespec(&atime, &grant->atime);
			
 
				+	ceph_decode_timespec(&ctime, &grant->ctime);
			
 
				+	ceph_fill_file_time(inode, issued,
			
 
				+			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
			
 
				+			    &atime);
			
 
				+
			
 
				+	/* max size increase? */
			
 
				+	if (max_size != ci->i_max_size) {
			
 
				+		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
			
 
				+		ci->i_max_size = max_size;
			
 
				+		if (max_size >= ci->i_wanted_max_size) {
			
 
				+			ci->i_wanted_max_size = 0;  /* reset */
			
 
				+			ci->i_requested_max_size = 0;
			
 
				+		}
			
 
				+		wake = 1;
			
 
				+	}
			
 
				+
			
 
				+	/* check cap bits */
			
 
				+	wanted = __ceph_caps_wanted(ci);
			
 
				+	used = __ceph_caps_used(ci);
			
 
				+	dirty = __ceph_caps_dirty(ci);
			
 
				+	dout(" my wanted = %s, used = %s, dirty %s\n",
			
 
				+	     ceph_cap_string(wanted),
			
 
				+	     ceph_cap_string(used),
			
 
				+	     ceph_cap_string(dirty));
			
 
				+	if (wanted != le32_to_cpu(grant->wanted)) {
			
 
				+		dout("mds wanted %s -> %s\n",
			
 
				+		     ceph_cap_string(le32_to_cpu(grant->wanted)),
			
 
				+		     ceph_cap_string(wanted));
			
 
				+		grant->wanted = cpu_to_le32(wanted);
			
 
				+	}
			
 
				+
			
 
				+	cap->seq = seq;
			
 
				+
			
 
				+	/* file layout may have changed */
			
 
				+	ci->i_layout = grant->layout;
			
 
				+
			
 
				+	/* revocation, grant, or no-op? */
			
 
				+	if (cap->issued & ~newcaps) {
			
 
				+		dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
			
 
				+		     ceph_cap_string(newcaps));
			
 
				+		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
			
 
				+			writeback = 1; /* will delay ack */
			
 
				+		else if (dirty & ~newcaps)
			
 
				+			reply = 1;     /* initiate writeback in check_caps */
			
 
				+		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
			
 
				+			   revoked_rdcache)
			
 
				+			reply = 2;     /* send revoke ack in check_caps */
			
 
				+		cap->issued = newcaps;
			
 
				+	} else if (cap->issued == newcaps) {
			
 
				+		dout("caps unchanged: %s -> %s\n",
			
 
				+		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
			
 
				+	} else {
			
 
				+		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
			
 
				+		     ceph_cap_string(newcaps));
			
 
				+		cap->issued = newcaps;
			
 
				+		cap->implemented |= newcaps; /* add bits only, to
			
 
				+					      * avoid stepping on a
			
 
				+					      * pending revocation */
			
 
				+		wake = 1;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	if (writeback)
			
 
				+		/*
			
 
				+		 * queue inode for writeback: we can't actually call
			
 
				+		 * filemap_write_and_wait, etc. from message handler
			
 
				+		 * context.
			
 
				+		 */
			
 
				+		ceph_queue_writeback(inode);
			
 
				+	if (queue_invalidate)
			
 
				+		ceph_queue_invalidate(inode);
			
 
				+	if (wake)
			
 
				+		wake_up(&ci->i_cap_wq);
			
 
				+	return reply;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
			
 
				+ * MDS has been safely committed.
			
 
				+ */
			
 
				+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
			
 
				+				 struct ceph_mds_caps *m,
			
 
				+				 struct ceph_mds_session *session,
			
 
				+				 struct ceph_cap *cap)
			
 
				+	__releases(inode->i_lock)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
			
 
				+	unsigned seq = le32_to_cpu(m->seq);
			
 
				+	int dirty = le32_to_cpu(m->dirty);
			
 
				+	int cleaned = 0;
			
 
				+	int drop = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < CEPH_CAP_BITS; i++)
			
 
				+		if ((dirty & (1 << i)) &&
			
 
				+		    flush_tid == ci->i_cap_flush_tid[i])
			
 
				+			cleaned |= 1 << i;
			
 
				+
			
 
				+	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
			
 
				+	     " flushing %s -> %s\n",
			
 
				+	     inode, session->s_mds, seq, ceph_cap_string(dirty),
			
 
				+	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
			
 
				+	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
			
 
				+
			
 
				+	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
			
 
				+		goto out;
			
 
				+
			
 
				+	ci->i_flushing_caps &= ~cleaned;
			
 
				+
			
 
				+	spin_lock(&mdsc->cap_dirty_lock);
			
 
				+	if (ci->i_flushing_caps == 0) {
			
 
				+		list_del_init(&ci->i_flushing_item);
			
 
				+		if (!list_empty(&session->s_cap_flushing))
			
 
				+			dout(" mds%d still flushing cap on %p\n",
			
 
				+			     session->s_mds,
			
 
				+			     &list_entry(session->s_cap_flushing.next,
			
 
				+					 struct ceph_inode_info,
			
 
				+					 i_flushing_item)->vfs_inode);
			
 
				+		mdsc->num_cap_flushing--;
			
 
				+		wake_up(&mdsc->cap_flushing_wq);
			
 
				+		dout(" inode %p now !flushing\n", inode);
			
 
				+
			
 
				+		if (ci->i_dirty_caps == 0) {
			
 
				+			dout(" inode %p now clean\n", inode);
			
 
				+			BUG_ON(!list_empty(&ci->i_dirty_item));
			
 
				+			drop = 1;
			
 
				+		} else {
			
 
				+			BUG_ON(list_empty(&ci->i_dirty_item));
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->cap_dirty_lock);
			
 
				+	wake_up(&ci->i_cap_wq);
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	if (drop)
			
 
				+		iput(inode);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
			
 
				+ * throw away our cap_snap.
			
 
				+ *
			
 
				+ * Caller hold s_mutex.
			
 
				+ */
			
 
				+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
			
 
				+				     struct ceph_mds_caps *m,
			
 
				+				     struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	u64 follows = le64_to_cpu(m->snap_follows);
			
 
				+	struct ceph_cap_snap *capsnap;
			
 
				+	int drop = 0;
			
 
				+
			
 
				+	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
			
 
				+	     inode, ci, session->s_mds, follows);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
			
 
				+		if (capsnap->follows == follows) {
			
 
				+			if (capsnap->flush_tid != flush_tid) {
			
 
				+				dout(" cap_snap %p follows %lld tid %lld !="
			
 
				+				     " %lld\n", capsnap, follows,
			
 
				+				     flush_tid, capsnap->flush_tid);
			
 
				+				break;
			
 
				+			}
			
 
				+			WARN_ON(capsnap->dirty_pages || capsnap->writing);
			
 
				+			dout(" removing cap_snap %p follows %lld\n",
			
 
				+			     capsnap, follows);
			
 
				+			ceph_put_snap_context(capsnap->context);
			
 
				+			list_del(&capsnap->ci_item);
			
 
				+			list_del(&capsnap->flushing_item);
			
 
				+			ceph_put_cap_snap(capsnap);
			
 
				+			drop = 1;
			
 
				+			break;
			
 
				+		} else {
			
 
				+			dout(" skipping cap_snap %p follows %lld\n",
			
 
				+			     capsnap, capsnap->follows);
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	if (drop)
			
 
				+		iput(inode);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle TRUNC from MDS, indicating file truncation.
			
 
				+ *
			
 
				+ * caller hold s_mutex.
			
 
				+ */
			
 
				+static void handle_cap_trunc(struct inode *inode,
			
 
				+			     struct ceph_mds_caps *trunc,
			
 
				+			     struct ceph_mds_session *session)
			
 
				+	__releases(inode->i_lock)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int mds = session->s_mds;
			
 
				+	int seq = le32_to_cpu(trunc->seq);
			
 
				+	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
			
 
				+	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
			
 
				+	u64 size = le64_to_cpu(trunc->size);
			
 
				+	int implemented = 0;
			
 
				+	int dirty = __ceph_caps_dirty(ci);
			
 
				+	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
			
 
				+	int queue_trunc = 0;
			
 
				+
			
 
				+	issued |= implemented | dirty;
			
 
				+
			
 
				+	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
			
 
				+	     inode, mds, seq, truncate_size, truncate_seq);
			
 
				+	queue_trunc = ceph_fill_file_size(inode, issued,
			
 
				+					  truncate_seq, truncate_size, size);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (queue_trunc)
			
 
				+		ceph_queue_vmtruncate(inode);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
			
 
				+ * different one.  If we are the most recent migration we've seen (as
			
 
				+ * indicated by mseq), make note of the migrating cap bits for the
			
 
				+ * duration (until we see the corresponding IMPORT).
			
 
				+ *
			
 
				+ * caller holds s_mutex
			
 
				+ */
			
 
				+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
			
 
				+			      struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int mds = session->s_mds;
			
 
				+	unsigned mseq = le32_to_cpu(ex->migrate_seq);
			
 
				+	struct ceph_cap *cap = NULL, *t;
			
 
				+	struct rb_node *p;
			
 
				+	int remember = 1;
			
 
				+
			
 
				+	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
			
 
				+	     inode, ci, mds, mseq);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	/* make sure we haven't seen a higher mseq */
			
 
				+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
			
 
				+		t = rb_entry(p, struct ceph_cap, ci_node);
			
 
				+		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
			
 
				+			dout(" higher mseq on cap from mds%d\n",
			
 
				+			     t->session->s_mds);
			
 
				+			remember = 0;
			
 
				+		}
			
 
				+		if (t->session->s_mds == mds)
			
 
				+			cap = t;
			
 
				+	}
			
 
				+
			
 
				+	if (cap) {
			
 
				+		if (remember) {
			
 
				+			/* make note */
			
 
				+			ci->i_cap_exporting_mds = mds;
			
 
				+			ci->i_cap_exporting_mseq = mseq;
			
 
				+			ci->i_cap_exporting_issued = cap->issued;
			
 
				+		}
			
 
				+		__ceph_remove_cap(cap);
			
 
				+	} else {
			
 
				+		WARN_ON(!cap);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
			
 
				+ * clean them up.
			
 
				+ *
			
 
				+ * caller holds s_mutex.
			
 
				+ */
			
 
				+static void handle_cap_import(struct ceph_mds_client *mdsc,
			
 
				+			      struct inode *inode, struct ceph_mds_caps *im,
			
 
				+			      struct ceph_mds_session *session,
			
 
				+			      void *snaptrace, int snaptrace_len)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int mds = session->s_mds;
			
 
				+	unsigned issued = le32_to_cpu(im->caps);
			
 
				+	unsigned wanted = le32_to_cpu(im->wanted);
			
 
				+	unsigned seq = le32_to_cpu(im->seq);
			
 
				+	unsigned mseq = le32_to_cpu(im->migrate_seq);
			
 
				+	u64 realmino = le64_to_cpu(im->realm);
			
 
				+	u64 cap_id = le64_to_cpu(im->cap_id);
			
 
				+
			
 
				+	if (ci->i_cap_exporting_mds >= 0 &&
			
 
				+	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
			
 
				+		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
			
 
				+		     " - cleared exporting from mds%d\n",
			
 
				+		     inode, ci, mds, mseq,
			
 
				+		     ci->i_cap_exporting_mds);
			
 
				+		ci->i_cap_exporting_issued = 0;
			
 
				+		ci->i_cap_exporting_mseq = 0;
			
 
				+		ci->i_cap_exporting_mds = -1;
			
 
				+	} else {
			
 
				+		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
			
 
				+		     inode, ci, mds, mseq);
			
 
				+	}
			
 
				+
			
 
				+	down_write(&mdsc->snap_rwsem);
			
 
				+	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
			
 
				+			       false);
			
 
				+	downgrade_write(&mdsc->snap_rwsem);
			
 
				+	ceph_add_cap(inode, session, cap_id, -1,
			
 
				+		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
			
 
				+		     NULL /* no caps context */);
			
 
				+	try_flush_caps(inode, session, NULL);
			
 
				+	up_read(&mdsc->snap_rwsem);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle a caps message from the MDS.
			
 
				+ *
			
 
				+ * Identify the appropriate session, inode, and call the right handler
			
 
				+ * based on the cap op.
			
 
				+ */
			
 
				+void ceph_handle_caps(struct ceph_mds_session *session,
			
 
				+		      struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = session->s_mdsc;
			
 
				+	struct super_block *sb = mdsc->client->sb;
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct ceph_mds_caps *h;
			
 
				+	int mds = session->s_mds;
			
 
				+	int op;
			
 
				+	u32 seq;
			
 
				+	struct ceph_vino vino;
			
 
				+	u64 cap_id;
			
 
				+	u64 size, max_size;
			
 
				+	u64 tid;
			
 
				+	int check_caps = 0;
			
 
				+	void *snaptrace;
			
 
				+	int r;
			
 
				+
			
 
				+	dout("handle_caps from mds%d\n", mds);
			
 
				+
			
 
				+	/* decode */
			
 
				+	tid = le64_to_cpu(msg->hdr.tid);
			
 
				+	if (msg->front.iov_len < sizeof(*h))
			
 
				+		goto bad;
			
 
				+	h = msg->front.iov_base;
			
 
				+	snaptrace = h + 1;
			
 
				+	op = le32_to_cpu(h->op);
			
 
				+	vino.ino = le64_to_cpu(h->ino);
			
 
				+	vino.snap = CEPH_NOSNAP;
			
 
				+	cap_id = le64_to_cpu(h->cap_id);
			
 
				+	seq = le32_to_cpu(h->seq);
			
 
				+	size = le64_to_cpu(h->size);
			
 
				+	max_size = le64_to_cpu(h->max_size);
			
 
				+
			
 
				+	mutex_lock(&session->s_mutex);
			
 
				+	session->s_seq++;
			
 
				+	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
			
 
				+	     (unsigned)seq);
			
 
				+
			
 
				+	/* lookup ino */
			
 
				+	inode = ceph_find_inode(sb, vino);
			
 
				+	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
			
 
				+	     vino.snap, inode);
			
 
				+	if (!inode) {
			
 
				+		dout(" i don't have ino %llx\n", vino.ino);
			
 
				+		goto done;
			
 
				+	}
			
 
				+
			
 
				+	/* these will work even if we don't have a cap yet */
			
 
				+	switch (op) {
			
 
				+	case CEPH_CAP_OP_FLUSHSNAP_ACK:
			
 
				+		handle_cap_flushsnap_ack(inode, tid, h, session);
			
 
				+		goto done;
			
 
				+
			
 
				+	case CEPH_CAP_OP_EXPORT:
			
 
				+		handle_cap_export(inode, h, session);
			
 
				+		goto done;
			
 
				+
			
 
				+	case CEPH_CAP_OP_IMPORT:
			
 
				+		handle_cap_import(mdsc, inode, h, session,
			
 
				+				  snaptrace, le32_to_cpu(h->snap_trace_len));
			
 
				+		check_caps = 1; /* we may have sent a RELEASE to the old auth */
			
 
				+		goto done;
			
 
				+	}
			
 
				+
			
 
				+	/* the rest require a cap */
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	cap = __get_cap_for_mds(ceph_inode(inode), mds);
			
 
				+	if (!cap) {
			
 
				+		dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
			
 
				+		     inode, ceph_ino(inode), ceph_snap(inode), mds);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		goto done;
			
 
				+	}
			
 
				+
			
 
				+	/* note that each of these drops i_lock for us */
			
 
				+	switch (op) {
			
 
				+	case CEPH_CAP_OP_REVOKE:
			
 
				+	case CEPH_CAP_OP_GRANT:
			
 
				+		r = handle_cap_grant(inode, h, session, cap, msg->middle);
			
 
				+		if (r == 1)
			
 
				+			ceph_check_caps(ceph_inode(inode),
			
 
				+					CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
			
 
				+					session);
			
 
				+		else if (r == 2)
			
 
				+			ceph_check_caps(ceph_inode(inode),
			
 
				+					CHECK_CAPS_NODELAY,
			
 
				+					session);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_CAP_OP_FLUSH_ACK:
			
 
				+		handle_cap_flush_ack(inode, tid, h, session, cap);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_CAP_OP_TRUNC:
			
 
				+		handle_cap_trunc(inode, h, session);
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
			
 
				+		       ceph_cap_op_name(op));
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	mutex_unlock(&session->s_mutex);
			
 
				+
			
 
				+	if (check_caps)
			
 
				+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
			
 
				+	if (inode)
			
 
				+		iput(inode);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("ceph_handle_caps: corrupt message\n");
			
 
				+	ceph_msg_dump(msg);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Delayed work handler to process end of delayed cap release LRU list.
			
 
				+ */
			
 
				+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	int flags = CHECK_CAPS_NODELAY;
			
 
				+
			
 
				+	dout("check_delayed_caps\n");
			
 
				+	while (1) {
			
 
				+		spin_lock(&mdsc->cap_delay_lock);
			
 
				+		if (list_empty(&mdsc->cap_delay_list))
			
 
				+			break;
			
 
				+		ci = list_first_entry(&mdsc->cap_delay_list,
			
 
				+				      struct ceph_inode_info,
			
 
				+				      i_cap_delay_list);
			
 
				+		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
			
 
				+		    time_before(jiffies, ci->i_hold_caps_max))
			
 
				+			break;
			
 
				+		list_del_init(&ci->i_cap_delay_list);
			
 
				+		spin_unlock(&mdsc->cap_delay_lock);
			
 
				+		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
			
 
				+		ceph_check_caps(ci, flags, NULL);
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->cap_delay_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Flush all dirty caps to the mds
			
 
				+ */
			
 
				+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci, *nci = NULL;
			
 
				+	struct inode *inode, *ninode = NULL;
			
 
				+	struct list_head *p, *n;
			
 
				+
			
 
				+	dout("flush_dirty_caps\n");
			
 
				+	spin_lock(&mdsc->cap_dirty_lock);
			
 
				+	list_for_each_safe(p, n, &mdsc->cap_dirty) {
			
 
				+		if (nci) {
			
 
				+			ci = nci;
			
 
				+			inode = ninode;
			
 
				+			ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
			
 
				+			dout("flush_dirty_caps inode %p (was next inode)\n",
			
 
				+			     inode);
			
 
				+		} else {
			
 
				+			ci = list_entry(p, struct ceph_inode_info,
			
 
				+					i_dirty_item);
			
 
				+			inode = igrab(&ci->vfs_inode);
			
 
				+			BUG_ON(!inode);
			
 
				+			dout("flush_dirty_caps inode %p\n", inode);
			
 
				+		}
			
 
				+		if (n != &mdsc->cap_dirty) {
			
 
				+			nci = list_entry(n, struct ceph_inode_info,
			
 
				+					 i_dirty_item);
			
 
				+			ninode = igrab(&nci->vfs_inode);
			
 
				+			BUG_ON(!ninode);
			
 
				+			nci->i_ceph_flags |= CEPH_I_NOFLUSH;
			
 
				+			dout("flush_dirty_caps next inode %p, noflush\n",
			
 
				+			     ninode);
			
 
				+		} else {
			
 
				+			nci = NULL;
			
 
				+			ninode = NULL;
			
 
				+		}
			
 
				+		spin_unlock(&mdsc->cap_dirty_lock);
			
 
				+		if (inode) {
			
 
				+			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
			
 
				+					NULL);
			
 
				+			iput(inode);
			
 
				+		}
			
 
				+		spin_lock(&mdsc->cap_dirty_lock);
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->cap_dirty_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Drop open file reference.  If we were the last open file,
			
 
				+ * we may need to release capabilities to the MDS (or schedule
			
 
				+ * their delayed release).
			
 
				+ */
			
 
				+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	int last = 0;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
			
 
				+	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
			
 
				+	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
			
 
				+	if (--ci->i_nr_by_mode[fmode] == 0)
			
 
				+		last++;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (last && ci->i_vino.snap == CEPH_NOSNAP)
			
 
				+		ceph_check_caps(ci, 0, NULL);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Helpers for embedding cap and dentry lease releases into mds
			
 
				+ * requests.
			
 
				+ *
			
 
				+ * @force is used by dentry_release (below) to force inclusion of a
			
 
				+ * record for the directory inode, even when there aren't any caps to
			
 
				+ * drop.
			
 
				+ */
			
 
				+int ceph_encode_inode_release(void **p, struct inode *inode,
			
 
				+			      int mds, int drop, int unless, int force)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct ceph_mds_request_release *rel = *p;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
			
 
				+	     mds, ceph_cap_string(drop), ceph_cap_string(unless));
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	cap = __get_cap_for_mds(ci, mds);
			
 
				+	if (cap && __cap_is_valid(cap)) {
			
 
				+		if (force ||
			
 
				+		    ((cap->issued & drop) &&
			
 
				+		     (cap->issued & unless) == 0)) {
			
 
				+			if ((cap->issued & drop) &&
			
 
				+			    (cap->issued & unless) == 0) {
			
 
				+				dout("encode_inode_release %p cap %p %s -> "
			
 
				+				     "%s\n", inode, cap,
			
 
				+				     ceph_cap_string(cap->issued),
			
 
				+				     ceph_cap_string(cap->issued & ~drop));
			
 
				+				cap->issued &= ~drop;
			
 
				+				cap->implemented &= ~drop;
			
 
				+				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
			
 
				+					int wanted = __ceph_caps_wanted(ci);
			
 
				+					dout("  wanted %s -> %s (act %s)\n",
			
 
				+					     ceph_cap_string(cap->mds_wanted),
			
 
				+					     ceph_cap_string(cap->mds_wanted &
			
 
				+							     ~wanted),
			
 
				+					     ceph_cap_string(wanted));
			
 
				+					cap->mds_wanted &= wanted;
			
 
				+				}
			
 
				+			} else {
			
 
				+				dout("encode_inode_release %p cap %p %s"
			
 
				+				     " (force)\n", inode, cap,
			
 
				+				     ceph_cap_string(cap->issued));
			
 
				+			}
			
 
				+
			
 
				+			rel->ino = cpu_to_le64(ceph_ino(inode));
			
 
				+			rel->cap_id = cpu_to_le64(cap->cap_id);
			
 
				+			rel->seq = cpu_to_le32(cap->seq);
			
 
				+			rel->issue_seq = cpu_to_le32(cap->issue_seq),
			
 
				+			rel->mseq = cpu_to_le32(cap->mseq);
			
 
				+			rel->caps = cpu_to_le32(cap->issued);
			
 
				+			rel->wanted = cpu_to_le32(cap->mds_wanted);
			
 
				+			rel->dname_len = 0;
			
 
				+			rel->dname_seq = 0;
			
 
				+			*p += sizeof(*rel);
			
 
				+			ret = 1;
			
 
				+		} else {
			
 
				+			dout("encode_inode_release %p cap %p %s\n",
			
 
				+			     inode, cap, ceph_cap_string(cap->issued));
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
			
 
				+			       int mds, int drop, int unless)
			
 
				+{
			
 
				+	struct inode *dir = dentry->d_parent->d_inode;
			
 
				+	struct ceph_mds_request_release *rel = *p;
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dentry);
			
 
				+	int force = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * force an record for the directory caps if we have a dentry lease.
			
 
				+	 * this is racy (can't take i_lock and d_lock together), but it
			
 
				+	 * doesn't have to be perfect; the mds will revoke anything we don't
			
 
				+	 * release.
			
 
				+	 */
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	if (di->lease_session && di->lease_session->s_mds == mds)
			
 
				+		force = 1;
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+
			
 
				+	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
			
 
				+
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
			
 
				+		dout("encode_dentry_release %p mds%d seq %d\n",
			
 
				+		     dentry, mds, (int)di->lease_seq);
			
 
				+		rel->dname_len = cpu_to_le32(dentry->d_name.len);
			
 
				+		memcpy(*p, dentry->d_name.name, dentry->d_name.len);
			
 
				+		*p += dentry->d_name.len;
			
 
				+		rel->dname_seq = cpu_to_le32(di->lease_seq);
			
 
				+	}
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+	return ret;
			
 
				+}
			
--- a/fs/ceph/ceph_debug.h
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
 
				+#ifndef _FS_CEPH_DEBUG_H
			
 
				+#define _FS_CEPH_DEBUG_H
			
 
				+
			
 
				+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
			
 
				+
			
 
				+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
			
 
				+
			
 
				+/*
			
 
				+ * wrap pr_debug to include a filename:lineno prefix on each line.
			
 
				+ * this incurs some overhead (kernel size and execution time) due to
			
 
				+ * the extra function call at each call site.
			
 
				+ */
			
 
				+
			
 
				+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
			
 
				+extern const char *ceph_file_part(const char *s, int len);
			
 
				+#  define dout(fmt, ...)						\
			
 
				+	pr_debug(" %12.12s:%-4d : " fmt,				\
			
 
				+		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
			
 
				+		 __LINE__, ##__VA_ARGS__)
			
 
				+# else
			
 
				+/* faux printk call just to see any compiler warnings. */
			
 
				+#  define dout(fmt, ...)	do {				\
			
 
				+		if (0)						\
			
 
				+			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
			
 
				+	} while (0)
			
 
				+# endif
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+/*
			
 
				+ * or, just wrap pr_debug
			
 
				+ */
			
 
				+# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
 
				+/*
			
 
				+ * Ceph 'frag' type
			
 
				+ */
			
 
				+#include "types.h"
			
 
				+
			
 
				+int ceph_frag_compare(__u32 a, __u32 b)
			
 
				+{
			
 
				+	unsigned va = ceph_frag_value(a);
			
 
				+	unsigned vb = ceph_frag_value(b);
			
 
				+	if (va < vb)
			
 
				+		return -1;
			
 
				+	if (va > vb)
			
 
				+		return 1;
			
 
				+	va = ceph_frag_bits(a);
			
 
				+	vb = ceph_frag_bits(b);
			
 
				+	if (va < vb)
			
 
				+		return -1;
			
 
				+	if (va > vb)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/ceph/ceph_frag.h
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
 
				+#ifndef _FS_CEPH_FRAG_H
			
 
				+#define _FS_CEPH_FRAG_H
			
 
				+
			
 
				+/*
			
 
				+ * "Frags" are a way to describe a subset of a 32-bit number space,
			
 
				+ * using a mask and a value to match against that mask.  Any given frag
			
 
				+ * (subset of the number space) can be partitioned into 2^n sub-frags.
			
 
				+ *
			
 
				+ * Frags are encoded into a 32-bit word:
			
 
				+ *   8 upper bits = "bits"
			
 
				+ *  24 lower bits = "value"
			
 
				+ * (We could go to 5+27 bits, but who cares.)
			
 
				+ *
			
 
				+ * We use the _most_ significant bits of the 24 bit value.  This makes
			
 
				+ * values logically sort.
			
 
				+ *
			
 
				+ * Unfortunately, because the "bits" field is still in the high bits, we
			
 
				+ * can't sort encoded frags numerically.  However, it does allow you
			
 
				+ * to feed encoded frags as values into frag_contains_value.
			
 
				+ */
			
 
				+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
			
 
				+{
			
 
				+	return (b << 24) |
			
 
				+		(v & (0xffffffu << (24-b)) & 0xffffffu);
			
 
				+}
			
 
				+static inline __u32 ceph_frag_bits(__u32 f)
			
 
				+{
			
 
				+	return f >> 24;
			
 
				+}
			
 
				+static inline __u32 ceph_frag_value(__u32 f)
			
 
				+{
			
 
				+	return f & 0xffffffu;
			
 
				+}
			
 
				+static inline __u32 ceph_frag_mask(__u32 f)
			
 
				+{
			
 
				+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
			
 
				+}
			
 
				+static inline __u32 ceph_frag_mask_shift(__u32 f)
			
 
				+{
			
 
				+	return 24 - ceph_frag_bits(f);
			
 
				+}
			
 
				+
			
 
				+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
			
 
				+{
			
 
				+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
			
 
				+}
			
 
				+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
			
 
				+{
			
 
				+	/* is sub as specific as us, and contained by us? */
			
 
				+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
			
 
				+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
			
 
				+}
			
 
				+
			
 
				+static inline __u32 ceph_frag_parent(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_make(ceph_frag_bits(f) - 1,
			
 
				+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
			
 
				+}
			
 
				+static inline int ceph_frag_is_left_child(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_bits(f) > 0 &&
			
 
				+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
			
 
				+}
			
 
				+static inline int ceph_frag_is_right_child(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_bits(f) > 0 &&
			
 
				+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
			
 
				+}
			
 
				+static inline __u32 ceph_frag_sibling(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_make(ceph_frag_bits(f),
			
 
				+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
			
 
				+}
			
 
				+static inline __u32 ceph_frag_left_child(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
			
 
				+}
			
 
				+static inline __u32 ceph_frag_right_child(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_make(ceph_frag_bits(f)+1,
			
 
				+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
			
 
				+}
			
 
				+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
			
 
				+{
			
 
				+	int newbits = ceph_frag_bits(f) + by;
			
 
				+	return ceph_frag_make(newbits,
			
 
				+			 ceph_frag_value(f) | (i << (24 - newbits)));
			
 
				+}
			
 
				+static inline int ceph_frag_is_leftmost(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_value(f) == 0;
			
 
				+}
			
 
				+static inline int ceph_frag_is_rightmost(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_value(f) == ceph_frag_mask(f);
			
 
				+}
			
 
				+static inline __u32 ceph_frag_next(__u32 f)
			
 
				+{
			
 
				+	return ceph_frag_make(ceph_frag_bits(f),
			
 
				+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * comparator to sort frags logically, as when traversing the
			
 
				+ * number space in ascending order...
			
 
				+ */
			
 
				+int ceph_frag_compare(__u32 a, __u32 b);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
 
				+/*
			
 
				+ * Some non-inline ceph helpers
			
 
				+ */
			
 
				+#include "types.h"
			
 
				+
			
 
				+/*
			
 
				+ * return true if @layout appears to be valid
			
 
				+ */
			
 
				+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
			
 
				+{
			
 
				+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
			
 
				+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
			
 
				+	__u32 os = le32_to_cpu(layout->fl_object_size);
			
 
				+
			
 
				+	/* stripe unit, object size must be non-zero, 64k increment */
			
 
				+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
			
 
				+		return 0;
			
 
				+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
			
 
				+		return 0;
			
 
				+	/* object size must be a multiple of stripe unit */
			
 
				+	if (os < su || os % su)
			
 
				+		return 0;
			
 
				+	/* stripe count must be non-zero */
			
 
				+	if (!sc)
			
 
				+		return 0;
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int ceph_flags_to_mode(int flags)
			
 
				+{
			
 
				+#ifdef O_DIRECTORY  /* fixme */
			
 
				+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
			
 
				+		return CEPH_FILE_MODE_PIN;
			
 
				+#endif
			
 
				+#ifdef O_LAZY
			
 
				+	if (flags & O_LAZY)
			
 
				+		return CEPH_FILE_MODE_LAZY;
			
 
				+#endif
			
 
				+	if ((flags & O_APPEND) == O_APPEND)
			
 
				+		flags |= O_WRONLY;
			
 
				+
			
 
				+	flags &= O_ACCMODE;
			
 
				+	if ((flags & O_RDWR) == O_RDWR)
			
 
				+		return CEPH_FILE_MODE_RDWR;
			
 
				+	if ((flags & O_WRONLY) == O_WRONLY)
			
 
				+		return CEPH_FILE_MODE_WR;
			
 
				+	return CEPH_FILE_MODE_RD;
			
 
				+}
			
 
				+
			
 
				+int ceph_caps_for_mode(int mode)
			
 
				+{
			
 
				+	switch (mode) {
			
 
				+	case CEPH_FILE_MODE_PIN:
			
 
				+		return CEPH_CAP_PIN;
			
 
				+	case CEPH_FILE_MODE_RD:
			
 
				+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
			
 
				+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
			
 
				+	case CEPH_FILE_MODE_RDWR:
			
 
				+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
			
 
				+			CEPH_CAP_FILE_EXCL |
			
 
				+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
			
 
				+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
			
 
				+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
			
 
				+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
			
 
				+	case CEPH_FILE_MODE_WR:
			
 
				+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
			
 
				+			CEPH_CAP_FILE_EXCL |
			
 
				+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
			
 
				+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
			
 
				+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
 
				+/*
			
 
				+ * ceph_fs.h - Ceph constants and data types to share between kernel and
			
 
				+ * user space.
			
 
				+ *
			
 
				+ * Most types in this file are defined as little-endian, and are
			
 
				+ * primarily intended to describe data structures that pass over the
			
 
				+ * wire or that are stored on disk.
			
 
				+ *
			
 
				+ * LGPL2
			
 
				+ */
			
 
				+
			
 
				+#ifndef _FS_CEPH_CEPH_FS_H
			
 
				+#define _FS_CEPH_CEPH_FS_H
			
 
				+
			
 
				+#include "msgr.h"
			
 
				+#include "rados.h"
			
 
				+
			
 
				+/*
			
 
				+ * Ceph release version
			
 
				+ */
			
 
				+#define CEPH_VERSION_MAJOR 0
			
 
				+#define CEPH_VERSION_MINOR 19
			
 
				+#define CEPH_VERSION_PATCH 0
			
 
				+
			
 
				+#define _CEPH_STRINGIFY(x) #x
			
 
				+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
			
 
				+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
			
 
				+	"." CEPH_STRINGIFY(z)
			
 
				+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
			
 
				+				       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
			
 
				+
			
 
				+/*
			
 
				+ * subprotocol versions.  when specific messages types or high-level
			
 
				+ * protocols change, bump the affected components.  we keep rev
			
 
				+ * internal cluster protocols separately from the public,
			
 
				+ * client-facing protocol.
			
 
				+ */
			
 
				+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
			
 
				+#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
			
 
				+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
			
 
				+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
			
 
				+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
			
 
				+#define CEPH_MONC_PROTOCOL   15 /* server/client */
			
 
				+
			
 
				+
			
 
				+#define CEPH_INO_ROOT  1
			
 
				+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
			
 
				+
			
 
				+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
			
 
				+#define CEPH_MAX_MON   31
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * feature bits
			
 
				+ */
			
 
				+#define CEPH_FEATURE_SUPPORTED  0
			
 
				+#define CEPH_FEATURE_REQUIRED   0
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * ceph_file_layout - describe data layout for a file/inode
			
 
				+ */
			
 
				+struct ceph_file_layout {
			
 
				+	/* file -> object mapping */
			
 
				+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
			
 
				+				      of page size. */
			
 
				+	__le32 fl_stripe_count;    /* over this many objects */
			
 
				+	__le32 fl_object_size;     /* until objects are this big, then move to
			
 
				+				      new objects */
			
 
				+	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
			
 
				+
			
 
				+	/* pg -> disk layout */
			
 
				+	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
			
 
				+
			
 
				+	/* object -> pg layout */
			
 
				+	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
			
 
				+	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_MIN_STRIPE_UNIT 65536
			
 
				+
			
 
				+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
			
 
				+
			
 
				+
			
 
				+/* crypto algorithms */
			
 
				+#define CEPH_CRYPTO_NONE 0x0
			
 
				+#define CEPH_CRYPTO_AES  0x1
			
 
				+
			
 
				+/* security/authentication protocols */
			
 
				+#define CEPH_AUTH_UNKNOWN	0x0
			
 
				+#define CEPH_AUTH_NONE	 	0x1
			
 
				+#define CEPH_AUTH_CEPHX	 	0x2
			
 
				+
			
 
				+
			
 
				+/*********************************************
			
 
				+ * message layer
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * message types
			
 
				+ */
			
 
				+
			
 
				+/* misc */
			
 
				+#define CEPH_MSG_SHUTDOWN               1
			
 
				+#define CEPH_MSG_PING                   2
			
 
				+
			
 
				+/* client <-> monitor */
			
 
				+#define CEPH_MSG_MON_MAP                4
			
 
				+#define CEPH_MSG_MON_GET_MAP            5
			
 
				+#define CEPH_MSG_STATFS                 13
			
 
				+#define CEPH_MSG_STATFS_REPLY           14
			
 
				+#define CEPH_MSG_MON_SUBSCRIBE          15
			
 
				+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
			
 
				+#define CEPH_MSG_AUTH			17
			
 
				+#define CEPH_MSG_AUTH_REPLY		18
			
 
				+
			
 
				+/* client <-> mds */
			
 
				+#define CEPH_MSG_MDS_MAP                21
			
 
				+
			
 
				+#define CEPH_MSG_CLIENT_SESSION         22
			
 
				+#define CEPH_MSG_CLIENT_RECONNECT       23
			
 
				+
			
 
				+#define CEPH_MSG_CLIENT_REQUEST         24
			
 
				+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
			
 
				+#define CEPH_MSG_CLIENT_REPLY           26
			
 
				+#define CEPH_MSG_CLIENT_CAPS            0x310
			
 
				+#define CEPH_MSG_CLIENT_LEASE           0x311
			
 
				+#define CEPH_MSG_CLIENT_SNAP            0x312
			
 
				+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
			
 
				+
			
 
				+/* osd */
			
 
				+#define CEPH_MSG_OSD_MAP          41
			
 
				+#define CEPH_MSG_OSD_OP           42
			
 
				+#define CEPH_MSG_OSD_OPREPLY      43
			
 
				+
			
 
				+struct ceph_mon_request_header {
			
 
				+	__le64 have_version;
			
 
				+	__le16 session_mon;
			
 
				+	__le64 session_mon_tid;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mon_statfs {
			
 
				+	struct ceph_mon_request_header monhdr;
			
 
				+	struct ceph_fsid fsid;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_statfs {
			
 
				+	__le64 kb, kb_used, kb_avail;
			
 
				+	__le64 num_objects;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mon_statfs_reply {
			
 
				+	struct ceph_fsid fsid;
			
 
				+	__le64 version;
			
 
				+	struct ceph_statfs st;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_osd_getmap {
			
 
				+	struct ceph_mon_request_header monhdr;
			
 
				+	struct ceph_fsid fsid;
			
 
				+	__le32 start;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mds_getmap {
			
 
				+	struct ceph_mon_request_header monhdr;
			
 
				+	struct ceph_fsid fsid;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_client_mount {
			
 
				+	struct ceph_mon_request_header monhdr;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mon_subscribe_item {
			
 
				+	__le64 have_version;	__le64 have;
			
 
				+	__u8 onetime;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mon_subscribe_ack {
			
 
				+	__le32 duration;         /* seconds */
			
 
				+	struct ceph_fsid fsid;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * mds states
			
 
				+ *   > 0 -> in
			
 
				+ *  <= 0 -> out
			
 
				+ */
			
 
				+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
			
 
				+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
			
 
				+					  empty log. */
			
 
				+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
			
 
				+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
			
 
				+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
			
 
				+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
			
 
				+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
			
 
				+
			
 
				+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
			
 
				+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
			
 
				+					  operations (import, rename, etc.) */
			
 
				+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
			
 
				+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
			
 
				+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
			
 
				+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
			
 
				+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
			
 
				+
			
 
				+extern const char *ceph_mds_state_name(int s);
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * metadata lock types.
			
 
				+ *  - these are bitmasks.. we can compose them
			
 
				+ *  - they also define the lock ordering by the MDS
			
 
				+ *  - a few of these are internal to the mds
			
 
				+ */
			
 
				+#define CEPH_LOCK_DN          1
			
 
				+#define CEPH_LOCK_ISNAP       2
			
 
				+#define CEPH_LOCK_IVERSION    4     /* mds internal */
			
 
				+#define CEPH_LOCK_IFILE       8     /* mds internal */
			
 
				+#define CEPH_LOCK_IAUTH       32
			
 
				+#define CEPH_LOCK_ILINK       64
			
 
				+#define CEPH_LOCK_IDFT        128   /* dir frag tree */
			
 
				+#define CEPH_LOCK_INEST       256   /* mds internal */
			
 
				+#define CEPH_LOCK_IXATTR      512
			
 
				+#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
			
 
				+
			
 
				+/* client_session ops */
			
 
				+enum {
			
 
				+	CEPH_SESSION_REQUEST_OPEN,
			
 
				+	CEPH_SESSION_OPEN,
			
 
				+	CEPH_SESSION_REQUEST_CLOSE,
			
 
				+	CEPH_SESSION_CLOSE,
			
 
				+	CEPH_SESSION_REQUEST_RENEWCAPS,
			
 
				+	CEPH_SESSION_RENEWCAPS,
			
 
				+	CEPH_SESSION_STALE,
			
 
				+	CEPH_SESSION_RECALL_STATE,
			
 
				+};
			
 
				+
			
 
				+extern const char *ceph_session_op_name(int op);
			
 
				+
			
 
				+struct ceph_mds_session_head {
			
 
				+	__le32 op;
			
 
				+	__le64 seq;
			
 
				+	struct ceph_timespec stamp;
			
 
				+	__le32 max_caps, max_leases;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* client_request */
			
 
				+/*
			
 
				+ * metadata ops.
			
 
				+ *  & 0x001000 -> write op
			
 
				+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
			
 
				+ &  & 0x100000 -> use weird ino/path trace
			
 
				+ */
			
 
				+#define CEPH_MDS_OP_WRITE        0x001000
			
 
				+enum {
			
 
				+	CEPH_MDS_OP_LOOKUP     = 0x00100,
			
 
				+	CEPH_MDS_OP_GETATTR    = 0x00101,
			
 
				+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
			
 
				+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
			
 
				+
			
 
				+	CEPH_MDS_OP_SETXATTR   = 0x01105,
			
 
				+	CEPH_MDS_OP_RMXATTR    = 0x01106,
			
 
				+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
			
 
				+	CEPH_MDS_OP_SETATTR    = 0x01108,
			
 
				+
			
 
				+	CEPH_MDS_OP_MKNOD      = 0x01201,
			
 
				+	CEPH_MDS_OP_LINK       = 0x01202,
			
 
				+	CEPH_MDS_OP_UNLINK     = 0x01203,
			
 
				+	CEPH_MDS_OP_RENAME     = 0x01204,
			
 
				+	CEPH_MDS_OP_MKDIR      = 0x01220,
			
 
				+	CEPH_MDS_OP_RMDIR      = 0x01221,
			
 
				+	CEPH_MDS_OP_SYMLINK    = 0x01222,
			
 
				+
			
 
				+	CEPH_MDS_OP_CREATE     = 0x01301,
			
 
				+	CEPH_MDS_OP_OPEN       = 0x00302,
			
 
				+	CEPH_MDS_OP_READDIR    = 0x00305,
			
 
				+
			
 
				+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
			
 
				+	CEPH_MDS_OP_MKSNAP     = 0x01400,
			
 
				+	CEPH_MDS_OP_RMSNAP     = 0x01401,
			
 
				+	CEPH_MDS_OP_LSSNAP     = 0x00402,
			
 
				+};
			
 
				+
			
 
				+extern const char *ceph_mds_op_name(int op);
			
 
				+
			
 
				+
			
 
				+#define CEPH_SETATTR_MODE   1
			
 
				+#define CEPH_SETATTR_UID    2
			
 
				+#define CEPH_SETATTR_GID    4
			
 
				+#define CEPH_SETATTR_MTIME  8
			
 
				+#define CEPH_SETATTR_ATIME 16
			
 
				+#define CEPH_SETATTR_SIZE  32
			
 
				+#define CEPH_SETATTR_CTIME 64
			
 
				+
			
 
				+union ceph_mds_request_args {
			
 
				+	struct {
			
 
				+		__le32 mask;                 /* CEPH_CAP_* */
			
 
				+	} __attribute__ ((packed)) getattr;
			
 
				+	struct {
			
 
				+		__le32 mode;
			
 
				+		__le32 uid;
			
 
				+		__le32 gid;
			
 
				+		struct ceph_timespec mtime;
			
 
				+		struct ceph_timespec atime;
			
 
				+		__le64 size, old_size;       /* old_size needed by truncate */
			
 
				+		__le32 mask;                 /* CEPH_SETATTR_* */
			
 
				+	} __attribute__ ((packed)) setattr;
			
 
				+	struct {
			
 
				+		__le32 frag;                 /* which dir fragment */
			
 
				+		__le32 max_entries;          /* how many dentries to grab */
			
 
				+	} __attribute__ ((packed)) readdir;
			
 
				+	struct {
			
 
				+		__le32 mode;
			
 
				+		__le32 rdev;
			
 
				+	} __attribute__ ((packed)) mknod;
			
 
				+	struct {
			
 
				+		__le32 mode;
			
 
				+	} __attribute__ ((packed)) mkdir;
			
 
				+	struct {
			
 
				+		__le32 flags;
			
 
				+		__le32 mode;
			
 
				+		__le32 stripe_unit;          /* layout for newly created file */
			
 
				+		__le32 stripe_count;         /* ... */
			
 
				+		__le32 object_size;
			
 
				+		__le32 file_replication;
			
 
				+		__le32 preferred;
			
 
				+	} __attribute__ ((packed)) open;
			
 
				+	struct {
			
 
				+		__le32 flags;
			
 
				+	} __attribute__ ((packed)) setxattr;
			
 
				+	struct {
			
 
				+		struct ceph_file_layout layout;
			
 
				+	} __attribute__ ((packed)) setlayout;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
			
 
				+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
			
 
				+
			
 
				+struct ceph_mds_request_head {
			
 
				+	__le64 oldest_client_tid;
			
 
				+	__le32 mdsmap_epoch;           /* on client */
			
 
				+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
			
 
				+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
			
 
				+	__le16 num_releases;           /* # include cap/lease release records */
			
 
				+	__le32 op;                     /* mds op code */
			
 
				+	__le32 caller_uid, caller_gid;
			
 
				+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
			
 
				+					  etc. (if replaying) */
			
 
				+	union ceph_mds_request_args args;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* cap/lease release record */
			
 
				+struct ceph_mds_request_release {
			
 
				+	__le64 ino, cap_id;            /* ino and unique cap id */
			
 
				+	__le32 caps, wanted;           /* new issued, wanted */
			
 
				+	__le32 seq, issue_seq, mseq;
			
 
				+	__le32 dname_seq;              /* if releasing a dentry lease, a */
			
 
				+	__le32 dname_len;              /* string follows. */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* client reply */
			
 
				+struct ceph_mds_reply_head {
			
 
				+	__le32 op;
			
 
				+	__le32 result;
			
 
				+	__le32 mdsmap_epoch;
			
 
				+	__u8 safe;                     /* true if committed to disk */
			
 
				+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
			
 
				+					  are included with reply */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* one for each node split */
			
 
				+struct ceph_frag_tree_split {
			
 
				+	__le32 frag;                   /* this frag splits... */
			
 
				+	__le32 by;                     /* ...by this many bits */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_frag_tree_head {
			
 
				+	__le32 nsplits;                /* num ceph_frag_tree_split records */
			
 
				+	struct ceph_frag_tree_split splits[];
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* capability issue, for bundling with mds reply */
			
 
				+struct ceph_mds_reply_cap {
			
 
				+	__le32 caps, wanted;           /* caps issued, wanted */
			
 
				+	__le64 cap_id;
			
 
				+	__le32 seq, mseq;
			
 
				+	__le64 realm;                  /* snap realm */
			
 
				+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
			
 
				+
			
 
				+/* inode record, for bundling with mds reply */
			
 
				+struct ceph_mds_reply_inode {
			
 
				+	__le64 ino;
			
 
				+	__le64 snapid;
			
 
				+	__le32 rdev;
			
 
				+	__le64 version;                /* inode version */
			
 
				+	__le64 xattr_version;          /* version for xattr blob */
			
 
				+	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
			
 
				+	struct ceph_file_layout layout;
			
 
				+	struct ceph_timespec ctime, mtime, atime;
			
 
				+	__le32 time_warp_seq;
			
 
				+	__le64 size, max_size, truncate_size;
			
 
				+	__le32 truncate_seq;
			
 
				+	__le32 mode, uid, gid;
			
 
				+	__le32 nlink;
			
 
				+	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
			
 
				+	struct ceph_timespec rctime;
			
 
				+	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
			
 
				+} __attribute__ ((packed));
			
 
				+/* followed by frag array, then symlink string, then xattr blob */
			
 
				+
			
 
				+/* reply_lease follows dname, and reply_inode */
			
 
				+struct ceph_mds_reply_lease {
			
 
				+	__le16 mask;            /* lease type(s) */
			
 
				+	__le32 duration_ms;     /* lease duration */
			
 
				+	__le32 seq;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mds_reply_dirfrag {
			
 
				+	__le32 frag;            /* fragment */
			
 
				+	__le32 auth;            /* auth mds, if this is a delegation point */
			
 
				+	__le32 ndist;           /* number of mds' this is replicated on */
			
 
				+	__le32 dist[];
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* file access modes */
			
 
				+#define CEPH_FILE_MODE_PIN        0
			
 
				+#define CEPH_FILE_MODE_RD         1
			
 
				+#define CEPH_FILE_MODE_WR         2
			
 
				+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
			
 
				+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
			
 
				+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
			
 
				+
			
 
				+int ceph_flags_to_mode(int flags);
			
 
				+
			
 
				+
			
 
				+/* capability bits */
			
 
				+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
			
 
				+
			
 
				+/* generic cap bits */
			
 
				+#define CEPH_CAP_GSHARED     1  /* client can reads */
			
 
				+#define CEPH_CAP_GEXCL       2  /* client can read and update */
			
 
				+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
			
 
				+#define CEPH_CAP_GRD         8  /* (file) client can read */
			
 
				+#define CEPH_CAP_GWR        16  /* (file) client can write */
			
 
				+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
			
 
				+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
			
 
				+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
			
 
				+
			
 
				+/* per-lock shift */
			
 
				+#define CEPH_CAP_SAUTH      2
			
 
				+#define CEPH_CAP_SLINK      4
			
 
				+#define CEPH_CAP_SXATTR     6
			
 
				+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
			
 
				+
			
 
				+#define CEPH_CAP_BITS       16
			
 
				+
			
 
				+/* composed values */
			
 
				+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
			
 
				+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
			
 
				+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
			
 
				+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
			
 
				+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
			
 
				+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
			
 
				+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
			
 
				+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
			
 
				+
			
 
				+/* cap masks (for getattr) */
			
 
				+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
			
 
				+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
			
 
				+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
			
 
				+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
			
 
				+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
			
 
				+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
			
 
				+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
			
 
				+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
			
 
				+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
			
 
				+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
			
 
				+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
			
 
				+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
			
 
				+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
			
 
				+				 CEPH_CAP_AUTH_SHARED |	\
			
 
				+				 CEPH_CAP_LINK_SHARED |	\
			
 
				+				 CEPH_CAP_FILE_SHARED |	\
			
 
				+				 CEPH_CAP_XATTR_SHARED)
			
 
				+
			
 
				+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
			
 
				+			      CEPH_CAP_LINK_SHARED |			\
			
 
				+			      CEPH_CAP_XATTR_SHARED |			\
			
 
				+			      CEPH_CAP_FILE_SHARED)
			
 
				+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
			
 
				+			   CEPH_CAP_FILE_CACHE)
			
 
				+
			
 
				+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
			
 
				+			   CEPH_CAP_LINK_EXCL |		\
			
 
				+			   CEPH_CAP_XATTR_EXCL |	\
			
 
				+			   CEPH_CAP_FILE_EXCL)
			
 
				+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
			
 
				+			      CEPH_CAP_FILE_EXCL)
			
 
				+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
			
 
				+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
			
 
				+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
			
 
				+
			
 
				+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
			
 
				+			CEPH_LOCK_IXATTR)
			
 
				+
			
 
				+int ceph_caps_for_mode(int mode);
			
 
				+
			
 
				+enum {
			
 
				+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
			
 
				+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
			
 
				+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
			
 
				+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
			
 
				+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
			
 
				+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
			
 
				+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
			
 
				+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
			
 
				+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
			
 
				+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
			
 
				+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
			
 
				+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
			
 
				+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
			
 
				+};
			
 
				+
			
 
				+extern const char *ceph_cap_op_name(int op);
			
 
				+
			
 
				+/*
			
 
				+ * caps message, used for capability callbacks, acks, requests, etc.
			
 
				+ */
			
 
				+struct ceph_mds_caps {
			
 
				+	__le32 op;                  /* CEPH_CAP_OP_* */
			
 
				+	__le64 ino, realm;
			
 
				+	__le64 cap_id;
			
 
				+	__le32 seq, issue_seq;
			
 
				+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
			
 
				+	__le32 migrate_seq;
			
 
				+	__le64 snap_follows;
			
 
				+	__le32 snap_trace_len;
			
 
				+
			
 
				+	/* authlock */
			
 
				+	__le32 uid, gid, mode;
			
 
				+
			
 
				+	/* linklock */
			
 
				+	__le32 nlink;
			
 
				+
			
 
				+	/* xattrlock */
			
 
				+	__le32 xattr_len;
			
 
				+	__le64 xattr_version;
			
 
				+
			
 
				+	/* filelock */
			
 
				+	__le64 size, max_size, truncate_size;
			
 
				+	__le32 truncate_seq;
			
 
				+	struct ceph_timespec mtime, atime, ctime;
			
 
				+	struct ceph_file_layout layout;
			
 
				+	__le32 time_warp_seq;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/* cap release msg head */
			
 
				+struct ceph_mds_cap_release {
			
 
				+	__le32 num;                /* number of cap_items that follow */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_mds_cap_item {
			
 
				+	__le64 ino;
			
 
				+	__le64 cap_id;
			
 
				+	__le32 migrate_seq, seq;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
			
 
				+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
			
 
				+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
			
 
				+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
			
 
				+
			
 
				+extern const char *ceph_lease_op_name(int o);
			
 
				+
			
 
				+/* lease msg header */
			
 
				+struct ceph_mds_lease {
			
 
				+	__u8 action;            /* CEPH_MDS_LEASE_* */
			
 
				+	__le16 mask;            /* which lease */
			
 
				+	__le64 ino;
			
 
				+	__le64 first, last;     /* snap range */
			
 
				+	__le32 seq;
			
 
				+	__le32 duration_ms;     /* duration of renewal */
			
 
				+} __attribute__ ((packed));
			
 
				+/* followed by a __le32+string for dname */
			
 
				+
			
 
				+/* client reconnect */
			
 
				+struct ceph_mds_cap_reconnect {
			
 
				+	__le64 cap_id;
			
 
				+	__le32 wanted;
			
 
				+	__le32 issued;
			
 
				+	__le64 size;
			
 
				+	struct ceph_timespec mtime, atime;
			
 
				+	__le64 snaprealm;
			
 
				+	__le64 pathbase;        /* base ino for our path to this ino */
			
 
				+} __attribute__ ((packed));
			
 
				+/* followed by encoded string */
			
 
				+
			
 
				+struct ceph_mds_snaprealm_reconnect {
			
 
				+	__le64 ino;     /* snap realm base */
			
 
				+	__le64 seq;     /* snap seq for this snap realm */
			
 
				+	__le64 parent;  /* parent realm */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * snaps
			
 
				+ */
			
 
				+enum {
			
 
				+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
			
 
				+	CEPH_SNAP_OP_CREATE,
			
 
				+	CEPH_SNAP_OP_DESTROY,
			
 
				+	CEPH_SNAP_OP_SPLIT,
			
 
				+};
			
 
				+
			
 
				+extern const char *ceph_snap_op_name(int o);
			
 
				+
			
 
				+/* snap msg header */
			
 
				+struct ceph_mds_snap_head {
			
 
				+	__le32 op;                /* CEPH_SNAP_OP_* */
			
 
				+	__le64 split;             /* ino to split off, if any */
			
 
				+	__le32 num_split_inos;    /* # inos belonging to new child realm */
			
 
				+	__le32 num_split_realms;  /* # child realms udner new child realm */
			
 
				+	__le32 trace_len;         /* size of snap trace blob */
			
 
				+} __attribute__ ((packed));
			
 
				+/* followed by split ino list, then split realms, then the trace blob */
			
 
				+
			
 
				+/*
			
 
				+ * encode info about a snaprealm, as viewed by a client
			
 
				+ */
			
 
				+struct ceph_mds_snap_realm {
			
 
				+	__le64 ino;           /* ino */
			
 
				+	__le64 created;       /* snap: when created */
			
 
				+	__le64 parent;        /* ino: parent realm */
			
 
				+	__le64 parent_since;  /* snap: same parent since */
			
 
				+	__le64 seq;           /* snap: version */
			
 
				+	__le32 num_snaps;
			
 
				+	__le32 num_prior_parent_snaps;
			
 
				+} __attribute__ ((packed));
			
 
				+/* followed by my snap list, then prior parent snap list */
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/ceph_hash.c
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
 
				+
			
 
				+#include "types.h"
			
 
				+
			
 
				+/*
			
 
				+ * Robert Jenkin's hash function.
			
 
				+ * http://burtleburtle.net/bob/hash/evahash.html
			
 
				+ * This is in the public domain.
			
 
				+ */
			
 
				+#define mix(a, b, c)						\
			
 
				+	do {							\
			
 
				+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
			
 
				+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
			
 
				+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
			
 
				+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
			
 
				+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
			
 
				+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
			
 
				+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
			
 
				+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
			
 
				+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
			
 
				+	} while (0)
			
 
				+
			
 
				+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
			
 
				+{
			
 
				+	const unsigned char *k = (const unsigned char *)str;
			
 
				+	__u32 a, b, c;  /* the internal state */
			
 
				+	__u32 len;      /* how many key bytes still need mixing */
			
 
				+
			
 
				+	/* Set up the internal state */
			
 
				+	len = length;
			
 
				+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
			
 
				+	b = a;
			
 
				+	c = 0;               /* variable initialization of internal state */
			
 
				+
			
 
				+	/* handle most of the key */
			
 
				+	while (len >= 12) {
			
 
				+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
			
 
				+			 ((__u32)k[3] << 24));
			
 
				+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
			
 
				+			 ((__u32)k[7] << 24));
			
 
				+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
			
 
				+			 ((__u32)k[11] << 24));
			
 
				+		mix(a, b, c);
			
 
				+		k = k + 12;
			
 
				+		len = len - 12;
			
 
				+	}
			
 
				+
			
 
				+	/* handle the last 11 bytes */
			
 
				+	c = c + length;
			
 
				+	switch (len) {            /* all the case statements fall through */
			
 
				+	case 11:
			
 
				+		c = c + ((__u32)k[10] << 24);
			
 
				+	case 10:
			
 
				+		c = c + ((__u32)k[9] << 16);
			
 
				+	case 9:
			
 
				+		c = c + ((__u32)k[8] << 8);
			
 
				+		/* the first byte of c is reserved for the length */
			
 
				+	case 8:
			
 
				+		b = b + ((__u32)k[7] << 24);
			
 
				+	case 7:
			
 
				+		b = b + ((__u32)k[6] << 16);
			
 
				+	case 6:
			
 
				+		b = b + ((__u32)k[5] << 8);
			
 
				+	case 5:
			
 
				+		b = b + k[4];
			
 
				+	case 4:
			
 
				+		a = a + ((__u32)k[3] << 24);
			
 
				+	case 3:
			
 
				+		a = a + ((__u32)k[2] << 16);
			
 
				+	case 2:
			
 
				+		a = a + ((__u32)k[1] << 8);
			
 
				+	case 1:
			
 
				+		a = a + k[0];
			
 
				+		/* case 0: nothing left to add */
			
 
				+	}
			
 
				+	mix(a, b, c);
			
 
				+
			
 
				+	return c;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * linux dcache hash
			
 
				+ */
			
 
				+unsigned ceph_str_hash_linux(const char *str, unsigned length)
			
 
				+{
			
 
				+	unsigned long hash = 0;
			
 
				+	unsigned char c;
			
 
				+
			
 
				+	while (length--) {
			
 
				+		c = *str++;
			
 
				+		hash = (hash + (c << 4) + (c >> 4)) * 11;
			
 
				+	}
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+unsigned ceph_str_hash(int type, const char *s, unsigned len)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CEPH_STR_HASH_LINUX:
			
 
				+		return ceph_str_hash_linux(s, len);
			
 
				+	case CEPH_STR_HASH_RJENKINS:
			
 
				+		return ceph_str_hash_rjenkins(s, len);
			
 
				+	default:
			
 
				+		return -1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const char *ceph_str_hash_name(int type)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CEPH_STR_HASH_LINUX:
			
 
				+		return "linux";
			
 
				+	case CEPH_STR_HASH_RJENKINS:
			
 
				+		return "rjenkins";
			
 
				+	default:
			
 
				+		return "unknown";
			
 
				+	}
			
 
				+}
			
--- a/fs/ceph/ceph_hash.h
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
 
				+#ifndef _FS_CEPH_HASH_H
			
 
				+#define _FS_CEPH_HASH_H
			
 
				+
			
 
				+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
			
 
				+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
			
 
				+
			
 
				+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
			
 
				+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
			
 
				+
			
 
				+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
			
 
				+extern const char *ceph_str_hash_name(int type);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
 
				+/*
			
 
				+ * Ceph string constants
			
 
				+ */
			
 
				+#include "types.h"
			
 
				+
			
 
				+const char *ceph_entity_type_name(int type)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CEPH_ENTITY_TYPE_MDS: return "mds";
			
 
				+	case CEPH_ENTITY_TYPE_OSD: return "osd";
			
 
				+	case CEPH_ENTITY_TYPE_MON: return "mon";
			
 
				+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
			
 
				+	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
			
 
				+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
			
 
				+	default: return "unknown";
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const char *ceph_osd_op_name(int op)
			
 
				+{
			
 
				+	switch (op) {
			
 
				+	case CEPH_OSD_OP_READ: return "read";
			
 
				+	case CEPH_OSD_OP_STAT: return "stat";
			
 
				+
			
 
				+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
			
 
				+
			
 
				+	case CEPH_OSD_OP_WRITE: return "write";
			
 
				+	case CEPH_OSD_OP_DELETE: return "delete";
			
 
				+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
			
 
				+	case CEPH_OSD_OP_ZERO: return "zero";
			
 
				+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
			
 
				+
			
 
				+	case CEPH_OSD_OP_APPEND: return "append";
			
 
				+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
			
 
				+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
			
 
				+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
			
 
				+
			
 
				+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
			
 
				+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
			
 
				+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
			
 
				+
			
 
				+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
			
 
				+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
			
 
				+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
			
 
				+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
			
 
				+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
			
 
				+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
			
 
				+
			
 
				+	case CEPH_OSD_OP_PULL: return "pull";
			
 
				+	case CEPH_OSD_OP_PUSH: return "push";
			
 
				+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
			
 
				+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
			
 
				+	case CEPH_OSD_OP_SCRUB: return "scrub";
			
 
				+
			
 
				+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
			
 
				+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
			
 
				+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
			
 
				+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
			
 
				+	case CEPH_OSD_OP_UPLOCK: return "uplock";
			
 
				+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
			
 
				+
			
 
				+	case CEPH_OSD_OP_CALL: return "call";
			
 
				+
			
 
				+	case CEPH_OSD_OP_PGLS: return "pgls";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
 
				+
			
 
				+const char *ceph_mds_state_name(int s)
			
 
				+{
			
 
				+	switch (s) {
			
 
				+		/* down and out */
			
 
				+	case CEPH_MDS_STATE_DNE:        return "down:dne";
			
 
				+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
			
 
				+		/* up and out */
			
 
				+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
			
 
				+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
			
 
				+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
			
 
				+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
			
 
				+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
			
 
				+		/* up and in */
			
 
				+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
			
 
				+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
			
 
				+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
			
 
				+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
			
 
				+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
			
 
				+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
			
 
				+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
 
				+
			
 
				+const char *ceph_session_op_name(int op)
			
 
				+{
			
 
				+	switch (op) {
			
 
				+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
			
 
				+	case CEPH_SESSION_OPEN: return "open";
			
 
				+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
			
 
				+	case CEPH_SESSION_CLOSE: return "close";
			
 
				+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
			
 
				+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
			
 
				+	case CEPH_SESSION_STALE: return "stale";
			
 
				+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
 
				+
			
 
				+const char *ceph_mds_op_name(int op)
			
 
				+{
			
 
				+	switch (op) {
			
 
				+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
			
 
				+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
			
 
				+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
			
 
				+	case CEPH_MDS_OP_GETATTR:  return "getattr";
			
 
				+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
			
 
				+	case CEPH_MDS_OP_SETATTR: return "setattr";
			
 
				+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
			
 
				+	case CEPH_MDS_OP_READDIR: return "readdir";
			
 
				+	case CEPH_MDS_OP_MKNOD: return "mknod";
			
 
				+	case CEPH_MDS_OP_LINK: return "link";
			
 
				+	case CEPH_MDS_OP_UNLINK: return "unlink";
			
 
				+	case CEPH_MDS_OP_RENAME: return "rename";
			
 
				+	case CEPH_MDS_OP_MKDIR: return "mkdir";
			
 
				+	case CEPH_MDS_OP_RMDIR: return "rmdir";
			
 
				+	case CEPH_MDS_OP_SYMLINK: return "symlink";
			
 
				+	case CEPH_MDS_OP_CREATE: return "create";
			
 
				+	case CEPH_MDS_OP_OPEN: return "open";
			
 
				+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
			
 
				+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
			
 
				+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
			
 
				+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
 
				+
			
 
				+const char *ceph_cap_op_name(int op)
			
 
				+{
			
 
				+	switch (op) {
			
 
				+	case CEPH_CAP_OP_GRANT: return "grant";
			
 
				+	case CEPH_CAP_OP_REVOKE: return "revoke";
			
 
				+	case CEPH_CAP_OP_TRUNC: return "trunc";
			
 
				+	case CEPH_CAP_OP_EXPORT: return "export";
			
 
				+	case CEPH_CAP_OP_IMPORT: return "import";
			
 
				+	case CEPH_CAP_OP_UPDATE: return "update";
			
 
				+	case CEPH_CAP_OP_DROP: return "drop";
			
 
				+	case CEPH_CAP_OP_FLUSH: return "flush";
			
 
				+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
			
 
				+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
			
 
				+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
			
 
				+	case CEPH_CAP_OP_RELEASE: return "release";
			
 
				+	case CEPH_CAP_OP_RENEW: return "renew";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
 
				+
			
 
				+const char *ceph_lease_op_name(int o)
			
 
				+{
			
 
				+	switch (o) {
			
 
				+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
			
 
				+	case CEPH_MDS_LEASE_RELEASE: return "release";
			
 
				+	case CEPH_MDS_LEASE_RENEW: return "renew";
			
 
				+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
 
				+
			
 
				+const char *ceph_snap_op_name(int o)
			
 
				+{
			
 
				+	switch (o) {
			
 
				+	case CEPH_SNAP_OP_UPDATE: return "update";
			
 
				+	case CEPH_SNAP_OP_CREATE: return "create";
			
 
				+	case CEPH_SNAP_OP_DESTROY: return "destroy";
			
 
				+	case CEPH_SNAP_OP_SPLIT: return "split";
			
 
				+	}
			
 
				+	return "???";
			
 
				+}
			
--- a/fs/ceph/crush/crush.c
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
 
				+
			
 
				+#ifdef __KERNEL__
			
 
				+# include <linux/slab.h>
			
 
				+#else
			
 
				+# include <stdlib.h>
			
 
				+# include <assert.h>
			
 
				+# define kfree(x) do { if (x) free(x); } while (0)
			
 
				+# define BUG_ON(x) assert(!(x))
			
 
				+#endif
			
 
				+
			
 
				+#include "crush.h"
			
 
				+
			
 
				+const char *crush_bucket_alg_name(int alg)
			
 
				+{
			
 
				+	switch (alg) {
			
 
				+	case CRUSH_BUCKET_UNIFORM: return "uniform";
			
 
				+	case CRUSH_BUCKET_LIST: return "list";
			
 
				+	case CRUSH_BUCKET_TREE: return "tree";
			
 
				+	case CRUSH_BUCKET_STRAW: return "straw";
			
 
				+	default: return "unknown";
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
			
 
				+ * @b: bucket pointer
			
 
				+ * @p: item index in bucket
			
 
				+ */
			
 
				+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
			
 
				+{
			
 
				+	if (p >= b->size)
			
 
				+		return 0;
			
 
				+
			
 
				+	switch (b->alg) {
			
 
				+	case CRUSH_BUCKET_UNIFORM:
			
 
				+		return ((struct crush_bucket_uniform *)b)->item_weight;
			
 
				+	case CRUSH_BUCKET_LIST:
			
 
				+		return ((struct crush_bucket_list *)b)->item_weights[p];
			
 
				+	case CRUSH_BUCKET_TREE:
			
 
				+		if (p & 1)
			
 
				+			return ((struct crush_bucket_tree *)b)->node_weights[p];
			
 
				+		return 0;
			
 
				+	case CRUSH_BUCKET_STRAW:
			
 
				+		return ((struct crush_bucket_straw *)b)->item_weights[p];
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * crush_calc_parents - Calculate parent vectors for the given crush map.
			
 
				+ * @map: crush_map pointer
			
 
				+ */
			
 
				+void crush_calc_parents(struct crush_map *map)
			
 
				+{
			
 
				+	int i, b, c;
			
 
				+
			
 
				+	for (b = 0; b < map->max_buckets; b++) {
			
 
				+		if (map->buckets[b] == NULL)
			
 
				+			continue;
			
 
				+		for (i = 0; i < map->buckets[b]->size; i++) {
			
 
				+			c = map->buckets[b]->items[i];
			
 
				+			BUG_ON(c >= map->max_devices ||
			
 
				+			       c < -map->max_buckets);
			
 
				+			if (c >= 0)
			
 
				+				map->device_parents[c] = map->buckets[b]->id;
			
 
				+			else
			
 
				+				map->bucket_parents[-1-c] = map->buckets[b]->id;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
			
 
				+{
			
 
				+	kfree(b->h.perm);
			
 
				+	kfree(b->h.items);
			
 
				+	kfree(b);
			
 
				+}
			
 
				+
			
 
				+void crush_destroy_bucket_list(struct crush_bucket_list *b)
			
 
				+{
			
 
				+	kfree(b->item_weights);
			
 
				+	kfree(b->sum_weights);
			
 
				+	kfree(b->h.perm);
			
 
				+	kfree(b->h.items);
			
 
				+	kfree(b);
			
 
				+}
			
 
				+
			
 
				+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
			
 
				+{
			
 
				+	kfree(b->node_weights);
			
 
				+	kfree(b);
			
 
				+}
			
 
				+
			
 
				+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
			
 
				+{
			
 
				+	kfree(b->straws);
			
 
				+	kfree(b->item_weights);
			
 
				+	kfree(b->h.perm);
			
 
				+	kfree(b->h.items);
			
 
				+	kfree(b);
			
 
				+}
			
 
				+
			
 
				+void crush_destroy_bucket(struct crush_bucket *b)
			
 
				+{
			
 
				+	switch (b->alg) {
			
 
				+	case CRUSH_BUCKET_UNIFORM:
			
 
				+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
			
 
				+		break;
			
 
				+	case CRUSH_BUCKET_LIST:
			
 
				+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
			
 
				+		break;
			
 
				+	case CRUSH_BUCKET_TREE:
			
 
				+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
			
 
				+		break;
			
 
				+	case CRUSH_BUCKET_STRAW:
			
 
				+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * crush_destroy - Destroy a crush_map
			
 
				+ * @map: crush_map pointer
			
 
				+ */
			
 
				+void crush_destroy(struct crush_map *map)
			
 
				+{
			
 
				+	int b;
			
 
				+
			
 
				+	/* buckets */
			
 
				+	if (map->buckets) {
			
 
				+		for (b = 0; b < map->max_buckets; b++) {
			
 
				+			if (map->buckets[b] == NULL)
			
 
				+				continue;
			
 
				+			crush_destroy_bucket(map->buckets[b]);
			
 
				+		}
			
 
				+		kfree(map->buckets);
			
 
				+	}
			
 
				+
			
 
				+	/* rules */
			
 
				+	if (map->rules) {
			
 
				+		for (b = 0; b < map->max_rules; b++)
			
 
				+			kfree(map->rules[b]);
			
 
				+		kfree(map->rules);
			
 
				+	}
			
 
				+
			
 
				+	kfree(map->bucket_parents);
			
 
				+	kfree(map->device_parents);
			
 
				+	kfree(map);
			
 
				+}
			
 
				+
			
 
				+
			
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
 
				+#ifndef _CRUSH_CRUSH_H
			
 
				+#define _CRUSH_CRUSH_H
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+/*
			
 
				+ * CRUSH is a pseudo-random data distribution algorithm that
			
 
				+ * efficiently distributes input values (typically, data objects)
			
 
				+ * across a heterogeneous, structured storage cluster.
			
 
				+ *
			
 
				+ * The algorithm was originally described in detail in this paper
			
 
				+ * (although the algorithm has evolved somewhat since then):
			
 
				+ *
			
 
				+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
			
 
				+ *
			
 
				+ * LGPL2
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
			
 
				+
			
 
				+
			
 
				+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
			
 
				+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * CRUSH uses user-defined "rules" to describe how inputs should be
			
 
				+ * mapped to devices.  A rule consists of sequence of steps to perform
			
 
				+ * to generate the set of output devices.
			
 
				+ */
			
 
				+struct crush_rule_step {
			
 
				+	__u32 op;
			
 
				+	__s32 arg1;
			
 
				+	__s32 arg2;
			
 
				+};
			
 
				+
			
 
				+/* step op codes */
			
 
				+enum {
			
 
				+	CRUSH_RULE_NOOP = 0,
			
 
				+	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
			
 
				+	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
			
 
				+				      /* arg2 = type */
			
 
				+	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
			
 
				+	CRUSH_RULE_EMIT = 4,          /* no args */
			
 
				+	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
			
 
				+	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * for specifying choose num (arg1) relative to the max parameter
			
 
				+ * passed to do_rule
			
 
				+ */
			
 
				+#define CRUSH_CHOOSE_N            0
			
 
				+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
			
 
				+
			
 
				+/*
			
 
				+ * The rule mask is used to describe what the rule is intended for.
			
 
				+ * Given a ruleset and size of output set, we search through the
			
 
				+ * rule list for a matching rule_mask.
			
 
				+ */
			
 
				+struct crush_rule_mask {
			
 
				+	__u8 ruleset;
			
 
				+	__u8 type;
			
 
				+	__u8 min_size;
			
 
				+	__u8 max_size;
			
 
				+};
			
 
				+
			
 
				+struct crush_rule {
			
 
				+	__u32 len;
			
 
				+	struct crush_rule_mask mask;
			
 
				+	struct crush_rule_step steps[0];
			
 
				+};
			
 
				+
			
 
				+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
			
 
				+			      (len)*sizeof(struct crush_rule_step))
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * A bucket is a named container of other items (either devices or
			
 
				+ * other buckets).  Items within a bucket are chosen using one of a
			
 
				+ * few different algorithms.  The table summarizes how the speed of
			
 
				+ * each option measures up against mapping stability when items are
			
 
				+ * added or removed.
			
 
				+ *
			
 
				+ *  Bucket Alg     Speed       Additions    Removals
			
 
				+ *  ------------------------------------------------
			
 
				+ *  uniform         O(1)       poor         poor
			
 
				+ *  list            O(n)       optimal      poor
			
 
				+ *  tree            O(log n)   good         good
			
 
				+ *  straw           O(n)       optimal      optimal
			
 
				+ */
			
 
				+enum {
			
 
				+	CRUSH_BUCKET_UNIFORM = 1,
			
 
				+	CRUSH_BUCKET_LIST = 2,
			
 
				+	CRUSH_BUCKET_TREE = 3,
			
 
				+	CRUSH_BUCKET_STRAW = 4
			
 
				+};
			
 
				+extern const char *crush_bucket_alg_name(int alg);
			
 
				+
			
 
				+struct crush_bucket {
			
 
				+	__s32 id;        /* this'll be negative */
			
 
				+	__u16 type;      /* non-zero; type=0 is reserved for devices */
			
 
				+	__u8 alg;        /* one of CRUSH_BUCKET_* */
			
 
				+	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
			
 
				+	__u32 weight;    /* 16-bit fixed point */
			
 
				+	__u32 size;      /* num items */
			
 
				+	__s32 *items;
			
 
				+
			
 
				+	/*
			
 
				+	 * cached random permutation: used for uniform bucket and for
			
 
				+	 * the linear search fallback for the other bucket types.
			
 
				+	 */
			
 
				+	__u32 perm_x;  /* @x for which *perm is defined */
			
 
				+	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
			
 
				+	__u32 *perm;
			
 
				+};
			
 
				+
			
 
				+struct crush_bucket_uniform {
			
 
				+	struct crush_bucket h;
			
 
				+	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
			
 
				+};
			
 
				+
			
 
				+struct crush_bucket_list {
			
 
				+	struct crush_bucket h;
			
 
				+	__u32 *item_weights;  /* 16-bit fixed point */
			
 
				+	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
			
 
				+				 of weights 0..i, inclusive */
			
 
				+};
			
 
				+
			
 
				+struct crush_bucket_tree {
			
 
				+	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
			
 
				+				   actual items */
			
 
				+	__u8 num_nodes;
			
 
				+	__u32 *node_weights;
			
 
				+};
			
 
				+
			
 
				+struct crush_bucket_straw {
			
 
				+	struct crush_bucket h;
			
 
				+	__u32 *item_weights;   /* 16-bit fixed point */
			
 
				+	__u32 *straws;         /* 16-bit fixed point */
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * CRUSH map includes all buckets, rules, etc.
			
 
				+ */
			
 
				+struct crush_map {
			
 
				+	struct crush_bucket **buckets;
			
 
				+	struct crush_rule **rules;
			
 
				+
			
 
				+	/*
			
 
				+	 * Parent pointers to identify the parent bucket a device or
			
 
				+	 * bucket in the hierarchy.  If an item appears more than
			
 
				+	 * once, this is the _last_ time it appeared (where buckets
			
 
				+	 * are processed in bucket id order, from -1 on down to
			
 
				+	 * -max_buckets.
			
 
				+	 */
			
 
				+	__u32 *bucket_parents;
			
 
				+	__u32 *device_parents;
			
 
				+
			
 
				+	__s32 max_buckets;
			
 
				+	__u32 max_rules;
			
 
				+	__s32 max_devices;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/* crush.c */
			
 
				+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
			
 
				+extern void crush_calc_parents(struct crush_map *map);
			
 
				+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
			
 
				+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
			
 
				+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
			
 
				+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
			
 
				+extern void crush_destroy_bucket(struct crush_bucket *b);
			
 
				+extern void crush_destroy(struct crush_map *map);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/crush/hash.c
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include "hash.h"
			
 
				+
			
 
				+/*
			
 
				+ * Robert Jenkins' function for mixing 32-bit values
			
 
				+ * http://burtleburtle.net/bob/hash/evahash.html
			
 
				+ * a, b = random bits, c = input and output
			
 
				+ */
			
 
				+#define crush_hashmix(a, b, c) do {			\
			
 
				+		a = a-b;  a = a-c;  a = a^(c>>13);	\
			
 
				+		b = b-c;  b = b-a;  b = b^(a<<8);	\
			
 
				+		c = c-a;  c = c-b;  c = c^(b>>13);	\
			
 
				+		a = a-b;  a = a-c;  a = a^(c>>12);	\
			
 
				+		b = b-c;  b = b-a;  b = b^(a<<16);	\
			
 
				+		c = c-a;  c = c-b;  c = c^(b>>5);	\
			
 
				+		a = a-b;  a = a-c;  a = a^(c>>3);	\
			
 
				+		b = b-c;  b = b-a;  b = b^(a<<10);	\
			
 
				+		c = c-a;  c = c-b;  c = c^(b>>15);	\
			
 
				+	} while (0)
			
 
				+
			
 
				+#define crush_hash_seed 1315423911
			
 
				+
			
 
				+static __u32 crush_hash32_rjenkins1(__u32 a)
			
 
				+{
			
 
				+	__u32 hash = crush_hash_seed ^ a;
			
 
				+	__u32 b = a;
			
 
				+	__u32 x = 231232;
			
 
				+	__u32 y = 1232;
			
 
				+	crush_hashmix(b, x, hash);
			
 
				+	crush_hashmix(y, a, hash);
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
			
 
				+{
			
 
				+	__u32 hash = crush_hash_seed ^ a ^ b;
			
 
				+	__u32 x = 231232;
			
 
				+	__u32 y = 1232;
			
 
				+	crush_hashmix(a, b, hash);
			
 
				+	crush_hashmix(x, a, hash);
			
 
				+	crush_hashmix(b, y, hash);
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
			
 
				+{
			
 
				+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
			
 
				+	__u32 x = 231232;
			
 
				+	__u32 y = 1232;
			
 
				+	crush_hashmix(a, b, hash);
			
 
				+	crush_hashmix(c, x, hash);
			
 
				+	crush_hashmix(y, a, hash);
			
 
				+	crush_hashmix(b, x, hash);
			
 
				+	crush_hashmix(y, c, hash);
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
			
 
				+{
			
 
				+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
			
 
				+	__u32 x = 231232;
			
 
				+	__u32 y = 1232;
			
 
				+	crush_hashmix(a, b, hash);
			
 
				+	crush_hashmix(c, d, hash);
			
 
				+	crush_hashmix(a, x, hash);
			
 
				+	crush_hashmix(y, b, hash);
			
 
				+	crush_hashmix(c, x, hash);
			
 
				+	crush_hashmix(y, d, hash);
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
			
 
				+				      __u32 e)
			
 
				+{
			
 
				+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
			
 
				+	__u32 x = 231232;
			
 
				+	__u32 y = 1232;
			
 
				+	crush_hashmix(a, b, hash);
			
 
				+	crush_hashmix(c, d, hash);
			
 
				+	crush_hashmix(e, x, hash);
			
 
				+	crush_hashmix(y, a, hash);
			
 
				+	crush_hashmix(b, x, hash);
			
 
				+	crush_hashmix(y, c, hash);
			
 
				+	crush_hashmix(d, x, hash);
			
 
				+	crush_hashmix(y, e, hash);
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__u32 crush_hash32(int type, __u32 a)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CRUSH_HASH_RJENKINS1:
			
 
				+		return crush_hash32_rjenkins1(a);
			
 
				+	default:
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CRUSH_HASH_RJENKINS1:
			
 
				+		return crush_hash32_rjenkins1_2(a, b);
			
 
				+	default:
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CRUSH_HASH_RJENKINS1:
			
 
				+		return crush_hash32_rjenkins1_3(a, b, c);
			
 
				+	default:
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CRUSH_HASH_RJENKINS1:
			
 
				+		return crush_hash32_rjenkins1_4(a, b, c, d);
			
 
				+	default:
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CRUSH_HASH_RJENKINS1:
			
 
				+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
			
 
				+	default:
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const char *crush_hash_name(int type)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CRUSH_HASH_RJENKINS1:
			
 
				+		return "rjenkins1";
			
 
				+	default:
			
 
				+		return "unknown";
			
 
				+	}
			
 
				+}
			
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
 
				+#ifndef _CRUSH_HASH_H
			
 
				+#define _CRUSH_HASH_H
			
 
				+
			
 
				+#define CRUSH_HASH_RJENKINS1   0
			
 
				+
			
 
				+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
			
 
				+
			
 
				+extern const char *crush_hash_name(int type);
			
 
				+
			
 
				+extern __u32 crush_hash32(int type, __u32 a);
			
 
				+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
			
 
				+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
			
 
				+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
			
 
				+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
			
 
				+			    __u32 e);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
 
				+
			
 
				+#ifdef __KERNEL__
			
 
				+# include <linux/string.h>
			
 
				+# include <linux/slab.h>
			
 
				+# include <linux/bug.h>
			
 
				+# include <linux/kernel.h>
			
 
				+# ifndef dprintk
			
 
				+#  define dprintk(args...)
			
 
				+# endif
			
 
				+#else
			
 
				+# include <string.h>
			
 
				+# include <stdio.h>
			
 
				+# include <stdlib.h>
			
 
				+# include <assert.h>
			
 
				+# define BUG_ON(x) assert(!(x))
			
 
				+# define dprintk(args...) /* printf(args) */
			
 
				+# define kmalloc(x, f) malloc(x)
			
 
				+# define kfree(x) free(x)
			
 
				+#endif
			
 
				+
			
 
				+#include "crush.h"
			
 
				+#include "hash.h"
			
 
				+
			
 
				+/*
			
 
				+ * Implement the core CRUSH mapping algorithm.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
			
 
				+ * @map: the crush_map
			
 
				+ * @ruleset: the storage ruleset id (user defined)
			
 
				+ * @type: storage ruleset type (user defined)
			
 
				+ * @size: output set size
			
 
				+ */
			
 
				+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < map->max_rules; i++) {
			
 
				+		if (map->rules[i] &&
			
 
				+		    map->rules[i]->mask.ruleset == ruleset &&
			
 
				+		    map->rules[i]->mask.type == type &&
			
 
				+		    map->rules[i]->mask.min_size <= size &&
			
 
				+		    map->rules[i]->mask.max_size >= size)
			
 
				+			return i;
			
 
				+	}
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * bucket choose methods
			
 
				+ *
			
 
				+ * For each bucket algorithm, we have a "choose" method that, given a
			
 
				+ * crush input @x and replica position (usually, position in output set) @r,
			
 
				+ * will produce an item in the bucket.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Choose based on a random permutation of the bucket.
			
 
				+ *
			
 
				+ * We used to use some prime number arithmetic to do this, but it
			
 
				+ * wasn't very random, and had some other bad behaviors.  Instead, we
			
 
				+ * calculate an actual random permutation of the bucket members.
			
 
				+ * Since this is expensive, we optimize for the r=0 case, which
			
 
				+ * captures the vast majority of calls.
			
 
				+ */
			
 
				+static int bucket_perm_choose(struct crush_bucket *bucket,
			
 
				+			      int x, int r)
			
 
				+{
			
 
				+	unsigned pr = r % bucket->size;
			
 
				+	unsigned i, s;
			
 
				+
			
 
				+	/* start a new permutation if @x has changed */
			
 
				+	if (bucket->perm_x != x || bucket->perm_n == 0) {
			
 
				+		dprintk("bucket %d new x=%d\n", bucket->id, x);
			
 
				+		bucket->perm_x = x;
			
 
				+
			
 
				+		/* optimize common r=0 case */
			
 
				+		if (pr == 0) {
			
 
				+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
			
 
				+				bucket->size;
			
 
				+			bucket->perm[0] = s;
			
 
				+			bucket->perm_n = 0xffff;   /* magic value, see below */
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		for (i = 0; i < bucket->size; i++)
			
 
				+			bucket->perm[i] = i;
			
 
				+		bucket->perm_n = 0;
			
 
				+	} else if (bucket->perm_n == 0xffff) {
			
 
				+		/* clean up after the r=0 case above */
			
 
				+		for (i = 1; i < bucket->size; i++)
			
 
				+			bucket->perm[i] = i;
			
 
				+		bucket->perm[bucket->perm[0]] = 0;
			
 
				+		bucket->perm_n = 1;
			
 
				+	}
			
 
				+
			
 
				+	/* calculate permutation up to pr */
			
 
				+	for (i = 0; i < bucket->perm_n; i++)
			
 
				+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
			
 
				+	while (bucket->perm_n <= pr) {
			
 
				+		unsigned p = bucket->perm_n;
			
 
				+		/* no point in swapping the final entry */
			
 
				+		if (p < bucket->size - 1) {
			
 
				+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
			
 
				+				(bucket->size - p);
			
 
				+			if (i) {
			
 
				+				unsigned t = bucket->perm[p + i];
			
 
				+				bucket->perm[p + i] = bucket->perm[p];
			
 
				+				bucket->perm[p] = t;
			
 
				+			}
			
 
				+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
			
 
				+		}
			
 
				+		bucket->perm_n++;
			
 
				+	}
			
 
				+	for (i = 0; i < bucket->size; i++)
			
 
				+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
			
 
				+
			
 
				+	s = bucket->perm[pr];
			
 
				+out:
			
 
				+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
			
 
				+		bucket->size, x, r, pr, s);
			
 
				+	return bucket->items[s];
			
 
				+}
			
 
				+
			
 
				+/* uniform */
			
 
				+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
			
 
				+				 int x, int r)
			
 
				+{
			
 
				+	return bucket_perm_choose(&bucket->h, x, r);
			
 
				+}
			
 
				+
			
 
				+/* list */
			
 
				+static int bucket_list_choose(struct crush_bucket_list *bucket,
			
 
				+			      int x, int r)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = bucket->h.size-1; i >= 0; i--) {
			
 
				+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
			
 
				+					 r, bucket->h.id);
			
 
				+		w &= 0xffff;
			
 
				+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
			
 
				+			"sw %x rand %llx",
			
 
				+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
			
 
				+			bucket->sum_weights[i], w);
			
 
				+		w *= bucket->sum_weights[i];
			
 
				+		w = w >> 16;
			
 
				+		/*dprintk(" scaled %llx\n", w);*/
			
 
				+		if (w < bucket->item_weights[i])
			
 
				+			return bucket->h.items[i];
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(1);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* (binary) tree */
			
 
				+static int height(int n)
			
 
				+{
			
 
				+	int h = 0;
			
 
				+	while ((n & 1) == 0) {
			
 
				+		h++;
			
 
				+		n = n >> 1;
			
 
				+	}
			
 
				+	return h;
			
 
				+}
			
 
				+
			
 
				+static int left(int x)
			
 
				+{
			
 
				+	int h = height(x);
			
 
				+	return x - (1 << (h-1));
			
 
				+}
			
 
				+
			
 
				+static int right(int x)
			
 
				+{
			
 
				+	int h = height(x);
			
 
				+	return x + (1 << (h-1));
			
 
				+}
			
 
				+
			
 
				+static int terminal(int x)
			
 
				+{
			
 
				+	return x & 1;
			
 
				+}
			
 
				+
			
 
				+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
			
 
				+			      int x, int r)
			
 
				+{
			
 
				+	int n, l;
			
 
				+	__u32 w;
			
 
				+	__u64 t;
			
 
				+
			
 
				+	/* start at root */
			
 
				+	n = bucket->num_nodes >> 1;
			
 
				+
			
 
				+	while (!terminal(n)) {
			
 
				+		/* pick point in [0, w) */
			
 
				+		w = bucket->node_weights[n];
			
 
				+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
			
 
				+					  bucket->h.id) * (__u64)w;
			
 
				+		t = t >> 32;
			
 
				+
			
 
				+		/* descend to the left or right? */
			
 
				+		l = left(n);
			
 
				+		if (t < bucket->node_weights[l])
			
 
				+			n = l;
			
 
				+		else
			
 
				+			n = right(n);
			
 
				+	}
			
 
				+
			
 
				+	return bucket->h.items[n >> 1];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* straw */
			
 
				+
			
 
				+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
			
 
				+			       int x, int r)
			
 
				+{
			
 
				+	int i;
			
 
				+	int high = 0;
			
 
				+	__u64 high_draw = 0;
			
 
				+	__u64 draw;
			
 
				+
			
 
				+	for (i = 0; i < bucket->h.size; i++) {
			
 
				+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
			
 
				+		draw &= 0xffff;
			
 
				+		draw *= bucket->straws[i];
			
 
				+		if (i == 0 || draw > high_draw) {
			
 
				+			high = i;
			
 
				+			high_draw = draw;
			
 
				+		}
			
 
				+	}
			
 
				+	return bucket->h.items[high];
			
 
				+}
			
 
				+
			
 
				+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
			
 
				+{
			
 
				+	dprintk("choose %d x=%d r=%d\n", in->id, x, r);
			
 
				+	switch (in->alg) {
			
 
				+	case CRUSH_BUCKET_UNIFORM:
			
 
				+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
			
 
				+					  x, r);
			
 
				+	case CRUSH_BUCKET_LIST:
			
 
				+		return bucket_list_choose((struct crush_bucket_list *)in,
			
 
				+					  x, r);
			
 
				+	case CRUSH_BUCKET_TREE:
			
 
				+		return bucket_tree_choose((struct crush_bucket_tree *)in,
			
 
				+					  x, r);
			
 
				+	case CRUSH_BUCKET_STRAW:
			
 
				+		return bucket_straw_choose((struct crush_bucket_straw *)in,
			
 
				+					   x, r);
			
 
				+	default:
			
 
				+		BUG_ON(1);
			
 
				+		return in->items[0];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * true if device is marked "out" (failed, fully offloaded)
			
 
				+ * of the cluster
			
 
				+ */
			
 
				+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
			
 
				+{
			
 
				+	if (weight[item] >= 0x1000)
			
 
				+		return 0;
			
 
				+	if (weight[item] == 0)
			
 
				+		return 1;
			
 
				+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
			
 
				+	    < weight[item])
			
 
				+		return 0;
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * crush_choose - choose numrep distinct items of given type
			
 
				+ * @map: the crush_map
			
 
				+ * @bucket: the bucket we are choose an item from
			
 
				+ * @x: crush input value
			
 
				+ * @numrep: the number of items to choose
			
 
				+ * @type: the type of item to choose
			
 
				+ * @out: pointer to output vector
			
 
				+ * @outpos: our position in that vector
			
 
				+ * @firstn: true if choosing "first n" items, false if choosing "indep"
			
 
				+ * @recurse_to_leaf: true if we want one device under each item of given type
			
 
				+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
			
 
				+ */
			
 
				+static int crush_choose(struct crush_map *map,
			
 
				+			struct crush_bucket *bucket,
			
 
				+			__u32 *weight,
			
 
				+			int x, int numrep, int type,
			
 
				+			int *out, int outpos,
			
 
				+			int firstn, int recurse_to_leaf,
			
 
				+			int *out2)
			
 
				+{
			
 
				+	int rep;
			
 
				+	int ftotal, flocal;
			
 
				+	int retry_descent, retry_bucket, skip_rep;
			
 
				+	struct crush_bucket *in = bucket;
			
 
				+	int r;
			
 
				+	int i;
			
 
				+	int item = 0;
			
 
				+	int itemtype;
			
 
				+	int collide, reject;
			
 
				+	const int orig_tries = 5; /* attempts before we fall back to search */
			
 
				+	dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
			
 
				+
			
 
				+	for (rep = outpos; rep < numrep; rep++) {
			
 
				+		/* keep trying until we get a non-out, non-colliding item */
			
 
				+		ftotal = 0;
			
 
				+		skip_rep = 0;
			
 
				+		do {
			
 
				+			retry_descent = 0;
			
 
				+			in = bucket;               /* initial bucket */
			
 
				+
			
 
				+			/* choose through intervening buckets */
			
 
				+			flocal = 0;
			
 
				+			do {
			
 
				+				collide = 0;
			
 
				+				retry_bucket = 0;
			
 
				+				r = rep;
			
 
				+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
			
 
				+					/* be careful */
			
 
				+					if (firstn || numrep >= in->size)
			
 
				+						/* r' = r + f_total */
			
 
				+						r += ftotal;
			
 
				+					else if (in->size % numrep == 0)
			
 
				+						/* r'=r+(n+1)*f_local */
			
 
				+						r += (numrep+1) *
			
 
				+							(flocal+ftotal);
			
 
				+					else
			
 
				+						/* r' = r + n*f_local */
			
 
				+						r += numrep * (flocal+ftotal);
			
 
				+				} else {
			
 
				+					if (firstn)
			
 
				+						/* r' = r + f_total */
			
 
				+						r += ftotal;
			
 
				+					else
			
 
				+						/* r' = r + n*f_local */
			
 
				+						r += numrep * (flocal+ftotal);
			
 
				+				}
			
 
				+
			
 
				+				/* bucket choose */
			
 
				+				if (in->size == 0) {
			
 
				+					reject = 1;
			
 
				+					goto reject;
			
 
				+				}
			
 
				+				if (flocal >= (in->size>>1) &&
			
 
				+				    flocal > orig_tries)
			
 
				+					item = bucket_perm_choose(in, x, r);
			
 
				+				else
			
 
				+					item = crush_bucket_choose(in, x, r);
			
 
				+				BUG_ON(item >= map->max_devices);
			
 
				+
			
 
				+				/* desired type? */
			
 
				+				if (item < 0)
			
 
				+					itemtype = map->buckets[-1-item]->type;
			
 
				+				else
			
 
				+					itemtype = 0;
			
 
				+				dprintk("  item %d type %d\n", item, itemtype);
			
 
				+
			
 
				+				/* keep going? */
			
 
				+				if (itemtype != type) {
			
 
				+					BUG_ON(item >= 0 ||
			
 
				+					       (-1-item) >= map->max_buckets);
			
 
				+					in = map->buckets[-1-item];
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				/* collision? */
			
 
				+				for (i = 0; i < outpos; i++) {
			
 
				+					if (out[i] == item) {
			
 
				+						collide = 1;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				if (recurse_to_leaf &&
			
 
				+				    item < 0 &&
			
 
				+				    crush_choose(map, map->buckets[-1-item],
			
 
				+						 weight,
			
 
				+						 x, outpos+1, 0,
			
 
				+						 out2, outpos,
			
 
				+						 firstn, 0, NULL) <= outpos) {
			
 
				+					reject = 1;
			
 
				+				} else {
			
 
				+					/* out? */
			
 
				+					if (itemtype == 0)
			
 
				+						reject = is_out(map, weight,
			
 
				+								item, x);
			
 
				+					else
			
 
				+						reject = 0;
			
 
				+				}
			
 
				+
			
 
				+reject:
			
 
				+				if (reject || collide) {
			
 
				+					ftotal++;
			
 
				+					flocal++;
			
 
				+
			
 
				+					if (collide && flocal < 3)
			
 
				+						/* retry locally a few times */
			
 
				+						retry_bucket = 1;
			
 
				+					else if (flocal < in->size + orig_tries)
			
 
				+						/* exhaustive bucket search */
			
 
				+						retry_bucket = 1;
			
 
				+					else if (ftotal < 20)
			
 
				+						/* then retry descent */
			
 
				+						retry_descent = 1;
			
 
				+					else
			
 
				+						/* else give up */
			
 
				+						skip_rep = 1;
			
 
				+					dprintk("  reject %d  collide %d  "
			
 
				+						"ftotal %d  flocal %d\n",
			
 
				+						reject, collide, ftotal,
			
 
				+						flocal);
			
 
				+				}
			
 
				+			} while (retry_bucket);
			
 
				+		} while (retry_descent);
			
 
				+
			
 
				+		if (skip_rep) {
			
 
				+			dprintk("skip rep\n");
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		dprintk("choose got %d\n", item);
			
 
				+		out[outpos] = item;
			
 
				+		outpos++;
			
 
				+	}
			
 
				+
			
 
				+	dprintk("choose returns %d\n", outpos);
			
 
				+	return outpos;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * crush_do_rule - calculate a mapping with the given input and rule
			
 
				+ * @map: the crush_map
			
 
				+ * @ruleno: the rule id
			
 
				+ * @x: hash input
			
 
				+ * @result: pointer to result vector
			
 
				+ * @result_max: maximum result size
			
 
				+ * @force: force initial replica choice; -1 for none
			
 
				+ */
			
 
				+int crush_do_rule(struct crush_map *map,
			
 
				+		  int ruleno, int x, int *result, int result_max,
			
 
				+		  int force, __u32 *weight)
			
 
				+{
			
 
				+	int result_len;
			
 
				+	int force_context[CRUSH_MAX_DEPTH];
			
 
				+	int force_pos = -1;
			
 
				+	int a[CRUSH_MAX_SET];
			
 
				+	int b[CRUSH_MAX_SET];
			
 
				+	int c[CRUSH_MAX_SET];
			
 
				+	int recurse_to_leaf;
			
 
				+	int *w;
			
 
				+	int wsize = 0;
			
 
				+	int *o;
			
 
				+	int osize;
			
 
				+	int *tmp;
			
 
				+	struct crush_rule *rule;
			
 
				+	int step;
			
 
				+	int i, j;
			
 
				+	int numrep;
			
 
				+	int firstn;
			
 
				+	int rc = -1;
			
 
				+
			
 
				+	BUG_ON(ruleno >= map->max_rules);
			
 
				+
			
 
				+	rule = map->rules[ruleno];
			
 
				+	result_len = 0;
			
 
				+	w = a;
			
 
				+	o = b;
			
 
				+
			
 
				+	/*
			
 
				+	 * determine hierarchical context of force, if any.  note
			
 
				+	 * that this may or may not correspond to the specific types
			
 
				+	 * referenced by the crush rule.
			
 
				+	 */
			
 
				+	if (force >= 0) {
			
 
				+		if (force >= map->max_devices ||
			
 
				+		    map->device_parents[force] == 0) {
			
 
				+			/*dprintk("CRUSH: forcefed device dne\n");*/
			
 
				+			rc = -1;  /* force fed device dne */
			
 
				+			goto out;
			
 
				+		}
			
 
				+		if (!is_out(map, weight, force, x)) {
			
 
				+			while (1) {
			
 
				+				force_context[++force_pos] = force;
			
 
				+				if (force >= 0)
			
 
				+					force = map->device_parents[force];
			
 
				+				else
			
 
				+					force = map->bucket_parents[-1-force];
			
 
				+				if (force == 0)
			
 
				+					break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (step = 0; step < rule->len; step++) {
			
 
				+		firstn = 0;
			
 
				+		switch (rule->steps[step].op) {
			
 
				+		case CRUSH_RULE_TAKE:
			
 
				+			w[0] = rule->steps[step].arg1;
			
 
				+			if (force_pos >= 0) {
			
 
				+				BUG_ON(force_context[force_pos] != w[0]);
			
 
				+				force_pos--;
			
 
				+			}
			
 
				+			wsize = 1;
			
 
				+			break;
			
 
				+
			
 
				+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
			
 
				+		case CRUSH_RULE_CHOOSE_FIRSTN:
			
 
				+			firstn = 1;
			
 
				+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
			
 
				+		case CRUSH_RULE_CHOOSE_INDEP:
			
 
				+			BUG_ON(wsize == 0);
			
 
				+
			
 
				+			recurse_to_leaf =
			
 
				+				rule->steps[step].op ==
			
 
				+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
			
 
				+				rule->steps[step].op ==
			
 
				+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
			
 
				+
			
 
				+			/* reset output */
			
 
				+			osize = 0;
			
 
				+
			
 
				+			for (i = 0; i < wsize; i++) {
			
 
				+				/*
			
 
				+				 * see CRUSH_N, CRUSH_N_MINUS macros.
			
 
				+				 * basically, numrep <= 0 means relative to
			
 
				+				 * the provided result_max
			
 
				+				 */
			
 
				+				numrep = rule->steps[step].arg1;
			
 
				+				if (numrep <= 0) {
			
 
				+					numrep += result_max;
			
 
				+					if (numrep <= 0)
			
 
				+						continue;
			
 
				+				}
			
 
				+				j = 0;
			
 
				+				if (osize == 0 && force_pos >= 0) {
			
 
				+					/* skip any intermediate types */
			
 
				+					while (force_pos &&
			
 
				+					       force_context[force_pos] < 0 &&
			
 
				+					       rule->steps[step].arg2 !=
			
 
				+					       map->buckets[-1 -
			
 
				+					       force_context[force_pos]]->type)
			
 
				+						force_pos--;
			
 
				+					o[osize] = force_context[force_pos];
			
 
				+					if (recurse_to_leaf)
			
 
				+						c[osize] = force_context[0];
			
 
				+					j++;
			
 
				+					force_pos--;
			
 
				+				}
			
 
				+				osize += crush_choose(map,
			
 
				+						      map->buckets[-1-w[i]],
			
 
				+						      weight,
			
 
				+						      x, numrep,
			
 
				+						      rule->steps[step].arg2,
			
 
				+						      o+osize, j,
			
 
				+						      firstn,
			
 
				+						      recurse_to_leaf, c+osize);
			
 
				+			}
			
 
				+
			
 
				+			if (recurse_to_leaf)
			
 
				+				/* copy final _leaf_ values to output set */
			
 
				+				memcpy(o, c, osize*sizeof(*o));
			
 
				+
			
 
				+			/* swap t and w arrays */
			
 
				+			tmp = o;
			
 
				+			o = w;
			
 
				+			w = tmp;
			
 
				+			wsize = osize;
			
 
				+			break;
			
 
				+
			
 
				+
			
 
				+		case CRUSH_RULE_EMIT:
			
 
				+			for (i = 0; i < wsize && result_len < result_max; i++) {
			
 
				+				result[result_len] = w[i];
			
 
				+				result_len++;
			
 
				+			}
			
 
				+			wsize = 0;
			
 
				+			break;
			
 
				+
			
 
				+		default:
			
 
				+			BUG_ON(1);
			
 
				+		}
			
 
				+	}
			
 
				+	rc = result_len;
			
 
				+
			
 
				+out:
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/fs/ceph/crush/mapper.h
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
 
				+#ifndef _CRUSH_MAPPER_H
			
 
				+#define _CRUSH_MAPPER_H
			
 
				+
			
 
				+/*
			
 
				+ * CRUSH functions for find rules and then mapping an input to an
			
 
				+ * output set.
			
 
				+ *
			
 
				+ * LGPL2
			
 
				+ */
			
 
				+
			
 
				+#include "crush.h"
			
 
				+
			
 
				+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
			
 
				+extern int crush_do_rule(struct crush_map *map,
			
 
				+			 int ruleno,
			
 
				+			 int x, int *result, int result_max,
			
 
				+			 int forcefeed,    /* -1 for none */
			
 
				+			 __u32 *weights);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
 
				+
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/scatterlist.h>
			
 
				+#include <crypto/hash.h>
			
 
				+
			
 
				+#include "crypto.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
			
 
				+{
			
 
				+	if (*p + sizeof(u16) + sizeof(key->created) +
			
 
				+	    sizeof(u16) + key->len > end)
			
 
				+		return -ERANGE;
			
 
				+	ceph_encode_16(p, key->type);
			
 
				+	ceph_encode_copy(p, &key->created, sizeof(key->created));
			
 
				+	ceph_encode_16(p, key->len);
			
 
				+	ceph_encode_copy(p, key->key, key->len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
			
 
				+{
			
 
				+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
			
 
				+	key->type = ceph_decode_16(p);
			
 
				+	ceph_decode_copy(p, &key->created, sizeof(key->created));
			
 
				+	key->len = ceph_decode_16(p);
			
 
				+	ceph_decode_need(p, end, key->len, bad);
			
 
				+	key->key = kmalloc(key->len, GFP_NOFS);
			
 
				+	if (!key->key)
			
 
				+		return -ENOMEM;
			
 
				+	ceph_decode_copy(p, key->key, key->len);
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	dout("failed to decode crypto key\n");
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
			
 
				+{
			
 
				+	int inlen = strlen(inkey);
			
 
				+	int blen = inlen * 3 / 4;
			
 
				+	void *buf, *p;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("crypto_key_unarmor %s\n", inkey);
			
 
				+	buf = kmalloc(blen, GFP_NOFS);
			
 
				+	if (!buf)
			
 
				+		return -ENOMEM;
			
 
				+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
			
 
				+	if (blen < 0) {
			
 
				+		kfree(buf);
			
 
				+		return blen;
			
 
				+	}
			
 
				+
			
 
				+	p = buf;
			
 
				+	ret = ceph_crypto_key_decode(key, &p, p + blen);
			
 
				+	kfree(buf);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
			
 
				+	     key->type, key->len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define AES_KEY_SIZE 16
			
 
				+
			
 
				+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
			
 
				+{
			
 
				+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
			
 
				+}
			
 
				+
			
 
				+const u8 *aes_iv = "cephsageyudagreg";
			
 
				+
			
 
				+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
			
 
				+		     const void *src, size_t src_len)
			
 
				+{
			
 
				+	struct scatterlist sg_in[2], sg_out[1];
			
 
				+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
			
 
				+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
			
 
				+	int ret;
			
 
				+	void *iv;
			
 
				+	int ivsize;
			
 
				+	size_t zero_padding = (0x10 - (src_len & 0x0f));
			
 
				+	char pad[16];
			
 
				+
			
 
				+	if (IS_ERR(tfm))
			
 
				+		return PTR_ERR(tfm);
			
 
				+
			
 
				+	memset(pad, zero_padding, zero_padding);
			
 
				+
			
 
				+	*dst_len = src_len + zero_padding;
			
 
				+
			
 
				+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
			
 
				+	sg_init_table(sg_in, 2);
			
 
				+	sg_set_buf(&sg_in[0], src, src_len);
			
 
				+	sg_set_buf(&sg_in[1], pad, zero_padding);
			
 
				+	sg_init_table(sg_out, 1);
			
 
				+	sg_set_buf(sg_out, dst, *dst_len);
			
 
				+	iv = crypto_blkcipher_crt(tfm)->iv;
			
 
				+	ivsize = crypto_blkcipher_ivsize(tfm);
			
 
				+
			
 
				+	memcpy(iv, aes_iv, ivsize);
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       key, key_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+			src, src_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+			pad, zero_padding, 1);
			
 
				+	*/
			
 
				+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
			
 
				+				     src_len + zero_padding);
			
 
				+	crypto_free_blkcipher(tfm);
			
 
				+	if (ret < 0)
			
 
				+		pr_err("ceph_aes_crypt failed %d\n", ret);
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       dst, *dst_len, 1);
			
 
				+	*/
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
			
 
				+		      const void *src1, size_t src1_len,
			
 
				+		      const void *src2, size_t src2_len)
			
 
				+{
			
 
				+	struct scatterlist sg_in[3], sg_out[1];
			
 
				+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
			
 
				+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
			
 
				+	int ret;
			
 
				+	void *iv;
			
 
				+	int ivsize;
			
 
				+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
			
 
				+	char pad[16];
			
 
				+
			
 
				+	if (IS_ERR(tfm))
			
 
				+		return PTR_ERR(tfm);
			
 
				+
			
 
				+	memset(pad, zero_padding, zero_padding);
			
 
				+
			
 
				+	*dst_len = src1_len + src2_len + zero_padding;
			
 
				+
			
 
				+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
			
 
				+	sg_init_table(sg_in, 3);
			
 
				+	sg_set_buf(&sg_in[0], src1, src1_len);
			
 
				+	sg_set_buf(&sg_in[1], src2, src2_len);
			
 
				+	sg_set_buf(&sg_in[2], pad, zero_padding);
			
 
				+	sg_init_table(sg_out, 1);
			
 
				+	sg_set_buf(sg_out, dst, *dst_len);
			
 
				+	iv = crypto_blkcipher_crt(tfm)->iv;
			
 
				+	ivsize = crypto_blkcipher_ivsize(tfm);
			
 
				+
			
 
				+	memcpy(iv, aes_iv, ivsize);
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       key, key_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+			src1, src1_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+			src2, src2_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+			pad, zero_padding, 1);
			
 
				+	*/
			
 
				+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
			
 
				+				     src1_len + src2_len + zero_padding);
			
 
				+	crypto_free_blkcipher(tfm);
			
 
				+	if (ret < 0)
			
 
				+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       dst, *dst_len, 1);
			
 
				+	*/
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
			
 
				+		     const void *src, size_t src_len)
			
 
				+{
			
 
				+	struct scatterlist sg_in[1], sg_out[2];
			
 
				+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
			
 
				+	struct blkcipher_desc desc = { .tfm = tfm };
			
 
				+	char pad[16];
			
 
				+	void *iv;
			
 
				+	int ivsize;
			
 
				+	int ret;
			
 
				+	int last_byte;
			
 
				+
			
 
				+	if (IS_ERR(tfm))
			
 
				+		return PTR_ERR(tfm);
			
 
				+
			
 
				+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
			
 
				+	sg_init_table(sg_in, 1);
			
 
				+	sg_init_table(sg_out, 2);
			
 
				+	sg_set_buf(sg_in, src, src_len);
			
 
				+	sg_set_buf(&sg_out[0], dst, *dst_len);
			
 
				+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
			
 
				+
			
 
				+	iv = crypto_blkcipher_crt(tfm)->iv;
			
 
				+	ivsize = crypto_blkcipher_ivsize(tfm);
			
 
				+
			
 
				+	memcpy(iv, aes_iv, ivsize);
			
 
				+
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       key, key_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       src, src_len, 1);
			
 
				+	*/
			
 
				+
			
 
				+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
			
 
				+	crypto_free_blkcipher(tfm);
			
 
				+	if (ret < 0) {
			
 
				+		pr_err("ceph_aes_decrypt failed %d\n", ret);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	if (src_len <= *dst_len)
			
 
				+		last_byte = ((char *)dst)[src_len - 1];
			
 
				+	else
			
 
				+		last_byte = pad[src_len - *dst_len - 1];
			
 
				+	if (last_byte <= 16 && src_len >= last_byte) {
			
 
				+		*dst_len = src_len - last_byte;
			
 
				+	} else {
			
 
				+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
			
 
				+		       last_byte, (int)src_len);
			
 
				+		return -EPERM;  /* bad padding */
			
 
				+	}
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       dst, *dst_len, 1);
			
 
				+	*/
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_aes_decrypt2(const void *key, int key_len,
			
 
				+		      void *dst1, size_t *dst1_len,
			
 
				+		      void *dst2, size_t *dst2_len,
			
 
				+		      const void *src, size_t src_len)
			
 
				+{
			
 
				+	struct scatterlist sg_in[1], sg_out[3];
			
 
				+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
			
 
				+	struct blkcipher_desc desc = { .tfm = tfm };
			
 
				+	char pad[16];
			
 
				+	void *iv;
			
 
				+	int ivsize;
			
 
				+	int ret;
			
 
				+	int last_byte;
			
 
				+
			
 
				+	if (IS_ERR(tfm))
			
 
				+		return PTR_ERR(tfm);
			
 
				+
			
 
				+	sg_init_table(sg_in, 1);
			
 
				+	sg_set_buf(sg_in, src, src_len);
			
 
				+	sg_init_table(sg_out, 3);
			
 
				+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
			
 
				+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
			
 
				+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
			
 
				+
			
 
				+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
			
 
				+	iv = crypto_blkcipher_crt(tfm)->iv;
			
 
				+	ivsize = crypto_blkcipher_ivsize(tfm);
			
 
				+
			
 
				+	memcpy(iv, aes_iv, ivsize);
			
 
				+
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       key, key_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       src, src_len, 1);
			
 
				+	*/
			
 
				+
			
 
				+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
			
 
				+	crypto_free_blkcipher(tfm);
			
 
				+	if (ret < 0) {
			
 
				+		pr_err("ceph_aes_decrypt failed %d\n", ret);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	if (src_len <= *dst1_len)
			
 
				+		last_byte = ((char *)dst1)[src_len - 1];
			
 
				+	else if (src_len <= *dst1_len + *dst2_len)
			
 
				+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
			
 
				+	else
			
 
				+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
			
 
				+	if (last_byte <= 16 && src_len >= last_byte) {
			
 
				+		src_len -= last_byte;
			
 
				+	} else {
			
 
				+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
			
 
				+		       last_byte, (int)src_len);
			
 
				+		return -EPERM;  /* bad padding */
			
 
				+	}
			
 
				+
			
 
				+	if (src_len < *dst1_len) {
			
 
				+		*dst1_len = src_len;
			
 
				+		*dst2_len = 0;
			
 
				+	} else {
			
 
				+		*dst2_len = src_len - *dst1_len;
			
 
				+	}
			
 
				+	/*
			
 
				+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       dst1, *dst1_len, 1);
			
 
				+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
			
 
				+		       dst2, *dst2_len, 1);
			
 
				+	*/
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
			
 
				+		 const void *src, size_t src_len)
			
 
				+{
			
 
				+	switch (secret->type) {
			
 
				+	case CEPH_CRYPTO_NONE:
			
 
				+		if (*dst_len < src_len)
			
 
				+			return -ERANGE;
			
 
				+		memcpy(dst, src, src_len);
			
 
				+		*dst_len = src_len;
			
 
				+		return 0;
			
 
				+
			
 
				+	case CEPH_CRYPTO_AES:
			
 
				+		return ceph_aes_decrypt(secret->key, secret->len, dst,
			
 
				+					dst_len, src, src_len);
			
 
				+
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int ceph_decrypt2(struct ceph_crypto_key *secret,
			
 
				+			void *dst1, size_t *dst1_len,
			
 
				+			void *dst2, size_t *dst2_len,
			
 
				+			const void *src, size_t src_len)
			
 
				+{
			
 
				+	size_t t;
			
 
				+
			
 
				+	switch (secret->type) {
			
 
				+	case CEPH_CRYPTO_NONE:
			
 
				+		if (*dst1_len + *dst2_len < src_len)
			
 
				+			return -ERANGE;
			
 
				+		t = min(*dst1_len, src_len);
			
 
				+		memcpy(dst1, src, t);
			
 
				+		*dst1_len = t;
			
 
				+		src += t;
			
 
				+		src_len -= t;
			
 
				+		if (src_len) {
			
 
				+			t = min(*dst2_len, src_len);
			
 
				+			memcpy(dst2, src, t);
			
 
				+			*dst2_len = t;
			
 
				+		}
			
 
				+		return 0;
			
 
				+
			
 
				+	case CEPH_CRYPTO_AES:
			
 
				+		return ceph_aes_decrypt2(secret->key, secret->len,
			
 
				+					 dst1, dst1_len, dst2, dst2_len,
			
 
				+					 src, src_len);
			
 
				+
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
			
 
				+		 const void *src, size_t src_len)
			
 
				+{
			
 
				+	switch (secret->type) {
			
 
				+	case CEPH_CRYPTO_NONE:
			
 
				+		if (*dst_len < src_len)
			
 
				+			return -ERANGE;
			
 
				+		memcpy(dst, src, src_len);
			
 
				+		*dst_len = src_len;
			
 
				+		return 0;
			
 
				+
			
 
				+	case CEPH_CRYPTO_AES:
			
 
				+		return ceph_aes_encrypt(secret->key, secret->len, dst,
			
 
				+					dst_len, src, src_len);
			
 
				+
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
			
 
				+		  const void *src1, size_t src1_len,
			
 
				+		  const void *src2, size_t src2_len)
			
 
				+{
			
 
				+	switch (secret->type) {
			
 
				+	case CEPH_CRYPTO_NONE:
			
 
				+		if (*dst_len < src1_len + src2_len)
			
 
				+			return -ERANGE;
			
 
				+		memcpy(dst, src1, src1_len);
			
 
				+		memcpy(dst + src1_len, src2, src2_len);
			
 
				+		*dst_len = src1_len + src2_len;
			
 
				+		return 0;
			
 
				+
			
 
				+	case CEPH_CRYPTO_AES:
			
 
				+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
			
 
				+					 src1, src1_len, src2, src2_len);
			
 
				+
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
 
				+#ifndef _FS_CEPH_CRYPTO_H
			
 
				+#define _FS_CEPH_CRYPTO_H
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "buffer.h"
			
 
				+
			
 
				+/*
			
 
				+ * cryptographic secret
			
 
				+ */
			
 
				+struct ceph_crypto_key {
			
 
				+	int type;
			
 
				+	struct ceph_timespec created;
			
 
				+	int len;
			
 
				+	void *key;
			
 
				+};
			
 
				+
			
 
				+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
			
 
				+{
			
 
				+	kfree(key->key);
			
 
				+}
			
 
				+
			
 
				+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
			
 
				+				  void **p, void *end);
			
 
				+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
			
 
				+				  void **p, void *end);
			
 
				+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
			
 
				+
			
 
				+/* crypto.c */
			
 
				+extern int ceph_decrypt(struct ceph_crypto_key *secret,
			
 
				+			void *dst, size_t *dst_len,
			
 
				+			const void *src, size_t src_len);
			
 
				+extern int ceph_encrypt(struct ceph_crypto_key *secret,
			
 
				+			void *dst, size_t *dst_len,
			
 
				+			const void *src, size_t src_len);
			
 
				+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
			
 
				+			void *dst1, size_t *dst1_len,
			
 
				+			void *dst2, size_t *dst2_len,
			
 
				+			const void *src, size_t src_len);
			
 
				+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
			
 
				+			 void *dst, size_t *dst_len,
			
 
				+			 const void *src1, size_t src1_len,
			
 
				+			 const void *src2, size_t src2_len);
			
 
				+
			
 
				+/* armor.c */
			
 
				+extern int ceph_armor(char *dst, const void *src, const void *end);
			
 
				+extern int ceph_unarmor(void *dst, const char *src, const char *end);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/device.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/ctype.h>
			
 
				+#include <linux/debugfs.h>
			
 
				+#include <linux/seq_file.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "mds_client.h"
			
 
				+#include "mon_client.h"
			
 
				+#include "auth.h"
			
 
				+
			
 
				+#ifdef CONFIG_DEBUG_FS
			
 
				+
			
 
				+/*
			
 
				+ * Implement /sys/kernel/debug/ceph fun
			
 
				+ *
			
 
				+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
			
 
				+ *      .../osdmap      - current osdmap
			
 
				+ *      .../mdsmap      - current mdsmap
			
 
				+ *      .../monmap      - current monmap
			
 
				+ *      .../osdc        - active osd requests
			
 
				+ *      .../mdsc        - active mds requests
			
 
				+ *      .../monc        - mon client state
			
 
				+ *      .../dentry_lru  - dump contents of dentry lru
			
 
				+ *      .../caps        - expose cap (reservation) stats
			
 
				+ *      .../bdi         - symlink to ../../bdi/something
			
 
				+ */
			
 
				+
			
 
				+static struct dentry *ceph_debugfs_dir;
			
 
				+
			
 
				+static int monmap_show(struct seq_file *s, void *p)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct ceph_client *client = s->private;
			
 
				+
			
 
				+	if (client->monc.monmap == NULL)
			
 
				+		return 0;
			
 
				+
			
 
				+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
			
 
				+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
			
 
				+		struct ceph_entity_inst *inst =
			
 
				+			&client->monc.monmap->mon_inst[i];
			
 
				+
			
 
				+		seq_printf(s, "\t%s%lld\t%s\n",
			
 
				+			   ENTITY_NAME(inst->name),
			
 
				+			   pr_addr(&inst->addr.in_addr));
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int mdsmap_show(struct seq_file *s, void *p)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct ceph_client *client = s->private;
			
 
				+
			
 
				+	if (client->mdsc.mdsmap == NULL)
			
 
				+		return 0;
			
 
				+	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
			
 
				+	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
			
 
				+	seq_printf(s, "session_timeout %d\n",
			
 
				+		       client->mdsc.mdsmap->m_session_timeout);
			
 
				+	seq_printf(s, "session_autoclose %d\n",
			
 
				+		       client->mdsc.mdsmap->m_session_autoclose);
			
 
				+	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
			
 
				+		struct ceph_entity_addr *addr =
			
 
				+			&client->mdsc.mdsmap->m_info[i].addr;
			
 
				+		int state = client->mdsc.mdsmap->m_info[i].state;
			
 
				+
			
 
				+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
			
 
				+			       ceph_mds_state_name(state));
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int osdmap_show(struct seq_file *s, void *p)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct ceph_client *client = s->private;
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	if (client->osdc.osdmap == NULL)
			
 
				+		return 0;
			
 
				+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
			
 
				+	seq_printf(s, "flags%s%s\n",
			
 
				+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
			
 
				+		   " NEARFULL" : "",
			
 
				+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
			
 
				+		   " FULL" : "");
			
 
				+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
			
 
				+		struct ceph_pg_pool_info *pool =
			
 
				+			rb_entry(n, struct ceph_pg_pool_info, node);
			
 
				+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
			
 
				+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
			
 
				+			   pool->v.lpg_num, pool->lpg_num_mask);
			
 
				+	}
			
 
				+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
			
 
				+		struct ceph_entity_addr *addr =
			
 
				+			&client->osdc.osdmap->osd_addr[i];
			
 
				+		int state = client->osdc.osdmap->osd_state[i];
			
 
				+		char sb[64];
			
 
				+
			
 
				+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
			
 
				+			   i, pr_addr(&addr->in_addr),
			
 
				+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
			
 
				+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int monc_show(struct seq_file *s, void *p)
			
 
				+{
			
 
				+	struct ceph_client *client = s->private;
			
 
				+	struct ceph_mon_statfs_request *req;
			
 
				+	struct ceph_mon_client *monc = &client->monc;
			
 
				+	struct rb_node *rp;
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+
			
 
				+	if (monc->have_mdsmap)
			
 
				+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
			
 
				+	if (monc->have_osdmap)
			
 
				+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
			
 
				+	if (monc->want_next_osdmap)
			
 
				+		seq_printf(s, "want next osdmap\n");
			
 
				+
			
 
				+	for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
			
 
				+		req = rb_entry(rp, struct ceph_mon_statfs_request, node);
			
 
				+		seq_printf(s, "%lld statfs\n", req->tid);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int mdsc_show(struct seq_file *s, void *p)
			
 
				+{
			
 
				+	struct ceph_client *client = s->private;
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct rb_node *rp;
			
 
				+	int pathlen;
			
 
				+	u64 pathbase;
			
 
				+	char *path;
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
			
 
				+		req = rb_entry(rp, struct ceph_mds_request, r_node);
			
 
				+
			
 
				+		if (req->r_request)
			
 
				+			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
			
 
				+		else
			
 
				+			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
			
 
				+
			
 
				+		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
			
 
				+
			
 
				+		if (req->r_got_unsafe)
			
 
				+			seq_printf(s, "\t(unsafe)");
			
 
				+		else
			
 
				+			seq_printf(s, "\t");
			
 
				+
			
 
				+		if (req->r_inode) {
			
 
				+			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
			
 
				+		} else if (req->r_dentry) {
			
 
				+			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
			
 
				+						    &pathbase, 0);
			
 
				+			spin_lock(&req->r_dentry->d_lock);
			
 
				+			seq_printf(s, " #%llx/%.*s (%s)",
			
 
				+				   ceph_ino(req->r_dentry->d_parent->d_inode),
			
 
				+				   req->r_dentry->d_name.len,
			
 
				+				   req->r_dentry->d_name.name,
			
 
				+				   path ? path : "");
			
 
				+			spin_unlock(&req->r_dentry->d_lock);
			
 
				+			kfree(path);
			
 
				+		} else if (req->r_path1) {
			
 
				+			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
			
 
				+				   req->r_path1);
			
 
				+		}
			
 
				+
			
 
				+		if (req->r_old_dentry) {
			
 
				+			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
			
 
				+						    &pathbase, 0);
			
 
				+			spin_lock(&req->r_old_dentry->d_lock);
			
 
				+			seq_printf(s, " #%llx/%.*s (%s)",
			
 
				+			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
			
 
				+				   req->r_old_dentry->d_name.len,
			
 
				+				   req->r_old_dentry->d_name.name,
			
 
				+				   path ? path : "");
			
 
				+			spin_unlock(&req->r_old_dentry->d_lock);
			
 
				+			kfree(path);
			
 
				+		} else if (req->r_path2) {
			
 
				+			if (req->r_ino2.ino)
			
 
				+				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
			
 
				+					   req->r_path2);
			
 
				+			else
			
 
				+				seq_printf(s, " %s", req->r_path2);
			
 
				+		}
			
 
				+
			
 
				+		seq_printf(s, "\n");
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int osdc_show(struct seq_file *s, void *pp)
			
 
				+{
			
 
				+	struct ceph_client *client = s->private;
			
 
				+	struct ceph_osd_client *osdc = &client->osdc;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
			
 
				+		struct ceph_osd_request *req;
			
 
				+		struct ceph_osd_request_head *head;
			
 
				+		struct ceph_osd_op *op;
			
 
				+		int num_ops;
			
 
				+		int opcode, olen;
			
 
				+		int i;
			
 
				+
			
 
				+		req = rb_entry(p, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
			
 
				+			   req->r_osd ? req->r_osd->o_osd : -1,
			
 
				+			   le32_to_cpu(req->r_pgid.pool),
			
 
				+			   le16_to_cpu(req->r_pgid.ps));
			
 
				+
			
 
				+		head = req->r_request->front.iov_base;
			
 
				+		op = (void *)(head + 1);
			
 
				+
			
 
				+		num_ops = le16_to_cpu(head->num_ops);
			
 
				+		olen = le32_to_cpu(head->object_len);
			
 
				+		seq_printf(s, "%.*s", olen,
			
 
				+			   (const char *)(head->ops + num_ops));
			
 
				+
			
 
				+		if (req->r_reassert_version.epoch)
			
 
				+			seq_printf(s, "\t%u'%llu",
			
 
				+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
			
 
				+			   le64_to_cpu(req->r_reassert_version.version));
			
 
				+		else
			
 
				+			seq_printf(s, "\t");
			
 
				+
			
 
				+		for (i = 0; i < num_ops; i++) {
			
 
				+			opcode = le16_to_cpu(op->op);
			
 
				+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
			
 
				+			op++;
			
 
				+		}
			
 
				+
			
 
				+		seq_printf(s, "\n");
			
 
				+	}
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int caps_show(struct seq_file *s, void *p)
			
 
				+{
			
 
				+	struct ceph_client *client = p;
			
 
				+	int total, avail, used, reserved, min;
			
 
				+
			
 
				+	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
			
 
				+	seq_printf(s, "total\t\t%d\n"
			
 
				+		   "avail\t\t%d\n"
			
 
				+		   "used\t\t%d\n"
			
 
				+		   "reserved\t%d\n"
			
 
				+		   "min\t%d\n",
			
 
				+		   total, avail, used, reserved, min);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int dentry_lru_show(struct seq_file *s, void *ptr)
			
 
				+{
			
 
				+	struct ceph_client *client = s->private;
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_dentry_info *di;
			
 
				+
			
 
				+	spin_lock(&mdsc->dentry_lru_lock);
			
 
				+	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
			
 
				+		struct dentry *dentry = di->dentry;
			
 
				+		seq_printf(s, "%p %p\t%.*s\n",
			
 
				+			   di, dentry, dentry->d_name.len, dentry->d_name.name);
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->dentry_lru_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#define DEFINE_SHOW_FUNC(name) 						\
			
 
				+static int name##_open(struct inode *inode, struct file *file)		\
			
 
				+{									\
			
 
				+	struct seq_file *sf;						\
			
 
				+	int ret;							\
			
 
				+									\
			
 
				+	ret = single_open(file, name, NULL);				\
			
 
				+	sf = file->private_data;					\
			
 
				+	sf->private = inode->i_private;					\
			
 
				+	return ret;							\
			
 
				+}									\
			
 
				+									\
			
 
				+static const struct file_operations name##_fops = {			\
			
 
				+	.open		= name##_open,					\
			
 
				+	.read		= seq_read,					\
			
 
				+	.llseek		= seq_lseek,					\
			
 
				+	.release	= single_release,				\
			
 
				+};
			
 
				+
			
 
				+DEFINE_SHOW_FUNC(monmap_show)
			
 
				+DEFINE_SHOW_FUNC(mdsmap_show)
			
 
				+DEFINE_SHOW_FUNC(osdmap_show)
			
 
				+DEFINE_SHOW_FUNC(monc_show)
			
 
				+DEFINE_SHOW_FUNC(mdsc_show)
			
 
				+DEFINE_SHOW_FUNC(osdc_show)
			
 
				+DEFINE_SHOW_FUNC(dentry_lru_show)
			
 
				+DEFINE_SHOW_FUNC(caps_show)
			
 
				+
			
 
				+static int congestion_kb_set(void *data, u64 val)
			
 
				+{
			
 
				+	struct ceph_client *client = (struct ceph_client *)data;
			
 
				+
			
 
				+	if (client)
			
 
				+		client->mount_args->congestion_kb = (int)val;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int congestion_kb_get(void *data, u64 *val)
			
 
				+{
			
 
				+	struct ceph_client *client = (struct ceph_client *)data;
			
 
				+
			
 
				+	if (client)
			
 
				+		*val = (u64)client->mount_args->congestion_kb;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
			
 
				+			congestion_kb_set, "%llu\n");
			
 
				+
			
 
				+int __init ceph_debugfs_init(void)
			
 
				+{
			
 
				+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
			
 
				+	if (!ceph_debugfs_dir)
			
 
				+		return -ENOMEM;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void ceph_debugfs_cleanup(void)
			
 
				+{
			
 
				+	debugfs_remove(ceph_debugfs_dir);
			
 
				+}
			
 
				+
			
 
				+int ceph_debugfs_client_init(struct ceph_client *client)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	char name[80];
			
 
				+
			
 
				+	snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
			
 
				+		 PR_FSID(&client->fsid), client->monc.auth->global_id);
			
 
				+
			
 
				+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
			
 
				+	if (!client->debugfs_dir)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->monc.debugfs_file = debugfs_create_file("monc",
			
 
				+						      0600,
			
 
				+						      client->debugfs_dir,
			
 
				+						      client,
			
 
				+						      &monc_show_fops);
			
 
				+	if (!client->monc.debugfs_file)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
			
 
				+						      0600,
			
 
				+						      client->debugfs_dir,
			
 
				+						      client,
			
 
				+						      &mdsc_show_fops);
			
 
				+	if (!client->mdsc.debugfs_file)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->osdc.debugfs_file = debugfs_create_file("osdc",
			
 
				+						      0600,
			
 
				+						      client->debugfs_dir,
			
 
				+						      client,
			
 
				+						      &osdc_show_fops);
			
 
				+	if (!client->osdc.debugfs_file)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->debugfs_monmap = debugfs_create_file("monmap",
			
 
				+					0600,
			
 
				+					client->debugfs_dir,
			
 
				+					client,
			
 
				+					&monmap_show_fops);
			
 
				+	if (!client->debugfs_monmap)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
			
 
				+					0600,
			
 
				+					client->debugfs_dir,
			
 
				+					client,
			
 
				+					&mdsmap_show_fops);
			
 
				+	if (!client->debugfs_mdsmap)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->debugfs_osdmap = debugfs_create_file("osdmap",
			
 
				+					0600,
			
 
				+					client->debugfs_dir,
			
 
				+					client,
			
 
				+					&osdmap_show_fops);
			
 
				+	if (!client->debugfs_osdmap)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
			
 
				+					0600,
			
 
				+					client->debugfs_dir,
			
 
				+					client,
			
 
				+					&dentry_lru_show_fops);
			
 
				+	if (!client->debugfs_dentry_lru)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->debugfs_caps = debugfs_create_file("caps",
			
 
				+						   0400,
			
 
				+						   client->debugfs_dir,
			
 
				+						   client,
			
 
				+						   &caps_show_fops);
			
 
				+	if (!client->debugfs_caps)
			
 
				+		goto out;
			
 
				+
			
 
				+	client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
			
 
				+						   0600,
			
 
				+						   client->debugfs_dir,
			
 
				+						   client,
			
 
				+						   &congestion_kb_fops);
			
 
				+	if (!client->debugfs_congestion_kb)
			
 
				+		goto out;
			
 
				+
			
 
				+	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
			
 
				+	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
			
 
				+						     name);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out:
			
 
				+	ceph_debugfs_client_cleanup(client);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void ceph_debugfs_client_cleanup(struct ceph_client *client)
			
 
				+{
			
 
				+	debugfs_remove(client->debugfs_bdi);
			
 
				+	debugfs_remove(client->debugfs_caps);
			
 
				+	debugfs_remove(client->debugfs_dentry_lru);
			
 
				+	debugfs_remove(client->debugfs_osdmap);
			
 
				+	debugfs_remove(client->debugfs_mdsmap);
			
 
				+	debugfs_remove(client->debugfs_monmap);
			
 
				+	debugfs_remove(client->osdc.debugfs_file);
			
 
				+	debugfs_remove(client->mdsc.debugfs_file);
			
 
				+	debugfs_remove(client->monc.debugfs_file);
			
 
				+	debugfs_remove(client->debugfs_congestion_kb);
			
 
				+	debugfs_remove(client->debugfs_dir);
			
 
				+}
			
 
				+
			
 
				+#else  // CONFIG_DEBUG_FS
			
 
				+
			
 
				+int __init ceph_debugfs_init(void)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void ceph_debugfs_cleanup(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+int ceph_debugfs_client_init(struct ceph_client *client)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void ceph_debugfs_client_cleanup(struct ceph_client *client)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+#endif  // CONFIG_DEBUG_FS
			
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
 
				+#ifndef __CEPH_DECODE_H
			
 
				+#define __CEPH_DECODE_H
			
 
				+
			
 
				+#include <asm/unaligned.h>
			
 
				+#include <linux/time.h>
			
 
				+
			
 
				+#include "types.h"
			
 
				+
			
 
				+/*
			
 
				+ * in all cases,
			
 
				+ *   void **p     pointer to position pointer
			
 
				+ *   void *end    pointer to end of buffer (last byte + 1)
			
 
				+ */
			
 
				+
			
 
				+static inline u64 ceph_decode_64(void **p)
			
 
				+{
			
 
				+	u64 v = get_unaligned_le64(*p);
			
 
				+	*p += sizeof(u64);
			
 
				+	return v;
			
 
				+}
			
 
				+static inline u32 ceph_decode_32(void **p)
			
 
				+{
			
 
				+	u32 v = get_unaligned_le32(*p);
			
 
				+	*p += sizeof(u32);
			
 
				+	return v;
			
 
				+}
			
 
				+static inline u16 ceph_decode_16(void **p)
			
 
				+{
			
 
				+	u16 v = get_unaligned_le16(*p);
			
 
				+	*p += sizeof(u16);
			
 
				+	return v;
			
 
				+}
			
 
				+static inline u8 ceph_decode_8(void **p)
			
 
				+{
			
 
				+	u8 v = *(u8 *)*p;
			
 
				+	(*p)++;
			
 
				+	return v;
			
 
				+}
			
 
				+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
			
 
				+{
			
 
				+	memcpy(pv, *p, n);
			
 
				+	*p += n;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * bounds check input.
			
 
				+ */
			
 
				+#define ceph_decode_need(p, end, n, bad)		\
			
 
				+	do {						\
			
 
				+		if (unlikely(*(p) + (n) > (end))) 	\
			
 
				+			goto bad;			\
			
 
				+	} while (0)
			
 
				+
			
 
				+#define ceph_decode_64_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_decode_need(p, end, sizeof(u64), bad);	\
			
 
				+		v = ceph_decode_64(p);				\
			
 
				+	} while (0)
			
 
				+#define ceph_decode_32_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_decode_need(p, end, sizeof(u32), bad);	\
			
 
				+		v = ceph_decode_32(p);				\
			
 
				+	} while (0)
			
 
				+#define ceph_decode_16_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_decode_need(p, end, sizeof(u16), bad);	\
			
 
				+		v = ceph_decode_16(p);				\
			
 
				+	} while (0)
			
 
				+#define ceph_decode_8_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_decode_need(p, end, sizeof(u8), bad);	\
			
 
				+		v = ceph_decode_8(p);				\
			
 
				+	} while (0)
			
 
				+
			
 
				+#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
			
 
				+	do {							\
			
 
				+		ceph_decode_need(p, end, n, bad);		\
			
 
				+		ceph_decode_copy(p, pv, n);			\
			
 
				+	} while (0)
			
 
				+
			
 
				+/*
			
 
				+ * struct ceph_timespec <-> struct timespec
			
 
				+ */
			
 
				+static inline void ceph_decode_timespec(struct timespec *ts,
			
 
				+					const struct ceph_timespec *tv)
			
 
				+{
			
 
				+	ts->tv_sec = le32_to_cpu(tv->tv_sec);
			
 
				+	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
			
 
				+}
			
 
				+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
			
 
				+					const struct timespec *ts)
			
 
				+{
			
 
				+	tv->tv_sec = cpu_to_le32(ts->tv_sec);
			
 
				+	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * sockaddr_storage <-> ceph_sockaddr
			
 
				+ */
			
 
				+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
			
 
				+{
			
 
				+	a->in_addr.ss_family = htons(a->in_addr.ss_family);
			
 
				+}
			
 
				+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
			
 
				+{
			
 
				+	a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
			
 
				+	WARN_ON(a->in_addr.ss_family == 512);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * encoders
			
 
				+ */
			
 
				+static inline void ceph_encode_64(void **p, u64 v)
			
 
				+{
			
 
				+	put_unaligned_le64(v, (__le64 *)*p);
			
 
				+	*p += sizeof(u64);
			
 
				+}
			
 
				+static inline void ceph_encode_32(void **p, u32 v)
			
 
				+{
			
 
				+	put_unaligned_le32(v, (__le32 *)*p);
			
 
				+	*p += sizeof(u32);
			
 
				+}
			
 
				+static inline void ceph_encode_16(void **p, u16 v)
			
 
				+{
			
 
				+	put_unaligned_le16(v, (__le16 *)*p);
			
 
				+	*p += sizeof(u16);
			
 
				+}
			
 
				+static inline void ceph_encode_8(void **p, u8 v)
			
 
				+{
			
 
				+	*(u8 *)*p = v;
			
 
				+	(*p)++;
			
 
				+}
			
 
				+static inline void ceph_encode_copy(void **p, const void *s, int len)
			
 
				+{
			
 
				+	memcpy(*p, s, len);
			
 
				+	*p += len;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * filepath, string encoders
			
 
				+ */
			
 
				+static inline void ceph_encode_filepath(void **p, void *end,
			
 
				+					u64 ino, const char *path)
			
 
				+{
			
 
				+	u32 len = path ? strlen(path) : 0;
			
 
				+	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
			
 
				+	ceph_encode_8(p, 1);
			
 
				+	ceph_encode_64(p, ino);
			
 
				+	ceph_encode_32(p, len);
			
 
				+	if (len)
			
 
				+		memcpy(*p, path, len);
			
 
				+	*p += len;
			
 
				+}
			
 
				+
			
 
				+static inline void ceph_encode_string(void **p, void *end,
			
 
				+				      const char *s, u32 len)
			
 
				+{
			
 
				+	BUG_ON(*p + sizeof(len) + len > end);
			
 
				+	ceph_encode_32(p, len);
			
 
				+	if (len)
			
 
				+		memcpy(*p, s, len);
			
 
				+	*p += len;
			
 
				+}
			
 
				+
			
 
				+#define ceph_encode_need(p, end, n, bad)		\
			
 
				+	do {						\
			
 
				+		if (unlikely(*(p) + (n) > (end))) 	\
			
 
				+			goto bad;			\
			
 
				+	} while (0)
			
 
				+
			
 
				+#define ceph_encode_64_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_encode_need(p, end, sizeof(u64), bad);	\
			
 
				+		ceph_encode_64(p, v);				\
			
 
				+	} while (0)
			
 
				+#define ceph_encode_32_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_encode_need(p, end, sizeof(u32), bad);	\
			
 
				+		ceph_encode_32(p, v);			\
			
 
				+	} while (0)
			
 
				+#define ceph_encode_16_safe(p, end, v, bad)			\
			
 
				+	do {							\
			
 
				+		ceph_encode_need(p, end, sizeof(u16), bad);	\
			
 
				+		ceph_encode_16(p, v);			\
			
 
				+	} while (0)
			
 
				+
			
 
				+#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
			
 
				+	do {							\
			
 
				+		ceph_encode_need(p, end, n, bad);		\
			
 
				+		ceph_encode_copy(p, pv, n);			\
			
 
				+	} while (0)
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1220 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/spinlock.h>
			
 
				+#include <linux/fs_struct.h>
			
 
				+#include <linux/namei.h>
			
 
				+#include <linux/sched.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+
			
 
				+/*
			
 
				+ * Directory operations: readdir, lookup, create, link, unlink,
			
 
				+ * rename, etc.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Ceph MDS operations are specified in terms of a base ino and
			
 
				+ * relative path.  Thus, the client can specify an operation on a
			
 
				+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
			
 
				+ * relative to, say, the root directory.
			
 
				+ *
			
 
				+ * Normally, we limit ourselves to strict inode ops (no path component)
			
 
				+ * or dentry operations (a single path component relative to an ino).  The
			
 
				+ * exception to this is open_root_dentry(), which will open the mount
			
 
				+ * point by name.
			
 
				+ */
			
 
				+
			
 
				+const struct inode_operations ceph_dir_iops;
			
 
				+const struct file_operations ceph_dir_fops;
			
 
				+struct dentry_operations ceph_dentry_ops;
			
 
				+
			
 
				+/*
			
 
				+ * Initialize ceph dentry state.
			
 
				+ */
			
 
				+int ceph_init_dentry(struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di;
			
 
				+
			
 
				+	if (dentry->d_fsdata)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
			
 
				+		dentry->d_op = &ceph_dentry_ops;
			
 
				+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
			
 
				+		dentry->d_op = &ceph_snapdir_dentry_ops;
			
 
				+	else
			
 
				+		dentry->d_op = &ceph_snap_dentry_ops;
			
 
				+
			
 
				+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
			
 
				+	if (!di)
			
 
				+		return -ENOMEM;          /* oh well */
			
 
				+
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	if (dentry->d_fsdata) /* lost a race */
			
 
				+		goto out_unlock;
			
 
				+	di->dentry = dentry;
			
 
				+	di->lease_session = NULL;
			
 
				+	dentry->d_fsdata = di;
			
 
				+	dentry->d_time = jiffies;
			
 
				+	ceph_dentry_lru_add(dentry);
			
 
				+out_unlock:
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * for readdir, we encode the directory frag and offset within that
			
 
				+ * frag into f_pos.
			
 
				+ */
			
 
				+static unsigned fpos_frag(loff_t p)
			
 
				+{
			
 
				+	return p >> 32;
			
 
				+}
			
 
				+static unsigned fpos_off(loff_t p)
			
 
				+{
			
 
				+	return p & 0xffffffff;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * When possible, we try to satisfy a readdir by peeking at the
			
 
				+ * dcache.  We make this work by carefully ordering dentries on
			
 
				+ * d_u.d_child when we initially get results back from the MDS, and
			
 
				+ * falling back to a "normal" sync readdir if any dentries in the dir
			
 
				+ * are dropped.
			
 
				+ *
			
 
				+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
			
 
				+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
			
 
				+ * the MDS if/when the directory is modified).
			
 
				+ */
			
 
				+static int __dcache_readdir(struct file *filp,
			
 
				+			    void *dirent, filldir_t filldir)
			
 
				+{
			
 
				+	struct inode *inode = filp->f_dentry->d_inode;
			
 
				+	struct ceph_file_info *fi = filp->private_data;
			
 
				+	struct dentry *parent = filp->f_dentry;
			
 
				+	struct inode *dir = parent->d_inode;
			
 
				+	struct list_head *p;
			
 
				+	struct dentry *dentry, *last;
			
 
				+	struct ceph_dentry_info *di;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	/* claim ref on last dentry we returned */
			
 
				+	last = fi->dentry;
			
 
				+	fi->dentry = NULL;
			
 
				+
			
 
				+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
			
 
				+	     last);
			
 
				+
			
 
				+	spin_lock(&dcache_lock);
			
 
				+
			
 
				+	/* start at beginning? */
			
 
				+	if (filp->f_pos == 2 || (last &&
			
 
				+				 filp->f_pos < ceph_dentry(last)->offset)) {
			
 
				+		if (list_empty(&parent->d_subdirs))
			
 
				+			goto out_unlock;
			
 
				+		p = parent->d_subdirs.prev;
			
 
				+		dout(" initial p %p/%p\n", p->prev, p->next);
			
 
				+	} else {
			
 
				+		p = last->d_u.d_child.prev;
			
 
				+	}
			
 
				+
			
 
				+more:
			
 
				+	dentry = list_entry(p, struct dentry, d_u.d_child);
			
 
				+	di = ceph_dentry(dentry);
			
 
				+	while (1) {
			
 
				+		dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
			
 
				+		     parent->d_subdirs.prev, parent->d_subdirs.next);
			
 
				+		if (p == &parent->d_subdirs) {
			
 
				+			fi->at_end = 1;
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				+		if (!d_unhashed(dentry) && dentry->d_inode &&
			
 
				+		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
			
 
				+		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
			
 
				+		    filp->f_pos <= di->offset)
			
 
				+			break;
			
 
				+		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
			
 
				+		     dentry->d_name.len, dentry->d_name.name, di->offset,
			
 
				+		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
			
 
				+		     !dentry->d_inode ? " null" : "");
			
 
				+		p = p->prev;
			
 
				+		dentry = list_entry(p, struct dentry, d_u.d_child);
			
 
				+		di = ceph_dentry(dentry);
			
 
				+	}
			
 
				+
			
 
				+	atomic_inc(&dentry->d_count);
			
 
				+	spin_unlock(&dcache_lock);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
			
 
				+	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
			
 
				+	filp->f_pos = di->offset;
			
 
				+	err = filldir(dirent, dentry->d_name.name,
			
 
				+		      dentry->d_name.len, di->offset,
			
 
				+		      dentry->d_inode->i_ino,
			
 
				+		      dentry->d_inode->i_mode >> 12);
			
 
				+
			
 
				+	if (last) {
			
 
				+		if (err < 0) {
			
 
				+			/* remember our position */
			
 
				+			fi->dentry = last;
			
 
				+			fi->next_offset = di->offset;
			
 
				+		} else {
			
 
				+			dput(last);
			
 
				+		}
			
 
				+		last = NULL;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	spin_lock(&dcache_lock);
			
 
				+
			
 
				+	if (err < 0)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	last = dentry;
			
 
				+
			
 
				+	p = p->prev;
			
 
				+	filp->f_pos++;
			
 
				+
			
 
				+	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
			
 
				+	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
			
 
				+		goto more;
			
 
				+	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
			
 
				+	err = -EAGAIN;
			
 
				+
			
 
				+out_unlock:
			
 
				+	spin_unlock(&dcache_lock);
			
 
				+
			
 
				+	if (last) {
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		dput(last);
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+	}
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * make note of the last dentry we read, so we can
			
 
				+ * continue at the same lexicographical point,
			
 
				+ * regardless of what dir changes take place on the
			
 
				+ * server.
			
 
				+ */
			
 
				+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
			
 
				+			    int len)
			
 
				+{
			
 
				+	kfree(fi->last_name);
			
 
				+	fi->last_name = kmalloc(len+1, GFP_NOFS);
			
 
				+	if (!fi->last_name)
			
 
				+		return -ENOMEM;
			
 
				+	memcpy(fi->last_name, name, len);
			
 
				+	fi->last_name[len] = 0;
			
 
				+	dout("note_last_dentry '%s'\n", fi->last_name);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
			
 
				+{
			
 
				+	struct ceph_file_info *fi = filp->private_data;
			
 
				+	struct inode *inode = filp->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_client *client = ceph_inode_to_client(inode);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	unsigned frag = fpos_frag(filp->f_pos);
			
 
				+	int off = fpos_off(filp->f_pos);
			
 
				+	int err;
			
 
				+	u32 ftype;
			
 
				+	struct ceph_mds_reply_info_parsed *rinfo;
			
 
				+	const int max_entries = client->mount_args->max_readdir;
			
 
				+
			
 
				+	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
			
 
				+	if (fi->at_end)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* always start with . and .. */
			
 
				+	if (filp->f_pos == 0) {
			
 
				+		/* note dir version at start of readdir so we can tell
			
 
				+		 * if any dentries get dropped */
			
 
				+		fi->dir_release_count = ci->i_release_count;
			
 
				+
			
 
				+		dout("readdir off 0 -> '.'\n");
			
 
				+		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
			
 
				+			    inode->i_ino, inode->i_mode >> 12) < 0)
			
 
				+			return 0;
			
 
				+		filp->f_pos = 1;
			
 
				+		off = 1;
			
 
				+	}
			
 
				+	if (filp->f_pos == 1) {
			
 
				+		dout("readdir off 1 -> '..'\n");
			
 
				+		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
			
 
				+			    filp->f_dentry->d_parent->d_inode->i_ino,
			
 
				+			    inode->i_mode >> 12) < 0)
			
 
				+			return 0;
			
 
				+		filp->f_pos = 2;
			
 
				+		off = 2;
			
 
				+	}
			
 
				+
			
 
				+	/* can we use the dcache? */
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if ((filp->f_pos == 2 || fi->dentry) &&
			
 
				+	    !ceph_test_opt(client, NOASYNCREADDIR) &&
			
 
				+	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
			
 
				+	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
			
 
				+		err = __dcache_readdir(filp, dirent, filldir);
			
 
				+		if (err != -EAGAIN) {
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+			return err;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	if (fi->dentry) {
			
 
				+		err = note_last_dentry(fi, fi->dentry->d_name.name,
			
 
				+				       fi->dentry->d_name.len);
			
 
				+		if (err)
			
 
				+			return err;
			
 
				+		dput(fi->dentry);
			
 
				+		fi->dentry = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/* proceed with a normal readdir */
			
 
				+
			
 
				+more:
			
 
				+	/* do we have the correct frag content buffered? */
			
 
				+	if (fi->frag != frag || fi->last_readdir == NULL) {
			
 
				+		struct ceph_mds_request *req;
			
 
				+		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
			
 
				+			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
			
 
				+
			
 
				+		/* discard old result, if any */
			
 
				+		if (fi->last_readdir)
			
 
				+			ceph_mdsc_put_request(fi->last_readdir);
			
 
				+
			
 
				+		/* requery frag tree, as the frag topology may have changed */
			
 
				+		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
			
 
				+
			
 
				+		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
			
 
				+		     ceph_vinop(inode), frag, fi->last_name);
			
 
				+		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
			
 
				+		if (IS_ERR(req))
			
 
				+			return PTR_ERR(req);
			
 
				+		req->r_inode = igrab(inode);
			
 
				+		req->r_dentry = dget(filp->f_dentry);
			
 
				+		/* hints to request -> mds selection code */
			
 
				+		req->r_direct_mode = USE_AUTH_MDS;
			
 
				+		req->r_direct_hash = ceph_frag_value(frag);
			
 
				+		req->r_direct_is_hash = true;
			
 
				+		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
			
 
				+		req->r_readdir_offset = fi->next_offset;
			
 
				+		req->r_args.readdir.frag = cpu_to_le32(frag);
			
 
				+		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
			
 
				+		req->r_num_caps = max_entries;
			
 
				+		err = ceph_mdsc_do_request(mdsc, NULL, req);
			
 
				+		if (err < 0) {
			
 
				+			ceph_mdsc_put_request(req);
			
 
				+			return err;
			
 
				+		}
			
 
				+		dout("readdir got and parsed readdir result=%d"
			
 
				+		     " on frag %x, end=%d, complete=%d\n", err, frag,
			
 
				+		     (int)req->r_reply_info.dir_end,
			
 
				+		     (int)req->r_reply_info.dir_complete);
			
 
				+
			
 
				+		if (!req->r_did_prepopulate) {
			
 
				+			dout("readdir !did_prepopulate");
			
 
				+			fi->dir_release_count--;    /* preclude I_COMPLETE */
			
 
				+		}
			
 
				+
			
 
				+		/* note next offset and last dentry name */
			
 
				+		fi->offset = fi->next_offset;
			
 
				+		fi->last_readdir = req;
			
 
				+
			
 
				+		if (req->r_reply_info.dir_end) {
			
 
				+			kfree(fi->last_name);
			
 
				+			fi->last_name = NULL;
			
 
				+			fi->next_offset = 0;
			
 
				+		} else {
			
 
				+			rinfo = &req->r_reply_info;
			
 
				+			err = note_last_dentry(fi,
			
 
				+				       rinfo->dir_dname[rinfo->dir_nr-1],
			
 
				+				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
			
 
				+			if (err)
			
 
				+				return err;
			
 
				+			fi->next_offset += rinfo->dir_nr;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	rinfo = &fi->last_readdir->r_reply_info;
			
 
				+	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
			
 
				+	     rinfo->dir_nr, off, fi->offset);
			
 
				+	while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
			
 
				+		u64 pos = ceph_make_fpos(frag, off);
			
 
				+		struct ceph_mds_reply_inode *in =
			
 
				+			rinfo->dir_in[off - fi->offset].in;
			
 
				+		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
			
 
				+		     off, off - fi->offset, rinfo->dir_nr, pos,
			
 
				+		     rinfo->dir_dname_len[off - fi->offset],
			
 
				+		     rinfo->dir_dname[off - fi->offset], in);
			
 
				+		BUG_ON(!in);
			
 
				+		ftype = le32_to_cpu(in->mode) >> 12;
			
 
				+		if (filldir(dirent,
			
 
				+			    rinfo->dir_dname[off - fi->offset],
			
 
				+			    rinfo->dir_dname_len[off - fi->offset],
			
 
				+			    pos,
			
 
				+			    le64_to_cpu(in->ino),
			
 
				+			    ftype) < 0) {
			
 
				+			dout("filldir stopping us...\n");
			
 
				+			return 0;
			
 
				+		}
			
 
				+		off++;
			
 
				+		filp->f_pos = pos + 1;
			
 
				+	}
			
 
				+
			
 
				+	if (fi->last_name) {
			
 
				+		ceph_mdsc_put_request(fi->last_readdir);
			
 
				+		fi->last_readdir = NULL;
			
 
				+		goto more;
			
 
				+	}
			
 
				+
			
 
				+	/* more frags? */
			
 
				+	if (!ceph_frag_is_rightmost(frag)) {
			
 
				+		frag = ceph_frag_next(frag);
			
 
				+		off = 0;
			
 
				+		filp->f_pos = ceph_make_fpos(frag, off);
			
 
				+		dout("readdir next frag is %x\n", frag);
			
 
				+		goto more;
			
 
				+	}
			
 
				+	fi->at_end = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * if dir_release_count still matches the dir, no dentries
			
 
				+	 * were released during the whole readdir, and we should have
			
 
				+	 * the complete dir contents in our cache.
			
 
				+	 */
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (ci->i_release_count == fi->dir_release_count) {
			
 
				+		dout(" marking %p complete\n", inode);
			
 
				+		ci->i_ceph_flags |= CEPH_I_COMPLETE;
			
 
				+		ci->i_max_offset = filp->f_pos;
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	dout("readdir %p filp %p done.\n", inode, filp);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void reset_readdir(struct ceph_file_info *fi)
			
 
				+{
			
 
				+	if (fi->last_readdir) {
			
 
				+		ceph_mdsc_put_request(fi->last_readdir);
			
 
				+		fi->last_readdir = NULL;
			
 
				+	}
			
 
				+	kfree(fi->last_name);
			
 
				+	fi->next_offset = 2;  /* compensate for . and .. */
			
 
				+	if (fi->dentry) {
			
 
				+		dput(fi->dentry);
			
 
				+		fi->dentry = NULL;
			
 
				+	}
			
 
				+	fi->at_end = 0;
			
 
				+}
			
 
				+
			
 
				+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
			
 
				+{
			
 
				+	struct ceph_file_info *fi = file->private_data;
			
 
				+	struct inode *inode = file->f_mapping->host;
			
 
				+	loff_t old_offset = offset;
			
 
				+	loff_t retval;
			
 
				+
			
 
				+	mutex_lock(&inode->i_mutex);
			
 
				+	switch (origin) {
			
 
				+	case SEEK_END:
			
 
				+		offset += inode->i_size + 2;   /* FIXME */
			
 
				+		break;
			
 
				+	case SEEK_CUR:
			
 
				+		offset += file->f_pos;
			
 
				+	}
			
 
				+	retval = -EINVAL;
			
 
				+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
			
 
				+		if (offset != file->f_pos) {
			
 
				+			file->f_pos = offset;
			
 
				+			file->f_version = 0;
			
 
				+			fi->at_end = 0;
			
 
				+		}
			
 
				+		retval = offset;
			
 
				+
			
 
				+		/*
			
 
				+		 * discard buffered readdir content on seekdir(0), or
			
 
				+		 * seek to new frag, or seek prior to current chunk.
			
 
				+		 */
			
 
				+		if (offset == 0 ||
			
 
				+		    fpos_frag(offset) != fpos_frag(old_offset) ||
			
 
				+		    fpos_off(offset) < fi->offset) {
			
 
				+			dout("dir_llseek dropping %p content\n", file);
			
 
				+			reset_readdir(fi);
			
 
				+		}
			
 
				+
			
 
				+		/* bump dir_release_count if we did a forward seek */
			
 
				+		if (offset > old_offset)
			
 
				+			fi->dir_release_count--;
			
 
				+	}
			
 
				+	mutex_unlock(&inode->i_mutex);
			
 
				+	return retval;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Process result of a lookup/open request.
			
 
				+ *
			
 
				+ * Mainly, make sure we return the final req->r_dentry (if it already
			
 
				+ * existed) in place of the original VFS-provided dentry when they
			
 
				+ * differ.
			
 
				+ *
			
 
				+ * Gracefully handle the case where the MDS replies with -ENOENT and
			
 
				+ * no trace (which it may do, at its discretion, e.g., if it doesn't
			
 
				+ * care to issue a lease on the negative dentry).
			
 
				+ */
			
 
				+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
			
 
				+				  struct dentry *dentry, int err)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_client(dentry->d_sb);
			
 
				+	struct inode *parent = dentry->d_parent->d_inode;
			
 
				+
			
 
				+	/* .snap dir? */
			
 
				+	if (err == -ENOENT &&
			
 
				+	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
			
 
				+	    strcmp(dentry->d_name.name,
			
 
				+		   client->mount_args->snapdir_name) == 0) {
			
 
				+		struct inode *inode = ceph_get_snapdir(parent);
			
 
				+		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
			
 
				+		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
			
 
				+		d_add(dentry, inode);
			
 
				+		err = 0;
			
 
				+	}
			
 
				+
			
 
				+	if (err == -ENOENT) {
			
 
				+		/* no trace? */
			
 
				+		err = 0;
			
 
				+		if (!req->r_reply_info.head->is_dentry) {
			
 
				+			dout("ENOENT and no trace, dentry %p inode %p\n",
			
 
				+			     dentry, dentry->d_inode);
			
 
				+			if (dentry->d_inode) {
			
 
				+				d_drop(dentry);
			
 
				+				err = -ENOENT;
			
 
				+			} else {
			
 
				+				d_add(dentry, NULL);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (err)
			
 
				+		dentry = ERR_PTR(err);
			
 
				+	else if (dentry != req->r_dentry)
			
 
				+		dentry = dget(req->r_dentry);   /* we got spliced */
			
 
				+	else
			
 
				+		dentry = NULL;
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
			
 
				+{
			
 
				+	return ceph_ino(inode) == CEPH_INO_ROOT &&
			
 
				+		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Look up a single dir entry.  If there is a lookup intent, inform
			
 
				+ * the MDS so that it gets our 'caps wanted' value in a single op.
			
 
				+ */
			
 
				+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
			
 
				+				  struct nameidata *nd)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int op;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("lookup %p dentry %p '%.*s'\n",
			
 
				+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
			
 
				+
			
 
				+	if (dentry->d_name.len > NAME_MAX)
			
 
				+		return ERR_PTR(-ENAMETOOLONG);
			
 
				+
			
 
				+	err = ceph_init_dentry(dentry);
			
 
				+	if (err < 0)
			
 
				+		return ERR_PTR(err);
			
 
				+
			
 
				+	/* open (but not create!) intent? */
			
 
				+	if (nd &&
			
 
				+	    (nd->flags & LOOKUP_OPEN) &&
			
 
				+	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
			
 
				+	    !(nd->intent.open.flags & O_CREAT)) {
			
 
				+		int mode = nd->intent.open.create_mode & ~current->fs->umask;
			
 
				+		return ceph_lookup_open(dir, dentry, nd, mode, 1);
			
 
				+	}
			
 
				+
			
 
				+	/* can we conclude ENOENT locally? */
			
 
				+	if (dentry->d_inode == NULL) {
			
 
				+		struct ceph_inode_info *ci = ceph_inode(dir);
			
 
				+		struct ceph_dentry_info *di = ceph_dentry(dentry);
			
 
				+
			
 
				+		spin_lock(&dir->i_lock);
			
 
				+		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
			
 
				+		if (strncmp(dentry->d_name.name,
			
 
				+			    client->mount_args->snapdir_name,
			
 
				+			    dentry->d_name.len) &&
			
 
				+		    !is_root_ceph_dentry(dir, dentry) &&
			
 
				+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
			
 
				+		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
			
 
				+			di->offset = ci->i_max_offset++;
			
 
				+			spin_unlock(&dir->i_lock);
			
 
				+			dout(" dir %p complete, -ENOENT\n", dir);
			
 
				+			d_add(dentry, NULL);
			
 
				+			di->lease_shared_gen = ci->i_shared_gen;
			
 
				+			return NULL;
			
 
				+		}
			
 
				+		spin_unlock(&dir->i_lock);
			
 
				+	}
			
 
				+
			
 
				+	op = ceph_snap(dir) == CEPH_SNAPDIR ?
			
 
				+		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
			
 
				+	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return ERR_PTR(PTR_ERR(req));
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	/* we only need inode linkage */
			
 
				+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
			
 
				+	req->r_locked_dir = dir;
			
 
				+	err = ceph_mdsc_do_request(mdsc, NULL, req);
			
 
				+	dentry = ceph_finish_lookup(req, dentry, err);
			
 
				+	ceph_mdsc_put_request(req);  /* will dput(dentry) */
			
 
				+	dout("lookup result=%p\n", dentry);
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If we do a create but get no trace back from the MDS, follow up with
			
 
				+ * a lookup (the VFS expects us to link up the provided dentry).
			
 
				+ */
			
 
				+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
			
 
				+{
			
 
				+	struct dentry *result = ceph_lookup(dir, dentry, NULL);
			
 
				+
			
 
				+	if (result && !IS_ERR(result)) {
			
 
				+		/*
			
 
				+		 * We created the item, then did a lookup, and found
			
 
				+		 * it was already linked to another inode we already
			
 
				+		 * had in our cache (and thus got spliced).  Link our
			
 
				+		 * dentry to that inode, but don't hash it, just in
			
 
				+		 * case the VFS wants to dereference it.
			
 
				+		 */
			
 
				+		BUG_ON(!result->d_inode);
			
 
				+		d_instantiate(dentry, result->d_inode);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	return PTR_ERR(result);
			
 
				+}
			
 
				+
			
 
				+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
			
 
				+		      int mode, dev_t rdev)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+
			
 
				+	if (ceph_snap(dir) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
			
 
				+	     dir, dentry, mode, rdev);
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		d_drop(dentry);
			
 
				+		return PTR_ERR(req);
			
 
				+	}
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	req->r_locked_dir = dir;
			
 
				+	req->r_args.mknod.mode = cpu_to_le32(mode);
			
 
				+	req->r_args.mknod.rdev = cpu_to_le32(rdev);
			
 
				+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	err = ceph_mdsc_do_request(mdsc, dir, req);
			
 
				+	if (!err && !req->r_reply_info.head->is_dentry)
			
 
				+		err = ceph_handle_notrace_create(dir, dentry);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	if (err)
			
 
				+		d_drop(dentry);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
			
 
				+		       struct nameidata *nd)
			
 
				+{
			
 
				+	dout("create in dir %p dentry %p name '%.*s'\n",
			
 
				+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
			
 
				+
			
 
				+	if (ceph_snap(dir) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	if (nd) {
			
 
				+		BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
			
 
				+		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
			
 
				+		/* hrm, what should i do here if we get aliased? */
			
 
				+		if (IS_ERR(dentry))
			
 
				+			return PTR_ERR(dentry);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* fall back to mknod */
			
 
				+	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
			
 
				+}
			
 
				+
			
 
				+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
			
 
				+			    const char *dest)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+
			
 
				+	if (ceph_snap(dir) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		d_drop(dentry);
			
 
				+		return PTR_ERR(req);
			
 
				+	}
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	req->r_path2 = kstrdup(dest, GFP_NOFS);
			
 
				+	req->r_locked_dir = dir;
			
 
				+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	err = ceph_mdsc_do_request(mdsc, dir, req);
			
 
				+	if (!err && !req->r_reply_info.head->is_dentry)
			
 
				+		err = ceph_handle_notrace_create(dir, dentry);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	if (err)
			
 
				+		d_drop(dentry);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err = -EROFS;
			
 
				+	int op;
			
 
				+
			
 
				+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
			
 
				+		/* mkdir .snap/foo is a MKSNAP */
			
 
				+		op = CEPH_MDS_OP_MKSNAP;
			
 
				+		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
			
 
				+		     dentry->d_name.len, dentry->d_name.name, dentry);
			
 
				+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
			
 
				+		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
			
 
				+		op = CEPH_MDS_OP_MKDIR;
			
 
				+	} else {
			
 
				+		goto out;
			
 
				+	}
			
 
				+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		err = PTR_ERR(req);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	req->r_locked_dir = dir;
			
 
				+	req->r_args.mkdir.mode = cpu_to_le32(mode);
			
 
				+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	err = ceph_mdsc_do_request(mdsc, dir, req);
			
 
				+	if (!err && !req->r_reply_info.head->is_dentry)
			
 
				+		err = ceph_handle_notrace_create(dir, dentry);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+out:
			
 
				+	if (err < 0)
			
 
				+		d_drop(dentry);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
			
 
				+		     struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+
			
 
				+	if (ceph_snap(dir) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	dout("link in dir %p old_dentry %p dentry %p\n", dir,
			
 
				+	     old_dentry, dentry);
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		d_drop(dentry);
			
 
				+		return PTR_ERR(req);
			
 
				+	}
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
			
 
				+	req->r_locked_dir = dir;
			
 
				+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	err = ceph_mdsc_do_request(mdsc, dir, req);
			
 
				+	if (err)
			
 
				+		d_drop(dentry);
			
 
				+	else if (!req->r_reply_info.head->is_dentry)
			
 
				+		d_instantiate(dentry, igrab(old_dentry->d_inode));
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
			
 
				+ * looks like the link count will hit 0, drop any other caps (other
			
 
				+ * than PIN) we don't specifically want (due to the file still being
			
 
				+ * open).
			
 
				+ */
			
 
				+static int drop_caps_for_unlink(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (inode->i_nlink == 1) {
			
 
				+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
			
 
				+		ci->i_ceph_flags |= CEPH_I_NODELAY;
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return drop;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * rmdir and unlink are differ only by the metadata op code
			
 
				+ */
			
 
				+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err = -EROFS;
			
 
				+	int op;
			
 
				+
			
 
				+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
			
 
				+		/* rmdir .snap/foo is RMSNAP */
			
 
				+		dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
			
 
				+		     dentry->d_name.name, dentry);
			
 
				+		op = CEPH_MDS_OP_RMSNAP;
			
 
				+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
			
 
				+		dout("unlink/rmdir dir %p dn %p inode %p\n",
			
 
				+		     dir, dentry, inode);
			
 
				+		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
			
 
				+			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
			
 
				+	} else
			
 
				+		goto out;
			
 
				+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		err = PTR_ERR(req);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	req->r_locked_dir = dir;
			
 
				+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	req->r_inode_drop = drop_caps_for_unlink(inode);
			
 
				+	err = ceph_mdsc_do_request(mdsc, dir, req);
			
 
				+	if (!err && !req->r_reply_info.head->is_dentry)
			
 
				+		d_delete(dentry);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
			
 
				+		       struct inode *new_dir, struct dentry *new_dentry)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+
			
 
				+	if (ceph_snap(old_dir) != ceph_snap(new_dir))
			
 
				+		return -EXDEV;
			
 
				+	if (ceph_snap(old_dir) != CEPH_NOSNAP ||
			
 
				+	    ceph_snap(new_dir) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+	dout("rename dir %p dentry %p to dir %p dentry %p\n",
			
 
				+	     old_dir, old_dentry, new_dir, new_dentry);
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+	req->r_dentry = dget(new_dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	req->r_old_dentry = dget(old_dentry);
			
 
				+	req->r_locked_dir = new_dir;
			
 
				+	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	/* release LINK_RDCACHE on source inode (mds will lock it) */
			
 
				+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
			
 
				+	if (new_dentry->d_inode)
			
 
				+		req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
			
 
				+	err = ceph_mdsc_do_request(mdsc, old_dir, req);
			
 
				+	if (!err && !req->r_reply_info.head->is_dentry) {
			
 
				+		/*
			
 
				+		 * Normally d_move() is done by fill_trace (called by
			
 
				+		 * do_request, above).  If there is no trace, we need
			
 
				+		 * to do it here.
			
 
				+		 */
			
 
				+		d_move(old_dentry, new_dentry);
			
 
				+	}
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
			
 
				+ * renew if the least is more than half up.
			
 
				+ */
			
 
				+static int dentry_lease_is_valid(struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di;
			
 
				+	struct ceph_mds_session *s;
			
 
				+	int valid = 0;
			
 
				+	u32 gen;
			
 
				+	unsigned long ttl;
			
 
				+	struct ceph_mds_session *session = NULL;
			
 
				+	struct inode *dir = NULL;
			
 
				+	u32 seq = 0;
			
 
				+
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	di = ceph_dentry(dentry);
			
 
				+	if (di && di->lease_session) {
			
 
				+		s = di->lease_session;
			
 
				+		spin_lock(&s->s_cap_lock);
			
 
				+		gen = s->s_cap_gen;
			
 
				+		ttl = s->s_cap_ttl;
			
 
				+		spin_unlock(&s->s_cap_lock);
			
 
				+
			
 
				+		if (di->lease_gen == gen &&
			
 
				+		    time_before(jiffies, dentry->d_time) &&
			
 
				+		    time_before(jiffies, ttl)) {
			
 
				+			valid = 1;
			
 
				+			if (di->lease_renew_after &&
			
 
				+			    time_after(jiffies, di->lease_renew_after)) {
			
 
				+				/* we should renew */
			
 
				+				dir = dentry->d_parent->d_inode;
			
 
				+				session = ceph_get_mds_session(s);
			
 
				+				seq = di->lease_seq;
			
 
				+				di->lease_renew_after = 0;
			
 
				+				di->lease_renew_from = jiffies;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+
			
 
				+	if (session) {
			
 
				+		ceph_mdsc_lease_send_msg(session, dir, dentry,
			
 
				+					 CEPH_MDS_LEASE_RENEW, seq);
			
 
				+		ceph_put_mds_session(session);
			
 
				+	}
			
 
				+	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
			
 
				+	return valid;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check if directory-wide content lease/cap is valid.
			
 
				+ */
			
 
				+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(dir);
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dentry);
			
 
				+	int valid = 0;
			
 
				+
			
 
				+	spin_lock(&dir->i_lock);
			
 
				+	if (ci->i_shared_gen == di->lease_shared_gen)
			
 
				+		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
			
 
				+	spin_unlock(&dir->i_lock);
			
 
				+	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
			
 
				+	     dir, (unsigned)ci->i_shared_gen, dentry,
			
 
				+	     (unsigned)di->lease_shared_gen, valid);
			
 
				+	return valid;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check if cached dentry can be trusted.
			
 
				+ */
			
 
				+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
			
 
				+{
			
 
				+	struct inode *dir = dentry->d_parent->d_inode;
			
 
				+
			
 
				+	dout("d_revalidate %p '%.*s' inode %p\n", dentry,
			
 
				+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
			
 
				+
			
 
				+	/* always trust cached snapped dentries, snapdir dentry */
			
 
				+	if (ceph_snap(dir) != CEPH_NOSNAP) {
			
 
				+		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
			
 
				+		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
			
 
				+		goto out_touch;
			
 
				+	}
			
 
				+	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
			
 
				+		goto out_touch;
			
 
				+
			
 
				+	if (dentry_lease_is_valid(dentry) ||
			
 
				+	    dir_lease_is_valid(dir, dentry))
			
 
				+		goto out_touch;
			
 
				+
			
 
				+	dout("d_revalidate %p invalid\n", dentry);
			
 
				+	d_drop(dentry);
			
 
				+	return 0;
			
 
				+out_touch:
			
 
				+	ceph_dentry_lru_touch(dentry);
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * When a dentry is released, clear the dir I_COMPLETE if it was part
			
 
				+ * of the current dir gen.
			
 
				+ */
			
 
				+static void ceph_dentry_release(struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dentry);
			
 
				+	struct inode *parent_inode = dentry->d_parent->d_inode;
			
 
				+
			
 
				+	if (parent_inode) {
			
 
				+		struct ceph_inode_info *ci = ceph_inode(parent_inode);
			
 
				+
			
 
				+		spin_lock(&parent_inode->i_lock);
			
 
				+		if (ci->i_shared_gen == di->lease_shared_gen) {
			
 
				+			dout(" clearing %p complete (d_release)\n",
			
 
				+			     parent_inode);
			
 
				+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
			
 
				+			ci->i_release_count++;
			
 
				+		}
			
 
				+		spin_unlock(&parent_inode->i_lock);
			
 
				+	}
			
 
				+	if (di) {
			
 
				+		ceph_dentry_lru_del(dentry);
			
 
				+		if (di->lease_session)
			
 
				+			ceph_put_mds_session(di->lease_session);
			
 
				+		kmem_cache_free(ceph_dentry_cachep, di);
			
 
				+		dentry->d_fsdata = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
			
 
				+					  struct nameidata *nd)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Eventually, we'll want to revalidate snapped metadata
			
 
				+	 * too... probably...
			
 
				+	 */
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * read() on a dir.  This weird interface hack only works if mounted
			
 
				+ * with '-o dirstat'.
			
 
				+ */
			
 
				+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
			
 
				+			     loff_t *ppos)
			
 
				+{
			
 
				+	struct ceph_file_info *cf = file->private_data;
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int left;
			
 
				+
			
 
				+	if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
			
 
				+		return -EISDIR;
			
 
				+
			
 
				+	if (!cf->dir_info) {
			
 
				+		cf->dir_info = kmalloc(1024, GFP_NOFS);
			
 
				+		if (!cf->dir_info)
			
 
				+			return -ENOMEM;
			
 
				+		cf->dir_info_len =
			
 
				+			sprintf(cf->dir_info,
			
 
				+				"entries:   %20lld\n"
			
 
				+				" files:    %20lld\n"
			
 
				+				" subdirs:  %20lld\n"
			
 
				+				"rentries:  %20lld\n"
			
 
				+				" rfiles:   %20lld\n"
			
 
				+				" rsubdirs: %20lld\n"
			
 
				+				"rbytes:    %20lld\n"
			
 
				+				"rctime:    %10ld.%09ld\n",
			
 
				+				ci->i_files + ci->i_subdirs,
			
 
				+				ci->i_files,
			
 
				+				ci->i_subdirs,
			
 
				+				ci->i_rfiles + ci->i_rsubdirs,
			
 
				+				ci->i_rfiles,
			
 
				+				ci->i_rsubdirs,
			
 
				+				ci->i_rbytes,
			
 
				+				(long)ci->i_rctime.tv_sec,
			
 
				+				(long)ci->i_rctime.tv_nsec);
			
 
				+	}
			
 
				+
			
 
				+	if (*ppos >= cf->dir_info_len)
			
 
				+		return 0;
			
 
				+	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
			
 
				+	left = copy_to_user(buf, cf->dir_info + *ppos, size);
			
 
				+	if (left == size)
			
 
				+		return -EFAULT;
			
 
				+	*ppos += (size - left);
			
 
				+	return size - left;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * an fsync() on a dir will wait for any uncommitted directory
			
 
				+ * operations to commit.
			
 
				+ */
			
 
				+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
			
 
				+			  int datasync)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct list_head *head = &ci->i_unsafe_dirops;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	u64 last_tid;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	dout("dir_fsync %p\n", inode);
			
 
				+	spin_lock(&ci->i_unsafe_lock);
			
 
				+	if (list_empty(head))
			
 
				+		goto out;
			
 
				+
			
 
				+	req = list_entry(head->prev,
			
 
				+			 struct ceph_mds_request, r_unsafe_dir_item);
			
 
				+	last_tid = req->r_tid;
			
 
				+
			
 
				+	do {
			
 
				+		ceph_mdsc_get_request(req);
			
 
				+		spin_unlock(&ci->i_unsafe_lock);
			
 
				+		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
			
 
				+		     inode, req->r_tid, last_tid);
			
 
				+		if (req->r_timeout) {
			
 
				+			ret = wait_for_completion_timeout(
			
 
				+				&req->r_safe_completion, req->r_timeout);
			
 
				+			if (ret > 0)
			
 
				+				ret = 0;
			
 
				+			else if (ret == 0)
			
 
				+				ret = -EIO;  /* timed out */
			
 
				+		} else {
			
 
				+			wait_for_completion(&req->r_safe_completion);
			
 
				+		}
			
 
				+		spin_lock(&ci->i_unsafe_lock);
			
 
				+		ceph_mdsc_put_request(req);
			
 
				+
			
 
				+		if (ret || list_empty(head))
			
 
				+			break;
			
 
				+		req = list_entry(head->next,
			
 
				+				 struct ceph_mds_request, r_unsafe_dir_item);
			
 
				+	} while (req->r_tid < last_tid);
			
 
				+out:
			
 
				+	spin_unlock(&ci->i_unsafe_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We maintain a private dentry LRU.
			
 
				+ *
			
 
				+ * FIXME: this needs to be changed to a per-mds lru to be useful.
			
 
				+ */
			
 
				+void ceph_dentry_lru_add(struct dentry *dn)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dn);
			
 
				+	struct ceph_mds_client *mdsc;
			
 
				+
			
 
				+	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
			
 
				+	     dn->d_name.len, dn->d_name.name);
			
 
				+	if (di) {
			
 
				+		mdsc = &ceph_client(dn->d_sb)->mdsc;
			
 
				+		spin_lock(&mdsc->dentry_lru_lock);
			
 
				+		list_add_tail(&di->lru, &mdsc->dentry_lru);
			
 
				+		mdsc->num_dentry++;
			
 
				+		spin_unlock(&mdsc->dentry_lru_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_dentry_lru_touch(struct dentry *dn)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dn);
			
 
				+	struct ceph_mds_client *mdsc;
			
 
				+
			
 
				+	dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
			
 
				+	     dn->d_name.len, dn->d_name.name);
			
 
				+	if (di) {
			
 
				+		mdsc = &ceph_client(dn->d_sb)->mdsc;
			
 
				+		spin_lock(&mdsc->dentry_lru_lock);
			
 
				+		list_move_tail(&di->lru, &mdsc->dentry_lru);
			
 
				+		spin_unlock(&mdsc->dentry_lru_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_dentry_lru_del(struct dentry *dn)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dn);
			
 
				+	struct ceph_mds_client *mdsc;
			
 
				+
			
 
				+	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
			
 
				+	     dn->d_name.len, dn->d_name.name);
			
 
				+	if (di) {
			
 
				+		mdsc = &ceph_client(dn->d_sb)->mdsc;
			
 
				+		spin_lock(&mdsc->dentry_lru_lock);
			
 
				+		list_del_init(&di->lru);
			
 
				+		mdsc->num_dentry--;
			
 
				+		spin_unlock(&mdsc->dentry_lru_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const struct file_operations ceph_dir_fops = {
			
 
				+	.read = ceph_read_dir,
			
 
				+	.readdir = ceph_readdir,
			
 
				+	.llseek = ceph_dir_llseek,
			
 
				+	.open = ceph_open,
			
 
				+	.release = ceph_release,
			
 
				+	.unlocked_ioctl = ceph_ioctl,
			
 
				+	.fsync = ceph_dir_fsync,
			
 
				+};
			
 
				+
			
 
				+const struct inode_operations ceph_dir_iops = {
			
 
				+	.lookup = ceph_lookup,
			
 
				+	.permission = ceph_permission,
			
 
				+	.getattr = ceph_getattr,
			
 
				+	.setattr = ceph_setattr,
			
 
				+	.setxattr = ceph_setxattr,
			
 
				+	.getxattr = ceph_getxattr,
			
 
				+	.listxattr = ceph_listxattr,
			
 
				+	.removexattr = ceph_removexattr,
			
 
				+	.mknod = ceph_mknod,
			
 
				+	.symlink = ceph_symlink,
			
 
				+	.mkdir = ceph_mkdir,
			
 
				+	.link = ceph_link,
			
 
				+	.unlink = ceph_unlink,
			
 
				+	.rmdir = ceph_unlink,
			
 
				+	.rename = ceph_rename,
			
 
				+	.create = ceph_create,
			
 
				+};
			
 
				+
			
 
				+struct dentry_operations ceph_dentry_ops = {
			
 
				+	.d_revalidate = ceph_d_revalidate,
			
 
				+	.d_release = ceph_dentry_release,
			
 
				+};
			
 
				+
			
 
				+struct dentry_operations ceph_snapdir_dentry_ops = {
			
 
				+	.d_revalidate = ceph_snapdir_d_revalidate,
			
 
				+};
			
 
				+
			
 
				+struct dentry_operations ceph_snap_dentry_ops = {
			
 
				+};
			
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/exportfs.h>
			
 
				+#include <asm/unaligned.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+
			
 
				+/*
			
 
				+ * NFS export support
			
 
				+ *
			
 
				+ * NFS re-export of a ceph mount is, at present, only semireliable.
			
 
				+ * The basic issue is that the Ceph architectures doesn't lend itself
			
 
				+ * well to generating filehandles that will remain valid forever.
			
 
				+ *
			
 
				+ * So, we do our best.  If you're lucky, your inode will be in the
			
 
				+ * client's cache.  If it's not, and you have a connectable fh, then
			
 
				+ * the MDS server may be able to find it for you.  Otherwise, you get
			
 
				+ * ESTALE.
			
 
				+ *
			
 
				+ * There are ways to this more reliable, but in the non-connectable fh
			
 
				+ * case, we won't every work perfectly, and in the connectable case,
			
 
				+ * some changes are needed on the MDS side to work better.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Basic fh
			
 
				+ */
			
 
				+struct ceph_nfs_fh {
			
 
				+	u64 ino;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * Larger 'connectable' fh that includes parent ino and name hash.
			
 
				+ * Use this whenever possible, as it works more reliably.
			
 
				+ */
			
 
				+struct ceph_nfs_confh {
			
 
				+	u64 ino, parent_ino;
			
 
				+	u32 parent_name_hash;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
			
 
				+			  int connectable)
			
 
				+{
			
 
				+	struct ceph_nfs_fh *fh = (void *)rawfh;
			
 
				+	struct ceph_nfs_confh *cfh = (void *)rawfh;
			
 
				+	struct dentry *parent = dentry->d_parent;
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	int type;
			
 
				+
			
 
				+	/* don't re-export snaps */
			
 
				+	if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (*max_len >= sizeof(*cfh)) {
			
 
				+		dout("encode_fh %p connectable\n", dentry);
			
 
				+		cfh->ino = ceph_ino(dentry->d_inode);
			
 
				+		cfh->parent_ino = ceph_ino(parent->d_inode);
			
 
				+		cfh->parent_name_hash = parent->d_name.hash;
			
 
				+		*max_len = sizeof(*cfh);
			
 
				+		type = 2;
			
 
				+	} else if (*max_len > sizeof(*fh)) {
			
 
				+		if (connectable)
			
 
				+			return -ENOSPC;
			
 
				+		dout("encode_fh %p\n", dentry);
			
 
				+		fh->ino = ceph_ino(dentry->d_inode);
			
 
				+		*max_len = sizeof(*fh);
			
 
				+		type = 1;
			
 
				+	} else {
			
 
				+		return -ENOSPC;
			
 
				+	}
			
 
				+	return type;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * convert regular fh to dentry
			
 
				+ *
			
 
				+ * FIXME: we should try harder by querying the mds for the ino.
			
 
				+ */
			
 
				+static struct dentry *__fh_to_dentry(struct super_block *sb,
			
 
				+				     struct ceph_nfs_fh *fh)
			
 
				+{
			
 
				+	struct inode *inode;
			
 
				+	struct dentry *dentry;
			
 
				+	struct ceph_vino vino;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("__fh_to_dentry %llx\n", fh->ino);
			
 
				+	vino.ino = fh->ino;
			
 
				+	vino.snap = CEPH_NOSNAP;
			
 
				+	inode = ceph_find_inode(sb, vino);
			
 
				+	if (!inode)
			
 
				+		return ERR_PTR(-ESTALE);
			
 
				+
			
 
				+	dentry = d_obtain_alias(inode);
			
 
				+	if (!dentry) {
			
 
				+		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
			
 
				+		       fh->ino, inode);
			
 
				+		iput(inode);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+	err = ceph_init_dentry(dentry);
			
 
				+
			
 
				+	if (err < 0) {
			
 
				+		iput(inode);
			
 
				+		return ERR_PTR(err);
			
 
				+	}
			
 
				+	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * convert connectable fh to dentry
			
 
				+ */
			
 
				+static struct dentry *__cfh_to_dentry(struct super_block *sb,
			
 
				+				      struct ceph_nfs_confh *cfh)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
			
 
				+	struct inode *inode;
			
 
				+	struct dentry *dentry;
			
 
				+	struct ceph_vino vino;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("__cfh_to_dentry %llx (%llx/%x)\n",
			
 
				+	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
			
 
				+
			
 
				+	vino.ino = cfh->ino;
			
 
				+	vino.snap = CEPH_NOSNAP;
			
 
				+	inode = ceph_find_inode(sb, vino);
			
 
				+	if (!inode) {
			
 
				+		struct ceph_mds_request *req;
			
 
				+
			
 
				+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
			
 
				+					       USE_ANY_MDS);
			
 
				+		if (IS_ERR(req))
			
 
				+			return ERR_PTR(PTR_ERR(req));
			
 
				+
			
 
				+		req->r_ino1 = vino;
			
 
				+		req->r_ino2.ino = cfh->parent_ino;
			
 
				+		req->r_ino2.snap = CEPH_NOSNAP;
			
 
				+		req->r_path2 = kmalloc(16, GFP_NOFS);
			
 
				+		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
			
 
				+		req->r_num_caps = 1;
			
 
				+		err = ceph_mdsc_do_request(mdsc, NULL, req);
			
 
				+		ceph_mdsc_put_request(req);
			
 
				+		inode = ceph_find_inode(sb, vino);
			
 
				+		if (!inode)
			
 
				+			return ERR_PTR(err ? err : -ESTALE);
			
 
				+	}
			
 
				+
			
 
				+	dentry = d_obtain_alias(inode);
			
 
				+	if (!dentry) {
			
 
				+		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
			
 
				+		       cfh->ino, inode);
			
 
				+		iput(inode);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+	err = ceph_init_dentry(dentry);
			
 
				+	if (err < 0) {
			
 
				+		iput(inode);
			
 
				+		return ERR_PTR(err);
			
 
				+	}
			
 
				+	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
			
 
				+					int fh_len, int fh_type)
			
 
				+{
			
 
				+	if (fh_type == 1)
			
 
				+		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
			
 
				+	else
			
 
				+		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * get parent, if possible.
			
 
				+ *
			
 
				+ * FIXME: we could do better by querying the mds to discover the
			
 
				+ * parent.
			
 
				+ */
			
 
				+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
			
 
				+					 struct fid *fid,
			
 
				+					int fh_len, int fh_type)
			
 
				+{
			
 
				+	struct ceph_nfs_confh *cfh = (void *)fid->raw;
			
 
				+	struct ceph_vino vino;
			
 
				+	struct inode *inode;
			
 
				+	struct dentry *dentry;
			
 
				+	int err;
			
 
				+
			
 
				+	if (fh_type == 1)
			
 
				+		return ERR_PTR(-ESTALE);
			
 
				+
			
 
				+	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
			
 
				+		 cfh->parent_name_hash);
			
 
				+
			
 
				+	vino.ino = cfh->ino;
			
 
				+	vino.snap = CEPH_NOSNAP;
			
 
				+	inode = ceph_find_inode(sb, vino);
			
 
				+	if (!inode)
			
 
				+		return ERR_PTR(-ESTALE);
			
 
				+
			
 
				+	dentry = d_obtain_alias(inode);
			
 
				+	if (!dentry) {
			
 
				+		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
			
 
				+		       cfh->ino, inode);
			
 
				+		iput(inode);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+	err = ceph_init_dentry(dentry);
			
 
				+	if (err < 0) {
			
 
				+		iput(inode);
			
 
				+		return ERR_PTR(err);
			
 
				+	}
			
 
				+	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+const struct export_operations ceph_export_ops = {
			
 
				+	.encode_fh = ceph_encode_fh,
			
 
				+	.fh_to_dentry = ceph_fh_to_dentry,
			
 
				+	.fh_to_parent = ceph_fh_to_parent,
			
 
				+};
			
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -0,0 +1,937 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/file.h>
			
 
				+#include <linux/namei.h>
			
 
				+#include <linux/writeback.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "mds_client.h"
			
 
				+
			
 
				+/*
			
 
				+ * Ceph file operations
			
 
				+ *
			
 
				+ * Implement basic open/close functionality, and implement
			
 
				+ * read/write.
			
 
				+ *
			
 
				+ * We implement three modes of file I/O:
			
 
				+ *  - buffered uses the generic_file_aio_{read,write} helpers
			
 
				+ *
			
 
				+ *  - synchronous is used when there is multi-client read/write
			
 
				+ *    sharing, avoids the page cache, and synchronously waits for an
			
 
				+ *    ack from the OSD.
			
 
				+ *
			
 
				+ *  - direct io takes the variant of the sync path that references
			
 
				+ *    user pages directly.
			
 
				+ *
			
 
				+ * fsync() flushes and waits on dirty pages, but just queues metadata
			
 
				+ * for writeback: since the MDS can recover size and mtime there is no
			
 
				+ * need to wait for MDS acknowledgement.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Prepare an open request.  Preallocate ceph_cap to avoid an
			
 
				+ * inopportune ENOMEM later.
			
 
				+ */
			
 
				+static struct ceph_mds_request *
			
 
				+prepare_open_request(struct super_block *sb, int flags, int create_mode)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int want_auth = USE_ANY_MDS;
			
 
				+	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
			
 
				+
			
 
				+	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
			
 
				+		want_auth = USE_AUTH_MDS;
			
 
				+
			
 
				+	req = ceph_mdsc_create_request(mdsc, op, want_auth);
			
 
				+	if (IS_ERR(req))
			
 
				+		goto out;
			
 
				+	req->r_fmode = ceph_flags_to_mode(flags);
			
 
				+	req->r_args.open.flags = cpu_to_le32(flags);
			
 
				+	req->r_args.open.mode = cpu_to_le32(create_mode);
			
 
				+	req->r_args.open.preferred = cpu_to_le32(-1);
			
 
				+out:
			
 
				+	return req;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * initialize private struct file data.
			
 
				+ * if we fail, clean up by dropping fmode reference on the ceph_inode
			
 
				+ */
			
 
				+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
			
 
				+{
			
 
				+	struct ceph_file_info *cf;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	switch (inode->i_mode & S_IFMT) {
			
 
				+	case S_IFREG:
			
 
				+	case S_IFDIR:
			
 
				+		dout("init_file %p %p 0%o (regular)\n", inode, file,
			
 
				+		     inode->i_mode);
			
 
				+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
			
 
				+		if (cf == NULL) {
			
 
				+			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+		cf->fmode = fmode;
			
 
				+		cf->next_offset = 2;
			
 
				+		file->private_data = cf;
			
 
				+		BUG_ON(inode->i_fop->release != ceph_release);
			
 
				+		break;
			
 
				+
			
 
				+	case S_IFLNK:
			
 
				+		dout("init_file %p %p 0%o (symlink)\n", inode, file,
			
 
				+		     inode->i_mode);
			
 
				+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		dout("init_file %p %p 0%o (special)\n", inode, file,
			
 
				+		     inode->i_mode);
			
 
				+		/*
			
 
				+		 * we need to drop the open ref now, since we don't
			
 
				+		 * have .release set to ceph_release.
			
 
				+		 */
			
 
				+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
			
 
				+		BUG_ON(inode->i_fop->release == ceph_release);
			
 
				+
			
 
				+		/* call the proper open fop */
			
 
				+		ret = inode->i_fop->open(inode, file);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If the filp already has private_data, that means the file was
			
 
				+ * already opened by intent during lookup, and we do nothing.
			
 
				+ *
			
 
				+ * If we already have the requisite capabilities, we can satisfy
			
 
				+ * the open request locally (no need to request new caps from the
			
 
				+ * MDS).  We do, however, need to inform the MDS (asynchronously)
			
 
				+ * if our wanted caps set expands.
			
 
				+ */
			
 
				+int ceph_open(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct ceph_file_info *cf = file->private_data;
			
 
				+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
			
 
				+	int err;
			
 
				+	int flags, fmode, wanted;
			
 
				+
			
 
				+	if (cf) {
			
 
				+		dout("open file %p is already opened\n", file);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
			
 
				+	flags = file->f_flags & ~(O_CREAT|O_EXCL);
			
 
				+	if (S_ISDIR(inode->i_mode))
			
 
				+		flags = O_DIRECTORY;  /* mds likes to know */
			
 
				+
			
 
				+	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
			
 
				+	     ceph_vinop(inode), file, flags, file->f_flags);
			
 
				+	fmode = ceph_flags_to_mode(flags);
			
 
				+	wanted = ceph_caps_for_mode(fmode);
			
 
				+
			
 
				+	/* snapped files are read-only */
			
 
				+	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	/* trivially open snapdir */
			
 
				+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		__ceph_get_fmode(ci, fmode);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		return ceph_init_file(inode, file, fmode);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * No need to block if we have any caps.  Update wanted set
			
 
				+	 * asynchronously.
			
 
				+	 */
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (__ceph_is_any_real_caps(ci)) {
			
 
				+		int mds_wanted = __ceph_caps_mds_wanted(ci);
			
 
				+		int issued = __ceph_caps_issued(ci, NULL);
			
 
				+
			
 
				+		dout("open %p fmode %d want %s issued %s using existing\n",
			
 
				+		     inode, fmode, ceph_cap_string(wanted),
			
 
				+		     ceph_cap_string(issued));
			
 
				+		__ceph_get_fmode(ci, fmode);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+		/* adjust wanted? */
			
 
				+		if ((issued & wanted) != wanted &&
			
 
				+		    (mds_wanted & wanted) != wanted &&
			
 
				+		    ceph_snap(inode) != CEPH_SNAPDIR)
			
 
				+			ceph_check_caps(ci, 0, NULL);
			
 
				+
			
 
				+		return ceph_init_file(inode, file, fmode);
			
 
				+	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
			
 
				+		   (ci->i_snap_caps & wanted) == wanted) {
			
 
				+		__ceph_get_fmode(ci, fmode);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		return ceph_init_file(inode, file, fmode);
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
			
 
				+	req = prepare_open_request(inode->i_sb, flags, 0);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		err = PTR_ERR(req);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	req->r_inode = igrab(inode);
			
 
				+	req->r_num_caps = 1;
			
 
				+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
			
 
				+	if (!err)
			
 
				+		err = ceph_init_file(inode, file, req->r_fmode);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Do a lookup + open with a single request.
			
 
				+ *
			
 
				+ * If this succeeds, but some subsequent check in the vfs
			
 
				+ * may_open() fails, the struct *file gets cleaned up (i.e.
			
 
				+ * ceph_release gets called).  So fear not!
			
 
				+ */
			
 
				+/*
			
 
				+ * flags
			
 
				+ *  path_lookup_open   -> LOOKUP_OPEN
			
 
				+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
			
 
				+ */
			
 
				+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
			
 
				+				struct nameidata *nd, int mode,
			
 
				+				int locked_dir)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct file *file = nd->intent.open.file;
			
 
				+	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
			
 
				+
			
 
				+	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
			
 
				+	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
			
 
				+
			
 
				+	/* do the open */
			
 
				+	req = prepare_open_request(dir->i_sb, flags, mode);
			
 
				+	if (IS_ERR(req))
			
 
				+		return ERR_PTR(PTR_ERR(req));
			
 
				+	req->r_dentry = dget(dentry);
			
 
				+	req->r_num_caps = 2;
			
 
				+	if (flags & O_CREAT) {
			
 
				+		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
			
 
				+		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
			
 
				+	}
			
 
				+	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
			
 
				+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
			
 
				+	dentry = ceph_finish_lookup(req, dentry, err);
			
 
				+	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
			
 
				+		err = ceph_handle_notrace_create(dir, dentry);
			
 
				+	if (!err)
			
 
				+		err = ceph_init_file(req->r_dentry->d_inode, file,
			
 
				+				     req->r_fmode);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	dout("ceph_lookup_open result=%p\n", dentry);
			
 
				+	return dentry;
			
 
				+}
			
 
				+
			
 
				+int ceph_release(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_file_info *cf = file->private_data;
			
 
				+
			
 
				+	dout("release inode %p file %p\n", inode, file);
			
 
				+	ceph_put_fmode(ci, cf->fmode);
			
 
				+	if (cf->last_readdir)
			
 
				+		ceph_mdsc_put_request(cf->last_readdir);
			
 
				+	kfree(cf->last_name);
			
 
				+	kfree(cf->dir_info);
			
 
				+	dput(cf->dentry);
			
 
				+	kmem_cache_free(ceph_file_cachep, cf);
			
 
				+
			
 
				+	/* wake up anyone waiting for caps on this inode */
			
 
				+	wake_up(&ci->i_cap_wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * build a vector of user pages
			
 
				+ */
			
 
				+static struct page **get_direct_page_vector(const char __user *data,
			
 
				+					    int num_pages,
			
 
				+					    loff_t off, size_t len)
			
 
				+{
			
 
				+	struct page **pages;
			
 
				+	int rc;
			
 
				+
			
 
				+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
			
 
				+	if (!pages)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	down_read(&current->mm->mmap_sem);
			
 
				+	rc = get_user_pages(current, current->mm, (unsigned long)data,
			
 
				+			    num_pages, 0, 0, pages, NULL);
			
 
				+	up_read(&current->mm->mmap_sem);
			
 
				+	if (rc < 0)
			
 
				+		goto fail;
			
 
				+	return pages;
			
 
				+
			
 
				+fail:
			
 
				+	kfree(pages);
			
 
				+	return ERR_PTR(rc);
			
 
				+}
			
 
				+
			
 
				+static void put_page_vector(struct page **pages, int num_pages)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < num_pages; i++)
			
 
				+		put_page(pages[i]);
			
 
				+	kfree(pages);
			
 
				+}
			
 
				+
			
 
				+void ceph_release_page_vector(struct page **pages, int num_pages)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < num_pages; i++)
			
 
				+		__free_pages(pages[i], 0);
			
 
				+	kfree(pages);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * allocate a vector new pages
			
 
				+ */
			
 
				+static struct page **alloc_page_vector(int num_pages)
			
 
				+{
			
 
				+	struct page **pages;
			
 
				+	int i;
			
 
				+
			
 
				+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
			
 
				+	if (!pages)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	for (i = 0; i < num_pages; i++) {
			
 
				+		pages[i] = alloc_page(GFP_NOFS);
			
 
				+		if (pages[i] == NULL) {
			
 
				+			ceph_release_page_vector(pages, i);
			
 
				+			return ERR_PTR(-ENOMEM);
			
 
				+		}
			
 
				+	}
			
 
				+	return pages;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * copy user data into a page vector
			
 
				+ */
			
 
				+static int copy_user_to_page_vector(struct page **pages,
			
 
				+				    const char __user *data,
			
 
				+				    loff_t off, size_t len)
			
 
				+{
			
 
				+	int i = 0;
			
 
				+	int po = off & ~PAGE_CACHE_MASK;
			
 
				+	int left = len;
			
 
				+	int l, bad;
			
 
				+
			
 
				+	while (left > 0) {
			
 
				+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
			
 
				+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
			
 
				+		if (bad == l)
			
 
				+			return -EFAULT;
			
 
				+		data += l - bad;
			
 
				+		left -= l - bad;
			
 
				+		po += l - bad;
			
 
				+		if (po == PAGE_CACHE_SIZE) {
			
 
				+			po = 0;
			
 
				+			i++;
			
 
				+		}
			
 
				+	}
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * copy user data from a page vector into a user pointer
			
 
				+ */
			
 
				+static int copy_page_vector_to_user(struct page **pages, char __user *data,
			
 
				+				    loff_t off, size_t len)
			
 
				+{
			
 
				+	int i = 0;
			
 
				+	int po = off & ~PAGE_CACHE_MASK;
			
 
				+	int left = len;
			
 
				+	int l, bad;
			
 
				+
			
 
				+	while (left > 0) {
			
 
				+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
			
 
				+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
			
 
				+		if (bad == l)
			
 
				+			return -EFAULT;
			
 
				+		data += l - bad;
			
 
				+		left -= l - bad;
			
 
				+		if (po) {
			
 
				+			po += l - bad;
			
 
				+			if (po == PAGE_CACHE_SIZE)
			
 
				+				po = 0;
			
 
				+		}
			
 
				+		i++;
			
 
				+	}
			
 
				+	return len;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Zero an extent within a page vector.  Offset is relative to the
			
 
				+ * start of the first page.
			
 
				+ */
			
 
				+static void zero_page_vector_range(int off, int len, struct page **pages)
			
 
				+{
			
 
				+	int i = off >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+	off &= ~PAGE_CACHE_MASK;
			
 
				+
			
 
				+	dout("zero_page_vector_page %u~%u\n", off, len);
			
 
				+
			
 
				+	/* leading partial page? */
			
 
				+	if (off) {
			
 
				+		int end = min((int)PAGE_CACHE_SIZE, off + len);
			
 
				+		dout("zeroing %d %p head from %d\n", i, pages[i],
			
 
				+		     (int)off);
			
 
				+		zero_user_segment(pages[i], off, end);
			
 
				+		len -= (end - off);
			
 
				+		i++;
			
 
				+	}
			
 
				+	while (len >= PAGE_CACHE_SIZE) {
			
 
				+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
			
 
				+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
			
 
				+		len -= PAGE_CACHE_SIZE;
			
 
				+		i++;
			
 
				+	}
			
 
				+	/* trailing partial page? */
			
 
				+	if (len) {
			
 
				+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
			
 
				+		zero_user_segment(pages[i], 0, len);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Read a range of bytes striped over one or more objects.  Iterate over
			
 
				+ * objects we stripe over.  (That's not atomic, but good enough for now.)
			
 
				+ *
			
 
				+ * If we get a short result from the OSD, check against i_size; we need to
			
 
				+ * only return a short read to the caller if we hit EOF.
			
 
				+ */
			
 
				+static int striped_read(struct inode *inode,
			
 
				+			u64 off, u64 len,
			
 
				+			struct page **pages, int num_pages,
			
 
				+			int *checkeof)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_inode_to_client(inode);
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	u64 pos, this_len;
			
 
				+	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
			
 
				+	int left, pages_left;
			
 
				+	int read;
			
 
				+	struct page **page_pos;
			
 
				+	int ret;
			
 
				+	bool hit_stripe, was_short;
			
 
				+
			
 
				+	/*
			
 
				+	 * we may need to do multiple reads.  not atomic, unfortunately.
			
 
				+	 */
			
 
				+	pos = off;
			
 
				+	left = len;
			
 
				+	page_pos = pages;
			
 
				+	pages_left = num_pages;
			
 
				+	read = 0;
			
 
				+
			
 
				+more:
			
 
				+	this_len = left;
			
 
				+	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
			
 
				+				  &ci->i_layout, pos, &this_len,
			
 
				+				  ci->i_truncate_seq,
			
 
				+				  ci->i_truncate_size,
			
 
				+				  page_pos, pages_left);
			
 
				+	hit_stripe = this_len < left;
			
 
				+	was_short = ret >= 0 && ret < this_len;
			
 
				+	if (ret == -ENOENT)
			
 
				+		ret = 0;
			
 
				+	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
			
 
				+	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
			
 
				+
			
 
				+	if (ret > 0) {
			
 
				+		int didpages =
			
 
				+			((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				+		if (read < pos - off) {
			
 
				+			dout(" zero gap %llu to %llu\n", off + read, pos);
			
 
				+			zero_page_vector_range(page_off + read,
			
 
				+					       pos - off - read, pages);
			
 
				+		}
			
 
				+		pos += ret;
			
 
				+		read = pos - off;
			
 
				+		left -= ret;
			
 
				+		page_pos += didpages;
			
 
				+		pages_left -= didpages;
			
 
				+
			
 
				+		/* hit stripe? */
			
 
				+		if (left && hit_stripe)
			
 
				+			goto more;
			
 
				+	}
			
 
				+
			
 
				+	if (was_short) {
			
 
				+		/* was original extent fully inside i_size? */
			
 
				+		if (pos + left <= inode->i_size) {
			
 
				+			dout("zero tail\n");
			
 
				+			zero_page_vector_range(page_off + read, len - read,
			
 
				+					       pages);
			
 
				+			read = len;
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		/* check i_size */
			
 
				+		*checkeof = 1;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	if (ret >= 0)
			
 
				+		ret = read;
			
 
				+	dout("striped_read returns %d\n", ret);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Completely synchronous read and write methods.  Direct from __user
			
 
				+ * buffer to osd, or directly to user pages (if O_DIRECT).
			
 
				+ *
			
 
				+ * If the read spans object boundary, just do multiple reads.
			
 
				+ */
			
 
				+static ssize_t ceph_sync_read(struct file *file, char __user *data,
			
 
				+			      unsigned len, loff_t *poff, int *checkeof)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct page **pages;
			
 
				+	u64 off = *poff;
			
 
				+	int num_pages = calc_pages_for(off, len);
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
			
 
				+	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
			
 
				+
			
 
				+	if (file->f_flags & O_DIRECT) {
			
 
				+		pages = get_direct_page_vector(data, num_pages, off, len);
			
 
				+
			
 
				+		/*
			
 
				+		 * flush any page cache pages in this range.  this
			
 
				+		 * will make concurrent normal and O_DIRECT io slow,
			
 
				+		 * but it will at least behave sensibly when they are
			
 
				+		 * in sequence.
			
 
				+		 */
			
 
				+	} else {
			
 
				+		pages = alloc_page_vector(num_pages);
			
 
				+	}
			
 
				+	if (IS_ERR(pages))
			
 
				+		return PTR_ERR(pages);
			
 
				+
			
 
				+	ret = filemap_write_and_wait(inode->i_mapping);
			
 
				+	if (ret < 0)
			
 
				+		goto done;
			
 
				+
			
 
				+	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
			
 
				+
			
 
				+	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
			
 
				+		ret = copy_page_vector_to_user(pages, data, off, ret);
			
 
				+	if (ret >= 0)
			
 
				+		*poff = off + ret;
			
 
				+
			
 
				+done:
			
 
				+	if (file->f_flags & O_DIRECT)
			
 
				+		put_page_vector(pages, num_pages);
			
 
				+	else
			
 
				+		ceph_release_page_vector(pages, num_pages);
			
 
				+	dout("sync_read result %d\n", ret);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Write commit callback, called if we requested both an ACK and
			
 
				+ * ONDISK commit reply from the OSD.
			
 
				+ */
			
 
				+static void sync_write_commit(struct ceph_osd_request *req,
			
 
				+			      struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
			
 
				+
			
 
				+	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
			
 
				+	spin_lock(&ci->i_unsafe_lock);
			
 
				+	list_del_init(&req->r_unsafe_item);
			
 
				+	spin_unlock(&ci->i_unsafe_lock);
			
 
				+	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Synchronous write, straight from __user pointer or user pages (if
			
 
				+ * O_DIRECT).
			
 
				+ *
			
 
				+ * If write spans object boundary, just do multiple writes.  (For a
			
 
				+ * correct atomic write, we should e.g. take write locks on all
			
 
				+ * objects, rollback on failure, etc.)
			
 
				+ */
			
 
				+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
			
 
				+			       size_t left, loff_t *offset)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_client *client = ceph_inode_to_client(inode);
			
 
				+	struct ceph_osd_request *req;
			
 
				+	struct page **pages;
			
 
				+	int num_pages;
			
 
				+	long long unsigned pos;
			
 
				+	u64 len;
			
 
				+	int written = 0;
			
 
				+	int flags;
			
 
				+	int do_sync = 0;
			
 
				+	int check_caps = 0;
			
 
				+	int ret;
			
 
				+	struct timespec mtime = CURRENT_TIME;
			
 
				+
			
 
				+	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
			
 
				+	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
			
 
				+
			
 
				+	if (file->f_flags & O_APPEND)
			
 
				+		pos = i_size_read(inode);
			
 
				+	else
			
 
				+		pos = *offset;
			
 
				+
			
 
				+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = invalidate_inode_pages2_range(inode->i_mapping,
			
 
				+					    pos >> PAGE_CACHE_SHIFT,
			
 
				+					    (pos + left) >> PAGE_CACHE_SHIFT);
			
 
				+	if (ret < 0)
			
 
				+		dout("invalidate_inode_pages2_range returned %d\n", ret);
			
 
				+
			
 
				+	flags = CEPH_OSD_FLAG_ORDERSNAP |
			
 
				+		CEPH_OSD_FLAG_ONDISK |
			
 
				+		CEPH_OSD_FLAG_WRITE;
			
 
				+	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
			
 
				+		flags |= CEPH_OSD_FLAG_ACK;
			
 
				+	else
			
 
				+		do_sync = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * we may need to do multiple writes here if we span an object
			
 
				+	 * boundary.  this isn't atomic, unfortunately.  :(
			
 
				+	 */
			
 
				+more:
			
 
				+	len = left;
			
 
				+	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
			
 
				+				    ceph_vino(inode), pos, &len,
			
 
				+				    CEPH_OSD_OP_WRITE, flags,
			
 
				+				    ci->i_snap_realm->cached_context,
			
 
				+				    do_sync,
			
 
				+				    ci->i_truncate_seq, ci->i_truncate_size,
			
 
				+				    &mtime, false, 2);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+
			
 
				+	num_pages = calc_pages_for(pos, len);
			
 
				+
			
 
				+	if (file->f_flags & O_DIRECT) {
			
 
				+		pages = get_direct_page_vector(data, num_pages, pos, len);
			
 
				+		if (IS_ERR(pages)) {
			
 
				+			ret = PTR_ERR(pages);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * throw out any page cache pages in this range. this
			
 
				+		 * may block.
			
 
				+		 */
			
 
				+		truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
			
 
				+	} else {
			
 
				+		pages = alloc_page_vector(num_pages);
			
 
				+		if (IS_ERR(pages)) {
			
 
				+			ret = PTR_ERR(pages);
			
 
				+			goto out;
			
 
				+		}
			
 
				+		ret = copy_user_to_page_vector(pages, data, pos, len);
			
 
				+		if (ret < 0) {
			
 
				+			ceph_release_page_vector(pages, num_pages);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		if ((file->f_flags & O_SYNC) == 0) {
			
 
				+			/* get a second commit callback */
			
 
				+			req->r_safe_callback = sync_write_commit;
			
 
				+			req->r_own_pages = 1;
			
 
				+		}
			
 
				+	}
			
 
				+	req->r_pages = pages;
			
 
				+	req->r_num_pages = num_pages;
			
 
				+	req->r_inode = inode;
			
 
				+
			
 
				+	ret = ceph_osdc_start_request(&client->osdc, req, false);
			
 
				+	if (!ret) {
			
 
				+		if (req->r_safe_callback) {
			
 
				+			/*
			
 
				+			 * Add to inode unsafe list only after we
			
 
				+			 * start_request so that a tid has been assigned.
			
 
				+			 */
			
 
				+			spin_lock(&ci->i_unsafe_lock);
			
 
				+			list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
			
 
				+			spin_unlock(&ci->i_unsafe_lock);
			
 
				+			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
			
 
				+		}
			
 
				+		ret = ceph_osdc_wait_request(&client->osdc, req);
			
 
				+	}
			
 
				+
			
 
				+	if (file->f_flags & O_DIRECT)
			
 
				+		put_page_vector(pages, num_pages);
			
 
				+	else if (file->f_flags & O_SYNC)
			
 
				+		ceph_release_page_vector(pages, num_pages);
			
 
				+
			
 
				+out:
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	if (ret == 0) {
			
 
				+		pos += len;
			
 
				+		written += len;
			
 
				+		left -= len;
			
 
				+		if (left)
			
 
				+			goto more;
			
 
				+
			
 
				+		ret = written;
			
 
				+		*offset = pos;
			
 
				+		if (pos > i_size_read(inode))
			
 
				+			check_caps = ceph_inode_set_size(inode, pos);
			
 
				+		if (check_caps)
			
 
				+			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
			
 
				+					NULL);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
			
 
				+ * Atomically grab references, so that those bits are not released
			
 
				+ * back to the MDS mid-read.
			
 
				+ *
			
 
				+ * Hmm, the sync read case isn't actually async... should it be?
			
 
				+ */
			
 
				+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
			
 
				+			     unsigned long nr_segs, loff_t pos)
			
 
				+{
			
 
				+	struct file *filp = iocb->ki_filp;
			
 
				+	loff_t *ppos = &iocb->ki_pos;
			
 
				+	size_t len = iov->iov_len;
			
 
				+	struct inode *inode = filp->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	void *base = iov->iov_base;
			
 
				+	ssize_t ret;
			
 
				+	int got = 0;
			
 
				+	int checkeof = 0, read = 0;
			
 
				+
			
 
				+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
			
 
				+	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
			
 
				+again:
			
 
				+	__ceph_do_pending_vmtruncate(inode);
			
 
				+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
			
 
				+			    &got, -1);
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
			
 
				+	     inode, ceph_vinop(inode), pos, (unsigned)len,
			
 
				+	     ceph_cap_string(got));
			
 
				+
			
 
				+	if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
			
 
				+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
			
 
				+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
			
 
				+		/* hmm, this isn't really async... */
			
 
				+		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
			
 
				+	else
			
 
				+		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
			
 
				+
			
 
				+out:
			
 
				+	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
			
 
				+	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
			
 
				+	ceph_put_cap_refs(ci, got);
			
 
				+
			
 
				+	if (checkeof && ret >= 0) {
			
 
				+		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
			
 
				+
			
 
				+		/* hit EOF or hole? */
			
 
				+		if (statret == 0 && *ppos < inode->i_size) {
			
 
				+			dout("aio_read sync_read hit hole, reading more\n");
			
 
				+			read += ret;
			
 
				+			base += ret;
			
 
				+			len -= ret;
			
 
				+			checkeof = 0;
			
 
				+			goto again;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ret >= 0)
			
 
				+		ret += read;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Take cap references to avoid releasing caps to MDS mid-write.
			
 
				+ *
			
 
				+ * If we are synchronous, and write with an old snap context, the OSD
			
 
				+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
			
 
				+ * dropping our cap refs and allowing the pending snap to logically
			
 
				+ * complete _before_ this write occurs.
			
 
				+ *
			
 
				+ * If we are near ENOSPC, write synchronously.
			
 
				+ */
			
 
				+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
			
 
				+		       unsigned long nr_segs, loff_t pos)
			
 
				+{
			
 
				+	struct file *file = iocb->ki_filp;
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
			
 
				+	loff_t endoff = pos + iov->iov_len;
			
 
				+	int got = 0;
			
 
				+	int ret, err;
			
 
				+
			
 
				+	if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+retry_snap:
			
 
				+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
			
 
				+		return -ENOSPC;
			
 
				+	__ceph_do_pending_vmtruncate(inode);
			
 
				+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
			
 
				+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
			
 
				+	     inode->i_size);
			
 
				+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
			
 
				+			    &got, endoff);
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
			
 
				+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
			
 
				+	     ceph_cap_string(got));
			
 
				+
			
 
				+	if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
			
 
				+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
			
 
				+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
			
 
				+		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
			
 
				+			&iocb->ki_pos);
			
 
				+	} else {
			
 
				+		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
			
 
				+
			
 
				+		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
			
 
				+		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
			
 
				+		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
			
 
				+			err = vfs_fsync_range(file, file->f_path.dentry,
			
 
				+					      pos, pos + ret - 1, 1);
			
 
				+			if (err < 0)
			
 
				+				ret = err;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ret >= 0) {
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		__ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
			
 
				+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
			
 
				+	     ceph_cap_string(got));
			
 
				+	ceph_put_cap_refs(ci, got);
			
 
				+
			
 
				+	if (ret == -EOLDSNAPC) {
			
 
				+		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
			
 
				+		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
			
 
				+		goto retry_snap;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * llseek.  be sure to verify file size on SEEK_END.
			
 
				+ */
			
 
				+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
			
 
				+{
			
 
				+	struct inode *inode = file->f_mapping->host;
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&inode->i_mutex);
			
 
				+	__ceph_do_pending_vmtruncate(inode);
			
 
				+	switch (origin) {
			
 
				+	case SEEK_END:
			
 
				+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
			
 
				+		if (ret < 0) {
			
 
				+			offset = ret;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		offset += inode->i_size;
			
 
				+		break;
			
 
				+	case SEEK_CUR:
			
 
				+		/*
			
 
				+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
			
 
				+		 * position-querying operation.  Avoid rewriting the "same"
			
 
				+		 * f_pos value back to the file because a concurrent read(),
			
 
				+		 * write() or lseek() might have altered it
			
 
				+		 */
			
 
				+		if (offset == 0) {
			
 
				+			offset = file->f_pos;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		offset += file->f_pos;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
			
 
				+		offset = -EINVAL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* Special lock needed here? */
			
 
				+	if (offset != file->f_pos) {
			
 
				+		file->f_pos = offset;
			
 
				+		file->f_version = 0;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&inode->i_mutex);
			
 
				+	return offset;
			
 
				+}
			
 
				+
			
 
				+const struct file_operations ceph_file_fops = {
			
 
				+	.open = ceph_open,
			
 
				+	.release = ceph_release,
			
 
				+	.llseek = ceph_llseek,
			
 
				+	.read = do_sync_read,
			
 
				+	.write = do_sync_write,
			
 
				+	.aio_read = ceph_aio_read,
			
 
				+	.aio_write = ceph_aio_write,
			
 
				+	.mmap = ceph_mmap,
			
 
				+	.fsync = ceph_fsync,
			
 
				+	.splice_read = generic_file_splice_read,
			
 
				+	.splice_write = generic_file_splice_write,
			
 
				+	.unlocked_ioctl = ceph_ioctl,
			
 
				+	.compat_ioctl	= ceph_ioctl,
			
 
				+};
			
 
				+
			
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1750 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/smp_lock.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/namei.h>
			
 
				+#include <linux/writeback.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+#include <linux/pagevec.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+/*
			
 
				+ * Ceph inode operations
			
 
				+ *
			
 
				+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
			
 
				+ * setattr, etc.), xattr helpers, and helpers for assimilating
			
 
				+ * metadata returned by the MDS into our cache.
			
 
				+ *
			
 
				+ * Also define helpers for doing asynchronous writeback, invalidation,
			
 
				+ * and truncation for the benefit of those who can't afford to block
			
 
				+ * (typically because they are in the message handler path).
			
 
				+ */
			
 
				+
			
 
				+static const struct inode_operations ceph_symlink_iops;
			
 
				+
			
 
				+static void ceph_invalidate_work(struct work_struct *work);
			
 
				+static void ceph_writeback_work(struct work_struct *work);
			
 
				+static void ceph_vmtruncate_work(struct work_struct *work);
			
 
				+
			
 
				+/*
			
 
				+ * find or create an inode, given the ceph ino number
			
 
				+ */
			
 
				+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
			
 
				+{
			
 
				+	struct inode *inode;
			
 
				+	ino_t t = ceph_vino_to_ino(vino);
			
 
				+
			
 
				+	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
			
 
				+	if (inode == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	if (inode->i_state & I_NEW) {
			
 
				+		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
			
 
				+		     inode, ceph_vinop(inode), (u64)inode->i_ino);
			
 
				+		unlock_new_inode(inode);
			
 
				+	}
			
 
				+
			
 
				+	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
			
 
				+	     vino.snap, inode);
			
 
				+	return inode;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * get/constuct snapdir inode for a given directory
			
 
				+ */
			
 
				+struct inode *ceph_get_snapdir(struct inode *parent)
			
 
				+{
			
 
				+	struct ceph_vino vino = {
			
 
				+		.ino = ceph_ino(parent),
			
 
				+		.snap = CEPH_SNAPDIR,
			
 
				+	};
			
 
				+	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+
			
 
				+	BUG_ON(!S_ISDIR(parent->i_mode));
			
 
				+	if (IS_ERR(inode))
			
 
				+		return ERR_PTR(PTR_ERR(inode));
			
 
				+	inode->i_mode = parent->i_mode;
			
 
				+	inode->i_uid = parent->i_uid;
			
 
				+	inode->i_gid = parent->i_gid;
			
 
				+	inode->i_op = &ceph_dir_iops;
			
 
				+	inode->i_fop = &ceph_dir_fops;
			
 
				+	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
			
 
				+	ci->i_rbytes = 0;
			
 
				+	return inode;
			
 
				+}
			
 
				+
			
 
				+const struct inode_operations ceph_file_iops = {
			
 
				+	.permission = ceph_permission,
			
 
				+	.setattr = ceph_setattr,
			
 
				+	.getattr = ceph_getattr,
			
 
				+	.setxattr = ceph_setxattr,
			
 
				+	.getxattr = ceph_getxattr,
			
 
				+	.listxattr = ceph_listxattr,
			
 
				+	.removexattr = ceph_removexattr,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * We use a 'frag tree' to keep track of the MDS's directory fragments
			
 
				+ * for a given inode (usually there is just a single fragment).  We
			
 
				+ * need to know when a child frag is delegated to a new MDS, or when
			
 
				+ * it is flagged as replicated, so we can direct our requests
			
 
				+ * accordingly.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * find/create a frag in the tree
			
 
				+ */
			
 
				+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
			
 
				+						    u32 f)
			
 
				+{
			
 
				+	struct rb_node **p;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_inode_frag *frag;
			
 
				+	int c;
			
 
				+
			
 
				+	p = &ci->i_fragtree.rb_node;
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		frag = rb_entry(parent, struct ceph_inode_frag, node);
			
 
				+		c = ceph_frag_compare(f, frag->frag);
			
 
				+		if (c < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (c > 0)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			return frag;
			
 
				+	}
			
 
				+
			
 
				+	frag = kmalloc(sizeof(*frag), GFP_NOFS);
			
 
				+	if (!frag) {
			
 
				+		pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
			
 
				+		       "frag %x\n", &ci->vfs_inode,
			
 
				+		       ceph_vinop(&ci->vfs_inode), f);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+	frag->frag = f;
			
 
				+	frag->split_by = 0;
			
 
				+	frag->mds = -1;
			
 
				+	frag->ndist = 0;
			
 
				+
			
 
				+	rb_link_node(&frag->node, parent, p);
			
 
				+	rb_insert_color(&frag->node, &ci->i_fragtree);
			
 
				+
			
 
				+	dout("get_or_create_frag added %llx.%llx frag %x\n",
			
 
				+	     ceph_vinop(&ci->vfs_inode), f);
			
 
				+	return frag;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * find a specific frag @f
			
 
				+ */
			
 
				+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
			
 
				+{
			
 
				+	struct rb_node *n = ci->i_fragtree.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		struct ceph_inode_frag *frag =
			
 
				+			rb_entry(n, struct ceph_inode_frag, node);
			
 
				+		int c = ceph_frag_compare(f, frag->frag);
			
 
				+		if (c < 0)
			
 
				+			n = n->rb_left;
			
 
				+		else if (c > 0)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return frag;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Choose frag containing the given value @v.  If @pfrag is
			
 
				+ * specified, copy the frag delegation info to the caller if
			
 
				+ * it is present.
			
 
				+ */
			
 
				+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
			
 
				+		     struct ceph_inode_frag *pfrag,
			
 
				+		     int *found)
			
 
				+{
			
 
				+	u32 t = ceph_frag_make(0, 0);
			
 
				+	struct ceph_inode_frag *frag;
			
 
				+	unsigned nway, i;
			
 
				+	u32 n;
			
 
				+
			
 
				+	if (found)
			
 
				+		*found = 0;
			
 
				+
			
 
				+	mutex_lock(&ci->i_fragtree_mutex);
			
 
				+	while (1) {
			
 
				+		WARN_ON(!ceph_frag_contains_value(t, v));
			
 
				+		frag = __ceph_find_frag(ci, t);
			
 
				+		if (!frag)
			
 
				+			break; /* t is a leaf */
			
 
				+		if (frag->split_by == 0) {
			
 
				+			if (pfrag)
			
 
				+				memcpy(pfrag, frag, sizeof(*pfrag));
			
 
				+			if (found)
			
 
				+				*found = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		/* choose child */
			
 
				+		nway = 1 << frag->split_by;
			
 
				+		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
			
 
				+		     frag->split_by, nway);
			
 
				+		for (i = 0; i < nway; i++) {
			
 
				+			n = ceph_frag_make_child(t, frag->split_by, i);
			
 
				+			if (ceph_frag_contains_value(n, v)) {
			
 
				+				t = n;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		BUG_ON(i == nway);
			
 
				+	}
			
 
				+	dout("choose_frag(%x) = %x\n", v, t);
			
 
				+
			
 
				+	mutex_unlock(&ci->i_fragtree_mutex);
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Process dirfrag (delegation) info from the mds.  Include leaf
			
 
				+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
			
 
				+ * branches/splits are included in i_fragtree)
			
 
				+ */
			
 
				+static int ceph_fill_dirfrag(struct inode *inode,
			
 
				+			     struct ceph_mds_reply_dirfrag *dirinfo)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_inode_frag *frag;
			
 
				+	u32 id = le32_to_cpu(dirinfo->frag);
			
 
				+	int mds = le32_to_cpu(dirinfo->auth);
			
 
				+	int ndist = le32_to_cpu(dirinfo->ndist);
			
 
				+	int i;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	mutex_lock(&ci->i_fragtree_mutex);
			
 
				+	if (ndist == 0) {
			
 
				+		/* no delegation info needed. */
			
 
				+		frag = __ceph_find_frag(ci, id);
			
 
				+		if (!frag)
			
 
				+			goto out;
			
 
				+		if (frag->split_by == 0) {
			
 
				+			/* tree leaf, remove */
			
 
				+			dout("fill_dirfrag removed %llx.%llx frag %x"
			
 
				+			     " (no ref)\n", ceph_vinop(inode), id);
			
 
				+			rb_erase(&frag->node, &ci->i_fragtree);
			
 
				+			kfree(frag);
			
 
				+		} else {
			
 
				+			/* tree branch, keep and clear */
			
 
				+			dout("fill_dirfrag cleared %llx.%llx frag %x"
			
 
				+			     " referral\n", ceph_vinop(inode), id);
			
 
				+			frag->mds = -1;
			
 
				+			frag->ndist = 0;
			
 
				+		}
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/* find/add this frag to store mds delegation info */
			
 
				+	frag = __get_or_create_frag(ci, id);
			
 
				+	if (IS_ERR(frag)) {
			
 
				+		/* this is not the end of the world; we can continue
			
 
				+		   with bad/inaccurate delegation info */
			
 
				+		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
			
 
				+		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
			
 
				+		err = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	frag->mds = mds;
			
 
				+	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
			
 
				+	for (i = 0; i < frag->ndist; i++)
			
 
				+		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
			
 
				+	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
			
 
				+	     ceph_vinop(inode), frag->frag, frag->ndist);
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&ci->i_fragtree_mutex);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * initialize a newly allocated inode.
			
 
				+ */
			
 
				+struct inode *ceph_alloc_inode(struct super_block *sb)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	int i;
			
 
				+
			
 
				+	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
			
 
				+	if (!ci)
			
 
				+		return NULL;
			
 
				+
			
 
				+	dout("alloc_inode %p\n", &ci->vfs_inode);
			
 
				+
			
 
				+	ci->i_version = 0;
			
 
				+	ci->i_time_warp_seq = 0;
			
 
				+	ci->i_ceph_flags = 0;
			
 
				+	ci->i_release_count = 0;
			
 
				+	ci->i_symlink = NULL;
			
 
				+
			
 
				+	ci->i_fragtree = RB_ROOT;
			
 
				+	mutex_init(&ci->i_fragtree_mutex);
			
 
				+
			
 
				+	ci->i_xattrs.blob = NULL;
			
 
				+	ci->i_xattrs.prealloc_blob = NULL;
			
 
				+	ci->i_xattrs.dirty = false;
			
 
				+	ci->i_xattrs.index = RB_ROOT;
			
 
				+	ci->i_xattrs.count = 0;
			
 
				+	ci->i_xattrs.names_size = 0;
			
 
				+	ci->i_xattrs.vals_size = 0;
			
 
				+	ci->i_xattrs.version = 0;
			
 
				+	ci->i_xattrs.index_version = 0;
			
 
				+
			
 
				+	ci->i_caps = RB_ROOT;
			
 
				+	ci->i_auth_cap = NULL;
			
 
				+	ci->i_dirty_caps = 0;
			
 
				+	ci->i_flushing_caps = 0;
			
 
				+	INIT_LIST_HEAD(&ci->i_dirty_item);
			
 
				+	INIT_LIST_HEAD(&ci->i_flushing_item);
			
 
				+	ci->i_cap_flush_seq = 0;
			
 
				+	ci->i_cap_flush_last_tid = 0;
			
 
				+	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
			
 
				+	init_waitqueue_head(&ci->i_cap_wq);
			
 
				+	ci->i_hold_caps_min = 0;
			
 
				+	ci->i_hold_caps_max = 0;
			
 
				+	INIT_LIST_HEAD(&ci->i_cap_delay_list);
			
 
				+	ci->i_cap_exporting_mds = 0;
			
 
				+	ci->i_cap_exporting_mseq = 0;
			
 
				+	ci->i_cap_exporting_issued = 0;
			
 
				+	INIT_LIST_HEAD(&ci->i_cap_snaps);
			
 
				+	ci->i_head_snapc = NULL;
			
 
				+	ci->i_snap_caps = 0;
			
 
				+
			
 
				+	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
			
 
				+		ci->i_nr_by_mode[i] = 0;
			
 
				+
			
 
				+	ci->i_truncate_seq = 0;
			
 
				+	ci->i_truncate_size = 0;
			
 
				+	ci->i_truncate_pending = 0;
			
 
				+
			
 
				+	ci->i_max_size = 0;
			
 
				+	ci->i_reported_size = 0;
			
 
				+	ci->i_wanted_max_size = 0;
			
 
				+	ci->i_requested_max_size = 0;
			
 
				+
			
 
				+	ci->i_pin_ref = 0;
			
 
				+	ci->i_rd_ref = 0;
			
 
				+	ci->i_rdcache_ref = 0;
			
 
				+	ci->i_wr_ref = 0;
			
 
				+	ci->i_wrbuffer_ref = 0;
			
 
				+	ci->i_wrbuffer_ref_head = 0;
			
 
				+	ci->i_shared_gen = 0;
			
 
				+	ci->i_rdcache_gen = 0;
			
 
				+	ci->i_rdcache_revoking = 0;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&ci->i_unsafe_writes);
			
 
				+	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
			
 
				+	spin_lock_init(&ci->i_unsafe_lock);
			
 
				+
			
 
				+	ci->i_snap_realm = NULL;
			
 
				+	INIT_LIST_HEAD(&ci->i_snap_realm_item);
			
 
				+	INIT_LIST_HEAD(&ci->i_snap_flush_item);
			
 
				+
			
 
				+	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
			
 
				+	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
			
 
				+
			
 
				+	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
			
 
				+
			
 
				+	return &ci->vfs_inode;
			
 
				+}
			
 
				+
			
 
				+void ceph_destroy_inode(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_inode_frag *frag;
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
			
 
				+
			
 
				+	ceph_queue_caps_release(inode);
			
 
				+
			
 
				+	kfree(ci->i_symlink);
			
 
				+	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
			
 
				+		frag = rb_entry(n, struct ceph_inode_frag, node);
			
 
				+		rb_erase(n, &ci->i_fragtree);
			
 
				+		kfree(frag);
			
 
				+	}
			
 
				+
			
 
				+	__ceph_destroy_xattrs(ci);
			
 
				+	if (ci->i_xattrs.blob)
			
 
				+		ceph_buffer_put(ci->i_xattrs.blob);
			
 
				+	if (ci->i_xattrs.prealloc_blob)
			
 
				+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
			
 
				+
			
 
				+	kmem_cache_free(ceph_inode_cachep, ci);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
			
 
				+ * careful because either the client or MDS may have more up to date
			
 
				+ * info, depending on which capabilities are held, and whether
			
 
				+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
			
 
				+ * and size are monotonically increasing, except when utimes() or
			
 
				+ * truncate() increments the corresponding _seq values.)
			
 
				+ */
			
 
				+int ceph_fill_file_size(struct inode *inode, int issued,
			
 
				+			u32 truncate_seq, u64 truncate_size, u64 size)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int queue_trunc = 0;
			
 
				+
			
 
				+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
			
 
				+	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
			
 
				+		dout("size %lld -> %llu\n", inode->i_size, size);
			
 
				+		inode->i_size = size;
			
 
				+		inode->i_blocks = (size + (1<<9) - 1) >> 9;
			
 
				+		ci->i_reported_size = size;
			
 
				+		if (truncate_seq != ci->i_truncate_seq) {
			
 
				+			dout("truncate_seq %u -> %u\n",
			
 
				+			     ci->i_truncate_seq, truncate_seq);
			
 
				+			ci->i_truncate_seq = truncate_seq;
			
 
				+			/*
			
 
				+			 * If we hold relevant caps, or in the case where we're
			
 
				+			 * not the only client referencing this file and we
			
 
				+			 * don't hold those caps, then we need to check whether
			
 
				+			 * the file is either opened or mmaped
			
 
				+			 */
			
 
				+			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
			
 
				+				      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
			
 
				+				      CEPH_CAP_FILE_EXCL)) ||
			
 
				+			    mapping_mapped(inode->i_mapping) ||
			
 
				+			    __ceph_caps_file_wanted(ci)) {
			
 
				+				ci->i_truncate_pending++;
			
 
				+				queue_trunc = 1;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
			
 
				+	    ci->i_truncate_size != truncate_size) {
			
 
				+		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
			
 
				+		     truncate_size);
			
 
				+		ci->i_truncate_size = truncate_size;
			
 
				+	}
			
 
				+	return queue_trunc;
			
 
				+}
			
 
				+
			
 
				+void ceph_fill_file_time(struct inode *inode, int issued,
			
 
				+			 u64 time_warp_seq, struct timespec *ctime,
			
 
				+			 struct timespec *mtime, struct timespec *atime)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int warn = 0;
			
 
				+
			
 
				+	if (issued & (CEPH_CAP_FILE_EXCL|
			
 
				+		      CEPH_CAP_FILE_WR|
			
 
				+		      CEPH_CAP_FILE_BUFFER)) {
			
 
				+		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
			
 
				+			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
			
 
				+			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
			
 
				+			     ctime->tv_sec, ctime->tv_nsec);
			
 
				+			inode->i_ctime = *ctime;
			
 
				+		}
			
 
				+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
			
 
				+			/* the MDS did a utimes() */
			
 
				+			dout("mtime %ld.%09ld -> %ld.%09ld "
			
 
				+			     "tw %d -> %d\n",
			
 
				+			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
			
 
				+			     mtime->tv_sec, mtime->tv_nsec,
			
 
				+			     ci->i_time_warp_seq, (int)time_warp_seq);
			
 
				+
			
 
				+			inode->i_mtime = *mtime;
			
 
				+			inode->i_atime = *atime;
			
 
				+			ci->i_time_warp_seq = time_warp_seq;
			
 
				+		} else if (time_warp_seq == ci->i_time_warp_seq) {
			
 
				+			/* nobody did utimes(); take the max */
			
 
				+			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
			
 
				+				dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
			
 
				+				     inode->i_mtime.tv_sec,
			
 
				+				     inode->i_mtime.tv_nsec,
			
 
				+				     mtime->tv_sec, mtime->tv_nsec);
			
 
				+				inode->i_mtime = *mtime;
			
 
				+			}
			
 
				+			if (timespec_compare(atime, &inode->i_atime) > 0) {
			
 
				+				dout("atime %ld.%09ld -> %ld.%09ld inc\n",
			
 
				+				     inode->i_atime.tv_sec,
			
 
				+				     inode->i_atime.tv_nsec,
			
 
				+				     atime->tv_sec, atime->tv_nsec);
			
 
				+				inode->i_atime = *atime;
			
 
				+			}
			
 
				+		} else if (issued & CEPH_CAP_FILE_EXCL) {
			
 
				+			/* we did a utimes(); ignore mds values */
			
 
				+		} else {
			
 
				+			warn = 1;
			
 
				+		}
			
 
				+	} else {
			
 
				+		/* we have no write caps; whatever the MDS says is true */
			
 
				+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
			
 
				+			inode->i_ctime = *ctime;
			
 
				+			inode->i_mtime = *mtime;
			
 
				+			inode->i_atime = *atime;
			
 
				+			ci->i_time_warp_seq = time_warp_seq;
			
 
				+		} else {
			
 
				+			warn = 1;
			
 
				+		}
			
 
				+	}
			
 
				+	if (warn) /* time_warp_seq shouldn't go backwards */
			
 
				+		dout("%p mds time_warp_seq %llu < %u\n",
			
 
				+		     inode, time_warp_seq, ci->i_time_warp_seq);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Populate an inode based on info from mds.  May be called on new or
			
 
				+ * existing inodes.
			
 
				+ */
			
 
				+static int fill_inode(struct inode *inode,
			
 
				+		      struct ceph_mds_reply_info_in *iinfo,
			
 
				+		      struct ceph_mds_reply_dirfrag *dirinfo,
			
 
				+		      struct ceph_mds_session *session,
			
 
				+		      unsigned long ttl_from, int cap_fmode,
			
 
				+		      struct ceph_cap_reservation *caps_reservation)
			
 
				+{
			
 
				+	struct ceph_mds_reply_inode *info = iinfo->in;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int i;
			
 
				+	int issued, implemented;
			
 
				+	struct timespec mtime, atime, ctime;
			
 
				+	u32 nsplits;
			
 
				+	struct ceph_buffer *xattr_blob = NULL;
			
 
				+	int err = 0;
			
 
				+	int queue_trunc = 0;
			
 
				+
			
 
				+	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
			
 
				+	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
			
 
				+	     ci->i_version);
			
 
				+
			
 
				+	/*
			
 
				+	 * prealloc xattr data, if it looks like we'll need it.  only
			
 
				+	 * if len > 4 (meaning there are actually xattrs; the first 4
			
 
				+	 * bytes are the xattr count).
			
 
				+	 */
			
 
				+	if (iinfo->xattr_len > 4) {
			
 
				+		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
			
 
				+		if (!xattr_blob)
			
 
				+			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
			
 
				+			       iinfo->xattr_len);
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * provided version will be odd if inode value is projected,
			
 
				+	 * even if stable.  skip the update if we have a newer info
			
 
				+	 * (e.g., due to inode info racing form multiple MDSs), or if
			
 
				+	 * we are getting projected (unstable) inode info.
			
 
				+	 */
			
 
				+	if (le64_to_cpu(info->version) > 0 &&
			
 
				+	    (ci->i_version & ~1) > le64_to_cpu(info->version))
			
 
				+		goto no_change;
			
 
				+
			
 
				+	issued = __ceph_caps_issued(ci, &implemented);
			
 
				+	issued |= implemented | __ceph_caps_dirty(ci);
			
 
				+
			
 
				+	/* update inode */
			
 
				+	ci->i_version = le64_to_cpu(info->version);
			
 
				+	inode->i_version++;
			
 
				+	inode->i_rdev = le32_to_cpu(info->rdev);
			
 
				+
			
 
				+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
			
 
				+		inode->i_mode = le32_to_cpu(info->mode);
			
 
				+		inode->i_uid = le32_to_cpu(info->uid);
			
 
				+		inode->i_gid = le32_to_cpu(info->gid);
			
 
				+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
			
 
				+		     inode->i_uid, inode->i_gid);
			
 
				+	}
			
 
				+
			
 
				+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
			
 
				+		inode->i_nlink = le32_to_cpu(info->nlink);
			
 
				+
			
 
				+	/* be careful with mtime, atime, size */
			
 
				+	ceph_decode_timespec(&atime, &info->atime);
			
 
				+	ceph_decode_timespec(&mtime, &info->mtime);
			
 
				+	ceph_decode_timespec(&ctime, &info->ctime);
			
 
				+	queue_trunc = ceph_fill_file_size(inode, issued,
			
 
				+					  le32_to_cpu(info->truncate_seq),
			
 
				+					  le64_to_cpu(info->truncate_size),
			
 
				+					  le64_to_cpu(info->size));
			
 
				+	ceph_fill_file_time(inode, issued,
			
 
				+			    le32_to_cpu(info->time_warp_seq),
			
 
				+			    &ctime, &mtime, &atime);
			
 
				+
			
 
				+	ci->i_max_size = le64_to_cpu(info->max_size);
			
 
				+	ci->i_layout = info->layout;
			
 
				+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
			
 
				+
			
 
				+	/* xattrs */
			
 
				+	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
			
 
				+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
			
 
				+	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
			
 
				+		if (ci->i_xattrs.blob)
			
 
				+			ceph_buffer_put(ci->i_xattrs.blob);
			
 
				+		ci->i_xattrs.blob = xattr_blob;
			
 
				+		if (xattr_blob)
			
 
				+			memcpy(ci->i_xattrs.blob->vec.iov_base,
			
 
				+			       iinfo->xattr_data, iinfo->xattr_len);
			
 
				+		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
			
 
				+	}
			
 
				+
			
 
				+	inode->i_mapping->a_ops = &ceph_aops;
			
 
				+	inode->i_mapping->backing_dev_info =
			
 
				+		&ceph_client(inode->i_sb)->backing_dev_info;
			
 
				+
			
 
				+	switch (inode->i_mode & S_IFMT) {
			
 
				+	case S_IFIFO:
			
 
				+	case S_IFBLK:
			
 
				+	case S_IFCHR:
			
 
				+	case S_IFSOCK:
			
 
				+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
			
 
				+		inode->i_op = &ceph_file_iops;
			
 
				+		break;
			
 
				+	case S_IFREG:
			
 
				+		inode->i_op = &ceph_file_iops;
			
 
				+		inode->i_fop = &ceph_file_fops;
			
 
				+		break;
			
 
				+	case S_IFLNK:
			
 
				+		inode->i_op = &ceph_symlink_iops;
			
 
				+		if (!ci->i_symlink) {
			
 
				+			int symlen = iinfo->symlink_len;
			
 
				+			char *sym;
			
 
				+
			
 
				+			BUG_ON(symlen != inode->i_size);
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+			err = -ENOMEM;
			
 
				+			sym = kmalloc(symlen+1, GFP_NOFS);
			
 
				+			if (!sym)
			
 
				+				goto out;
			
 
				+			memcpy(sym, iinfo->symlink, symlen);
			
 
				+			sym[symlen] = 0;
			
 
				+
			
 
				+			spin_lock(&inode->i_lock);
			
 
				+			if (!ci->i_symlink)
			
 
				+				ci->i_symlink = sym;
			
 
				+			else
			
 
				+				kfree(sym); /* lost a race */
			
 
				+		}
			
 
				+		break;
			
 
				+	case S_IFDIR:
			
 
				+		inode->i_op = &ceph_dir_iops;
			
 
				+		inode->i_fop = &ceph_dir_fops;
			
 
				+
			
 
				+		ci->i_files = le64_to_cpu(info->files);
			
 
				+		ci->i_subdirs = le64_to_cpu(info->subdirs);
			
 
				+		ci->i_rbytes = le64_to_cpu(info->rbytes);
			
 
				+		ci->i_rfiles = le64_to_cpu(info->rfiles);
			
 
				+		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
			
 
				+		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
			
 
				+
			
 
				+		/* set dir completion flag? */
			
 
				+		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
			
 
				+		    ceph_snap(inode) == CEPH_NOSNAP &&
			
 
				+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
			
 
				+			dout(" marking %p complete (empty)\n", inode);
			
 
				+			ci->i_ceph_flags |= CEPH_I_COMPLETE;
			
 
				+			ci->i_max_offset = 2;
			
 
				+		}
			
 
				+
			
 
				+		/* it may be better to set st_size in getattr instead? */
			
 
				+		if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
			
 
				+			inode->i_size = ci->i_rbytes;
			
 
				+		break;
			
 
				+	default:
			
 
				+		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
			
 
				+		       ceph_vinop(inode), inode->i_mode);
			
 
				+	}
			
 
				+
			
 
				+no_change:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	/* queue truncate if we saw i_size decrease */
			
 
				+	if (queue_trunc)
			
 
				+		ceph_queue_vmtruncate(inode);
			
 
				+
			
 
				+	/* populate frag tree */
			
 
				+	/* FIXME: move me up, if/when version reflects fragtree changes */
			
 
				+	nsplits = le32_to_cpu(info->fragtree.nsplits);
			
 
				+	mutex_lock(&ci->i_fragtree_mutex);
			
 
				+	for (i = 0; i < nsplits; i++) {
			
 
				+		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
			
 
				+		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
			
 
				+
			
 
				+		if (IS_ERR(frag))
			
 
				+			continue;
			
 
				+		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
			
 
				+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
			
 
				+	}
			
 
				+	mutex_unlock(&ci->i_fragtree_mutex);
			
 
				+
			
 
				+	/* were we issued a capability? */
			
 
				+	if (info->cap.caps) {
			
 
				+		if (ceph_snap(inode) == CEPH_NOSNAP) {
			
 
				+			ceph_add_cap(inode, session,
			
 
				+				     le64_to_cpu(info->cap.cap_id),
			
 
				+				     cap_fmode,
			
 
				+				     le32_to_cpu(info->cap.caps),
			
 
				+				     le32_to_cpu(info->cap.wanted),
			
 
				+				     le32_to_cpu(info->cap.seq),
			
 
				+				     le32_to_cpu(info->cap.mseq),
			
 
				+				     le64_to_cpu(info->cap.realm),
			
 
				+				     info->cap.flags,
			
 
				+				     caps_reservation);
			
 
				+		} else {
			
 
				+			spin_lock(&inode->i_lock);
			
 
				+			dout(" %p got snap_caps %s\n", inode,
			
 
				+			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
			
 
				+			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
			
 
				+			if (cap_fmode >= 0)
			
 
				+				__ceph_get_fmode(ci, cap_fmode);
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* update delegation info? */
			
 
				+	if (dirinfo)
			
 
				+		ceph_fill_dirfrag(inode, dirinfo);
			
 
				+
			
 
				+	err = 0;
			
 
				+
			
 
				+out:
			
 
				+	if (xattr_blob)
			
 
				+		ceph_buffer_put(xattr_blob);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caller should hold session s_mutex.
			
 
				+ */
			
 
				+static void update_dentry_lease(struct dentry *dentry,
			
 
				+				struct ceph_mds_reply_lease *lease,
			
 
				+				struct ceph_mds_session *session,
			
 
				+				unsigned long from_time)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dentry);
			
 
				+	long unsigned duration = le32_to_cpu(lease->duration_ms);
			
 
				+	long unsigned ttl = from_time + (duration * HZ) / 1000;
			
 
				+	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
			
 
				+	struct inode *dir;
			
 
				+
			
 
				+	/* only track leases on regular dentries */
			
 
				+	if (dentry->d_op != &ceph_dentry_ops)
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
			
 
				+	     dentry, le16_to_cpu(lease->mask), duration, ttl);
			
 
				+
			
 
				+	/* make lease_rdcache_gen match directory */
			
 
				+	dir = dentry->d_parent->d_inode;
			
 
				+	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
			
 
				+
			
 
				+	if (lease->mask == 0)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	if (di->lease_gen == session->s_cap_gen &&
			
 
				+	    time_before(ttl, dentry->d_time))
			
 
				+		goto out_unlock;  /* we already have a newer lease. */
			
 
				+
			
 
				+	if (di->lease_session && di->lease_session != session)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	ceph_dentry_lru_touch(dentry);
			
 
				+
			
 
				+	if (!di->lease_session)
			
 
				+		di->lease_session = ceph_get_mds_session(session);
			
 
				+	di->lease_gen = session->s_cap_gen;
			
 
				+	di->lease_seq = le32_to_cpu(lease->seq);
			
 
				+	di->lease_renew_after = half_ttl;
			
 
				+	di->lease_renew_from = 0;
			
 
				+	dentry->d_time = ttl;
			
 
				+out_unlock:
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * splice a dentry to an inode.
			
 
				+ * caller must hold directory i_mutex for this to be safe.
			
 
				+ *
			
 
				+ * we will only rehash the resulting dentry if @prehash is
			
 
				+ * true; @prehash will be set to false (for the benefit of
			
 
				+ * the caller) if we fail.
			
 
				+ */
			
 
				+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
			
 
				+				    bool *prehash)
			
 
				+{
			
 
				+	struct dentry *realdn;
			
 
				+
			
 
				+	/* dn must be unhashed */
			
 
				+	if (!d_unhashed(dn))
			
 
				+		d_drop(dn);
			
 
				+	realdn = d_materialise_unique(dn, in);
			
 
				+	if (IS_ERR(realdn)) {
			
 
				+		pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
			
 
				+		       dn, in, ceph_vinop(in));
			
 
				+		if (prehash)
			
 
				+			*prehash = false; /* don't rehash on error */
			
 
				+		dn = realdn; /* note realdn contains the error */
			
 
				+		goto out;
			
 
				+	} else if (realdn) {
			
 
				+		dout("dn %p (%d) spliced with %p (%d) "
			
 
				+		     "inode %p ino %llx.%llx\n",
			
 
				+		     dn, atomic_read(&dn->d_count),
			
 
				+		     realdn, atomic_read(&realdn->d_count),
			
 
				+		     realdn->d_inode, ceph_vinop(realdn->d_inode));
			
 
				+		dput(dn);
			
 
				+		dn = realdn;
			
 
				+	} else {
			
 
				+		BUG_ON(!ceph_dentry(dn));
			
 
				+
			
 
				+		dout("dn %p attached to %p ino %llx.%llx\n",
			
 
				+		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
			
 
				+	}
			
 
				+	if ((!prehash || *prehash) && d_unhashed(dn))
			
 
				+		d_rehash(dn);
			
 
				+out:
			
 
				+	return dn;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Set dentry's directory position based on the current dir's max, and
			
 
				+ * order it in d_subdirs, so that dcache_readdir behaves.
			
 
				+ */
			
 
				+static void ceph_set_dentry_offset(struct dentry *dn)
			
 
				+{
			
 
				+	struct dentry *dir = dn->d_parent;
			
 
				+	struct inode *inode = dn->d_parent->d_inode;
			
 
				+	struct ceph_dentry_info *di;
			
 
				+
			
 
				+	BUG_ON(!inode);
			
 
				+
			
 
				+	di = ceph_dentry(dn);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	di->offset = ceph_inode(inode)->i_max_offset++;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	spin_lock(&dcache_lock);
			
 
				+	spin_lock(&dn->d_lock);
			
 
				+	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
			
 
				+	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
			
 
				+	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
			
 
				+	spin_unlock(&dn->d_lock);
			
 
				+	spin_unlock(&dcache_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Incorporate results into the local cache.  This is either just
			
 
				+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
			
 
				+ * after a lookup).
			
 
				+ *
			
 
				+ * A reply may contain
			
 
				+ *         a directory inode along with a dentry.
			
 
				+ *  and/or a target inode
			
 
				+ *
			
 
				+ * Called with snap_rwsem (read).
			
 
				+ */
			
 
				+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
			
 
				+		    struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
			
 
				+	struct inode *in = NULL;
			
 
				+	struct ceph_mds_reply_inode *ininfo;
			
 
				+	struct ceph_vino vino;
			
 
				+	int i = 0;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	dout("fill_trace %p is_dentry %d is_target %d\n", req,
			
 
				+	     rinfo->head->is_dentry, rinfo->head->is_target);
			
 
				+
			
 
				+#if 0
			
 
				+	/*
			
 
				+	 * Debugging hook:
			
 
				+	 *
			
 
				+	 * If we resend completed ops to a recovering mds, we get no
			
 
				+	 * trace.  Since that is very rare, pretend this is the case
			
 
				+	 * to ensure the 'no trace' handlers in the callers behave.
			
 
				+	 *
			
 
				+	 * Fill in inodes unconditionally to avoid breaking cap
			
 
				+	 * invariants.
			
 
				+	 */
			
 
				+	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
			
 
				+		pr_info("fill_trace faking empty trace on %lld %s\n",
			
 
				+			req->r_tid, ceph_mds_op_name(rinfo->head->op));
			
 
				+		if (rinfo->head->is_dentry) {
			
 
				+			rinfo->head->is_dentry = 0;
			
 
				+			err = fill_inode(req->r_locked_dir,
			
 
				+					 &rinfo->diri, rinfo->dirfrag,
			
 
				+					 session, req->r_request_started, -1);
			
 
				+		}
			
 
				+		if (rinfo->head->is_target) {
			
 
				+			rinfo->head->is_target = 0;
			
 
				+			ininfo = rinfo->targeti.in;
			
 
				+			vino.ino = le64_to_cpu(ininfo->ino);
			
 
				+			vino.snap = le64_to_cpu(ininfo->snapid);
			
 
				+			in = ceph_get_inode(sb, vino);
			
 
				+			err = fill_inode(in, &rinfo->targeti, NULL,
			
 
				+					 session, req->r_request_started,
			
 
				+					 req->r_fmode);
			
 
				+			iput(in);
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
			
 
				+		dout("fill_trace reply is empty!\n");
			
 
				+		if (rinfo->head->result == 0 && req->r_locked_dir) {
			
 
				+			struct ceph_inode_info *ci =
			
 
				+				ceph_inode(req->r_locked_dir);
			
 
				+			dout(" clearing %p complete (empty trace)\n",
			
 
				+			     req->r_locked_dir);
			
 
				+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
			
 
				+			ci->i_release_count++;
			
 
				+		}
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (rinfo->head->is_dentry) {
			
 
				+		struct inode *dir = req->r_locked_dir;
			
 
				+
			
 
				+		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
			
 
				+				 session, req->r_request_started, -1,
			
 
				+				 &req->r_caps_reservation);
			
 
				+		if (err < 0)
			
 
				+			return err;
			
 
				+	}
			
 
				+
			
 
				+	if (rinfo->head->is_dentry && !req->r_aborted) {
			
 
				+		/*
			
 
				+		 * lookup link rename   : null -> possibly existing inode
			
 
				+		 * mknod symlink mkdir  : null -> new inode
			
 
				+		 * unlink               : linked -> null
			
 
				+		 */
			
 
				+		struct inode *dir = req->r_locked_dir;
			
 
				+		struct dentry *dn = req->r_dentry;
			
 
				+		bool have_dir_cap, have_lease;
			
 
				+
			
 
				+		BUG_ON(!dn);
			
 
				+		BUG_ON(!dir);
			
 
				+		BUG_ON(dn->d_parent->d_inode != dir);
			
 
				+		BUG_ON(ceph_ino(dir) !=
			
 
				+		       le64_to_cpu(rinfo->diri.in->ino));
			
 
				+		BUG_ON(ceph_snap(dir) !=
			
 
				+		       le64_to_cpu(rinfo->diri.in->snapid));
			
 
				+
			
 
				+		/* do we have a lease on the whole dir? */
			
 
				+		have_dir_cap =
			
 
				+			(le32_to_cpu(rinfo->diri.in->cap.caps) &
			
 
				+			 CEPH_CAP_FILE_SHARED);
			
 
				+
			
 
				+		/* do we have a dn lease? */
			
 
				+		have_lease = have_dir_cap ||
			
 
				+			(le16_to_cpu(rinfo->dlease->mask) &
			
 
				+			 CEPH_LOCK_DN);
			
 
				+
			
 
				+		if (!have_lease)
			
 
				+			dout("fill_trace  no dentry lease or dir cap\n");
			
 
				+
			
 
				+		/* rename? */
			
 
				+		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
			
 
				+			dout(" src %p '%.*s' dst %p '%.*s'\n",
			
 
				+			     req->r_old_dentry,
			
 
				+			     req->r_old_dentry->d_name.len,
			
 
				+			     req->r_old_dentry->d_name.name,
			
 
				+			     dn, dn->d_name.len, dn->d_name.name);
			
 
				+			dout("fill_trace doing d_move %p -> %p\n",
			
 
				+			     req->r_old_dentry, dn);
			
 
				+			d_move(req->r_old_dentry, dn);
			
 
				+			dout(" src %p '%.*s' dst %p '%.*s'\n",
			
 
				+			     req->r_old_dentry,
			
 
				+			     req->r_old_dentry->d_name.len,
			
 
				+			     req->r_old_dentry->d_name.name,
			
 
				+			     dn, dn->d_name.len, dn->d_name.name);
			
 
				+			/* ensure target dentry is invalidated, despite
			
 
				+			   rehashing bug in vfs_rename_dir */
			
 
				+			dn->d_time = jiffies;
			
 
				+			ceph_dentry(dn)->lease_shared_gen = 0;
			
 
				+			/* take overwritten dentry's readdir offset */
			
 
				+			ceph_dentry(req->r_old_dentry)->offset =
			
 
				+				ceph_dentry(dn)->offset;
			
 
				+			dn = req->r_old_dentry;  /* use old_dentry */
			
 
				+			in = dn->d_inode;
			
 
				+		}
			
 
				+
			
 
				+		/* null dentry? */
			
 
				+		if (!rinfo->head->is_target) {
			
 
				+			dout("fill_trace null dentry\n");
			
 
				+			if (dn->d_inode) {
			
 
				+				dout("d_delete %p\n", dn);
			
 
				+				d_delete(dn);
			
 
				+			} else {
			
 
				+				dout("d_instantiate %p NULL\n", dn);
			
 
				+				d_instantiate(dn, NULL);
			
 
				+				if (have_lease && d_unhashed(dn))
			
 
				+					d_rehash(dn);
			
 
				+				update_dentry_lease(dn, rinfo->dlease,
			
 
				+						    session,
			
 
				+						    req->r_request_started);
			
 
				+			}
			
 
				+			goto done;
			
 
				+		}
			
 
				+
			
 
				+		/* attach proper inode */
			
 
				+		ininfo = rinfo->targeti.in;
			
 
				+		vino.ino = le64_to_cpu(ininfo->ino);
			
 
				+		vino.snap = le64_to_cpu(ininfo->snapid);
			
 
				+		if (!dn->d_inode) {
			
 
				+			in = ceph_get_inode(sb, vino);
			
 
				+			if (IS_ERR(in)) {
			
 
				+				pr_err("fill_trace bad get_inode "
			
 
				+				       "%llx.%llx\n", vino.ino, vino.snap);
			
 
				+				err = PTR_ERR(in);
			
 
				+				d_delete(dn);
			
 
				+				goto done;
			
 
				+			}
			
 
				+			dn = splice_dentry(dn, in, &have_lease);
			
 
				+			if (IS_ERR(dn)) {
			
 
				+				err = PTR_ERR(dn);
			
 
				+				goto done;
			
 
				+			}
			
 
				+			req->r_dentry = dn;  /* may have spliced */
			
 
				+			ceph_set_dentry_offset(dn);
			
 
				+			igrab(in);
			
 
				+		} else if (ceph_ino(in) == vino.ino &&
			
 
				+			   ceph_snap(in) == vino.snap) {
			
 
				+			igrab(in);
			
 
				+		} else {
			
 
				+			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
			
 
				+			     dn, in, ceph_ino(in), ceph_snap(in),
			
 
				+			     vino.ino, vino.snap);
			
 
				+			have_lease = false;
			
 
				+			in = NULL;
			
 
				+		}
			
 
				+
			
 
				+		if (have_lease)
			
 
				+			update_dentry_lease(dn, rinfo->dlease, session,
			
 
				+					    req->r_request_started);
			
 
				+		dout(" final dn %p\n", dn);
			
 
				+		i++;
			
 
				+	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
			
 
				+		   req->r_op == CEPH_MDS_OP_MKSNAP) {
			
 
				+		struct dentry *dn = req->r_dentry;
			
 
				+
			
 
				+		/* fill out a snapdir LOOKUPSNAP dentry */
			
 
				+		BUG_ON(!dn);
			
 
				+		BUG_ON(!req->r_locked_dir);
			
 
				+		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
			
 
				+		ininfo = rinfo->targeti.in;
			
 
				+		vino.ino = le64_to_cpu(ininfo->ino);
			
 
				+		vino.snap = le64_to_cpu(ininfo->snapid);
			
 
				+		in = ceph_get_inode(sb, vino);
			
 
				+		if (IS_ERR(in)) {
			
 
				+			pr_err("fill_inode get_inode badness %llx.%llx\n",
			
 
				+			       vino.ino, vino.snap);
			
 
				+			err = PTR_ERR(in);
			
 
				+			d_delete(dn);
			
 
				+			goto done;
			
 
				+		}
			
 
				+		dout(" linking snapped dir %p to dn %p\n", in, dn);
			
 
				+		dn = splice_dentry(dn, in, NULL);
			
 
				+		if (IS_ERR(dn)) {
			
 
				+			err = PTR_ERR(dn);
			
 
				+			goto done;
			
 
				+		}
			
 
				+		ceph_set_dentry_offset(dn);
			
 
				+		req->r_dentry = dn;  /* may have spliced */
			
 
				+		igrab(in);
			
 
				+		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
			
 
				+	}
			
 
				+
			
 
				+	if (rinfo->head->is_target) {
			
 
				+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
			
 
				+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
			
 
				+
			
 
				+		if (in == NULL || ceph_ino(in) != vino.ino ||
			
 
				+		    ceph_snap(in) != vino.snap) {
			
 
				+			in = ceph_get_inode(sb, vino);
			
 
				+			if (IS_ERR(in)) {
			
 
				+				err = PTR_ERR(in);
			
 
				+				goto done;
			
 
				+			}
			
 
				+		}
			
 
				+		req->r_target_inode = in;
			
 
				+
			
 
				+		err = fill_inode(in,
			
 
				+				 &rinfo->targeti, NULL,
			
 
				+				 session, req->r_request_started,
			
 
				+				 (le32_to_cpu(rinfo->head->result) == 0) ?
			
 
				+				 req->r_fmode : -1,
			
 
				+				 &req->r_caps_reservation);
			
 
				+		if (err < 0) {
			
 
				+			pr_err("fill_inode badness %p %llx.%llx\n",
			
 
				+			       in, ceph_vinop(in));
			
 
				+			goto done;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	dout("fill_trace done err=%d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepopulate our cache with readdir results, leases, etc.
			
 
				+ */
			
 
				+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
			
 
				+			     struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct dentry *parent = req->r_dentry;
			
 
				+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
			
 
				+	struct qstr dname;
			
 
				+	struct dentry *dn;
			
 
				+	struct inode *in;
			
 
				+	int err = 0, i;
			
 
				+	struct inode *snapdir = NULL;
			
 
				+	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
			
 
				+	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
			
 
				+	struct ceph_dentry_info *di;
			
 
				+
			
 
				+	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
			
 
				+		snapdir = ceph_get_snapdir(parent->d_inode);
			
 
				+		parent = d_find_alias(snapdir);
			
 
				+		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
			
 
				+		     rinfo->dir_nr, parent);
			
 
				+	} else {
			
 
				+		dout("readdir_prepopulate %d items under dn %p\n",
			
 
				+		     rinfo->dir_nr, parent);
			
 
				+		if (rinfo->dir_dir)
			
 
				+			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < rinfo->dir_nr; i++) {
			
 
				+		struct ceph_vino vino;
			
 
				+
			
 
				+		dname.name = rinfo->dir_dname[i];
			
 
				+		dname.len = rinfo->dir_dname_len[i];
			
 
				+		dname.hash = full_name_hash(dname.name, dname.len);
			
 
				+
			
 
				+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
			
 
				+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
			
 
				+
			
 
				+retry_lookup:
			
 
				+		dn = d_lookup(parent, &dname);
			
 
				+		dout("d_lookup on parent=%p name=%.*s got %p\n",
			
 
				+		     parent, dname.len, dname.name, dn);
			
 
				+
			
 
				+		if (!dn) {
			
 
				+			dn = d_alloc(parent, &dname);
			
 
				+			dout("d_alloc %p '%.*s' = %p\n", parent,
			
 
				+			     dname.len, dname.name, dn);
			
 
				+			if (dn == NULL) {
			
 
				+				dout("d_alloc badness\n");
			
 
				+				err = -ENOMEM;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			err = ceph_init_dentry(dn);
			
 
				+			if (err < 0)
			
 
				+				goto out;
			
 
				+		} else if (dn->d_inode &&
			
 
				+			   (ceph_ino(dn->d_inode) != vino.ino ||
			
 
				+			    ceph_snap(dn->d_inode) != vino.snap)) {
			
 
				+			dout(" dn %p points to wrong inode %p\n",
			
 
				+			     dn, dn->d_inode);
			
 
				+			d_delete(dn);
			
 
				+			dput(dn);
			
 
				+			goto retry_lookup;
			
 
				+		} else {
			
 
				+			/* reorder parent's d_subdirs */
			
 
				+			spin_lock(&dcache_lock);
			
 
				+			spin_lock(&dn->d_lock);
			
 
				+			list_move(&dn->d_u.d_child, &parent->d_subdirs);
			
 
				+			spin_unlock(&dn->d_lock);
			
 
				+			spin_unlock(&dcache_lock);
			
 
				+		}
			
 
				+
			
 
				+		di = dn->d_fsdata;
			
 
				+		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
			
 
				+
			
 
				+		/* inode */
			
 
				+		if (dn->d_inode) {
			
 
				+			in = dn->d_inode;
			
 
				+		} else {
			
 
				+			in = ceph_get_inode(parent->d_sb, vino);
			
 
				+			if (in == NULL) {
			
 
				+				dout("new_inode badness\n");
			
 
				+				d_delete(dn);
			
 
				+				dput(dn);
			
 
				+				err = -ENOMEM;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			dn = splice_dentry(dn, in, NULL);
			
 
				+		}
			
 
				+
			
 
				+		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
			
 
				+			       req->r_request_started, -1,
			
 
				+			       &req->r_caps_reservation) < 0) {
			
 
				+			pr_err("fill_inode badness on %p\n", in);
			
 
				+			dput(dn);
			
 
				+			continue;
			
 
				+		}
			
 
				+		update_dentry_lease(dn, rinfo->dir_dlease[i],
			
 
				+				    req->r_session, req->r_request_started);
			
 
				+		dput(dn);
			
 
				+	}
			
 
				+	req->r_did_prepopulate = true;
			
 
				+
			
 
				+out:
			
 
				+	if (snapdir) {
			
 
				+		iput(snapdir);
			
 
				+		dput(parent);
			
 
				+	}
			
 
				+	dout("readdir_prepopulate done\n");
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+int ceph_inode_set_size(struct inode *inode, loff_t size)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
			
 
				+	inode->i_size = size;
			
 
				+	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
			
 
				+
			
 
				+	/* tell the MDS if we are approaching max_size */
			
 
				+	if ((size << 1) >= ci->i_max_size &&
			
 
				+	    (ci->i_reported_size << 1) < ci->i_max_size)
			
 
				+		ret = 1;
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Write back inode data in a worker thread.  (This can't be done
			
 
				+ * in the message handler context.)
			
 
				+ */
			
 
				+void ceph_queue_writeback(struct inode *inode)
			
 
				+{
			
 
				+	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
			
 
				+		       &ceph_inode(inode)->i_wb_work)) {
			
 
				+		dout("ceph_queue_writeback %p\n", inode);
			
 
				+		igrab(inode);
			
 
				+	} else {
			
 
				+		dout("ceph_queue_writeback %p failed\n", inode);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ceph_writeback_work(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
			
 
				+						  i_wb_work);
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+
			
 
				+	dout("writeback %p\n", inode);
			
 
				+	filemap_fdatawrite(&inode->i_data);
			
 
				+	iput(inode);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * queue an async invalidation
			
 
				+ */
			
 
				+void ceph_queue_invalidate(struct inode *inode)
			
 
				+{
			
 
				+	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
			
 
				+		       &ceph_inode(inode)->i_pg_inv_work)) {
			
 
				+		dout("ceph_queue_invalidate %p\n", inode);
			
 
				+		igrab(inode);
			
 
				+	} else {
			
 
				+		dout("ceph_queue_invalidate %p failed\n", inode);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * invalidate any pages that are not dirty or under writeback.  this
			
 
				+ * includes pages that are clean and mapped.
			
 
				+ */
			
 
				+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
			
 
				+{
			
 
				+	struct pagevec pvec;
			
 
				+	pgoff_t next = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	pagevec_init(&pvec, 0);
			
 
				+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
			
 
				+		for (i = 0; i < pagevec_count(&pvec); i++) {
			
 
				+			struct page *page = pvec.pages[i];
			
 
				+			pgoff_t index;
			
 
				+			int skip_page =
			
 
				+				(PageDirty(page) || PageWriteback(page));
			
 
				+
			
 
				+			if (!skip_page)
			
 
				+				skip_page = !trylock_page(page);
			
 
				+
			
 
				+			/*
			
 
				+			 * We really shouldn't be looking at the ->index of an
			
 
				+			 * unlocked page.  But we're not allowed to lock these
			
 
				+			 * pages.  So we rely upon nobody altering the ->index
			
 
				+			 * of this (pinned-by-us) page.
			
 
				+			 */
			
 
				+			index = page->index;
			
 
				+			if (index > next)
			
 
				+				next = index;
			
 
				+			next++;
			
 
				+
			
 
				+			if (skip_page)
			
 
				+				continue;
			
 
				+
			
 
				+			generic_error_remove_page(mapping, page);
			
 
				+			unlock_page(page);
			
 
				+		}
			
 
				+		pagevec_release(&pvec);
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Invalidate inode pages in a worker thread.  (This can't be done
			
 
				+ * in the message handler context.)
			
 
				+ */
			
 
				+static void ceph_invalidate_work(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
			
 
				+						  i_pg_inv_work);
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	u32 orig_gen;
			
 
				+	int check = 0;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	dout("invalidate_pages %p gen %d revoking %d\n", inode,
			
 
				+	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
			
 
				+	if (ci->i_rdcache_gen == 0 ||
			
 
				+	    ci->i_rdcache_revoking != ci->i_rdcache_gen) {
			
 
				+		BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
			
 
				+		/* nevermind! */
			
 
				+		ci->i_rdcache_revoking = 0;
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	orig_gen = ci->i_rdcache_gen;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	ceph_invalidate_nondirty_pages(inode->i_mapping);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (orig_gen == ci->i_rdcache_gen) {
			
 
				+		dout("invalidate_pages %p gen %d successful\n", inode,
			
 
				+		     ci->i_rdcache_gen);
			
 
				+		ci->i_rdcache_gen = 0;
			
 
				+		ci->i_rdcache_revoking = 0;
			
 
				+		check = 1;
			
 
				+	} else {
			
 
				+		dout("invalidate_pages %p gen %d raced, gen now %d\n",
			
 
				+		     inode, orig_gen, ci->i_rdcache_gen);
			
 
				+	}
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (check)
			
 
				+		ceph_check_caps(ci, 0, NULL);
			
 
				+out:
			
 
				+	iput(inode);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * called by trunc_wq; take i_mutex ourselves
			
 
				+ *
			
 
				+ * We also truncate in a separate thread as well.
			
 
				+ */
			
 
				+static void ceph_vmtruncate_work(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
			
 
				+						  i_vmtruncate_work);
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+
			
 
				+	dout("vmtruncate_work %p\n", inode);
			
 
				+	mutex_lock(&inode->i_mutex);
			
 
				+	__ceph_do_pending_vmtruncate(inode);
			
 
				+	mutex_unlock(&inode->i_mutex);
			
 
				+	iput(inode);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
			
 
				+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
			
 
				+ */
			
 
				+void ceph_queue_vmtruncate(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+
			
 
				+	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
			
 
				+		       &ci->i_vmtruncate_work)) {
			
 
				+		dout("ceph_queue_vmtruncate %p\n", inode);
			
 
				+		igrab(inode);
			
 
				+	} else {
			
 
				+		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
			
 
				+		     inode, ci->i_truncate_pending);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called with i_mutex held.
			
 
				+ *
			
 
				+ * Make sure any pending truncation is applied before doing anything
			
 
				+ * that may depend on it.
			
 
				+ */
			
 
				+void __ceph_do_pending_vmtruncate(struct inode *inode)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	u64 to;
			
 
				+	int wrbuffer_refs, wake = 0;
			
 
				+
			
 
				+retry:
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	if (ci->i_truncate_pending == 0) {
			
 
				+		dout("__do_pending_vmtruncate %p none pending\n", inode);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * make sure any dirty snapped pages are flushed before we
			
 
				+	 * possibly truncate them.. so write AND block!
			
 
				+	 */
			
 
				+	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
			
 
				+		dout("__do_pending_vmtruncate %p flushing snaps first\n",
			
 
				+		     inode);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		filemap_write_and_wait_range(&inode->i_data, 0,
			
 
				+					     inode->i_sb->s_maxbytes);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	to = ci->i_truncate_size;
			
 
				+	wrbuffer_refs = ci->i_wrbuffer_ref;
			
 
				+	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
			
 
				+	     ci->i_truncate_pending, to);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	truncate_inode_pages(inode->i_mapping, to);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	ci->i_truncate_pending--;
			
 
				+	if (ci->i_truncate_pending == 0)
			
 
				+		wake = 1;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (wrbuffer_refs == 0)
			
 
				+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
			
 
				+	if (wake)
			
 
				+		wake_up(&ci->i_cap_wq);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * symlinks
			
 
				+ */
			
 
				+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
			
 
				+	nd_set_link(nd, ci->i_symlink);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static const struct inode_operations ceph_symlink_iops = {
			
 
				+	.readlink = generic_readlink,
			
 
				+	.follow_link = ceph_sym_follow_link,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * setattr
			
 
				+ */
			
 
				+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct inode *parent_inode = dentry->d_parent->d_inode;
			
 
				+	const unsigned int ia_valid = attr->ia_valid;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
			
 
				+	int issued;
			
 
				+	int release = 0, dirtied = 0;
			
 
				+	int mask = 0;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	__ceph_do_pending_vmtruncate(inode);
			
 
				+
			
 
				+	err = inode_change_ok(inode, attr);
			
 
				+	if (err != 0)
			
 
				+		return err;
			
 
				+
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
			
 
				+				       USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	issued = __ceph_caps_issued(ci, NULL);
			
 
				+	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
			
 
				+
			
 
				+	if (ia_valid & ATTR_UID) {
			
 
				+		dout("setattr %p uid %d -> %d\n", inode,
			
 
				+		     inode->i_uid, attr->ia_uid);
			
 
				+		if (issued & CEPH_CAP_AUTH_EXCL) {
			
 
				+			inode->i_uid = attr->ia_uid;
			
 
				+			dirtied |= CEPH_CAP_AUTH_EXCL;
			
 
				+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
			
 
				+			   attr->ia_uid != inode->i_uid) {
			
 
				+			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
			
 
				+			mask |= CEPH_SETATTR_UID;
			
 
				+			release |= CEPH_CAP_AUTH_SHARED;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ia_valid & ATTR_GID) {
			
 
				+		dout("setattr %p gid %d -> %d\n", inode,
			
 
				+		     inode->i_gid, attr->ia_gid);
			
 
				+		if (issued & CEPH_CAP_AUTH_EXCL) {
			
 
				+			inode->i_gid = attr->ia_gid;
			
 
				+			dirtied |= CEPH_CAP_AUTH_EXCL;
			
 
				+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
			
 
				+			   attr->ia_gid != inode->i_gid) {
			
 
				+			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
			
 
				+			mask |= CEPH_SETATTR_GID;
			
 
				+			release |= CEPH_CAP_AUTH_SHARED;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ia_valid & ATTR_MODE) {
			
 
				+		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
			
 
				+		     attr->ia_mode);
			
 
				+		if (issued & CEPH_CAP_AUTH_EXCL) {
			
 
				+			inode->i_mode = attr->ia_mode;
			
 
				+			dirtied |= CEPH_CAP_AUTH_EXCL;
			
 
				+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
			
 
				+			   attr->ia_mode != inode->i_mode) {
			
 
				+			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
			
 
				+			mask |= CEPH_SETATTR_MODE;
			
 
				+			release |= CEPH_CAP_AUTH_SHARED;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (ia_valid & ATTR_ATIME) {
			
 
				+		dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
			
 
				+		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
			
 
				+		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
			
 
				+		if (issued & CEPH_CAP_FILE_EXCL) {
			
 
				+			ci->i_time_warp_seq++;
			
 
				+			inode->i_atime = attr->ia_atime;
			
 
				+			dirtied |= CEPH_CAP_FILE_EXCL;
			
 
				+		} else if ((issued & CEPH_CAP_FILE_WR) &&
			
 
				+			   timespec_compare(&inode->i_atime,
			
 
				+					    &attr->ia_atime) < 0) {
			
 
				+			inode->i_atime = attr->ia_atime;
			
 
				+			dirtied |= CEPH_CAP_FILE_WR;
			
 
				+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
			
 
				+			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
			
 
				+			ceph_encode_timespec(&req->r_args.setattr.atime,
			
 
				+					     &attr->ia_atime);
			
 
				+			mask |= CEPH_SETATTR_ATIME;
			
 
				+			release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
			
 
				+				CEPH_CAP_FILE_WR;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ia_valid & ATTR_MTIME) {
			
 
				+		dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
			
 
				+		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
			
 
				+		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
			
 
				+		if (issued & CEPH_CAP_FILE_EXCL) {
			
 
				+			ci->i_time_warp_seq++;
			
 
				+			inode->i_mtime = attr->ia_mtime;
			
 
				+			dirtied |= CEPH_CAP_FILE_EXCL;
			
 
				+		} else if ((issued & CEPH_CAP_FILE_WR) &&
			
 
				+			   timespec_compare(&inode->i_mtime,
			
 
				+					    &attr->ia_mtime) < 0) {
			
 
				+			inode->i_mtime = attr->ia_mtime;
			
 
				+			dirtied |= CEPH_CAP_FILE_WR;
			
 
				+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
			
 
				+			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
			
 
				+			ceph_encode_timespec(&req->r_args.setattr.mtime,
			
 
				+					     &attr->ia_mtime);
			
 
				+			mask |= CEPH_SETATTR_MTIME;
			
 
				+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
			
 
				+				CEPH_CAP_FILE_WR;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ia_valid & ATTR_SIZE) {
			
 
				+		dout("setattr %p size %lld -> %lld\n", inode,
			
 
				+		     inode->i_size, attr->ia_size);
			
 
				+		if (attr->ia_size > inode->i_sb->s_maxbytes) {
			
 
				+			err = -EINVAL;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		if ((issued & CEPH_CAP_FILE_EXCL) &&
			
 
				+		    attr->ia_size > inode->i_size) {
			
 
				+			inode->i_size = attr->ia_size;
			
 
				+			inode->i_blocks =
			
 
				+				(attr->ia_size + (1 << 9) - 1) >> 9;
			
 
				+			inode->i_ctime = attr->ia_ctime;
			
 
				+			ci->i_reported_size = attr->ia_size;
			
 
				+			dirtied |= CEPH_CAP_FILE_EXCL;
			
 
				+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
			
 
				+			   attr->ia_size != inode->i_size) {
			
 
				+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
			
 
				+			req->r_args.setattr.old_size =
			
 
				+				cpu_to_le64(inode->i_size);
			
 
				+			mask |= CEPH_SETATTR_SIZE;
			
 
				+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
			
 
				+				CEPH_CAP_FILE_WR;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* these do nothing */
			
 
				+	if (ia_valid & ATTR_CTIME) {
			
 
				+		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
			
 
				+					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
			
 
				+		dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
			
 
				+		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
			
 
				+		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
			
 
				+		     only ? "ctime only" : "ignored");
			
 
				+		inode->i_ctime = attr->ia_ctime;
			
 
				+		if (only) {
			
 
				+			/*
			
 
				+			 * if kernel wants to dirty ctime but nothing else,
			
 
				+			 * we need to choose a cap to dirty under, or do
			
 
				+			 * a almost-no-op setattr
			
 
				+			 */
			
 
				+			if (issued & CEPH_CAP_AUTH_EXCL)
			
 
				+				dirtied |= CEPH_CAP_AUTH_EXCL;
			
 
				+			else if (issued & CEPH_CAP_FILE_EXCL)
			
 
				+				dirtied |= CEPH_CAP_FILE_EXCL;
			
 
				+			else if (issued & CEPH_CAP_XATTR_EXCL)
			
 
				+				dirtied |= CEPH_CAP_XATTR_EXCL;
			
 
				+			else
			
 
				+				mask |= CEPH_SETATTR_CTIME;
			
 
				+		}
			
 
				+	}
			
 
				+	if (ia_valid & ATTR_FILE)
			
 
				+		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
			
 
				+
			
 
				+	if (dirtied) {
			
 
				+		__ceph_mark_dirty_caps(ci, dirtied);
			
 
				+		inode->i_ctime = CURRENT_TIME;
			
 
				+	}
			
 
				+
			
 
				+	release &= issued;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	if (mask) {
			
 
				+		req->r_inode = igrab(inode);
			
 
				+		req->r_inode_drop = release;
			
 
				+		req->r_args.setattr.mask = cpu_to_le32(mask);
			
 
				+		req->r_num_caps = 1;
			
 
				+		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
			
 
				+	}
			
 
				+	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
			
 
				+	     ceph_cap_string(dirtied), mask);
			
 
				+
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	__ceph_do_pending_vmtruncate(inode);
			
 
				+	return err;
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Verify that we have a lease on the given mask.  If not,
			
 
				+ * do a getattr against an mds.
			
 
				+ */
			
 
				+int ceph_do_getattr(struct inode *inode, int mask)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+
			
 
				+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
			
 
				+		dout("do_getattr inode %p SNAPDIR\n", inode);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
			
 
				+	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
			
 
				+		return 0;
			
 
				+
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+	req->r_inode = igrab(inode);
			
 
				+	req->r_num_caps = 1;
			
 
				+	req->r_args.getattr.mask = cpu_to_le32(mask);
			
 
				+	err = ceph_mdsc_do_request(mdsc, NULL, req);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	dout("do_getattr result=%d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Check inode permissions.  We verify we have a valid value for
			
 
				+ * the AUTH cap, then call the generic handler.
			
 
				+ */
			
 
				+int ceph_permission(struct inode *inode, int mask)
			
 
				+{
			
 
				+	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
			
 
				+
			
 
				+	if (!err)
			
 
				+		err = generic_permission(inode, mask, NULL);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get all attributes.  Hopefully somedata we'll have a statlite()
			
 
				+ * and can limit the fields we require to be accurate.
			
 
				+ */
			
 
				+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
			
 
				+		 struct kstat *stat)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int err;
			
 
				+
			
 
				+	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
			
 
				+	if (!err) {
			
 
				+		generic_fillattr(inode, stat);
			
 
				+		stat->ino = inode->i_ino;
			
 
				+		if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				+			stat->dev = ceph_snap(inode);
			
 
				+		else
			
 
				+			stat->dev = 0;
			
 
				+		if (S_ISDIR(inode->i_mode)) {
			
 
				+			stat->size = ci->i_rbytes;
			
 
				+			stat->blocks = 0;
			
 
				+			stat->blksize = 65536;
			
 
				+		}
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
 
				+#include <linux/in.h>
			
 
				+
			
 
				+#include "ioctl.h"
			
 
				+#include "super.h"
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * ioctls
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * get and set the file layout
			
 
				+ */
			
 
				+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
			
 
				+	struct ceph_ioctl_layout l;
			
 
				+	int err;
			
 
				+
			
 
				+	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
			
 
				+	if (!err) {
			
 
				+		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
			
 
				+		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
			
 
				+		l.object_size = ceph_file_layout_object_size(ci->i_layout);
			
 
				+		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
			
 
				+		l.preferred_osd =
			
 
				+			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
			
 
				+		if (copy_to_user(arg, &l, sizeof(l)))
			
 
				+			return -EFAULT;
			
 
				+	}
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
			
 
				+{
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
			
 
				+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct ceph_ioctl_layout l;
			
 
				+	int err, i;
			
 
				+
			
 
				+	/* copy and validate */
			
 
				+	if (copy_from_user(&l, arg, sizeof(l)))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	if ((l.object_size & ~PAGE_MASK) ||
			
 
				+	    (l.stripe_unit & ~PAGE_MASK) ||
			
 
				+	    !l.stripe_unit ||
			
 
				+	    (l.object_size &&
			
 
				+	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* make sure it's a valid data pool */
			
 
				+	if (l.data_pool > 0) {
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+		err = -EINVAL;
			
 
				+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
			
 
				+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
			
 
				+				err = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		if (err)
			
 
				+			return err;
			
 
				+	}
			
 
				+
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
			
 
				+				       USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+	req->r_inode = igrab(inode);
			
 
				+	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
			
 
				+
			
 
				+	req->r_args.setlayout.layout.fl_stripe_unit =
			
 
				+		cpu_to_le32(l.stripe_unit);
			
 
				+	req->r_args.setlayout.layout.fl_stripe_count =
			
 
				+		cpu_to_le32(l.stripe_count);
			
 
				+	req->r_args.setlayout.layout.fl_object_size =
			
 
				+		cpu_to_le32(l.object_size);
			
 
				+	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
			
 
				+	req->r_args.setlayout.layout.fl_pg_preferred =
			
 
				+		cpu_to_le32(l.preferred_osd);
			
 
				+
			
 
				+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return object name, size/offset information, and location (OSD
			
 
				+ * number, network address) for a given file offset.
			
 
				+ */
			
 
				+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
			
 
				+{
			
 
				+	struct ceph_ioctl_dataloc dl;
			
 
				+	struct inode *inode = file->f_dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
			
 
				+	u64 len = 1, olen;
			
 
				+	u64 tmp;
			
 
				+	struct ceph_object_layout ol;
			
 
				+	struct ceph_pg pgid;
			
 
				+
			
 
				+	/* copy and validate */
			
 
				+	if (copy_from_user(&dl, arg, sizeof(dl)))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	down_read(&osdc->map_sem);
			
 
				+	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
			
 
				+				      &dl.object_no, &dl.object_offset, &olen);
			
 
				+	dl.file_offset -= dl.object_offset;
			
 
				+	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
			
 
				+	dl.block_size = ceph_file_layout_su(ci->i_layout);
			
 
				+
			
 
				+	/* block_offset = object_offset % block_size */
			
 
				+	tmp = dl.object_offset;
			
 
				+	dl.block_offset = do_div(tmp, dl.block_size);
			
 
				+
			
 
				+	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
			
 
				+		 ceph_ino(inode), dl.object_no);
			
 
				+	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
			
 
				+				osdc->osdmap);
			
 
				+
			
 
				+	pgid = ol.ol_pgid;
			
 
				+	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
			
 
				+	if (dl.osd >= 0) {
			
 
				+		struct ceph_entity_addr *a =
			
 
				+			ceph_osd_addr(osdc->osdmap, dl.osd);
			
 
				+		if (a)
			
 
				+			memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
			
 
				+	} else {
			
 
				+		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
			
 
				+	}
			
 
				+	up_read(&osdc->map_sem);
			
 
				+
			
 
				+	/* send result back to user */
			
 
				+	if (copy_to_user(arg, &dl, sizeof(dl)))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
			
 
				+{
			
 
				+	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
			
 
				+	switch (cmd) {
			
 
				+	case CEPH_IOC_GET_LAYOUT:
			
 
				+		return ceph_ioctl_get_layout(file, (void __user *)arg);
			
 
				+
			
 
				+	case CEPH_IOC_SET_LAYOUT:
			
 
				+		return ceph_ioctl_set_layout(file, (void __user *)arg);
			
 
				+
			
 
				+	case CEPH_IOC_GET_DATALOC:
			
 
				+		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
			
 
				+	}
			
 
				+	return -ENOTTY;
			
 
				+}
			
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
 
				+#ifndef FS_CEPH_IOCTL_H
			
 
				+#define FS_CEPH_IOCTL_H
			
 
				+
			
 
				+#include <linux/ioctl.h>
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+#define CEPH_IOCTL_MAGIC 0x97
			
 
				+
			
 
				+/* just use u64 to align sanely on all archs */
			
 
				+struct ceph_ioctl_layout {
			
 
				+	__u64 stripe_unit, stripe_count, object_size;
			
 
				+	__u64 data_pool;
			
 
				+	__s64 preferred_osd;
			
 
				+};
			
 
				+
			
 
				+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
			
 
				+				   struct ceph_ioctl_layout)
			
 
				+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
			
 
				+				   struct ceph_ioctl_layout)
			
 
				+
			
 
				+/*
			
 
				+ * Extract identity, address of the OSD and object storing a given
			
 
				+ * file offset.
			
 
				+ */
			
 
				+struct ceph_ioctl_dataloc {
			
 
				+	__u64 file_offset;           /* in+out: file offset */
			
 
				+	__u64 object_offset;         /* out: offset in object */
			
 
				+	__u64 object_no;             /* out: object # */
			
 
				+	__u64 object_size;           /* out: object size */
			
 
				+	char object_name[64];        /* out: object name */
			
 
				+	__u64 block_offset;          /* out: offset in block */
			
 
				+	__u64 block_size;            /* out: block length */
			
 
				+	__s64 osd;                   /* out: osd # */
			
 
				+	struct sockaddr_storage osd_addr; /* out: osd address */
			
 
				+};
			
 
				+
			
 
				+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
			
 
				+				   struct ceph_ioctl_dataloc)
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3021 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/wait.h>
			
 
				+#include <linux/sched.h>
			
 
				+
			
 
				+#include "mds_client.h"
			
 
				+#include "mon_client.h"
			
 
				+#include "super.h"
			
 
				+#include "messenger.h"
			
 
				+#include "decode.h"
			
 
				+#include "auth.h"
			
 
				+#include "pagelist.h"
			
 
				+
			
 
				+/*
			
 
				+ * A cluster of MDS (metadata server) daemons is responsible for
			
 
				+ * managing the file system namespace (the directory hierarchy and
			
 
				+ * inodes) and for coordinating shared access to storage.  Metadata is
			
 
				+ * partitioning hierarchically across a number of servers, and that
			
 
				+ * partition varies over time as the cluster adjusts the distribution
			
 
				+ * in order to balance load.
			
 
				+ *
			
 
				+ * The MDS client is primarily responsible to managing synchronous
			
 
				+ * metadata requests for operations like open, unlink, and so forth.
			
 
				+ * If there is a MDS failure, we find out about it when we (possibly
			
 
				+ * request and) receive a new MDS map, and can resubmit affected
			
 
				+ * requests.
			
 
				+ *
			
 
				+ * For the most part, though, we take advantage of a lossless
			
 
				+ * communications channel to the MDS, and do not need to worry about
			
 
				+ * timing out or resubmitting requests.
			
 
				+ *
			
 
				+ * We maintain a stateful "session" with each MDS we interact with.
			
 
				+ * Within each session, we sent periodic heartbeat messages to ensure
			
 
				+ * any capabilities or leases we have been issues remain valid.  If
			
 
				+ * the session times out and goes stale, our leases and capabilities
			
 
				+ * are no longer valid.
			
 
				+ */
			
 
				+
			
 
				+static void __wake_requests(struct ceph_mds_client *mdsc,
			
 
				+			    struct list_head *head);
			
 
				+
			
 
				+const static struct ceph_connection_operations mds_con_ops;
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * mds reply parsing
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * parse individual inode info
			
 
				+ */
			
 
				+static int parse_reply_info_in(void **p, void *end,
			
 
				+			       struct ceph_mds_reply_info_in *info)
			
 
				+{
			
 
				+	int err = -EIO;
			
 
				+
			
 
				+	info->in = *p;
			
 
				+	*p += sizeof(struct ceph_mds_reply_inode) +
			
 
				+		sizeof(*info->in->fragtree.splits) *
			
 
				+		le32_to_cpu(info->in->fragtree.nsplits);
			
 
				+
			
 
				+	ceph_decode_32_safe(p, end, info->symlink_len, bad);
			
 
				+	ceph_decode_need(p, end, info->symlink_len, bad);
			
 
				+	info->symlink = *p;
			
 
				+	*p += info->symlink_len;
			
 
				+
			
 
				+	ceph_decode_32_safe(p, end, info->xattr_len, bad);
			
 
				+	ceph_decode_need(p, end, info->xattr_len, bad);
			
 
				+	info->xattr_data = *p;
			
 
				+	*p += info->xattr_len;
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * parse a normal reply, which may contain a (dir+)dentry and/or a
			
 
				+ * target inode.
			
 
				+ */
			
 
				+static int parse_reply_info_trace(void **p, void *end,
			
 
				+				  struct ceph_mds_reply_info_parsed *info)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	if (info->head->is_dentry) {
			
 
				+		err = parse_reply_info_in(p, end, &info->diri);
			
 
				+		if (err < 0)
			
 
				+			goto out_bad;
			
 
				+
			
 
				+		if (unlikely(*p + sizeof(*info->dirfrag) > end))
			
 
				+			goto bad;
			
 
				+		info->dirfrag = *p;
			
 
				+		*p += sizeof(*info->dirfrag) +
			
 
				+			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
			
 
				+		if (unlikely(*p > end))
			
 
				+			goto bad;
			
 
				+
			
 
				+		ceph_decode_32_safe(p, end, info->dname_len, bad);
			
 
				+		ceph_decode_need(p, end, info->dname_len, bad);
			
 
				+		info->dname = *p;
			
 
				+		*p += info->dname_len;
			
 
				+		info->dlease = *p;
			
 
				+		*p += sizeof(*info->dlease);
			
 
				+	}
			
 
				+
			
 
				+	if (info->head->is_target) {
			
 
				+		err = parse_reply_info_in(p, end, &info->targeti);
			
 
				+		if (err < 0)
			
 
				+			goto out_bad;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(*p != end))
			
 
				+		goto bad;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	err = -EIO;
			
 
				+out_bad:
			
 
				+	pr_err("problem parsing mds trace %d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * parse readdir results
			
 
				+ */
			
 
				+static int parse_reply_info_dir(void **p, void *end,
			
 
				+				struct ceph_mds_reply_info_parsed *info)
			
 
				+{
			
 
				+	u32 num, i = 0;
			
 
				+	int err;
			
 
				+
			
 
				+	info->dir_dir = *p;
			
 
				+	if (*p + sizeof(*info->dir_dir) > end)
			
 
				+		goto bad;
			
 
				+	*p += sizeof(*info->dir_dir) +
			
 
				+		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
			
 
				+	if (*p > end)
			
 
				+		goto bad;
			
 
				+
			
 
				+	ceph_decode_need(p, end, sizeof(num) + 2, bad);
			
 
				+	num = ceph_decode_32(p);
			
 
				+	info->dir_end = ceph_decode_8(p);
			
 
				+	info->dir_complete = ceph_decode_8(p);
			
 
				+	if (num == 0)
			
 
				+		goto done;
			
 
				+
			
 
				+	/* alloc large array */
			
 
				+	info->dir_nr = num;
			
 
				+	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
			
 
				+			       sizeof(*info->dir_dname) +
			
 
				+			       sizeof(*info->dir_dname_len) +
			
 
				+			       sizeof(*info->dir_dlease),
			
 
				+			       GFP_NOFS);
			
 
				+	if (info->dir_in == NULL) {
			
 
				+		err = -ENOMEM;
			
 
				+		goto out_bad;
			
 
				+	}
			
 
				+	info->dir_dname = (void *)(info->dir_in + num);
			
 
				+	info->dir_dname_len = (void *)(info->dir_dname + num);
			
 
				+	info->dir_dlease = (void *)(info->dir_dname_len + num);
			
 
				+
			
 
				+	while (num) {
			
 
				+		/* dentry */
			
 
				+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
			
 
				+		info->dir_dname_len[i] = ceph_decode_32(p);
			
 
				+		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
			
 
				+		info->dir_dname[i] = *p;
			
 
				+		*p += info->dir_dname_len[i];
			
 
				+		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
			
 
				+		     info->dir_dname[i]);
			
 
				+		info->dir_dlease[i] = *p;
			
 
				+		*p += sizeof(struct ceph_mds_reply_lease);
			
 
				+
			
 
				+		/* inode */
			
 
				+		err = parse_reply_info_in(p, end, &info->dir_in[i]);
			
 
				+		if (err < 0)
			
 
				+			goto out_bad;
			
 
				+		i++;
			
 
				+		num--;
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	if (*p != end)
			
 
				+		goto bad;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	err = -EIO;
			
 
				+out_bad:
			
 
				+	pr_err("problem parsing dir contents %d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * parse entire mds reply
			
 
				+ */
			
 
				+static int parse_reply_info(struct ceph_msg *msg,
			
 
				+			    struct ceph_mds_reply_info_parsed *info)
			
 
				+{
			
 
				+	void *p, *end;
			
 
				+	u32 len;
			
 
				+	int err;
			
 
				+
			
 
				+	info->head = msg->front.iov_base;
			
 
				+	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
			
 
				+	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
			
 
				+
			
 
				+	/* trace */
			
 
				+	ceph_decode_32_safe(&p, end, len, bad);
			
 
				+	if (len > 0) {
			
 
				+		err = parse_reply_info_trace(&p, p+len, info);
			
 
				+		if (err < 0)
			
 
				+			goto out_bad;
			
 
				+	}
			
 
				+
			
 
				+	/* dir content */
			
 
				+	ceph_decode_32_safe(&p, end, len, bad);
			
 
				+	if (len > 0) {
			
 
				+		err = parse_reply_info_dir(&p, p+len, info);
			
 
				+		if (err < 0)
			
 
				+			goto out_bad;
			
 
				+	}
			
 
				+
			
 
				+	/* snap blob */
			
 
				+	ceph_decode_32_safe(&p, end, len, bad);
			
 
				+	info->snapblob_len = len;
			
 
				+	info->snapblob = p;
			
 
				+	p += len;
			
 
				+
			
 
				+	if (p != end)
			
 
				+		goto bad;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	err = -EIO;
			
 
				+out_bad:
			
 
				+	pr_err("mds parse_reply err %d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
			
 
				+{
			
 
				+	kfree(info->dir_in);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * sessions
			
 
				+ */
			
 
				+static const char *session_state_name(int s)
			
 
				+{
			
 
				+	switch (s) {
			
 
				+	case CEPH_MDS_SESSION_NEW: return "new";
			
 
				+	case CEPH_MDS_SESSION_OPENING: return "opening";
			
 
				+	case CEPH_MDS_SESSION_OPEN: return "open";
			
 
				+	case CEPH_MDS_SESSION_HUNG: return "hung";
			
 
				+	case CEPH_MDS_SESSION_CLOSING: return "closing";
			
 
				+	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
			
 
				+	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
			
 
				+	default: return "???";
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
			
 
				+{
			
 
				+	if (atomic_inc_not_zero(&s->s_ref)) {
			
 
				+		dout("mdsc get_session %p %d -> %d\n", s,
			
 
				+		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
			
 
				+		return s;
			
 
				+	} else {
			
 
				+		dout("mdsc get_session %p 0 -- FAIL", s);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_put_mds_session(struct ceph_mds_session *s)
			
 
				+{
			
 
				+	dout("mdsc put_session %p %d -> %d\n", s,
			
 
				+	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
			
 
				+	if (atomic_dec_and_test(&s->s_ref)) {
			
 
				+		if (s->s_authorizer)
			
 
				+			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
			
 
				+				s->s_mdsc->client->monc.auth, s->s_authorizer);
			
 
				+		kfree(s);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under mdsc->mutex
			
 
				+ */
			
 
				+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
			
 
				+						   int mds)
			
 
				+{
			
 
				+	struct ceph_mds_session *session;
			
 
				+
			
 
				+	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
			
 
				+		return NULL;
			
 
				+	session = mdsc->sessions[mds];
			
 
				+	dout("lookup_mds_session %p %d\n", session,
			
 
				+	     atomic_read(&session->s_ref));
			
 
				+	get_session(session);
			
 
				+	return session;
			
 
				+}
			
 
				+
			
 
				+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
			
 
				+{
			
 
				+	if (mds >= mdsc->max_sessions)
			
 
				+		return false;
			
 
				+	return mdsc->sessions[mds];
			
 
				+}
			
 
				+
			
 
				+static int __verify_registered_session(struct ceph_mds_client *mdsc,
			
 
				+				       struct ceph_mds_session *s)
			
 
				+{
			
 
				+	if (s->s_mds >= mdsc->max_sessions ||
			
 
				+	    mdsc->sessions[s->s_mds] != s)
			
 
				+		return -ENOENT;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * create+register a new session for given mds.
			
 
				+ * called under mdsc->mutex.
			
 
				+ */
			
 
				+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
			
 
				+						 int mds)
			
 
				+{
			
 
				+	struct ceph_mds_session *s;
			
 
				+
			
 
				+	s = kzalloc(sizeof(*s), GFP_NOFS);
			
 
				+	s->s_mdsc = mdsc;
			
 
				+	s->s_mds = mds;
			
 
				+	s->s_state = CEPH_MDS_SESSION_NEW;
			
 
				+	s->s_ttl = 0;
			
 
				+	s->s_seq = 0;
			
 
				+	mutex_init(&s->s_mutex);
			
 
				+
			
 
				+	ceph_con_init(mdsc->client->msgr, &s->s_con);
			
 
				+	s->s_con.private = s;
			
 
				+	s->s_con.ops = &mds_con_ops;
			
 
				+	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
			
 
				+	s->s_con.peer_name.num = cpu_to_le64(mds);
			
 
				+
			
 
				+	spin_lock_init(&s->s_cap_lock);
			
 
				+	s->s_cap_gen = 0;
			
 
				+	s->s_cap_ttl = 0;
			
 
				+	s->s_renew_requested = 0;
			
 
				+	s->s_renew_seq = 0;
			
 
				+	INIT_LIST_HEAD(&s->s_caps);
			
 
				+	s->s_nr_caps = 0;
			
 
				+	s->s_trim_caps = 0;
			
 
				+	atomic_set(&s->s_ref, 1);
			
 
				+	INIT_LIST_HEAD(&s->s_waiting);
			
 
				+	INIT_LIST_HEAD(&s->s_unsafe);
			
 
				+	s->s_num_cap_releases = 0;
			
 
				+	s->s_cap_iterator = NULL;
			
 
				+	INIT_LIST_HEAD(&s->s_cap_releases);
			
 
				+	INIT_LIST_HEAD(&s->s_cap_releases_done);
			
 
				+	INIT_LIST_HEAD(&s->s_cap_flushing);
			
 
				+	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
			
 
				+
			
 
				+	dout("register_session mds%d\n", mds);
			
 
				+	if (mds >= mdsc->max_sessions) {
			
 
				+		int newmax = 1 << get_count_order(mds+1);
			
 
				+		struct ceph_mds_session **sa;
			
 
				+
			
 
				+		dout("register_session realloc to %d\n", newmax);
			
 
				+		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
			
 
				+		if (sa == NULL)
			
 
				+			goto fail_realloc;
			
 
				+		if (mdsc->sessions) {
			
 
				+			memcpy(sa, mdsc->sessions,
			
 
				+			       mdsc->max_sessions * sizeof(void *));
			
 
				+			kfree(mdsc->sessions);
			
 
				+		}
			
 
				+		mdsc->sessions = sa;
			
 
				+		mdsc->max_sessions = newmax;
			
 
				+	}
			
 
				+	mdsc->sessions[mds] = s;
			
 
				+	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
			
 
				+
			
 
				+	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
			
 
				+
			
 
				+	return s;
			
 
				+
			
 
				+fail_realloc:
			
 
				+	kfree(s);
			
 
				+	return ERR_PTR(-ENOMEM);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under mdsc->mutex
			
 
				+ */
			
 
				+static void __unregister_session(struct ceph_mds_client *mdsc,
			
 
				+			       struct ceph_mds_session *s)
			
 
				+{
			
 
				+	dout("__unregister_session mds%d %p\n", s->s_mds, s);
			
 
				+	BUG_ON(mdsc->sessions[s->s_mds] != s);
			
 
				+	mdsc->sessions[s->s_mds] = NULL;
			
 
				+	ceph_con_close(&s->s_con);
			
 
				+	ceph_put_mds_session(s);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * drop session refs in request.
			
 
				+ *
			
 
				+ * should be last request ref, or hold mdsc->mutex
			
 
				+ */
			
 
				+static void put_request_session(struct ceph_mds_request *req)
			
 
				+{
			
 
				+	if (req->r_session) {
			
 
				+		ceph_put_mds_session(req->r_session);
			
 
				+		req->r_session = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_mdsc_release_request(struct kref *kref)
			
 
				+{
			
 
				+	struct ceph_mds_request *req = container_of(kref,
			
 
				+						    struct ceph_mds_request,
			
 
				+						    r_kref);
			
 
				+	if (req->r_request)
			
 
				+		ceph_msg_put(req->r_request);
			
 
				+	if (req->r_reply) {
			
 
				+		ceph_msg_put(req->r_reply);
			
 
				+		destroy_reply_info(&req->r_reply_info);
			
 
				+	}
			
 
				+	if (req->r_inode) {
			
 
				+		ceph_put_cap_refs(ceph_inode(req->r_inode),
			
 
				+				  CEPH_CAP_PIN);
			
 
				+		iput(req->r_inode);
			
 
				+	}
			
 
				+	if (req->r_locked_dir)
			
 
				+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
			
 
				+				  CEPH_CAP_PIN);
			
 
				+	if (req->r_target_inode)
			
 
				+		iput(req->r_target_inode);
			
 
				+	if (req->r_dentry)
			
 
				+		dput(req->r_dentry);
			
 
				+	if (req->r_old_dentry) {
			
 
				+		ceph_put_cap_refs(
			
 
				+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
			
 
				+			CEPH_CAP_PIN);
			
 
				+		dput(req->r_old_dentry);
			
 
				+	}
			
 
				+	kfree(req->r_path1);
			
 
				+	kfree(req->r_path2);
			
 
				+	put_request_session(req);
			
 
				+	ceph_unreserve_caps(&req->r_caps_reservation);
			
 
				+	kfree(req);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * lookup session, bump ref if found.
			
 
				+ *
			
 
				+ * called under mdsc->mutex.
			
 
				+ */
			
 
				+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
			
 
				+					     u64 tid)
			
 
				+{
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct rb_node *n = mdsc->request_tree.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		req = rb_entry(n, struct ceph_mds_request, r_node);
			
 
				+		if (tid < req->r_tid)
			
 
				+			n = n->rb_left;
			
 
				+		else if (tid > req->r_tid)
			
 
				+			n = n->rb_right;
			
 
				+		else {
			
 
				+			ceph_mdsc_get_request(req);
			
 
				+			return req;
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void __insert_request(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_mds_request *new)
			
 
				+{
			
 
				+	struct rb_node **p = &mdsc->request_tree.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_mds_request *req = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		req = rb_entry(parent, struct ceph_mds_request, r_node);
			
 
				+		if (new->r_tid < req->r_tid)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->r_tid > req->r_tid)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->r_node, parent, p);
			
 
				+	rb_insert_color(&new->r_node, &mdsc->request_tree);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Register an in-flight request, and assign a tid.  Link to directory
			
 
				+ * are modifying (if any).
			
 
				+ *
			
 
				+ * Called under mdsc->mutex.
			
 
				+ */
			
 
				+static void __register_request(struct ceph_mds_client *mdsc,
			
 
				+			       struct ceph_mds_request *req,
			
 
				+			       struct inode *dir)
			
 
				+{
			
 
				+	req->r_tid = ++mdsc->last_tid;
			
 
				+	if (req->r_num_caps)
			
 
				+		ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
			
 
				+	dout("__register_request %p tid %lld\n", req, req->r_tid);
			
 
				+	ceph_mdsc_get_request(req);
			
 
				+	__insert_request(mdsc, req);
			
 
				+
			
 
				+	if (dir) {
			
 
				+		struct ceph_inode_info *ci = ceph_inode(dir);
			
 
				+
			
 
				+		spin_lock(&ci->i_unsafe_lock);
			
 
				+		req->r_unsafe_dir = dir;
			
 
				+		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
			
 
				+		spin_unlock(&ci->i_unsafe_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void __unregister_request(struct ceph_mds_client *mdsc,
			
 
				+				 struct ceph_mds_request *req)
			
 
				+{
			
 
				+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
			
 
				+	rb_erase(&req->r_node, &mdsc->request_tree);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+
			
 
				+	if (req->r_unsafe_dir) {
			
 
				+		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
			
 
				+
			
 
				+		spin_lock(&ci->i_unsafe_lock);
			
 
				+		list_del_init(&req->r_unsafe_dir_item);
			
 
				+		spin_unlock(&ci->i_unsafe_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Choose mds to send request to next.  If there is a hint set in the
			
 
				+ * request (e.g., due to a prior forward hint from the mds), use that.
			
 
				+ * Otherwise, consult frag tree and/or caps to identify the
			
 
				+ * appropriate mds.  If all else fails, choose randomly.
			
 
				+ *
			
 
				+ * Called under mdsc->mutex.
			
 
				+ */
			
 
				+static int __choose_mds(struct ceph_mds_client *mdsc,
			
 
				+			struct ceph_mds_request *req)
			
 
				+{
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct ceph_cap *cap;
			
 
				+	int mode = req->r_direct_mode;
			
 
				+	int mds = -1;
			
 
				+	u32 hash = req->r_direct_hash;
			
 
				+	bool is_hash = req->r_direct_is_hash;
			
 
				+
			
 
				+	/*
			
 
				+	 * is there a specific mds we should try?  ignore hint if we have
			
 
				+	 * no session and the mds is not up (active or recovering).
			
 
				+	 */
			
 
				+	if (req->r_resend_mds >= 0 &&
			
 
				+	    (__have_session(mdsc, req->r_resend_mds) ||
			
 
				+	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
			
 
				+		dout("choose_mds using resend_mds mds%d\n",
			
 
				+		     req->r_resend_mds);
			
 
				+		return req->r_resend_mds;
			
 
				+	}
			
 
				+
			
 
				+	if (mode == USE_RANDOM_MDS)
			
 
				+		goto random;
			
 
				+
			
 
				+	inode = NULL;
			
 
				+	if (req->r_inode) {
			
 
				+		inode = req->r_inode;
			
 
				+	} else if (req->r_dentry) {
			
 
				+		if (req->r_dentry->d_inode) {
			
 
				+			inode = req->r_dentry->d_inode;
			
 
				+		} else {
			
 
				+			inode = req->r_dentry->d_parent->d_inode;
			
 
				+			hash = req->r_dentry->d_name.hash;
			
 
				+			is_hash = true;
			
 
				+		}
			
 
				+	}
			
 
				+	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
			
 
				+	     (int)hash, mode);
			
 
				+	if (!inode)
			
 
				+		goto random;
			
 
				+	ci = ceph_inode(inode);
			
 
				+
			
 
				+	if (is_hash && S_ISDIR(inode->i_mode)) {
			
 
				+		struct ceph_inode_frag frag;
			
 
				+		int found;
			
 
				+
			
 
				+		ceph_choose_frag(ci, hash, &frag, &found);
			
 
				+		if (found) {
			
 
				+			if (mode == USE_ANY_MDS && frag.ndist > 0) {
			
 
				+				u8 r;
			
 
				+
			
 
				+				/* choose a random replica */
			
 
				+				get_random_bytes(&r, 1);
			
 
				+				r %= frag.ndist;
			
 
				+				mds = frag.dist[r];
			
 
				+				dout("choose_mds %p %llx.%llx "
			
 
				+				     "frag %u mds%d (%d/%d)\n",
			
 
				+				     inode, ceph_vinop(inode),
			
 
				+				     frag.frag, frag.mds,
			
 
				+				     (int)r, frag.ndist);
			
 
				+				return mds;
			
 
				+			}
			
 
				+
			
 
				+			/* since this file/dir wasn't known to be
			
 
				+			 * replicated, then we want to look for the
			
 
				+			 * authoritative mds. */
			
 
				+			mode = USE_AUTH_MDS;
			
 
				+			if (frag.mds >= 0) {
			
 
				+				/* choose auth mds */
			
 
				+				mds = frag.mds;
			
 
				+				dout("choose_mds %p %llx.%llx "
			
 
				+				     "frag %u mds%d (auth)\n",
			
 
				+				     inode, ceph_vinop(inode), frag.frag, mds);
			
 
				+				return mds;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	cap = NULL;
			
 
				+	if (mode == USE_AUTH_MDS)
			
 
				+		cap = ci->i_auth_cap;
			
 
				+	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
			
 
				+		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
			
 
				+	if (!cap) {
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		goto random;
			
 
				+	}
			
 
				+	mds = cap->session->s_mds;
			
 
				+	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
			
 
				+	     inode, ceph_vinop(inode), mds,
			
 
				+	     cap == ci->i_auth_cap ? "auth " : "", cap);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return mds;
			
 
				+
			
 
				+random:
			
 
				+	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
			
 
				+	dout("choose_mds chose random mds%d\n", mds);
			
 
				+	return mds;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * session messages
			
 
				+ */
			
 
				+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	struct ceph_mds_session_head *h;
			
 
				+
			
 
				+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
			
 
				+	if (IS_ERR(msg)) {
			
 
				+		pr_err("create_session_msg ENOMEM creating msg\n");
			
 
				+		return ERR_PTR(PTR_ERR(msg));
			
 
				+	}
			
 
				+	h = msg->front.iov_base;
			
 
				+	h->op = cpu_to_le32(op);
			
 
				+	h->seq = cpu_to_le64(seq);
			
 
				+	return msg;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * send session open request.
			
 
				+ *
			
 
				+ * called under mdsc->mutex
			
 
				+ */
			
 
				+static int __open_session(struct ceph_mds_client *mdsc,
			
 
				+			  struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	int mstate;
			
 
				+	int mds = session->s_mds;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	/* wait for mds to go active? */
			
 
				+	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
			
 
				+	dout("open_session to mds%d (%s)\n", mds,
			
 
				+	     ceph_mds_state_name(mstate));
			
 
				+	session->s_state = CEPH_MDS_SESSION_OPENING;
			
 
				+	session->s_renew_requested = jiffies;
			
 
				+
			
 
				+	/* send connect message */
			
 
				+	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
			
 
				+	if (IS_ERR(msg)) {
			
 
				+		err = PTR_ERR(msg);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	ceph_con_send(&session->s_con, msg);
			
 
				+
			
 
				+out:
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * session caps
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Free preallocated cap messages assigned to this session
			
 
				+ */
			
 
				+static void cleanup_cap_releases(struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+
			
 
				+	spin_lock(&session->s_cap_lock);
			
 
				+	while (!list_empty(&session->s_cap_releases)) {
			
 
				+		msg = list_first_entry(&session->s_cap_releases,
			
 
				+				       struct ceph_msg, list_head);
			
 
				+		list_del_init(&msg->list_head);
			
 
				+		ceph_msg_put(msg);
			
 
				+	}
			
 
				+	while (!list_empty(&session->s_cap_releases_done)) {
			
 
				+		msg = list_first_entry(&session->s_cap_releases_done,
			
 
				+				       struct ceph_msg, list_head);
			
 
				+		list_del_init(&msg->list_head);
			
 
				+		ceph_msg_put(msg);
			
 
				+	}
			
 
				+	spin_unlock(&session->s_cap_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Helper to safely iterate over all caps associated with a session.
			
 
				+ *
			
 
				+ * caller must hold session s_mutex
			
 
				+ */
			
 
				+static int iterate_session_caps(struct ceph_mds_session *session,
			
 
				+				 int (*cb)(struct inode *, struct ceph_cap *,
			
 
				+					    void *), void *arg)
			
 
				+{
			
 
				+	struct list_head *p;
			
 
				+	struct ceph_cap *cap;
			
 
				+	struct inode *inode, *last_inode = NULL;
			
 
				+	struct ceph_cap *old_cap = NULL;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
			
 
				+	spin_lock(&session->s_cap_lock);
			
 
				+	p = session->s_caps.next;
			
 
				+	while (p != &session->s_caps) {
			
 
				+		cap = list_entry(p, struct ceph_cap, session_caps);
			
 
				+		inode = igrab(&cap->ci->vfs_inode);
			
 
				+		if (!inode) {
			
 
				+			p = p->next;
			
 
				+			continue;
			
 
				+		}
			
 
				+		session->s_cap_iterator = cap;
			
 
				+		spin_unlock(&session->s_cap_lock);
			
 
				+
			
 
				+		if (last_inode) {
			
 
				+			iput(last_inode);
			
 
				+			last_inode = NULL;
			
 
				+		}
			
 
				+		if (old_cap) {
			
 
				+			ceph_put_cap(old_cap);
			
 
				+			old_cap = NULL;
			
 
				+		}
			
 
				+
			
 
				+		ret = cb(inode, cap, arg);
			
 
				+		last_inode = inode;
			
 
				+
			
 
				+		spin_lock(&session->s_cap_lock);
			
 
				+		p = p->next;
			
 
				+		if (cap->ci == NULL) {
			
 
				+			dout("iterate_session_caps  finishing cap %p removal\n",
			
 
				+			     cap);
			
 
				+			BUG_ON(cap->session != session);
			
 
				+			list_del_init(&cap->session_caps);
			
 
				+			session->s_nr_caps--;
			
 
				+			cap->session = NULL;
			
 
				+			old_cap = cap;  /* put_cap it w/o locks held */
			
 
				+		}
			
 
				+		if (ret < 0)
			
 
				+			goto out;
			
 
				+	}
			
 
				+	ret = 0;
			
 
				+out:
			
 
				+	session->s_cap_iterator = NULL;
			
 
				+	spin_unlock(&session->s_cap_lock);
			
 
				+
			
 
				+	if (last_inode)
			
 
				+		iput(last_inode);
			
 
				+	if (old_cap)
			
 
				+		ceph_put_cap(old_cap);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
			
 
				+				   void *arg)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	dout("removing cap %p, ci is %p, inode is %p\n",
			
 
				+	     cap, ci, &ci->vfs_inode);
			
 
				+	ceph_remove_cap(cap);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caller must hold session s_mutex
			
 
				+ */
			
 
				+static void remove_session_caps(struct ceph_mds_session *session)
			
 
				+{
			
 
				+	dout("remove_session_caps on %p\n", session);
			
 
				+	iterate_session_caps(session, remove_session_caps_cb, NULL);
			
 
				+	BUG_ON(session->s_nr_caps > 0);
			
 
				+	cleanup_cap_releases(session);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * wake up any threads waiting on this session's caps.  if the cap is
			
 
				+ * old (didn't get renewed on the client reconnect), remove it now.
			
 
				+ *
			
 
				+ * caller must hold s_mutex.
			
 
				+ */
			
 
				+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
			
 
				+			      void *arg)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+
			
 
				+	wake_up(&ci->i_cap_wq);
			
 
				+	if (arg) {
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		ci->i_wanted_max_size = 0;
			
 
				+		ci->i_requested_max_size = 0;
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void wake_up_session_caps(struct ceph_mds_session *session,
			
 
				+				 int reconnect)
			
 
				+{
			
 
				+	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
			
 
				+	iterate_session_caps(session, wake_up_session_cb,
			
 
				+			     (void *)(unsigned long)reconnect);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Send periodic message to MDS renewing all currently held caps.  The
			
 
				+ * ack will reset the expiration for all caps from this session.
			
 
				+ *
			
 
				+ * caller holds s_mutex
			
 
				+ */
			
 
				+static int send_renew_caps(struct ceph_mds_client *mdsc,
			
 
				+			   struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	int state;
			
 
				+
			
 
				+	if (time_after_eq(jiffies, session->s_cap_ttl) &&
			
 
				+	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
			
 
				+		pr_info("mds%d caps stale\n", session->s_mds);
			
 
				+
			
 
				+	/* do not try to renew caps until a recovering mds has reconnected
			
 
				+	 * with its clients. */
			
 
				+	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
			
 
				+	if (state < CEPH_MDS_STATE_RECONNECT) {
			
 
				+		dout("send_renew_caps ignoring mds%d (%s)\n",
			
 
				+		     session->s_mds, ceph_mds_state_name(state));
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
			
 
				+		ceph_mds_state_name(state));
			
 
				+	session->s_renew_requested = jiffies;
			
 
				+	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
			
 
				+				 ++session->s_renew_seq);
			
 
				+	if (IS_ERR(msg))
			
 
				+		return PTR_ERR(msg);
			
 
				+	ceph_con_send(&session->s_con, msg);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
			
 
				+ *
			
 
				+ * Called under session->s_mutex
			
 
				+ */
			
 
				+static void renewed_caps(struct ceph_mds_client *mdsc,
			
 
				+			 struct ceph_mds_session *session, int is_renew)
			
 
				+{
			
 
				+	int was_stale;
			
 
				+	int wake = 0;
			
 
				+
			
 
				+	spin_lock(&session->s_cap_lock);
			
 
				+	was_stale = is_renew && (session->s_cap_ttl == 0 ||
			
 
				+				 time_after_eq(jiffies, session->s_cap_ttl));
			
 
				+
			
 
				+	session->s_cap_ttl = session->s_renew_requested +
			
 
				+		mdsc->mdsmap->m_session_timeout*HZ;
			
 
				+
			
 
				+	if (was_stale) {
			
 
				+		if (time_before(jiffies, session->s_cap_ttl)) {
			
 
				+			pr_info("mds%d caps renewed\n", session->s_mds);
			
 
				+			wake = 1;
			
 
				+		} else {
			
 
				+			pr_info("mds%d caps still stale\n", session->s_mds);
			
 
				+		}
			
 
				+	}
			
 
				+	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
			
 
				+	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
			
 
				+	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
			
 
				+	spin_unlock(&session->s_cap_lock);
			
 
				+
			
 
				+	if (wake)
			
 
				+		wake_up_session_caps(session, 0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * send a session close request
			
 
				+ */
			
 
				+static int request_close_session(struct ceph_mds_client *mdsc,
			
 
				+				 struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	dout("request_close_session mds%d state %s seq %lld\n",
			
 
				+	     session->s_mds, session_state_name(session->s_state),
			
 
				+	     session->s_seq);
			
 
				+	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
			
 
				+	if (IS_ERR(msg))
			
 
				+		err = PTR_ERR(msg);
			
 
				+	else
			
 
				+		ceph_con_send(&session->s_con, msg);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called with s_mutex held.
			
 
				+ */
			
 
				+static int __close_session(struct ceph_mds_client *mdsc,
			
 
				+			 struct ceph_mds_session *session)
			
 
				+{
			
 
				+	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
			
 
				+		return 0;
			
 
				+	session->s_state = CEPH_MDS_SESSION_CLOSING;
			
 
				+	return request_close_session(mdsc, session);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Trim old(er) caps.
			
 
				+ *
			
 
				+ * Because we can't cache an inode without one or more caps, we do
			
 
				+ * this indirectly: if a cap is unused, we prune its aliases, at which
			
 
				+ * point the inode will hopefully get dropped to.
			
 
				+ *
			
 
				+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
			
 
				+ * memory pressure from the MDS, though, so it needn't be perfect.
			
 
				+ */
			
 
				+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
			
 
				+{
			
 
				+	struct ceph_mds_session *session = arg;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int used, oissued, mine;
			
 
				+
			
 
				+	if (session->s_trim_caps <= 0)
			
 
				+		return -1;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	mine = cap->issued | cap->implemented;
			
 
				+	used = __ceph_caps_used(ci);
			
 
				+	oissued = __ceph_caps_issued_other(ci, cap);
			
 
				+
			
 
				+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
			
 
				+	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
			
 
				+	     ceph_cap_string(used));
			
 
				+	if (ci->i_dirty_caps)
			
 
				+		goto out;   /* dirty caps */
			
 
				+	if ((used & ~oissued) & mine)
			
 
				+		goto out;   /* we need these caps */
			
 
				+
			
 
				+	session->s_trim_caps--;
			
 
				+	if (oissued) {
			
 
				+		/* we aren't the only cap.. just remove us */
			
 
				+		__ceph_remove_cap(cap);
			
 
				+	} else {
			
 
				+		/* try to drop referring dentries */
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		d_prune_aliases(inode);
			
 
				+		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
			
 
				+		     inode, cap, atomic_read(&inode->i_count));
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Trim session cap count down to some max number.
			
 
				+ */
			
 
				+static int trim_caps(struct ceph_mds_client *mdsc,
			
 
				+		     struct ceph_mds_session *session,
			
 
				+		     int max_caps)
			
 
				+{
			
 
				+	int trim_caps = session->s_nr_caps - max_caps;
			
 
				+
			
 
				+	dout("trim_caps mds%d start: %d / %d, trim %d\n",
			
 
				+	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
			
 
				+	if (trim_caps > 0) {
			
 
				+		session->s_trim_caps = trim_caps;
			
 
				+		iterate_session_caps(session, trim_caps_cb, session);
			
 
				+		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
			
 
				+		     session->s_mds, session->s_nr_caps, max_caps,
			
 
				+			trim_caps - session->s_trim_caps);
			
 
				+		session->s_trim_caps = 0;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate cap_release messages.  If there is a partially full message
			
 
				+ * in the queue, try to allocate enough to cover it's remainder, so that
			
 
				+ * we can send it immediately.
			
 
				+ *
			
 
				+ * Called under s_mutex.
			
 
				+ */
			
 
				+static int add_cap_releases(struct ceph_mds_client *mdsc,
			
 
				+			    struct ceph_mds_session *session,
			
 
				+			    int extra)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	struct ceph_mds_cap_release *head;
			
 
				+	int err = -ENOMEM;
			
 
				+
			
 
				+	if (extra < 0)
			
 
				+		extra = mdsc->client->mount_args->cap_release_safety;
			
 
				+
			
 
				+	spin_lock(&session->s_cap_lock);
			
 
				+
			
 
				+	if (!list_empty(&session->s_cap_releases)) {
			
 
				+		msg = list_first_entry(&session->s_cap_releases,
			
 
				+				       struct ceph_msg,
			
 
				+				 list_head);
			
 
				+		head = msg->front.iov_base;
			
 
				+		extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
			
 
				+	}
			
 
				+
			
 
				+	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
			
 
				+		spin_unlock(&session->s_cap_lock);
			
 
				+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
			
 
				+				   0, 0, NULL);
			
 
				+		if (!msg)
			
 
				+			goto out_unlocked;
			
 
				+		dout("add_cap_releases %p msg %p now %d\n", session, msg,
			
 
				+		     (int)msg->front.iov_len);
			
 
				+		head = msg->front.iov_base;
			
 
				+		head->num = cpu_to_le32(0);
			
 
				+		msg->front.iov_len = sizeof(*head);
			
 
				+		spin_lock(&session->s_cap_lock);
			
 
				+		list_add(&msg->list_head, &session->s_cap_releases);
			
 
				+		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
			
 
				+	}
			
 
				+
			
 
				+	if (!list_empty(&session->s_cap_releases)) {
			
 
				+		msg = list_first_entry(&session->s_cap_releases,
			
 
				+				       struct ceph_msg,
			
 
				+				       list_head);
			
 
				+		head = msg->front.iov_base;
			
 
				+		if (head->num) {
			
 
				+			dout(" queueing non-full %p (%d)\n", msg,
			
 
				+			     le32_to_cpu(head->num));
			
 
				+			list_move_tail(&msg->list_head,
			
 
				+				      &session->s_cap_releases_done);
			
 
				+			session->s_num_cap_releases -=
			
 
				+				CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
			
 
				+		}
			
 
				+	}
			
 
				+	err = 0;
			
 
				+	spin_unlock(&session->s_cap_lock);
			
 
				+out_unlocked:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * flush all dirty inode data to disk.
			
 
				+ *
			
 
				+ * returns true if we've flushed through want_flush_seq
			
 
				+ */
			
 
				+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
			
 
				+{
			
 
				+	int mds, ret = 1;
			
 
				+
			
 
				+	dout("check_cap_flush want %lld\n", want_flush_seq);
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
			
 
				+		struct ceph_mds_session *session = mdsc->sessions[mds];
			
 
				+
			
 
				+		if (!session)
			
 
				+			continue;
			
 
				+		get_session(session);
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+		mutex_lock(&session->s_mutex);
			
 
				+		if (!list_empty(&session->s_cap_flushing)) {
			
 
				+			struct ceph_inode_info *ci =
			
 
				+				list_entry(session->s_cap_flushing.next,
			
 
				+					   struct ceph_inode_info,
			
 
				+					   i_flushing_item);
			
 
				+			struct inode *inode = &ci->vfs_inode;
			
 
				+
			
 
				+			spin_lock(&inode->i_lock);
			
 
				+			if (ci->i_cap_flush_seq <= want_flush_seq) {
			
 
				+				dout("check_cap_flush still flushing %p "
			
 
				+				     "seq %lld <= %lld to mds%d\n", inode,
			
 
				+				     ci->i_cap_flush_seq, want_flush_seq,
			
 
				+				     session->s_mds);
			
 
				+				ret = 0;
			
 
				+			}
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+		}
			
 
				+		mutex_unlock(&session->s_mutex);
			
 
				+		ceph_put_mds_session(session);
			
 
				+
			
 
				+		if (!ret)
			
 
				+			return ret;
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under s_mutex
			
 
				+ */
			
 
				+static void send_cap_releases(struct ceph_mds_client *mdsc,
			
 
				+		       struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+
			
 
				+	dout("send_cap_releases mds%d\n", session->s_mds);
			
 
				+	while (1) {
			
 
				+		spin_lock(&session->s_cap_lock);
			
 
				+		if (list_empty(&session->s_cap_releases_done))
			
 
				+			break;
			
 
				+		msg = list_first_entry(&session->s_cap_releases_done,
			
 
				+				 struct ceph_msg, list_head);
			
 
				+		list_del_init(&msg->list_head);
			
 
				+		spin_unlock(&session->s_cap_lock);
			
 
				+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
			
 
				+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
			
 
				+		ceph_con_send(&session->s_con, msg);
			
 
				+	}
			
 
				+	spin_unlock(&session->s_cap_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * requests
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Create an mds request.
			
 
				+ */
			
 
				+struct ceph_mds_request *
			
 
				+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
			
 
				+{
			
 
				+	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
			
 
				+
			
 
				+	if (!req)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	req->r_started = jiffies;
			
 
				+	req->r_resend_mds = -1;
			
 
				+	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
			
 
				+	req->r_fmode = -1;
			
 
				+	kref_init(&req->r_kref);
			
 
				+	INIT_LIST_HEAD(&req->r_wait);
			
 
				+	init_completion(&req->r_completion);
			
 
				+	init_completion(&req->r_safe_completion);
			
 
				+	INIT_LIST_HEAD(&req->r_unsafe_item);
			
 
				+
			
 
				+	req->r_op = op;
			
 
				+	req->r_direct_mode = mode;
			
 
				+	return req;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * return oldest (lowest) request, tid in request tree, 0 if none.
			
 
				+ *
			
 
				+ * called under mdsc->mutex.
			
 
				+ */
			
 
				+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	if (RB_EMPTY_ROOT(&mdsc->request_tree))
			
 
				+		return NULL;
			
 
				+	return rb_entry(rb_first(&mdsc->request_tree),
			
 
				+			struct ceph_mds_request, r_node);
			
 
				+}
			
 
				+
			
 
				+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_mds_request *req = __get_oldest_req(mdsc);
			
 
				+
			
 
				+	if (req)
			
 
				+		return req->r_tid;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
			
 
				+ * on build_path_from_dentry in fs/cifs/dir.c.
			
 
				+ *
			
 
				+ * If @stop_on_nosnap, generate path relative to the first non-snapped
			
 
				+ * inode.
			
 
				+ *
			
 
				+ * Encode hidden .snap dirs as a double /, i.e.
			
 
				+ *   foo/.snap/bar -> foo//bar
			
 
				+ */
			
 
				+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
			
 
				+			   int stop_on_nosnap)
			
 
				+{
			
 
				+	struct dentry *temp;
			
 
				+	char *path;
			
 
				+	int len, pos;
			
 
				+
			
 
				+	if (dentry == NULL)
			
 
				+		return ERR_PTR(-EINVAL);
			
 
				+
			
 
				+retry:
			
 
				+	len = 0;
			
 
				+	for (temp = dentry; !IS_ROOT(temp);) {
			
 
				+		struct inode *inode = temp->d_inode;
			
 
				+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
			
 
				+			len++;  /* slash only */
			
 
				+		else if (stop_on_nosnap && inode &&
			
 
				+			 ceph_snap(inode) == CEPH_NOSNAP)
			
 
				+			break;
			
 
				+		else
			
 
				+			len += 1 + temp->d_name.len;
			
 
				+		temp = temp->d_parent;
			
 
				+		if (temp == NULL) {
			
 
				+			pr_err("build_path_dentry corrupt dentry %p\n", dentry);
			
 
				+			return ERR_PTR(-EINVAL);
			
 
				+		}
			
 
				+	}
			
 
				+	if (len)
			
 
				+		len--;  /* no leading '/' */
			
 
				+
			
 
				+	path = kmalloc(len+1, GFP_NOFS);
			
 
				+	if (path == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	pos = len;
			
 
				+	path[pos] = 0;	/* trailing null */
			
 
				+	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
			
 
				+		struct inode *inode = temp->d_inode;
			
 
				+
			
 
				+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
			
 
				+			dout("build_path_dentry path+%d: %p SNAPDIR\n",
			
 
				+			     pos, temp);
			
 
				+		} else if (stop_on_nosnap && inode &&
			
 
				+			   ceph_snap(inode) == CEPH_NOSNAP) {
			
 
				+			break;
			
 
				+		} else {
			
 
				+			pos -= temp->d_name.len;
			
 
				+			if (pos < 0)
			
 
				+				break;
			
 
				+			strncpy(path + pos, temp->d_name.name,
			
 
				+				temp->d_name.len);
			
 
				+			dout("build_path_dentry path+%d: %p '%.*s'\n",
			
 
				+			     pos, temp, temp->d_name.len, path + pos);
			
 
				+		}
			
 
				+		if (pos)
			
 
				+			path[--pos] = '/';
			
 
				+		temp = temp->d_parent;
			
 
				+		if (temp == NULL) {
			
 
				+			pr_err("build_path_dentry corrupt dentry\n");
			
 
				+			kfree(path);
			
 
				+			return ERR_PTR(-EINVAL);
			
 
				+		}
			
 
				+	}
			
 
				+	if (pos != 0) {
			
 
				+		pr_err("build_path_dentry did not end path lookup where "
			
 
				+		       "expected, namelen is %d, pos is %d\n", len, pos);
			
 
				+		/* presumably this is only possible if racing with a
			
 
				+		   rename of one of the parent directories (we can not
			
 
				+		   lock the dentries above us to prevent this, but
			
 
				+		   retrying should be harmless) */
			
 
				+		kfree(path);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	*base = ceph_ino(temp->d_inode);
			
 
				+	*plen = len;
			
 
				+	dout("build_path_dentry on %p %d built %llx '%.*s'\n",
			
 
				+	     dentry, atomic_read(&dentry->d_count), *base, len, path);
			
 
				+	return path;
			
 
				+}
			
 
				+
			
 
				+static int build_dentry_path(struct dentry *dentry,
			
 
				+			     const char **ppath, int *ppathlen, u64 *pino,
			
 
				+			     int *pfreepath)
			
 
				+{
			
 
				+	char *path;
			
 
				+
			
 
				+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
			
 
				+		*pino = ceph_ino(dentry->d_parent->d_inode);
			
 
				+		*ppath = dentry->d_name.name;
			
 
				+		*ppathlen = dentry->d_name.len;
			
 
				+		return 0;
			
 
				+	}
			
 
				+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
			
 
				+	if (IS_ERR(path))
			
 
				+		return PTR_ERR(path);
			
 
				+	*ppath = path;
			
 
				+	*pfreepath = 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int build_inode_path(struct inode *inode,
			
 
				+			    const char **ppath, int *ppathlen, u64 *pino,
			
 
				+			    int *pfreepath)
			
 
				+{
			
 
				+	struct dentry *dentry;
			
 
				+	char *path;
			
 
				+
			
 
				+	if (ceph_snap(inode) == CEPH_NOSNAP) {
			
 
				+		*pino = ceph_ino(inode);
			
 
				+		*ppathlen = 0;
			
 
				+		return 0;
			
 
				+	}
			
 
				+	dentry = d_find_alias(inode);
			
 
				+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
			
 
				+	dput(dentry);
			
 
				+	if (IS_ERR(path))
			
 
				+		return PTR_ERR(path);
			
 
				+	*ppath = path;
			
 
				+	*pfreepath = 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * request arguments may be specified via an inode *, a dentry *, or
			
 
				+ * an explicit ino+path.
			
 
				+ */
			
 
				+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
			
 
				+				  const char *rpath, u64 rino,
			
 
				+				  const char **ppath, int *pathlen,
			
 
				+				  u64 *ino, int *freepath)
			
 
				+{
			
 
				+	int r = 0;
			
 
				+
			
 
				+	if (rinode) {
			
 
				+		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
			
 
				+		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
			
 
				+		     ceph_snap(rinode));
			
 
				+	} else if (rdentry) {
			
 
				+		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
			
 
				+		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
			
 
				+		     *ppath);
			
 
				+	} else if (rpath) {
			
 
				+		*ino = rino;
			
 
				+		*ppath = rpath;
			
 
				+		*pathlen = strlen(rpath);
			
 
				+		dout(" path %.*s\n", *pathlen, rpath);
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under mdsc->mutex
			
 
				+ */
			
 
				+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
			
 
				+					       struct ceph_mds_request *req,
			
 
				+					       int mds)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	struct ceph_mds_request_head *head;
			
 
				+	const char *path1 = NULL;
			
 
				+	const char *path2 = NULL;
			
 
				+	u64 ino1 = 0, ino2 = 0;
			
 
				+	int pathlen1 = 0, pathlen2 = 0;
			
 
				+	int freepath1 = 0, freepath2 = 0;
			
 
				+	int len;
			
 
				+	u16 releases;
			
 
				+	void *p, *end;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = set_request_path_attr(req->r_inode, req->r_dentry,
			
 
				+			      req->r_path1, req->r_ino1.ino,
			
 
				+			      &path1, &pathlen1, &ino1, &freepath1);
			
 
				+	if (ret < 0) {
			
 
				+		msg = ERR_PTR(ret);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	ret = set_request_path_attr(NULL, req->r_old_dentry,
			
 
				+			      req->r_path2, req->r_ino2.ino,
			
 
				+			      &path2, &pathlen2, &ino2, &freepath2);
			
 
				+	if (ret < 0) {
			
 
				+		msg = ERR_PTR(ret);
			
 
				+		goto out_free1;
			
 
				+	}
			
 
				+
			
 
				+	len = sizeof(*head) +
			
 
				+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
			
 
				+
			
 
				+	/* calculate (max) length for cap releases */
			
 
				+	len += sizeof(struct ceph_mds_request_release) *
			
 
				+		(!!req->r_inode_drop + !!req->r_dentry_drop +
			
 
				+		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
			
 
				+	if (req->r_dentry_drop)
			
 
				+		len += req->r_dentry->d_name.len;
			
 
				+	if (req->r_old_dentry_drop)
			
 
				+		len += req->r_old_dentry->d_name.len;
			
 
				+
			
 
				+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
			
 
				+	if (IS_ERR(msg))
			
 
				+		goto out_free2;
			
 
				+
			
 
				+	msg->hdr.tid = cpu_to_le64(req->r_tid);
			
 
				+
			
 
				+	head = msg->front.iov_base;
			
 
				+	p = msg->front.iov_base + sizeof(*head);
			
 
				+	end = msg->front.iov_base + msg->front.iov_len;
			
 
				+
			
 
				+	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
			
 
				+	head->op = cpu_to_le32(req->r_op);
			
 
				+	head->caller_uid = cpu_to_le32(current_fsuid());
			
 
				+	head->caller_gid = cpu_to_le32(current_fsgid());
			
 
				+	head->args = req->r_args;
			
 
				+
			
 
				+	ceph_encode_filepath(&p, end, ino1, path1);
			
 
				+	ceph_encode_filepath(&p, end, ino2, path2);
			
 
				+
			
 
				+	/* cap releases */
			
 
				+	releases = 0;
			
 
				+	if (req->r_inode_drop)
			
 
				+		releases += ceph_encode_inode_release(&p,
			
 
				+		      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
			
 
				+		      mds, req->r_inode_drop, req->r_inode_unless, 0);
			
 
				+	if (req->r_dentry_drop)
			
 
				+		releases += ceph_encode_dentry_release(&p, req->r_dentry,
			
 
				+		       mds, req->r_dentry_drop, req->r_dentry_unless);
			
 
				+	if (req->r_old_dentry_drop)
			
 
				+		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
			
 
				+		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
			
 
				+	if (req->r_old_inode_drop)
			
 
				+		releases += ceph_encode_inode_release(&p,
			
 
				+		      req->r_old_dentry->d_inode,
			
 
				+		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
			
 
				+	head->num_releases = cpu_to_le16(releases);
			
 
				+
			
 
				+	BUG_ON(p > end);
			
 
				+	msg->front.iov_len = p - msg->front.iov_base;
			
 
				+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
			
 
				+
			
 
				+	msg->pages = req->r_pages;
			
 
				+	msg->nr_pages = req->r_num_pages;
			
 
				+	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
			
 
				+	msg->hdr.data_off = cpu_to_le16(0);
			
 
				+
			
 
				+out_free2:
			
 
				+	if (freepath2)
			
 
				+		kfree((char *)path2);
			
 
				+out_free1:
			
 
				+	if (freepath1)
			
 
				+		kfree((char *)path1);
			
 
				+out:
			
 
				+	return msg;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under mdsc->mutex if error, under no mutex if
			
 
				+ * success.
			
 
				+ */
			
 
				+static void complete_request(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_mds_request *req)
			
 
				+{
			
 
				+	if (req->r_callback)
			
 
				+		req->r_callback(mdsc, req);
			
 
				+	else
			
 
				+		complete(&req->r_completion);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under mdsc->mutex
			
 
				+ */
			
 
				+static int __prepare_send_request(struct ceph_mds_client *mdsc,
			
 
				+				  struct ceph_mds_request *req,
			
 
				+				  int mds)
			
 
				+{
			
 
				+	struct ceph_mds_request_head *rhead;
			
 
				+	struct ceph_msg *msg;
			
 
				+	int flags = 0;
			
 
				+
			
 
				+	req->r_mds = mds;
			
 
				+	req->r_attempts++;
			
 
				+	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
			
 
				+	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
			
 
				+
			
 
				+	if (req->r_request) {
			
 
				+		ceph_msg_put(req->r_request);
			
 
				+		req->r_request = NULL;
			
 
				+	}
			
 
				+	msg = create_request_message(mdsc, req, mds);
			
 
				+	if (IS_ERR(msg)) {
			
 
				+		req->r_reply = ERR_PTR(PTR_ERR(msg));
			
 
				+		complete_request(mdsc, req);
			
 
				+		return -PTR_ERR(msg);
			
 
				+	}
			
 
				+	req->r_request = msg;
			
 
				+
			
 
				+	rhead = msg->front.iov_base;
			
 
				+	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
			
 
				+	if (req->r_got_unsafe)
			
 
				+		flags |= CEPH_MDS_FLAG_REPLAY;
			
 
				+	if (req->r_locked_dir)
			
 
				+		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
			
 
				+	rhead->flags = cpu_to_le32(flags);
			
 
				+	rhead->num_fwd = req->r_num_fwd;
			
 
				+	rhead->num_retry = req->r_attempts - 1;
			
 
				+
			
 
				+	dout(" r_locked_dir = %p\n", req->r_locked_dir);
			
 
				+
			
 
				+	if (req->r_target_inode && req->r_got_unsafe)
			
 
				+		rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
			
 
				+	else
			
 
				+		rhead->ino = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * send request, or put it on the appropriate wait list.
			
 
				+ */
			
 
				+static int __do_request(struct ceph_mds_client *mdsc,
			
 
				+			struct ceph_mds_request *req)
			
 
				+{
			
 
				+	struct ceph_mds_session *session = NULL;
			
 
				+	int mds = -1;
			
 
				+	int err = -EAGAIN;
			
 
				+
			
 
				+	if (req->r_reply)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (req->r_timeout &&
			
 
				+	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
			
 
				+		dout("do_request timed out\n");
			
 
				+		err = -EIO;
			
 
				+		goto finish;
			
 
				+	}
			
 
				+
			
 
				+	mds = __choose_mds(mdsc, req);
			
 
				+	if (mds < 0 ||
			
 
				+	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
			
 
				+		dout("do_request no mds or not active, waiting for map\n");
			
 
				+		list_add(&req->r_wait, &mdsc->waiting_for_map);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* get, open session */
			
 
				+	session = __ceph_lookup_mds_session(mdsc, mds);
			
 
				+	if (!session)
			
 
				+		session = register_session(mdsc, mds);
			
 
				+	dout("do_request mds%d session %p state %s\n", mds, session,
			
 
				+	     session_state_name(session->s_state));
			
 
				+	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
			
 
				+	    session->s_state != CEPH_MDS_SESSION_HUNG) {
			
 
				+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
			
 
				+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
			
 
				+			__open_session(mdsc, session);
			
 
				+		list_add(&req->r_wait, &session->s_waiting);
			
 
				+		goto out_session;
			
 
				+	}
			
 
				+
			
 
				+	/* send request */
			
 
				+	req->r_session = get_session(session);
			
 
				+	req->r_resend_mds = -1;   /* forget any previous mds hint */
			
 
				+
			
 
				+	if (req->r_request_started == 0)   /* note request start time */
			
 
				+		req->r_request_started = jiffies;
			
 
				+
			
 
				+	err = __prepare_send_request(mdsc, req, mds);
			
 
				+	if (!err) {
			
 
				+		ceph_msg_get(req->r_request);
			
 
				+		ceph_con_send(&session->s_con, req->r_request);
			
 
				+	}
			
 
				+
			
 
				+out_session:
			
 
				+	ceph_put_mds_session(session);
			
 
				+out:
			
 
				+	return err;
			
 
				+
			
 
				+finish:
			
 
				+	req->r_reply = ERR_PTR(err);
			
 
				+	complete_request(mdsc, req);
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under mdsc->mutex
			
 
				+ */
			
 
				+static void __wake_requests(struct ceph_mds_client *mdsc,
			
 
				+			    struct list_head *head)
			
 
				+{
			
 
				+	struct ceph_mds_request *req, *nreq;
			
 
				+
			
 
				+	list_for_each_entry_safe(req, nreq, head, r_wait) {
			
 
				+		list_del_init(&req->r_wait);
			
 
				+		__do_request(mdsc, req);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wake up threads with requests pending for @mds, so that they can
			
 
				+ * resubmit their requests to a possibly different mds.  If @all is set,
			
 
				+ * wake up if their requests has been forwarded to @mds, too.
			
 
				+ */
			
 
				+static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
			
 
				+{
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	dout("kick_requests mds%d\n", mds);
			
 
				+	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
			
 
				+		req = rb_entry(p, struct ceph_mds_request, r_node);
			
 
				+		if (req->r_got_unsafe)
			
 
				+			continue;
			
 
				+		if (req->r_session &&
			
 
				+		    req->r_session->s_mds == mds) {
			
 
				+			dout(" kicking tid %llu\n", req->r_tid);
			
 
				+			put_request_session(req);
			
 
				+			__do_request(mdsc, req);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
			
 
				+			      struct ceph_mds_request *req)
			
 
				+{
			
 
				+	dout("submit_request on %p\n", req);
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	__register_request(mdsc, req, NULL);
			
 
				+	__do_request(mdsc, req);
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Synchrously perform an mds request.  Take care of all of the
			
 
				+ * session setup, forwarding, retry details.
			
 
				+ */
			
 
				+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
			
 
				+			 struct inode *dir,
			
 
				+			 struct ceph_mds_request *req)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	dout("do_request on %p\n", req);
			
 
				+
			
 
				+	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
			
 
				+	if (req->r_inode)
			
 
				+		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
			
 
				+	if (req->r_locked_dir)
			
 
				+		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
			
 
				+	if (req->r_old_dentry)
			
 
				+		ceph_get_cap_refs(
			
 
				+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
			
 
				+			CEPH_CAP_PIN);
			
 
				+
			
 
				+	/* issue */
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	__register_request(mdsc, req, dir);
			
 
				+	__do_request(mdsc, req);
			
 
				+
			
 
				+	/* wait */
			
 
				+	if (!req->r_reply) {
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		if (req->r_timeout) {
			
 
				+			err = (long)wait_for_completion_interruptible_timeout(
			
 
				+				&req->r_completion, req->r_timeout);
			
 
				+			if (err == 0)
			
 
				+				req->r_reply = ERR_PTR(-EIO);
			
 
				+			else if (err < 0)
			
 
				+				req->r_reply = ERR_PTR(err);
			
 
				+		} else {
			
 
				+                        err = wait_for_completion_interruptible(
			
 
				+                                &req->r_completion);
			
 
				+                        if (err)
			
 
				+                                req->r_reply = ERR_PTR(err);
			
 
				+		}
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+	}
			
 
				+
			
 
				+	if (IS_ERR(req->r_reply)) {
			
 
				+		err = PTR_ERR(req->r_reply);
			
 
				+		req->r_reply = NULL;
			
 
				+
			
 
				+		if (err == -ERESTARTSYS) {
			
 
				+			/* aborted */
			
 
				+			req->r_aborted = true;
			
 
				+
			
 
				+			if (req->r_locked_dir &&
			
 
				+			    (req->r_op & CEPH_MDS_OP_WRITE)) {
			
 
				+				struct ceph_inode_info *ci =
			
 
				+					ceph_inode(req->r_locked_dir);
			
 
				+
			
 
				+				dout("aborted, clearing I_COMPLETE on %p\n", 
			
 
				+				     req->r_locked_dir);
			
 
				+				spin_lock(&req->r_locked_dir->i_lock);
			
 
				+				ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
			
 
				+				ci->i_release_count++;
			
 
				+				spin_unlock(&req->r_locked_dir->i_lock);
			
 
				+			}
			
 
				+		} else {
			
 
				+			/* clean up this request */
			
 
				+			__unregister_request(mdsc, req);
			
 
				+			if (!list_empty(&req->r_unsafe_item))
			
 
				+				list_del_init(&req->r_unsafe_item);
			
 
				+			complete(&req->r_safe_completion);
			
 
				+		}
			
 
				+	} else if (req->r_err) {
			
 
				+		err = req->r_err;
			
 
				+	} else {
			
 
				+		err = le32_to_cpu(req->r_reply_info.head->result);
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	dout("do_request %p done, result %d\n", req, err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle mds reply.
			
 
				+ *
			
 
				+ * We take the session mutex and parse and process the reply immediately.
			
 
				+ * This preserves the logical ordering of replies, capabilities, etc., sent
			
 
				+ * by the MDS as they are applied to our local cache.
			
 
				+ */
			
 
				+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = session->s_mdsc;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct ceph_mds_reply_head *head = msg->front.iov_base;
			
 
				+	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
			
 
				+	u64 tid;
			
 
				+	int err, result;
			
 
				+	int mds = session->s_mds;
			
 
				+
			
 
				+	if (msg->front.iov_len < sizeof(*head)) {
			
 
				+		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
			
 
				+		ceph_msg_dump(msg);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* get request, session */
			
 
				+	tid = le64_to_cpu(msg->hdr.tid);
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	req = __lookup_request(mdsc, tid);
			
 
				+	if (!req) {
			
 
				+		dout("handle_reply on unknown tid %llu\n", tid);
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		return;
			
 
				+	}
			
 
				+	dout("handle_reply %p\n", req);
			
 
				+
			
 
				+	/* correct session? */
			
 
				+	if (!req->r_session && req->r_session != session) {
			
 
				+		pr_err("mdsc_handle_reply got %llu on session mds%d"
			
 
				+		       " not mds%d\n", tid, session->s_mds,
			
 
				+		       req->r_session ? req->r_session->s_mds : -1);
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* dup? */
			
 
				+	if ((req->r_got_unsafe && !head->safe) ||
			
 
				+	    (req->r_got_safe && head->safe)) {
			
 
				+		pr_warning("got a dup %s reply on %llu from mds%d\n",
			
 
				+			   head->safe ? "safe" : "unsafe", tid, mds);
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	result = le32_to_cpu(head->result);
			
 
				+
			
 
				+	/*
			
 
				+	 * Tolerate 2 consecutive ESTALEs from the same mds.
			
 
				+	 * FIXME: we should be looking at the cap migrate_seq.
			
 
				+	 */
			
 
				+	if (result == -ESTALE) {
			
 
				+		req->r_direct_mode = USE_AUTH_MDS;
			
 
				+		req->r_num_stale++;
			
 
				+		if (req->r_num_stale <= 2) {
			
 
				+			__do_request(mdsc, req);
			
 
				+			mutex_unlock(&mdsc->mutex);
			
 
				+			goto out;
			
 
				+		}
			
 
				+	} else {
			
 
				+		req->r_num_stale = 0;
			
 
				+	}
			
 
				+
			
 
				+	if (head->safe) {
			
 
				+		req->r_got_safe = true;
			
 
				+		__unregister_request(mdsc, req);
			
 
				+		complete(&req->r_safe_completion);
			
 
				+
			
 
				+		if (req->r_got_unsafe) {
			
 
				+			/*
			
 
				+			 * We already handled the unsafe response, now do the
			
 
				+			 * cleanup.  No need to examine the response; the MDS
			
 
				+			 * doesn't include any result info in the safe
			
 
				+			 * response.  And even if it did, there is nothing
			
 
				+			 * useful we could do with a revised return value.
			
 
				+			 */
			
 
				+			dout("got safe reply %llu, mds%d\n", tid, mds);
			
 
				+			list_del_init(&req->r_unsafe_item);
			
 
				+
			
 
				+			/* last unsafe request during umount? */
			
 
				+			if (mdsc->stopping && !__get_oldest_req(mdsc))
			
 
				+				complete(&mdsc->safe_umount_waiters);
			
 
				+			mutex_unlock(&mdsc->mutex);
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(req->r_reply);
			
 
				+
			
 
				+	if (!head->safe) {
			
 
				+		req->r_got_unsafe = true;
			
 
				+		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
			
 
				+	}
			
 
				+
			
 
				+	dout("handle_reply tid %lld result %d\n", tid, result);
			
 
				+	rinfo = &req->r_reply_info;
			
 
				+	err = parse_reply_info(msg, rinfo);
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	mutex_lock(&session->s_mutex);
			
 
				+	if (err < 0) {
			
 
				+		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
			
 
				+		ceph_msg_dump(msg);
			
 
				+		goto out_err;
			
 
				+	}
			
 
				+
			
 
				+	/* snap trace */
			
 
				+	if (rinfo->snapblob_len) {
			
 
				+		down_write(&mdsc->snap_rwsem);
			
 
				+		ceph_update_snap_trace(mdsc, rinfo->snapblob,
			
 
				+			       rinfo->snapblob + rinfo->snapblob_len,
			
 
				+			       le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
			
 
				+		downgrade_write(&mdsc->snap_rwsem);
			
 
				+	} else {
			
 
				+		down_read(&mdsc->snap_rwsem);
			
 
				+	}
			
 
				+
			
 
				+	/* insert trace into our cache */
			
 
				+	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
			
 
				+	if (err == 0) {
			
 
				+		if (result == 0 && rinfo->dir_nr)
			
 
				+			ceph_readdir_prepopulate(req, req->r_session);
			
 
				+		ceph_unreserve_caps(&req->r_caps_reservation);
			
 
				+	}
			
 
				+
			
 
				+	up_read(&mdsc->snap_rwsem);
			
 
				+out_err:
			
 
				+	if (err) {
			
 
				+		req->r_err = err;
			
 
				+	} else {
			
 
				+		req->r_reply = msg;
			
 
				+		ceph_msg_get(msg);
			
 
				+	}
			
 
				+
			
 
				+	add_cap_releases(mdsc, req->r_session, -1);
			
 
				+	mutex_unlock(&session->s_mutex);
			
 
				+
			
 
				+	/* kick calling process */
			
 
				+	complete_request(mdsc, req);
			
 
				+out:
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * handle mds notification that our request has been forwarded.
			
 
				+ */
			
 
				+static void handle_forward(struct ceph_mds_client *mdsc,
			
 
				+			   struct ceph_mds_session *session,
			
 
				+			   struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mds_request *req;
			
 
				+	u64 tid = le64_to_cpu(msg->hdr.tid);
			
 
				+	u32 next_mds;
			
 
				+	u32 fwd_seq;
			
 
				+	int err = -EINVAL;
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *end = p + msg->front.iov_len;
			
 
				+
			
 
				+	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
			
 
				+	next_mds = ceph_decode_32(&p);
			
 
				+	fwd_seq = ceph_decode_32(&p);
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	req = __lookup_request(mdsc, tid);
			
 
				+	if (!req) {
			
 
				+		dout("forward %llu to mds%d - req dne\n", tid, next_mds);
			
 
				+		goto out;  /* dup reply? */
			
 
				+	}
			
 
				+
			
 
				+	if (fwd_seq <= req->r_num_fwd) {
			
 
				+		dout("forward %llu to mds%d - old seq %d <= %d\n",
			
 
				+		     tid, next_mds, req->r_num_fwd, fwd_seq);
			
 
				+	} else {
			
 
				+		/* resend. forward race not possible; mds would drop */
			
 
				+		dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
			
 
				+		req->r_num_fwd = fwd_seq;
			
 
				+		req->r_resend_mds = next_mds;
			
 
				+		put_request_session(req);
			
 
				+		__do_request(mdsc, req);
			
 
				+	}
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+out:
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("mdsc_handle_forward decode error err=%d\n", err);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * handle a mds session control message
			
 
				+ */
			
 
				+static void handle_session(struct ceph_mds_session *session,
			
 
				+			   struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = session->s_mdsc;
			
 
				+	u32 op;
			
 
				+	u64 seq;
			
 
				+	int mds = session->s_mds;
			
 
				+	struct ceph_mds_session_head *h = msg->front.iov_base;
			
 
				+	int wake = 0;
			
 
				+
			
 
				+	/* decode */
			
 
				+	if (msg->front.iov_len != sizeof(*h))
			
 
				+		goto bad;
			
 
				+	op = le32_to_cpu(h->op);
			
 
				+	seq = le64_to_cpu(h->seq);
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	if (op == CEPH_SESSION_CLOSE)
			
 
				+		__unregister_session(mdsc, session);
			
 
				+	/* FIXME: this ttl calculation is generous */
			
 
				+	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	mutex_lock(&session->s_mutex);
			
 
				+
			
 
				+	dout("handle_session mds%d %s %p state %s seq %llu\n",
			
 
				+	     mds, ceph_session_op_name(op), session,
			
 
				+	     session_state_name(session->s_state), seq);
			
 
				+
			
 
				+	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
			
 
				+		session->s_state = CEPH_MDS_SESSION_OPEN;
			
 
				+		pr_info("mds%d came back\n", session->s_mds);
			
 
				+	}
			
 
				+
			
 
				+	switch (op) {
			
 
				+	case CEPH_SESSION_OPEN:
			
 
				+		session->s_state = CEPH_MDS_SESSION_OPEN;
			
 
				+		renewed_caps(mdsc, session, 0);
			
 
				+		wake = 1;
			
 
				+		if (mdsc->stopping)
			
 
				+			__close_session(mdsc, session);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_SESSION_RENEWCAPS:
			
 
				+		if (session->s_renew_seq == seq)
			
 
				+			renewed_caps(mdsc, session, 1);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_SESSION_CLOSE:
			
 
				+		remove_session_caps(session);
			
 
				+		wake = 1; /* for good measure */
			
 
				+		complete(&mdsc->session_close_waiters);
			
 
				+		kick_requests(mdsc, mds, 0);      /* cur only */
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_SESSION_STALE:
			
 
				+		pr_info("mds%d caps went stale, renewing\n",
			
 
				+			session->s_mds);
			
 
				+		spin_lock(&session->s_cap_lock);
			
 
				+		session->s_cap_gen++;
			
 
				+		session->s_cap_ttl = 0;
			
 
				+		spin_unlock(&session->s_cap_lock);
			
 
				+		send_renew_caps(mdsc, session);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_SESSION_RECALL_STATE:
			
 
				+		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
			
 
				+		WARN_ON(1);
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&session->s_mutex);
			
 
				+	if (wake) {
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+		__wake_requests(mdsc, &session->s_waiting);
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+	}
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
			
 
				+	       (int)msg->front.iov_len);
			
 
				+	ceph_msg_dump(msg);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * called under session->mutex.
			
 
				+ */
			
 
				+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
			
 
				+				   struct ceph_mds_session *session)
			
 
				+{
			
 
				+	struct ceph_mds_request *req, *nreq;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("replay_unsafe_requests mds%d\n", session->s_mds);
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
			
 
				+		err = __prepare_send_request(mdsc, req, session->s_mds);
			
 
				+		if (!err) {
			
 
				+			ceph_msg_get(req->r_request);
			
 
				+			ceph_con_send(&session->s_con, req->r_request);
			
 
				+		}
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Encode information about a cap for a reconnect with the MDS.
			
 
				+ */
			
 
				+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
			
 
				+			  void *arg)
			
 
				+{
			
 
				+	struct ceph_mds_cap_reconnect rec;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct ceph_pagelist *pagelist = arg;
			
 
				+	char *path;
			
 
				+	int pathlen, err;
			
 
				+	u64 pathbase;
			
 
				+	struct dentry *dentry;
			
 
				+
			
 
				+	ci = cap->ci;
			
 
				+
			
 
				+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
			
 
				+	     inode, ceph_vinop(inode), cap, cap->cap_id,
			
 
				+	     ceph_cap_string(cap->issued));
			
 
				+	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	dentry = d_find_alias(inode);
			
 
				+	if (dentry) {
			
 
				+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
			
 
				+		if (IS_ERR(path)) {
			
 
				+			err = PTR_ERR(path);
			
 
				+			BUG_ON(err);
			
 
				+		}
			
 
				+	} else {
			
 
				+		path = NULL;
			
 
				+		pathlen = 0;
			
 
				+	}
			
 
				+	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	cap->seq = 0;        /* reset cap seq */
			
 
				+	cap->issue_seq = 0;  /* and issue_seq */
			
 
				+	rec.cap_id = cpu_to_le64(cap->cap_id);
			
 
				+	rec.pathbase = cpu_to_le64(pathbase);
			
 
				+	rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
			
 
				+	rec.issued = cpu_to_le32(cap->issued);
			
 
				+	rec.size = cpu_to_le64(inode->i_size);
			
 
				+	ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
			
 
				+	ceph_encode_timespec(&rec.atime, &inode->i_atime);
			
 
				+	rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
			
 
				+
			
 
				+out:
			
 
				+	kfree(path);
			
 
				+	dput(dentry);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * If an MDS fails and recovers, clients need to reconnect in order to
			
 
				+ * reestablish shared state.  This includes all caps issued through
			
 
				+ * this session _and_ the snap_realm hierarchy.  Because it's not
			
 
				+ * clear which snap realms the mds cares about, we send everything we
			
 
				+ * know about.. that ensures we'll then get any new info the
			
 
				+ * recovering MDS might have.
			
 
				+ *
			
 
				+ * This is a relatively heavyweight operation, but it's rare.
			
 
				+ *
			
 
				+ * called with mdsc->mutex held.
			
 
				+ */
			
 
				+static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
			
 
				+{
			
 
				+	struct ceph_mds_session *session = NULL;
			
 
				+	struct ceph_msg *reply;
			
 
				+	struct rb_node *p;
			
 
				+	int err;
			
 
				+	struct ceph_pagelist *pagelist;
			
 
				+
			
 
				+	pr_info("reconnect to recovering mds%d\n", mds);
			
 
				+
			
 
				+	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
			
 
				+	if (!pagelist)
			
 
				+		goto fail_nopagelist;
			
 
				+	ceph_pagelist_init(pagelist);
			
 
				+
			
 
				+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
			
 
				+	if (IS_ERR(reply)) {
			
 
				+		err = PTR_ERR(reply);
			
 
				+		goto fail_nomsg;
			
 
				+	}
			
 
				+
			
 
				+	/* find session */
			
 
				+	session = __ceph_lookup_mds_session(mdsc, mds);
			
 
				+	mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
			
 
				+
			
 
				+	if (session) {
			
 
				+		mutex_lock(&session->s_mutex);
			
 
				+
			
 
				+		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
			
 
				+		session->s_seq = 0;
			
 
				+
			
 
				+		ceph_con_open(&session->s_con,
			
 
				+			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
			
 
				+
			
 
				+		/* replay unsafe requests */
			
 
				+		replay_unsafe_requests(mdsc, session);
			
 
				+	} else {
			
 
				+		dout("no session for mds%d, will send short reconnect\n",
			
 
				+		     mds);
			
 
				+	}
			
 
				+
			
 
				+	down_read(&mdsc->snap_rwsem);
			
 
				+
			
 
				+	if (!session)
			
 
				+		goto send;
			
 
				+	dout("session %p state %s\n", session,
			
 
				+	     session_state_name(session->s_state));
			
 
				+
			
 
				+	/* traverse this session's caps */
			
 
				+	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
			
 
				+	if (err)
			
 
				+		goto fail;
			
 
				+	err = iterate_session_caps(session, encode_caps_cb, pagelist);
			
 
				+	if (err < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	/*
			
 
				+	 * snaprealms.  we provide mds with the ino, seq (version), and
			
 
				+	 * parent for all of our realms.  If the mds has any newer info,
			
 
				+	 * it will tell us.
			
 
				+	 */
			
 
				+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
			
 
				+		struct ceph_snap_realm *realm =
			
 
				+			rb_entry(p, struct ceph_snap_realm, node);
			
 
				+		struct ceph_mds_snaprealm_reconnect sr_rec;
			
 
				+
			
 
				+		dout(" adding snap realm %llx seq %lld parent %llx\n",
			
 
				+		     realm->ino, realm->seq, realm->parent_ino);
			
 
				+		sr_rec.ino = cpu_to_le64(realm->ino);
			
 
				+		sr_rec.seq = cpu_to_le64(realm->seq);
			
 
				+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
			
 
				+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
			
 
				+		if (err)
			
 
				+			goto fail;
			
 
				+	}
			
 
				+
			
 
				+send:
			
 
				+	reply->pagelist = pagelist;
			
 
				+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
			
 
				+	reply->nr_pages = calc_pages_for(0, pagelist->length);
			
 
				+	ceph_con_send(&session->s_con, reply);
			
 
				+
			
 
				+	if (session) {
			
 
				+		session->s_state = CEPH_MDS_SESSION_OPEN;
			
 
				+		__wake_requests(mdsc, &session->s_waiting);
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	up_read(&mdsc->snap_rwsem);
			
 
				+	if (session) {
			
 
				+		mutex_unlock(&session->s_mutex);
			
 
				+		ceph_put_mds_session(session);
			
 
				+	}
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	return;
			
 
				+
			
 
				+fail:
			
 
				+	ceph_msg_put(reply);
			
 
				+fail_nomsg:
			
 
				+	ceph_pagelist_release(pagelist);
			
 
				+	kfree(pagelist);
			
 
				+fail_nopagelist:
			
 
				+	pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * compare old and new mdsmaps, kicking requests
			
 
				+ * and closing out old connections as necessary
			
 
				+ *
			
 
				+ * called under mdsc->mutex.
			
 
				+ */
			
 
				+static void check_new_map(struct ceph_mds_client *mdsc,
			
 
				+			  struct ceph_mdsmap *newmap,
			
 
				+			  struct ceph_mdsmap *oldmap)
			
 
				+{
			
 
				+	int i;
			
 
				+	int oldstate, newstate;
			
 
				+	struct ceph_mds_session *s;
			
 
				+
			
 
				+	dout("check_new_map new %u old %u\n",
			
 
				+	     newmap->m_epoch, oldmap->m_epoch);
			
 
				+
			
 
				+	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
			
 
				+		if (mdsc->sessions[i] == NULL)
			
 
				+			continue;
			
 
				+		s = mdsc->sessions[i];
			
 
				+		oldstate = ceph_mdsmap_get_state(oldmap, i);
			
 
				+		newstate = ceph_mdsmap_get_state(newmap, i);
			
 
				+
			
 
				+		dout("check_new_map mds%d state %s -> %s (session %s)\n",
			
 
				+		     i, ceph_mds_state_name(oldstate),
			
 
				+		     ceph_mds_state_name(newstate),
			
 
				+		     session_state_name(s->s_state));
			
 
				+
			
 
				+		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
			
 
				+			   ceph_mdsmap_get_addr(newmap, i),
			
 
				+			   sizeof(struct ceph_entity_addr))) {
			
 
				+			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
			
 
				+				/* the session never opened, just close it
			
 
				+				 * out now */
			
 
				+				__wake_requests(mdsc, &s->s_waiting);
			
 
				+				__unregister_session(mdsc, s);
			
 
				+			} else {
			
 
				+				/* just close it */
			
 
				+				mutex_unlock(&mdsc->mutex);
			
 
				+				mutex_lock(&s->s_mutex);
			
 
				+				mutex_lock(&mdsc->mutex);
			
 
				+				ceph_con_close(&s->s_con);
			
 
				+				mutex_unlock(&s->s_mutex);
			
 
				+				s->s_state = CEPH_MDS_SESSION_RESTARTING;
			
 
				+			}
			
 
				+
			
 
				+			/* kick any requests waiting on the recovering mds */
			
 
				+			kick_requests(mdsc, i, 1);
			
 
				+		} else if (oldstate == newstate) {
			
 
				+			continue;  /* nothing new with this mds */
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * send reconnect?
			
 
				+		 */
			
 
				+		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
			
 
				+		    newstate >= CEPH_MDS_STATE_RECONNECT)
			
 
				+			send_mds_reconnect(mdsc, i);
			
 
				+
			
 
				+		/*
			
 
				+		 * kick requests on any mds that has gone active.
			
 
				+		 *
			
 
				+		 * kick requests on cur or forwarder: we may have sent
			
 
				+		 * the request to mds1, mds1 told us it forwarded it
			
 
				+		 * to mds2, but then we learn mds1 failed and can't be
			
 
				+		 * sure it successfully forwarded our request before
			
 
				+		 * it died.
			
 
				+		 */
			
 
				+		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
			
 
				+		    newstate >= CEPH_MDS_STATE_ACTIVE) {
			
 
				+			pr_info("mds%d reconnect completed\n", s->s_mds);
			
 
				+			kick_requests(mdsc, i, 1);
			
 
				+			ceph_kick_flushing_caps(mdsc, s);
			
 
				+			wake_up_session_caps(s, 1);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * leases
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * caller must hold session s_mutex, dentry->d_lock
			
 
				+ */
			
 
				+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di = ceph_dentry(dentry);
			
 
				+
			
 
				+	ceph_put_mds_session(di->lease_session);
			
 
				+	di->lease_session = NULL;
			
 
				+}
			
 
				+
			
 
				+static void handle_lease(struct ceph_mds_client *mdsc,
			
 
				+			 struct ceph_mds_session *session,
			
 
				+			 struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct super_block *sb = mdsc->client->sb;
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct dentry *parent, *dentry;
			
 
				+	struct ceph_dentry_info *di;
			
 
				+	int mds = session->s_mds;
			
 
				+	struct ceph_mds_lease *h = msg->front.iov_base;
			
 
				+	struct ceph_vino vino;
			
 
				+	int mask;
			
 
				+	struct qstr dname;
			
 
				+	int release = 0;
			
 
				+
			
 
				+	dout("handle_lease from mds%d\n", mds);
			
 
				+
			
 
				+	/* decode */
			
 
				+	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
			
 
				+		goto bad;
			
 
				+	vino.ino = le64_to_cpu(h->ino);
			
 
				+	vino.snap = CEPH_NOSNAP;
			
 
				+	mask = le16_to_cpu(h->mask);
			
 
				+	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
			
 
				+	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
			
 
				+	if (dname.len != get_unaligned_le32(h+1))
			
 
				+		goto bad;
			
 
				+
			
 
				+	mutex_lock(&session->s_mutex);
			
 
				+	session->s_seq++;
			
 
				+
			
 
				+	/* lookup inode */
			
 
				+	inode = ceph_find_inode(sb, vino);
			
 
				+	dout("handle_lease '%s', mask %d, ino %llx %p\n",
			
 
				+	     ceph_lease_op_name(h->action), mask, vino.ino, inode);
			
 
				+	if (inode == NULL) {
			
 
				+		dout("handle_lease no inode %llx\n", vino.ino);
			
 
				+		goto release;
			
 
				+	}
			
 
				+	ci = ceph_inode(inode);
			
 
				+
			
 
				+	/* dentry */
			
 
				+	parent = d_find_alias(inode);
			
 
				+	if (!parent) {
			
 
				+		dout("no parent dentry on inode %p\n", inode);
			
 
				+		WARN_ON(1);
			
 
				+		goto release;  /* hrm... */
			
 
				+	}
			
 
				+	dname.hash = full_name_hash(dname.name, dname.len);
			
 
				+	dentry = d_lookup(parent, &dname);
			
 
				+	dput(parent);
			
 
				+	if (!dentry)
			
 
				+		goto release;
			
 
				+
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	di = ceph_dentry(dentry);
			
 
				+	switch (h->action) {
			
 
				+	case CEPH_MDS_LEASE_REVOKE:
			
 
				+		if (di && di->lease_session == session) {
			
 
				+			h->seq = cpu_to_le32(di->lease_seq);
			
 
				+			__ceph_mdsc_drop_dentry_lease(dentry);
			
 
				+		}
			
 
				+		release = 1;
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MDS_LEASE_RENEW:
			
 
				+		if (di && di->lease_session == session &&
			
 
				+		    di->lease_gen == session->s_cap_gen &&
			
 
				+		    di->lease_renew_from &&
			
 
				+		    di->lease_renew_after == 0) {
			
 
				+			unsigned long duration =
			
 
				+				le32_to_cpu(h->duration_ms) * HZ / 1000;
			
 
				+
			
 
				+			di->lease_seq = le32_to_cpu(h->seq);
			
 
				+			dentry->d_time = di->lease_renew_from + duration;
			
 
				+			di->lease_renew_after = di->lease_renew_from +
			
 
				+				(duration >> 1);
			
 
				+			di->lease_renew_from = 0;
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+	dput(dentry);
			
 
				+
			
 
				+	if (!release)
			
 
				+		goto out;
			
 
				+
			
 
				+release:
			
 
				+	/* let's just reuse the same message */
			
 
				+	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
			
 
				+	ceph_msg_get(msg);
			
 
				+	ceph_con_send(&session->s_con, msg);
			
 
				+
			
 
				+out:
			
 
				+	iput(inode);
			
 
				+	mutex_unlock(&session->s_mutex);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("corrupt lease message\n");
			
 
				+	ceph_msg_dump(msg);
			
 
				+}
			
 
				+
			
 
				+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
			
 
				+			      struct inode *inode,
			
 
				+			      struct dentry *dentry, char action,
			
 
				+			      u32 seq)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	struct ceph_mds_lease *lease;
			
 
				+	int len = sizeof(*lease) + sizeof(u32);
			
 
				+	int dnamelen = 0;
			
 
				+
			
 
				+	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
			
 
				+	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
			
 
				+	dnamelen = dentry->d_name.len;
			
 
				+	len += dnamelen;
			
 
				+
			
 
				+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
			
 
				+	if (IS_ERR(msg))
			
 
				+		return;
			
 
				+	lease = msg->front.iov_base;
			
 
				+	lease->action = action;
			
 
				+	lease->mask = cpu_to_le16(CEPH_LOCK_DN);
			
 
				+	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
			
 
				+	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
			
 
				+	lease->seq = cpu_to_le32(seq);
			
 
				+	put_unaligned_le32(dnamelen, lease + 1);
			
 
				+	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
			
 
				+
			
 
				+	/*
			
 
				+	 * if this is a preemptive lease RELEASE, no need to
			
 
				+	 * flush request stream, since the actual request will
			
 
				+	 * soon follow.
			
 
				+	 */
			
 
				+	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
			
 
				+
			
 
				+	ceph_con_send(&session->s_con, msg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Preemptively release a lease we expect to invalidate anyway.
			
 
				+ * Pass @inode always, @dentry is optional.
			
 
				+ */
			
 
				+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
			
 
				+			     struct dentry *dentry, int mask)
			
 
				+{
			
 
				+	struct ceph_dentry_info *di;
			
 
				+	struct ceph_mds_session *session;
			
 
				+	u32 seq;
			
 
				+
			
 
				+	BUG_ON(inode == NULL);
			
 
				+	BUG_ON(dentry == NULL);
			
 
				+	BUG_ON(mask != CEPH_LOCK_DN);
			
 
				+
			
 
				+	/* is dentry lease valid? */
			
 
				+	spin_lock(&dentry->d_lock);
			
 
				+	di = ceph_dentry(dentry);
			
 
				+	if (!di || !di->lease_session ||
			
 
				+	    di->lease_session->s_mds < 0 ||
			
 
				+	    di->lease_gen != di->lease_session->s_cap_gen ||
			
 
				+	    !time_before(jiffies, dentry->d_time)) {
			
 
				+		dout("lease_release inode %p dentry %p -- "
			
 
				+		     "no lease on %d\n",
			
 
				+		     inode, dentry, mask);
			
 
				+		spin_unlock(&dentry->d_lock);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* we do have a lease on this dentry; note mds and seq */
			
 
				+	session = ceph_get_mds_session(di->lease_session);
			
 
				+	seq = di->lease_seq;
			
 
				+	__ceph_mdsc_drop_dentry_lease(dentry);
			
 
				+	spin_unlock(&dentry->d_lock);
			
 
				+
			
 
				+	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
			
 
				+	     inode, dentry, mask, session->s_mds);
			
 
				+	ceph_mdsc_lease_send_msg(session, inode, dentry,
			
 
				+				 CEPH_MDS_LEASE_RELEASE, seq);
			
 
				+	ceph_put_mds_session(session);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * drop all leases (and dentry refs) in preparation for umount
			
 
				+ */
			
 
				+static void drop_leases(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	dout("drop_leases\n");
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	for (i = 0; i < mdsc->max_sessions; i++) {
			
 
				+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
			
 
				+		if (!s)
			
 
				+			continue;
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		mutex_lock(&s->s_mutex);
			
 
				+		mutex_unlock(&s->s_mutex);
			
 
				+		ceph_put_mds_session(s);
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * delayed work -- periodically trim expired leases, renew caps with mds
			
 
				+ */
			
 
				+static void schedule_delayed(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	int delay = 5;
			
 
				+	unsigned hz = round_jiffies_relative(HZ * delay);
			
 
				+	schedule_delayed_work(&mdsc->delayed_work, hz);
			
 
				+}
			
 
				+
			
 
				+static void delayed_work(struct work_struct *work)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct ceph_mds_client *mdsc =
			
 
				+		container_of(work, struct ceph_mds_client, delayed_work.work);
			
 
				+	int renew_interval;
			
 
				+	int renew_caps;
			
 
				+
			
 
				+	dout("mdsc delayed_work\n");
			
 
				+	ceph_check_delayed_caps(mdsc);
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
			
 
				+	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
			
 
				+				   mdsc->last_renew_caps);
			
 
				+	if (renew_caps)
			
 
				+		mdsc->last_renew_caps = jiffies;
			
 
				+
			
 
				+	for (i = 0; i < mdsc->max_sessions; i++) {
			
 
				+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
			
 
				+		if (s == NULL)
			
 
				+			continue;
			
 
				+		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
			
 
				+			dout("resending session close request for mds%d\n",
			
 
				+			     s->s_mds);
			
 
				+			request_close_session(mdsc, s);
			
 
				+			ceph_put_mds_session(s);
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
			
 
				+			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
			
 
				+				s->s_state = CEPH_MDS_SESSION_HUNG;
			
 
				+				pr_info("mds%d hung\n", s->s_mds);
			
 
				+			}
			
 
				+		}
			
 
				+		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
			
 
				+			/* this mds is failed or recovering, just wait */
			
 
				+			ceph_put_mds_session(s);
			
 
				+			continue;
			
 
				+		}
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+		mutex_lock(&s->s_mutex);
			
 
				+		if (renew_caps)
			
 
				+			send_renew_caps(mdsc, s);
			
 
				+		else
			
 
				+			ceph_con_keepalive(&s->s_con);
			
 
				+		add_cap_releases(mdsc, s, -1);
			
 
				+		send_cap_releases(mdsc, s);
			
 
				+		mutex_unlock(&s->s_mutex);
			
 
				+		ceph_put_mds_session(s);
			
 
				+
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	schedule_delayed(mdsc);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
			
 
				+{
			
 
				+	mdsc->client = client;
			
 
				+	mutex_init(&mdsc->mutex);
			
 
				+	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
			
 
				+	init_completion(&mdsc->safe_umount_waiters);
			
 
				+	init_completion(&mdsc->session_close_waiters);
			
 
				+	INIT_LIST_HEAD(&mdsc->waiting_for_map);
			
 
				+	mdsc->sessions = NULL;
			
 
				+	mdsc->max_sessions = 0;
			
 
				+	mdsc->stopping = 0;
			
 
				+	init_rwsem(&mdsc->snap_rwsem);
			
 
				+	mdsc->snap_realms = RB_ROOT;
			
 
				+	INIT_LIST_HEAD(&mdsc->snap_empty);
			
 
				+	spin_lock_init(&mdsc->snap_empty_lock);
			
 
				+	mdsc->last_tid = 0;
			
 
				+	mdsc->request_tree = RB_ROOT;
			
 
				+	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
			
 
				+	mdsc->last_renew_caps = jiffies;
			
 
				+	INIT_LIST_HEAD(&mdsc->cap_delay_list);
			
 
				+	spin_lock_init(&mdsc->cap_delay_lock);
			
 
				+	INIT_LIST_HEAD(&mdsc->snap_flush_list);
			
 
				+	spin_lock_init(&mdsc->snap_flush_lock);
			
 
				+	mdsc->cap_flush_seq = 0;
			
 
				+	INIT_LIST_HEAD(&mdsc->cap_dirty);
			
 
				+	mdsc->num_cap_flushing = 0;
			
 
				+	spin_lock_init(&mdsc->cap_dirty_lock);
			
 
				+	init_waitqueue_head(&mdsc->cap_flushing_wq);
			
 
				+	spin_lock_init(&mdsc->dentry_lru_lock);
			
 
				+	INIT_LIST_HEAD(&mdsc->dentry_lru);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wait for safe replies on open mds requests.  If we time out, drop
			
 
				+ * all requests from the tree to avoid dangling dentry refs.
			
 
				+ */
			
 
				+static void wait_requests(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct ceph_client *client = mdsc->client;
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	if (__get_oldest_req(mdsc)) {
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+		dout("wait_requests waiting for requests\n");
			
 
				+		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
			
 
				+				    client->mount_args->mount_timeout * HZ);
			
 
				+
			
 
				+		/* tear down remaining requests */
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+		while ((req = __get_oldest_req(mdsc))) {
			
 
				+			dout("wait_requests timed out on tid %llu\n",
			
 
				+			     req->r_tid);
			
 
				+			__unregister_request(mdsc, req);
			
 
				+		}
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+	dout("wait_requests done\n");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called before mount is ro, and before dentries are torn down.
			
 
				+ * (hmm, does this still race with new lookups?)
			
 
				+ */
			
 
				+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	dout("pre_umount\n");
			
 
				+	mdsc->stopping = 1;
			
 
				+
			
 
				+	drop_leases(mdsc);
			
 
				+	ceph_flush_dirty_caps(mdsc);
			
 
				+	wait_requests(mdsc);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * wait for all write mds requests to flush.
			
 
				+ */
			
 
				+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
			
 
				+{
			
 
				+	struct ceph_mds_request *req = NULL;
			
 
				+	struct rb_node *n;
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	dout("wait_unsafe_requests want %lld\n", want_tid);
			
 
				+	req = __get_oldest_req(mdsc);
			
 
				+	while (req && req->r_tid <= want_tid) {
			
 
				+		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
			
 
				+			/* write op */
			
 
				+			ceph_mdsc_get_request(req);
			
 
				+			mutex_unlock(&mdsc->mutex);
			
 
				+			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
			
 
				+			     req->r_tid, want_tid);
			
 
				+			wait_for_completion(&req->r_safe_completion);
			
 
				+			mutex_lock(&mdsc->mutex);
			
 
				+			n = rb_next(&req->r_node);
			
 
				+			ceph_mdsc_put_request(req);
			
 
				+		} else {
			
 
				+			n = rb_next(&req->r_node);
			
 
				+		}
			
 
				+		if (!n)
			
 
				+			break;
			
 
				+		req = rb_entry(n, struct ceph_mds_request, r_node);
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+	dout("wait_unsafe_requests done\n");
			
 
				+}
			
 
				+
			
 
				+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	u64 want_tid, want_flush;
			
 
				+
			
 
				+	dout("sync\n");
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	want_tid = mdsc->last_tid;
			
 
				+	want_flush = mdsc->cap_flush_seq;
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
			
 
				+
			
 
				+	ceph_flush_dirty_caps(mdsc);
			
 
				+
			
 
				+	wait_unsafe_requests(mdsc, want_tid);
			
 
				+	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * called after sb is ro.
			
 
				+ */
			
 
				+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_mds_session *session;
			
 
				+	int i;
			
 
				+	int n;
			
 
				+	struct ceph_client *client = mdsc->client;
			
 
				+	unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
			
 
				+
			
 
				+	dout("close_sessions\n");
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+
			
 
				+	/* close sessions */
			
 
				+	started = jiffies;
			
 
				+	while (time_before(jiffies, started + timeout)) {
			
 
				+		dout("closing sessions\n");
			
 
				+		n = 0;
			
 
				+		for (i = 0; i < mdsc->max_sessions; i++) {
			
 
				+			session = __ceph_lookup_mds_session(mdsc, i);
			
 
				+			if (!session)
			
 
				+				continue;
			
 
				+			mutex_unlock(&mdsc->mutex);
			
 
				+			mutex_lock(&session->s_mutex);
			
 
				+			__close_session(mdsc, session);
			
 
				+			mutex_unlock(&session->s_mutex);
			
 
				+			ceph_put_mds_session(session);
			
 
				+			mutex_lock(&mdsc->mutex);
			
 
				+			n++;
			
 
				+		}
			
 
				+		if (n == 0)
			
 
				+			break;
			
 
				+
			
 
				+		if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
			
 
				+			break;
			
 
				+
			
 
				+		dout("waiting for sessions to close\n");
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		wait_for_completion_timeout(&mdsc->session_close_waiters,
			
 
				+					    timeout);
			
 
				+		mutex_lock(&mdsc->mutex);
			
 
				+	}
			
 
				+
			
 
				+	/* tear down remaining sessions */
			
 
				+	for (i = 0; i < mdsc->max_sessions; i++) {
			
 
				+		if (mdsc->sessions[i]) {
			
 
				+			session = get_session(mdsc->sessions[i]);
			
 
				+			__unregister_session(mdsc, session);
			
 
				+			mutex_unlock(&mdsc->mutex);
			
 
				+			mutex_lock(&session->s_mutex);
			
 
				+			remove_session_caps(session);
			
 
				+			mutex_unlock(&session->s_mutex);
			
 
				+			ceph_put_mds_session(session);
			
 
				+			mutex_lock(&mdsc->mutex);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	WARN_ON(!list_empty(&mdsc->cap_delay_list));
			
 
				+
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	ceph_cleanup_empty_realms(mdsc);
			
 
				+
			
 
				+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
			
 
				+
			
 
				+	dout("stopped\n");
			
 
				+}
			
 
				+
			
 
				+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	dout("stop\n");
			
 
				+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
			
 
				+	if (mdsc->mdsmap)
			
 
				+		ceph_mdsmap_destroy(mdsc->mdsmap);
			
 
				+	kfree(mdsc->sessions);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * handle mds map update.
			
 
				+ */
			
 
				+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
			
 
				+{
			
 
				+	u32 epoch;
			
 
				+	u32 maplen;
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *end = p + msg->front.iov_len;
			
 
				+	struct ceph_mdsmap *newmap, *oldmap;
			
 
				+	struct ceph_fsid fsid;
			
 
				+	int err = -EINVAL;
			
 
				+
			
 
				+	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
			
 
				+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
			
 
				+	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
			
 
				+		return;
			
 
				+	epoch = ceph_decode_32(&p);
			
 
				+	maplen = ceph_decode_32(&p);
			
 
				+	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
			
 
				+
			
 
				+	/* do we need it? */
			
 
				+	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
			
 
				+		dout("handle_map epoch %u <= our %u\n",
			
 
				+		     epoch, mdsc->mdsmap->m_epoch);
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	newmap = ceph_mdsmap_decode(&p, end);
			
 
				+	if (IS_ERR(newmap)) {
			
 
				+		err = PTR_ERR(newmap);
			
 
				+		goto bad_unlock;
			
 
				+	}
			
 
				+
			
 
				+	/* swap into place */
			
 
				+	if (mdsc->mdsmap) {
			
 
				+		oldmap = mdsc->mdsmap;
			
 
				+		mdsc->mdsmap = newmap;
			
 
				+		check_new_map(mdsc, newmap, oldmap);
			
 
				+		ceph_mdsmap_destroy(oldmap);
			
 
				+	} else {
			
 
				+		mdsc->mdsmap = newmap;  /* first mds map */
			
 
				+	}
			
 
				+	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
			
 
				+
			
 
				+	__wake_requests(mdsc, &mdsc->waiting_for_map);
			
 
				+
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+	schedule_delayed(mdsc);
			
 
				+	return;
			
 
				+
			
 
				+bad_unlock:
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+bad:
			
 
				+	pr_err("error decoding mdsmap %d\n", err);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_connection *con_get(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+
			
 
				+	if (get_session(s)) {
			
 
				+		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
			
 
				+		return con;
			
 
				+	}
			
 
				+	dout("mdsc con_get %p FAIL\n", s);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void con_put(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+
			
 
				+	ceph_put_mds_session(s);
			
 
				+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * if the client is unresponsive for long enough, the mds will kill
			
 
				+ * the session entirely.
			
 
				+ */
			
 
				+static void peer_reset(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+
			
 
				+	pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
			
 
				+	       s->s_mds);
			
 
				+}
			
 
				+
			
 
				+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+	struct ceph_mds_client *mdsc = s->s_mdsc;
			
 
				+	int type = le16_to_cpu(msg->hdr.type);
			
 
				+
			
 
				+	mutex_lock(&mdsc->mutex);
			
 
				+	if (__verify_registered_session(mdsc, s) < 0) {
			
 
				+		mutex_unlock(&mdsc->mutex);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	mutex_unlock(&mdsc->mutex);
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case CEPH_MSG_MDS_MAP:
			
 
				+		ceph_mdsc_handle_map(mdsc, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_CLIENT_SESSION:
			
 
				+		handle_session(s, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_CLIENT_REPLY:
			
 
				+		handle_reply(s, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
			
 
				+		handle_forward(mdsc, s, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_CLIENT_CAPS:
			
 
				+		ceph_handle_caps(s, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_CLIENT_SNAP:
			
 
				+		ceph_handle_snap(mdsc, s, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_CLIENT_LEASE:
			
 
				+		handle_lease(mdsc, s, msg);
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		pr_err("received unknown message type %d %s\n", type,
			
 
				+		       ceph_msg_type_name(type));
			
 
				+	}
			
 
				+out:
			
 
				+	ceph_msg_put(msg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * authentication
			
 
				+ */
			
 
				+static int get_authorizer(struct ceph_connection *con,
			
 
				+			  void **buf, int *len, int *proto,
			
 
				+			  void **reply_buf, int *reply_len, int force_new)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+	struct ceph_mds_client *mdsc = s->s_mdsc;
			
 
				+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (force_new && s->s_authorizer) {
			
 
				+		ac->ops->destroy_authorizer(ac, s->s_authorizer);
			
 
				+		s->s_authorizer = NULL;
			
 
				+	}
			
 
				+	if (s->s_authorizer == NULL) {
			
 
				+		if (ac->ops->create_authorizer) {
			
 
				+			ret = ac->ops->create_authorizer(
			
 
				+				ac, CEPH_ENTITY_TYPE_MDS,
			
 
				+				&s->s_authorizer,
			
 
				+				&s->s_authorizer_buf,
			
 
				+				&s->s_authorizer_buf_len,
			
 
				+				&s->s_authorizer_reply_buf,
			
 
				+				&s->s_authorizer_reply_buf_len);
			
 
				+			if (ret)
			
 
				+				return ret;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	*proto = ac->protocol;
			
 
				+	*buf = s->s_authorizer_buf;
			
 
				+	*len = s->s_authorizer_buf_len;
			
 
				+	*reply_buf = s->s_authorizer_reply_buf;
			
 
				+	*reply_len = s->s_authorizer_reply_buf_len;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int verify_authorizer_reply(struct ceph_connection *con, int len)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+	struct ceph_mds_client *mdsc = s->s_mdsc;
			
 
				+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
			
 
				+
			
 
				+	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
			
 
				+}
			
 
				+
			
 
				+static int invalidate_authorizer(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_mds_session *s = con->private;
			
 
				+	struct ceph_mds_client *mdsc = s->s_mdsc;
			
 
				+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
			
 
				+
			
 
				+	if (ac->ops->invalidate_authorizer)
			
 
				+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
			
 
				+
			
 
				+	return ceph_monc_validate_auth(&mdsc->client->monc);
			
 
				+}
			
 
				+
			
 
				+const static struct ceph_connection_operations mds_con_ops = {
			
 
				+	.get = con_get,
			
 
				+	.put = con_put,
			
 
				+	.dispatch = dispatch,
			
 
				+	.get_authorizer = get_authorizer,
			
 
				+	.verify_authorizer_reply = verify_authorizer_reply,
			
 
				+	.invalidate_authorizer = invalidate_authorizer,
			
 
				+	.peer_reset = peer_reset,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* eof */
			
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
 
				+#ifndef _FS_CEPH_MDS_CLIENT_H
			
 
				+#define _FS_CEPH_MDS_CLIENT_H
			
 
				+
			
 
				+#include <linux/completion.h>
			
 
				+#include <linux/kref.h>
			
 
				+#include <linux/list.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/rbtree.h>
			
 
				+#include <linux/spinlock.h>
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "messenger.h"
			
 
				+#include "mdsmap.h"
			
 
				+
			
 
				+/*
			
 
				+ * Some lock dependencies:
			
 
				+ *
			
 
				+ * session->s_mutex
			
 
				+ *         mdsc->mutex
			
 
				+ *
			
 
				+ *         mdsc->snap_rwsem
			
 
				+ *
			
 
				+ *         inode->i_lock
			
 
				+ *                 mdsc->snap_flush_lock
			
 
				+ *                 mdsc->cap_delay_lock
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+struct ceph_client;
			
 
				+struct ceph_cap;
			
 
				+
			
 
				+/*
			
 
				+ * parsed info about a single inode.  pointers are into the encoded
			
 
				+ * on-wire structures within the mds reply message payload.
			
 
				+ */
			
 
				+struct ceph_mds_reply_info_in {
			
 
				+	struct ceph_mds_reply_inode *in;
			
 
				+	u32 symlink_len;
			
 
				+	char *symlink;
			
 
				+	u32 xattr_len;
			
 
				+	char *xattr_data;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * parsed info about an mds reply, including information about the
			
 
				+ * target inode and/or its parent directory and dentry, and directory
			
 
				+ * contents (for readdir results).
			
 
				+ */
			
 
				+struct ceph_mds_reply_info_parsed {
			
 
				+	struct ceph_mds_reply_head    *head;
			
 
				+
			
 
				+	struct ceph_mds_reply_info_in diri, targeti;
			
 
				+	struct ceph_mds_reply_dirfrag *dirfrag;
			
 
				+	char                          *dname;
			
 
				+	u32                           dname_len;
			
 
				+	struct ceph_mds_reply_lease   *dlease;
			
 
				+
			
 
				+	struct ceph_mds_reply_dirfrag *dir_dir;
			
 
				+	int                           dir_nr;
			
 
				+	char                          **dir_dname;
			
 
				+	u32                           *dir_dname_len;
			
 
				+	struct ceph_mds_reply_lease   **dir_dlease;
			
 
				+	struct ceph_mds_reply_info_in *dir_in;
			
 
				+	u8                            dir_complete, dir_end;
			
 
				+
			
 
				+	/* encoded blob describing snapshot contexts for certain
			
 
				+	   operations (e.g., open) */
			
 
				+	void *snapblob;
			
 
				+	int snapblob_len;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * cap releases are batched and sent to the MDS en masse.
			
 
				+ */
			
 
				+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
			
 
				+				sizeof(struct ceph_mds_cap_release)) /	\
			
 
				+			       sizeof(struct ceph_mds_cap_item))
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * state associated with each MDS<->client session
			
 
				+ */
			
 
				+enum {
			
 
				+	CEPH_MDS_SESSION_NEW = 1,
			
 
				+	CEPH_MDS_SESSION_OPENING = 2,
			
 
				+	CEPH_MDS_SESSION_OPEN = 3,
			
 
				+	CEPH_MDS_SESSION_HUNG = 4,
			
 
				+	CEPH_MDS_SESSION_CLOSING = 5,
			
 
				+	CEPH_MDS_SESSION_RESTARTING = 6,
			
 
				+	CEPH_MDS_SESSION_RECONNECTING = 7,
			
 
				+};
			
 
				+
			
 
				+struct ceph_mds_session {
			
 
				+	struct ceph_mds_client *s_mdsc;
			
 
				+	int               s_mds;
			
 
				+	int               s_state;
			
 
				+	unsigned long     s_ttl;      /* time until mds kills us */
			
 
				+	u64               s_seq;      /* incoming msg seq # */
			
 
				+	struct mutex      s_mutex;    /* serialize session messages */
			
 
				+
			
 
				+	struct ceph_connection s_con;
			
 
				+
			
 
				+	struct ceph_authorizer *s_authorizer;
			
 
				+	void             *s_authorizer_buf, *s_authorizer_reply_buf;
			
 
				+	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
			
 
				+
			
 
				+	/* protected by s_cap_lock */
			
 
				+	spinlock_t        s_cap_lock;
			
 
				+	u32               s_cap_gen;  /* inc each time we get mds stale msg */
			
 
				+	unsigned long     s_cap_ttl;  /* when session caps expire */
			
 
				+	struct list_head  s_caps;     /* all caps issued by this session */
			
 
				+	int               s_nr_caps, s_trim_caps;
			
 
				+	int               s_num_cap_releases;
			
 
				+	struct list_head  s_cap_releases; /* waiting cap_release messages */
			
 
				+	struct list_head  s_cap_releases_done; /* ready to send */
			
 
				+	struct ceph_cap  *s_cap_iterator;
			
 
				+
			
 
				+	/* protected by mutex */
			
 
				+	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
			
 
				+	struct list_head  s_cap_snaps_flushing;
			
 
				+	unsigned long     s_renew_requested; /* last time we sent a renew req */
			
 
				+	u64               s_renew_seq;
			
 
				+
			
 
				+	atomic_t          s_ref;
			
 
				+	struct list_head  s_waiting;  /* waiting requests */
			
 
				+	struct list_head  s_unsafe;   /* unsafe requests */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * modes of choosing which MDS to send a request to
			
 
				+ */
			
 
				+enum {
			
 
				+	USE_ANY_MDS,
			
 
				+	USE_RANDOM_MDS,
			
 
				+	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
			
 
				+};
			
 
				+
			
 
				+struct ceph_mds_request;
			
 
				+struct ceph_mds_client;
			
 
				+
			
 
				+/*
			
 
				+ * request completion callback
			
 
				+ */
			
 
				+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
			
 
				+					     struct ceph_mds_request *req);
			
 
				+
			
 
				+/*
			
 
				+ * an in-flight mds request
			
 
				+ */
			
 
				+struct ceph_mds_request {
			
 
				+	u64 r_tid;                   /* transaction id */
			
 
				+	struct rb_node r_node;
			
 
				+
			
 
				+	int r_op;                    /* mds op code */
			
 
				+	int r_mds;
			
 
				+
			
 
				+	/* operation on what? */
			
 
				+	struct inode *r_inode;              /* arg1 */
			
 
				+	struct dentry *r_dentry;            /* arg1 */
			
 
				+	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
			
 
				+	char *r_path1, *r_path2;
			
 
				+	struct ceph_vino r_ino1, r_ino2;
			
 
				+
			
 
				+	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
			
 
				+	struct inode *r_target_inode;       /* resulting inode */
			
 
				+
			
 
				+	union ceph_mds_request_args r_args;
			
 
				+	int r_fmode;        /* file mode, if expecting cap */
			
 
				+
			
 
				+	/* for choosing which mds to send this request to */
			
 
				+	int r_direct_mode;
			
 
				+	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
			
 
				+	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
			
 
				+
			
 
				+	/* data payload is used for xattr ops */
			
 
				+	struct page **r_pages;
			
 
				+	int r_num_pages;
			
 
				+	int r_data_len;
			
 
				+
			
 
				+	/* what caps shall we drop? */
			
 
				+	int r_inode_drop, r_inode_unless;
			
 
				+	int r_dentry_drop, r_dentry_unless;
			
 
				+	int r_old_dentry_drop, r_old_dentry_unless;
			
 
				+	struct inode *r_old_inode;
			
 
				+	int r_old_inode_drop, r_old_inode_unless;
			
 
				+
			
 
				+	struct ceph_msg  *r_request;  /* original request */
			
 
				+	struct ceph_msg  *r_reply;
			
 
				+	struct ceph_mds_reply_info_parsed r_reply_info;
			
 
				+	int r_err;
			
 
				+	bool r_aborted;
			
 
				+
			
 
				+	unsigned long r_timeout;  /* optional.  jiffies */
			
 
				+	unsigned long r_started;  /* start time to measure timeout against */
			
 
				+	unsigned long r_request_started; /* start time for mds request only,
			
 
				+					    used to measure lease durations */
			
 
				+
			
 
				+	/* link unsafe requests to parent directory, for fsync */
			
 
				+	struct inode	*r_unsafe_dir;
			
 
				+	struct list_head r_unsafe_dir_item;
			
 
				+
			
 
				+	struct ceph_mds_session *r_session;
			
 
				+
			
 
				+	int               r_attempts;   /* resend attempts */
			
 
				+	int               r_num_fwd;    /* number of forward attempts */
			
 
				+	int               r_num_stale;
			
 
				+	int               r_resend_mds; /* mds to resend to next, if any*/
			
 
				+
			
 
				+	struct kref       r_kref;
			
 
				+	struct list_head  r_wait;
			
 
				+	struct completion r_completion;
			
 
				+	struct completion r_safe_completion;
			
 
				+	ceph_mds_request_callback_t r_callback;
			
 
				+	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
			
 
				+	bool		  r_got_unsafe, r_got_safe;
			
 
				+
			
 
				+	bool              r_did_prepopulate;
			
 
				+	u32               r_readdir_offset;
			
 
				+
			
 
				+	struct ceph_cap_reservation r_caps_reservation;
			
 
				+	int r_num_caps;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * mds client state
			
 
				+ */
			
 
				+struct ceph_mds_client {
			
 
				+	struct ceph_client      *client;
			
 
				+	struct mutex            mutex;         /* all nested structures */
			
 
				+
			
 
				+	struct ceph_mdsmap      *mdsmap;
			
 
				+	struct completion       safe_umount_waiters, session_close_waiters;
			
 
				+	struct list_head        waiting_for_map;
			
 
				+
			
 
				+	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
			
 
				+	int                     max_sessions;  /* len of s_mds_sessions */
			
 
				+	int                     stopping;      /* true if shutting down */
			
 
				+
			
 
				+	/*
			
 
				+	 * snap_rwsem will cover cap linkage into snaprealms, and
			
 
				+	 * realm snap contexts.  (later, we can do per-realm snap
			
 
				+	 * contexts locks..)  the empty list contains realms with no
			
 
				+	 * references (implying they contain no inodes with caps) that
			
 
				+	 * should be destroyed.
			
 
				+	 */
			
 
				+	struct rw_semaphore     snap_rwsem;
			
 
				+	struct rb_root          snap_realms;
			
 
				+	struct list_head        snap_empty;
			
 
				+	spinlock_t              snap_empty_lock;  /* protect snap_empty */
			
 
				+
			
 
				+	u64                    last_tid;      /* most recent mds request */
			
 
				+	struct rb_root         request_tree;  /* pending mds requests */
			
 
				+	struct delayed_work    delayed_work;  /* delayed work */
			
 
				+	unsigned long    last_renew_caps;  /* last time we renewed our caps */
			
 
				+	struct list_head cap_delay_list;   /* caps with delayed release */
			
 
				+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
			
 
				+	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
			
 
				+	spinlock_t       snap_flush_lock;
			
 
				+
			
 
				+	u64               cap_flush_seq;
			
 
				+	struct list_head  cap_dirty;        /* inodes with dirty caps */
			
 
				+	int               num_cap_flushing; /* # caps we are flushing */
			
 
				+	spinlock_t        cap_dirty_lock;   /* protects above items */
			
 
				+	wait_queue_head_t cap_flushing_wq;
			
 
				+
			
 
				+#ifdef CONFIG_DEBUG_FS
			
 
				+	struct dentry 	  *debugfs_file;
			
 
				+#endif
			
 
				+
			
 
				+	spinlock_t	  dentry_lru_lock;
			
 
				+	struct list_head  dentry_lru;
			
 
				+	int		  num_dentry;
			
 
				+};
			
 
				+
			
 
				+extern const char *ceph_mds_op_name(int op);
			
 
				+
			
 
				+extern struct ceph_mds_session *
			
 
				+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
			
 
				+
			
 
				+static inline struct ceph_mds_session *
			
 
				+ceph_get_mds_session(struct ceph_mds_session *s)
			
 
				+{
			
 
				+	atomic_inc(&s->s_ref);
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+extern void ceph_put_mds_session(struct ceph_mds_session *s);
			
 
				+
			
 
				+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_msg *msg, int mds);
			
 
				+
			
 
				+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
			
 
				+			   struct ceph_client *client);
			
 
				+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
			
 
				+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
			
 
				+
			
 
				+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
			
 
				+
			
 
				+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
			
 
				+				    struct inode *inode,
			
 
				+				    struct dentry *dn, int mask);
			
 
				+
			
 
				+extern struct ceph_mds_request *
			
 
				+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
			
 
				+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
			
 
				+				     struct ceph_mds_request *req);
			
 
				+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
			
 
				+				struct inode *dir,
			
 
				+				struct ceph_mds_request *req);
			
 
				+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
			
 
				+{
			
 
				+	kref_get(&req->r_kref);
			
 
				+}
			
 
				+extern void ceph_mdsc_release_request(struct kref *kref);
			
 
				+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
			
 
				+{
			
 
				+	kref_put(&req->r_kref, ceph_mdsc_release_request);
			
 
				+}
			
 
				+
			
 
				+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
			
 
				+
			
 
				+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
			
 
				+				  int stop_on_nosnap);
			
 
				+
			
 
				+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
			
 
				+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
			
 
				+				     struct inode *inode,
			
 
				+				     struct dentry *dentry, char action,
			
 
				+				     u32 seq);
			
 
				+
			
 
				+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
			
 
				+				 struct ceph_msg *msg);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/bug.h>
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/random.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+#include "mdsmap.h"
			
 
				+#include "messenger.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+#include "super.h"
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
			
 
				+ */
			
 
				+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
			
 
				+{
			
 
				+	int n = 0;
			
 
				+	int i;
			
 
				+	char r;
			
 
				+
			
 
				+	/* count */
			
 
				+	for (i = 0; i < m->m_max_mds; i++)
			
 
				+		if (m->m_info[i].state > 0)
			
 
				+			n++;
			
 
				+	if (n == 0)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* pick */
			
 
				+	get_random_bytes(&r, 1);
			
 
				+	n = r % n;
			
 
				+	i = 0;
			
 
				+	for (i = 0; n > 0; i++, n--)
			
 
				+		while (m->m_info[i].state <= 0)
			
 
				+			i++;
			
 
				+
			
 
				+	return i;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Decode an MDS map
			
 
				+ *
			
 
				+ * Ignore any fields we don't care about (there are quite a few of
			
 
				+ * them).
			
 
				+ */
			
 
				+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
			
 
				+{
			
 
				+	struct ceph_mdsmap *m;
			
 
				+	const void *start = *p;
			
 
				+	int i, j, n;
			
 
				+	int err = -EINVAL;
			
 
				+	u16 version;
			
 
				+
			
 
				+	m = kzalloc(sizeof(*m), GFP_NOFS);
			
 
				+	if (m == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	ceph_decode_16_safe(p, end, version, bad);
			
 
				+
			
 
				+	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
			
 
				+	m->m_epoch = ceph_decode_32(p);
			
 
				+	m->m_client_epoch = ceph_decode_32(p);
			
 
				+	m->m_last_failure = ceph_decode_32(p);
			
 
				+	m->m_root = ceph_decode_32(p);
			
 
				+	m->m_session_timeout = ceph_decode_32(p);
			
 
				+	m->m_session_autoclose = ceph_decode_32(p);
			
 
				+	m->m_max_file_size = ceph_decode_64(p);
			
 
				+	m->m_max_mds = ceph_decode_32(p);
			
 
				+
			
 
				+	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
			
 
				+	if (m->m_info == NULL)
			
 
				+		goto badmem;
			
 
				+
			
 
				+	/* pick out active nodes from mds_info (state > 0) */
			
 
				+	n = ceph_decode_32(p);
			
 
				+	for (i = 0; i < n; i++) {
			
 
				+		u64 global_id;
			
 
				+		u32 namelen;
			
 
				+		s32 mds, inc, state;
			
 
				+		u64 state_seq;
			
 
				+		u8 infoversion;
			
 
				+		struct ceph_entity_addr addr;
			
 
				+		u32 num_export_targets;
			
 
				+		void *pexport_targets = NULL;
			
 
				+
			
 
				+		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
			
 
				+		global_id = ceph_decode_64(p);
			
 
				+		infoversion = ceph_decode_8(p);
			
 
				+		*p += sizeof(u64);
			
 
				+		namelen = ceph_decode_32(p);  /* skip mds name */
			
 
				+		*p += namelen;
			
 
				+
			
 
				+		ceph_decode_need(p, end,
			
 
				+				 4*sizeof(u32) + sizeof(u64) +
			
 
				+				 sizeof(addr) + sizeof(struct ceph_timespec),
			
 
				+				 bad);
			
 
				+		mds = ceph_decode_32(p);
			
 
				+		inc = ceph_decode_32(p);
			
 
				+		state = ceph_decode_32(p);
			
 
				+		state_seq = ceph_decode_64(p);
			
 
				+		ceph_decode_copy(p, &addr, sizeof(addr));
			
 
				+		ceph_decode_addr(&addr);
			
 
				+		*p += sizeof(struct ceph_timespec);
			
 
				+		*p += sizeof(u32);
			
 
				+		ceph_decode_32_safe(p, end, namelen, bad);
			
 
				+		*p += namelen;
			
 
				+		if (infoversion >= 2) {
			
 
				+			ceph_decode_32_safe(p, end, num_export_targets, bad);
			
 
				+			pexport_targets = *p;
			
 
				+			*p += num_export_targets * sizeof(u32);
			
 
				+		} else {
			
 
				+			num_export_targets = 0;
			
 
				+		}
			
 
				+
			
 
				+		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
			
 
				+		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
			
 
				+		     ceph_mds_state_name(state));
			
 
				+		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
			
 
				+			m->m_info[mds].global_id = global_id;
			
 
				+			m->m_info[mds].state = state;
			
 
				+			m->m_info[mds].addr = addr;
			
 
				+			m->m_info[mds].num_export_targets = num_export_targets;
			
 
				+			if (num_export_targets) {
			
 
				+				m->m_info[mds].export_targets =
			
 
				+					kcalloc(num_export_targets, sizeof(u32),
			
 
				+						GFP_NOFS);
			
 
				+				for (j = 0; j < num_export_targets; j++)
			
 
				+					m->m_info[mds].export_targets[j] =
			
 
				+					       ceph_decode_32(&pexport_targets);
			
 
				+			} else {
			
 
				+				m->m_info[mds].export_targets = NULL;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* pg_pools */
			
 
				+	ceph_decode_32_safe(p, end, n, bad);
			
 
				+	m->m_num_data_pg_pools = n;
			
 
				+	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
			
 
				+	if (!m->m_data_pg_pools)
			
 
				+		goto badmem;
			
 
				+	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
			
 
				+	for (i = 0; i < n; i++)
			
 
				+		m->m_data_pg_pools[i] = ceph_decode_32(p);
			
 
				+	m->m_cas_pg_pool = ceph_decode_32(p);
			
 
				+
			
 
				+	/* ok, we don't care about the rest. */
			
 
				+	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
			
 
				+	return m;
			
 
				+
			
 
				+badmem:
			
 
				+	err = -ENOMEM;
			
 
				+bad:
			
 
				+	pr_err("corrupt mdsmap\n");
			
 
				+	print_hex_dump(KERN_DEBUG, "mdsmap: ",
			
 
				+		       DUMP_PREFIX_OFFSET, 16, 1,
			
 
				+		       start, end - start, true);
			
 
				+	ceph_mdsmap_destroy(m);
			
 
				+	return ERR_PTR(-EINVAL);
			
 
				+}
			
 
				+
			
 
				+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < m->m_max_mds; i++)
			
 
				+		kfree(m->m_info[i].export_targets);
			
 
				+	kfree(m->m_info);
			
 
				+	kfree(m->m_data_pg_pools);
			
 
				+	kfree(m);
			
 
				+}
			
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
 
				+#ifndef _FS_CEPH_MDSMAP_H
			
 
				+#define _FS_CEPH_MDSMAP_H
			
 
				+
			
 
				+#include "types.h"
			
 
				+
			
 
				+/*
			
 
				+ * mds map - describe servers in the mds cluster.
			
 
				+ *
			
 
				+ * we limit fields to those the client actually xcares about
			
 
				+ */
			
 
				+struct ceph_mds_info {
			
 
				+	u64 global_id;
			
 
				+	struct ceph_entity_addr addr;
			
 
				+	s32 state;
			
 
				+	int num_export_targets;
			
 
				+	u32 *export_targets;
			
 
				+};
			
 
				+
			
 
				+struct ceph_mdsmap {
			
 
				+	u32 m_epoch, m_client_epoch, m_last_failure;
			
 
				+	u32 m_root;
			
 
				+	u32 m_session_timeout;          /* seconds */
			
 
				+	u32 m_session_autoclose;        /* seconds */
			
 
				+	u64 m_max_file_size;
			
 
				+	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
			
 
				+	struct ceph_mds_info *m_info;
			
 
				+
			
 
				+	/* which object pools file data can be stored in */
			
 
				+	int m_num_data_pg_pools;
			
 
				+	u32 *m_data_pg_pools;
			
 
				+	u32 m_cas_pg_pool;
			
 
				+};
			
 
				+
			
 
				+static inline struct ceph_entity_addr *
			
 
				+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
			
 
				+{
			
 
				+	if (w >= m->m_max_mds)
			
 
				+		return NULL;
			
 
				+	return &m->m_info[w].addr;
			
 
				+}
			
 
				+
			
 
				+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
			
 
				+{
			
 
				+	BUG_ON(w < 0);
			
 
				+	if (w >= m->m_max_mds)
			
 
				+		return CEPH_MDS_STATE_DNE;
			
 
				+	return m->m_info[w].state;
			
 
				+}
			
 
				+
			
 
				+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
			
 
				+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
			
 
				+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/crc32c.h>
			
 
				+#include <linux/ctype.h>
			
 
				+#include <linux/highmem.h>
			
 
				+#include <linux/inet.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/net.h>
			
 
				+#include <linux/socket.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <net/tcp.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "messenger.h"
			
 
				+#include "decode.h"
			
 
				+#include "pagelist.h"
			
 
				+
			
 
				+/*
			
 
				+ * Ceph uses the messenger to exchange ceph_msg messages with other
			
 
				+ * hosts in the system.  The messenger provides ordered and reliable
			
 
				+ * delivery.  We tolerate TCP disconnects by reconnecting (with
			
 
				+ * exponential backoff) in the case of a fault (disconnection, bad
			
 
				+ * crc, protocol error).  Acks allow sent messages to be discarded by
			
 
				+ * the sender.
			
 
				+ */
			
 
				+
			
 
				+/* static tag bytes (protocol control messages) */
			
 
				+static char tag_msg = CEPH_MSGR_TAG_MSG;
			
 
				+static char tag_ack = CEPH_MSGR_TAG_ACK;
			
 
				+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
			
 
				+
			
 
				+
			
 
				+static void queue_con(struct ceph_connection *con);
			
 
				+static void con_work(struct work_struct *);
			
 
				+static void ceph_fault(struct ceph_connection *con);
			
 
				+
			
 
				+const char *ceph_name_type_str(int t)
			
 
				+{
			
 
				+	switch (t) {
			
 
				+	case CEPH_ENTITY_TYPE_MON: return "mon";
			
 
				+	case CEPH_ENTITY_TYPE_MDS: return "mds";
			
 
				+	case CEPH_ENTITY_TYPE_OSD: return "osd";
			
 
				+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
			
 
				+	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
			
 
				+	default: return "???";
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * nicely render a sockaddr as a string.
			
 
				+ */
			
 
				+#define MAX_ADDR_STR 20
			
 
				+static char addr_str[MAX_ADDR_STR][40];
			
 
				+static DEFINE_SPINLOCK(addr_str_lock);
			
 
				+static int last_addr_str;
			
 
				+
			
 
				+const char *pr_addr(const struct sockaddr_storage *ss)
			
 
				+{
			
 
				+	int i;
			
 
				+	char *s;
			
 
				+	struct sockaddr_in *in4 = (void *)ss;
			
 
				+	unsigned char *quad = (void *)&in4->sin_addr.s_addr;
			
 
				+	struct sockaddr_in6 *in6 = (void *)ss;
			
 
				+
			
 
				+	spin_lock(&addr_str_lock);
			
 
				+	i = last_addr_str++;
			
 
				+	if (last_addr_str == MAX_ADDR_STR)
			
 
				+		last_addr_str = 0;
			
 
				+	spin_unlock(&addr_str_lock);
			
 
				+	s = addr_str[i];
			
 
				+
			
 
				+	switch (ss->ss_family) {
			
 
				+	case AF_INET:
			
 
				+		sprintf(s, "%u.%u.%u.%u:%u",
			
 
				+			(unsigned int)quad[0],
			
 
				+			(unsigned int)quad[1],
			
 
				+			(unsigned int)quad[2],
			
 
				+			(unsigned int)quad[3],
			
 
				+			(unsigned int)ntohs(in4->sin_port));
			
 
				+		break;
			
 
				+
			
 
				+	case AF_INET6:
			
 
				+		sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
			
 
				+			in6->sin6_addr.s6_addr16[0],
			
 
				+			in6->sin6_addr.s6_addr16[1],
			
 
				+			in6->sin6_addr.s6_addr16[2],
			
 
				+			in6->sin6_addr.s6_addr16[3],
			
 
				+			in6->sin6_addr.s6_addr16[4],
			
 
				+			in6->sin6_addr.s6_addr16[5],
			
 
				+			in6->sin6_addr.s6_addr16[6],
			
 
				+			in6->sin6_addr.s6_addr16[7],
			
 
				+			(unsigned int)ntohs(in6->sin6_port));
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
			
 
				+	}
			
 
				+
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+static void encode_my_addr(struct ceph_messenger *msgr)
			
 
				+{
			
 
				+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
			
 
				+	ceph_encode_addr(&msgr->my_enc_addr);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * work queue for all reading and writing to/from the socket.
			
 
				+ */
			
 
				+struct workqueue_struct *ceph_msgr_wq;
			
 
				+
			
 
				+int __init ceph_msgr_init(void)
			
 
				+{
			
 
				+	ceph_msgr_wq = create_workqueue("ceph-msgr");
			
 
				+	if (IS_ERR(ceph_msgr_wq)) {
			
 
				+		int ret = PTR_ERR(ceph_msgr_wq);
			
 
				+		pr_err("msgr_init failed to create workqueue: %d\n", ret);
			
 
				+		ceph_msgr_wq = NULL;
			
 
				+		return ret;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void ceph_msgr_exit(void)
			
 
				+{
			
 
				+	destroy_workqueue(ceph_msgr_wq);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * socket callback functions
			
 
				+ */
			
 
				+
			
 
				+/* data available on socket, or listen socket received a connect */
			
 
				+static void ceph_data_ready(struct sock *sk, int count_unused)
			
 
				+{
			
 
				+	struct ceph_connection *con =
			
 
				+		(struct ceph_connection *)sk->sk_user_data;
			
 
				+	if (sk->sk_state != TCP_CLOSE_WAIT) {
			
 
				+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
			
 
				+		     con, con->state);
			
 
				+		queue_con(con);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* socket has buffer space for writing */
			
 
				+static void ceph_write_space(struct sock *sk)
			
 
				+{
			
 
				+	struct ceph_connection *con =
			
 
				+		(struct ceph_connection *)sk->sk_user_data;
			
 
				+
			
 
				+	/* only queue to workqueue if there is data we want to write. */
			
 
				+	if (test_bit(WRITE_PENDING, &con->state)) {
			
 
				+		dout("ceph_write_space %p queueing write work\n", con);
			
 
				+		queue_con(con);
			
 
				+	} else {
			
 
				+		dout("ceph_write_space %p nothing to write\n", con);
			
 
				+	}
			
 
				+
			
 
				+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
			
 
				+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
			
 
				+}
			
 
				+
			
 
				+/* socket's state has changed */
			
 
				+static void ceph_state_change(struct sock *sk)
			
 
				+{
			
 
				+	struct ceph_connection *con =
			
 
				+		(struct ceph_connection *)sk->sk_user_data;
			
 
				+
			
 
				+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
			
 
				+	     con, con->state, sk->sk_state);
			
 
				+
			
 
				+	if (test_bit(CLOSED, &con->state))
			
 
				+		return;
			
 
				+
			
 
				+	switch (sk->sk_state) {
			
 
				+	case TCP_CLOSE:
			
 
				+		dout("ceph_state_change TCP_CLOSE\n");
			
 
				+	case TCP_CLOSE_WAIT:
			
 
				+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
			
 
				+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
			
 
				+			if (test_bit(CONNECTING, &con->state))
			
 
				+				con->error_msg = "connection failed";
			
 
				+			else
			
 
				+				con->error_msg = "socket closed";
			
 
				+			queue_con(con);
			
 
				+		}
			
 
				+		break;
			
 
				+	case TCP_ESTABLISHED:
			
 
				+		dout("ceph_state_change TCP_ESTABLISHED\n");
			
 
				+		queue_con(con);
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * set up socket callbacks
			
 
				+ */
			
 
				+static void set_sock_callbacks(struct socket *sock,
			
 
				+			       struct ceph_connection *con)
			
 
				+{
			
 
				+	struct sock *sk = sock->sk;
			
 
				+	sk->sk_user_data = (void *)con;
			
 
				+	sk->sk_data_ready = ceph_data_ready;
			
 
				+	sk->sk_write_space = ceph_write_space;
			
 
				+	sk->sk_state_change = ceph_state_change;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * socket helpers
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * initiate connection to a remote socket.
			
 
				+ */
			
 
				+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
			
 
				+	struct socket *sock;
			
 
				+	int ret;
			
 
				+
			
 
				+	BUG_ON(con->sock);
			
 
				+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
			
 
				+	if (ret)
			
 
				+		return ERR_PTR(ret);
			
 
				+	con->sock = sock;
			
 
				+	sock->sk->sk_allocation = GFP_NOFS;
			
 
				+
			
 
				+	set_sock_callbacks(sock, con);
			
 
				+
			
 
				+	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
			
 
				+
			
 
				+	ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
			
 
				+	if (ret == -EINPROGRESS) {
			
 
				+		dout("connect %s EINPROGRESS sk_state = %u\n",
			
 
				+		     pr_addr(&con->peer_addr.in_addr),
			
 
				+		     sock->sk->sk_state);
			
 
				+		ret = 0;
			
 
				+	}
			
 
				+	if (ret < 0) {
			
 
				+		pr_err("connect %s error %d\n",
			
 
				+		       pr_addr(&con->peer_addr.in_addr), ret);
			
 
				+		sock_release(sock);
			
 
				+		con->sock = NULL;
			
 
				+		con->error_msg = "connect error";
			
 
				+	}
			
 
				+
			
 
				+	if (ret < 0)
			
 
				+		return ERR_PTR(ret);
			
 
				+	return sock;
			
 
				+}
			
 
				+
			
 
				+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
			
 
				+{
			
 
				+	struct kvec iov = {buf, len};
			
 
				+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
			
 
				+
			
 
				+	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * write something.  @more is true if caller will be sending more data
			
 
				+ * shortly.
			
 
				+ */
			
 
				+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
			
 
				+		     size_t kvlen, size_t len, int more)
			
 
				+{
			
 
				+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
			
 
				+
			
 
				+	if (more)
			
 
				+		msg.msg_flags |= MSG_MORE;
			
 
				+	else
			
 
				+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
			
 
				+
			
 
				+	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Shutdown/close the socket for the given connection.
			
 
				+ */
			
 
				+static int con_close_socket(struct ceph_connection *con)
			
 
				+{
			
 
				+	int rc;
			
 
				+
			
 
				+	dout("con_close_socket on %p sock %p\n", con, con->sock);
			
 
				+	if (!con->sock)
			
 
				+		return 0;
			
 
				+	set_bit(SOCK_CLOSED, &con->state);
			
 
				+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
			
 
				+	sock_release(con->sock);
			
 
				+	con->sock = NULL;
			
 
				+	clear_bit(SOCK_CLOSED, &con->state);
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reset a connection.  Discard all incoming and outgoing messages
			
 
				+ * and clear *_seq state.
			
 
				+ */
			
 
				+static void ceph_msg_remove(struct ceph_msg *msg)
			
 
				+{
			
 
				+	list_del_init(&msg->list_head);
			
 
				+	ceph_msg_put(msg);
			
 
				+}
			
 
				+static void ceph_msg_remove_list(struct list_head *head)
			
 
				+{
			
 
				+	while (!list_empty(head)) {
			
 
				+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
			
 
				+							list_head);
			
 
				+		ceph_msg_remove(msg);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void reset_connection(struct ceph_connection *con)
			
 
				+{
			
 
				+	/* reset connection, out_queue, msg_ and connect_seq */
			
 
				+	/* discard existing out_queue and msg_seq */
			
 
				+	ceph_msg_remove_list(&con->out_queue);
			
 
				+	ceph_msg_remove_list(&con->out_sent);
			
 
				+
			
 
				+	if (con->in_msg) {
			
 
				+		ceph_msg_put(con->in_msg);
			
 
				+		con->in_msg = NULL;
			
 
				+	}
			
 
				+
			
 
				+	con->connect_seq = 0;
			
 
				+	con->out_seq = 0;
			
 
				+	if (con->out_msg) {
			
 
				+		ceph_msg_put(con->out_msg);
			
 
				+		con->out_msg = NULL;
			
 
				+	}
			
 
				+	con->in_seq = 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * mark a peer down.  drop any open connections.
			
 
				+ */
			
 
				+void ceph_con_close(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
			
 
				+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
			
 
				+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
			
 
				+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
			
 
				+	clear_bit(KEEPALIVE_PENDING, &con->state);
			
 
				+	clear_bit(WRITE_PENDING, &con->state);
			
 
				+	mutex_lock(&con->mutex);
			
 
				+	reset_connection(con);
			
 
				+	cancel_delayed_work(&con->work);
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+	queue_con(con);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reopen a closed connection, with a new peer address.
			
 
				+ */
			
 
				+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
			
 
				+{
			
 
				+	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
			
 
				+	set_bit(OPENING, &con->state);
			
 
				+	clear_bit(CLOSED, &con->state);
			
 
				+	memcpy(&con->peer_addr, addr, sizeof(*addr));
			
 
				+	con->delay = 0;      /* reset backoff memory */
			
 
				+	queue_con(con);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * generic get/put
			
 
				+ */
			
 
				+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("con_get %p nref = %d -> %d\n", con,
			
 
				+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
			
 
				+	if (atomic_inc_not_zero(&con->nref))
			
 
				+		return con;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void ceph_con_put(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("con_put %p nref = %d -> %d\n", con,
			
 
				+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
			
 
				+	BUG_ON(atomic_read(&con->nref) == 0);
			
 
				+	if (atomic_dec_and_test(&con->nref)) {
			
 
				+		BUG_ON(con->sock);
			
 
				+		kfree(con);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * initialize a new connection.
			
 
				+ */
			
 
				+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("con_init %p\n", con);
			
 
				+	memset(con, 0, sizeof(*con));
			
 
				+	atomic_set(&con->nref, 1);
			
 
				+	con->msgr = msgr;
			
 
				+	mutex_init(&con->mutex);
			
 
				+	INIT_LIST_HEAD(&con->out_queue);
			
 
				+	INIT_LIST_HEAD(&con->out_sent);
			
 
				+	INIT_DELAYED_WORK(&con->work, con_work);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * We maintain a global counter to order connection attempts.  Get
			
 
				+ * a unique seq greater than @gt.
			
 
				+ */
			
 
				+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
			
 
				+{
			
 
				+	u32 ret;
			
 
				+
			
 
				+	spin_lock(&msgr->global_seq_lock);
			
 
				+	if (msgr->global_seq < gt)
			
 
				+		msgr->global_seq = gt;
			
 
				+	ret = ++msgr->global_seq;
			
 
				+	spin_unlock(&msgr->global_seq_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Prepare footer for currently outgoing message, and finish things
			
 
				+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
			
 
				+ */
			
 
				+static void prepare_write_message_footer(struct ceph_connection *con, int v)
			
 
				+{
			
 
				+	struct ceph_msg *m = con->out_msg;
			
 
				+
			
 
				+	dout("prepare_write_message_footer %p\n", con);
			
 
				+	con->out_kvec_is_msg = true;
			
 
				+	con->out_kvec[v].iov_base = &m->footer;
			
 
				+	con->out_kvec[v].iov_len = sizeof(m->footer);
			
 
				+	con->out_kvec_bytes += sizeof(m->footer);
			
 
				+	con->out_kvec_left++;
			
 
				+	con->out_more = m->more_to_follow;
			
 
				+	con->out_msg_done = true;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepare headers for the next outgoing message.
			
 
				+ */
			
 
				+static void prepare_write_message(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_msg *m;
			
 
				+	int v = 0;
			
 
				+
			
 
				+	con->out_kvec_bytes = 0;
			
 
				+	con->out_kvec_is_msg = true;
			
 
				+	con->out_msg_done = false;
			
 
				+
			
 
				+	/* Sneak an ack in there first?  If we can get it into the same
			
 
				+	 * TCP packet that's a good thing. */
			
 
				+	if (con->in_seq > con->in_seq_acked) {
			
 
				+		con->in_seq_acked = con->in_seq;
			
 
				+		con->out_kvec[v].iov_base = &tag_ack;
			
 
				+		con->out_kvec[v++].iov_len = 1;
			
 
				+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
			
 
				+		con->out_kvec[v].iov_base = &con->out_temp_ack;
			
 
				+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
			
 
				+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
			
 
				+	}
			
 
				+
			
 
				+	m = list_first_entry(&con->out_queue,
			
 
				+		       struct ceph_msg, list_head);
			
 
				+	con->out_msg = m;
			
 
				+	if (test_bit(LOSSYTX, &con->state)) {
			
 
				+		list_del_init(&m->list_head);
			
 
				+	} else {
			
 
				+		/* put message on sent list */
			
 
				+		ceph_msg_get(m);
			
 
				+		list_move_tail(&m->list_head, &con->out_sent);
			
 
				+	}
			
 
				+
			
 
				+	m->hdr.seq = cpu_to_le64(++con->out_seq);
			
 
				+
			
 
				+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
			
 
				+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
			
 
				+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
			
 
				+	     le32_to_cpu(m->hdr.data_len),
			
 
				+	     m->nr_pages);
			
 
				+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
			
 
				+
			
 
				+	/* tag + hdr + front + middle */
			
 
				+	con->out_kvec[v].iov_base = &tag_msg;
			
 
				+	con->out_kvec[v++].iov_len = 1;
			
 
				+	con->out_kvec[v].iov_base = &m->hdr;
			
 
				+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
			
 
				+	con->out_kvec[v++] = m->front;
			
 
				+	if (m->middle)
			
 
				+		con->out_kvec[v++] = m->middle->vec;
			
 
				+	con->out_kvec_left = v;
			
 
				+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
			
 
				+		(m->middle ? m->middle->vec.iov_len : 0);
			
 
				+	con->out_kvec_cur = con->out_kvec;
			
 
				+
			
 
				+	/* fill in crc (except data pages), footer */
			
 
				+	con->out_msg->hdr.crc =
			
 
				+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
			
 
				+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
			
 
				+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
			
 
				+	con->out_msg->footer.front_crc =
			
 
				+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
			
 
				+	if (m->middle)
			
 
				+		con->out_msg->footer.middle_crc =
			
 
				+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
			
 
				+					   m->middle->vec.iov_len));
			
 
				+	else
			
 
				+		con->out_msg->footer.middle_crc = 0;
			
 
				+	con->out_msg->footer.data_crc = 0;
			
 
				+	dout("prepare_write_message front_crc %u data_crc %u\n",
			
 
				+	     le32_to_cpu(con->out_msg->footer.front_crc),
			
 
				+	     le32_to_cpu(con->out_msg->footer.middle_crc));
			
 
				+
			
 
				+	/* is there a data payload? */
			
 
				+	if (le32_to_cpu(m->hdr.data_len) > 0) {
			
 
				+		/* initialize page iterator */
			
 
				+		con->out_msg_pos.page = 0;
			
 
				+		con->out_msg_pos.page_pos =
			
 
				+			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
			
 
				+		con->out_msg_pos.data_pos = 0;
			
 
				+		con->out_msg_pos.did_page_crc = 0;
			
 
				+		con->out_more = 1;  /* data + footer will follow */
			
 
				+	} else {
			
 
				+		/* no, queue up footer too and be done */
			
 
				+		prepare_write_message_footer(con, v);
			
 
				+	}
			
 
				+
			
 
				+	set_bit(WRITE_PENDING, &con->state);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepare an ack.
			
 
				+ */
			
 
				+static void prepare_write_ack(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_write_ack %p %llu -> %llu\n", con,
			
 
				+	     con->in_seq_acked, con->in_seq);
			
 
				+	con->in_seq_acked = con->in_seq;
			
 
				+
			
 
				+	con->out_kvec[0].iov_base = &tag_ack;
			
 
				+	con->out_kvec[0].iov_len = 1;
			
 
				+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
			
 
				+	con->out_kvec[1].iov_base = &con->out_temp_ack;
			
 
				+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
			
 
				+	con->out_kvec_left = 2;
			
 
				+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
			
 
				+	con->out_kvec_cur = con->out_kvec;
			
 
				+	con->out_more = 1;  /* more will follow.. eventually.. */
			
 
				+	set_bit(WRITE_PENDING, &con->state);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepare to write keepalive byte.
			
 
				+ */
			
 
				+static void prepare_write_keepalive(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_write_keepalive %p\n", con);
			
 
				+	con->out_kvec[0].iov_base = &tag_keepalive;
			
 
				+	con->out_kvec[0].iov_len = 1;
			
 
				+	con->out_kvec_left = 1;
			
 
				+	con->out_kvec_bytes = 1;
			
 
				+	con->out_kvec_cur = con->out_kvec;
			
 
				+	set_bit(WRITE_PENDING, &con->state);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Connection negotiation.
			
 
				+ */
			
 
				+
			
 
				+static void prepare_connect_authorizer(struct ceph_connection *con)
			
 
				+{
			
 
				+	void *auth_buf;
			
 
				+	int auth_len = 0;
			
 
				+	int auth_protocol = 0;
			
 
				+
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+	if (con->ops->get_authorizer)
			
 
				+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
			
 
				+					 &auth_protocol, &con->auth_reply_buf,
			
 
				+					 &con->auth_reply_buf_len,
			
 
				+					 con->auth_retry);
			
 
				+	mutex_lock(&con->mutex);
			
 
				+
			
 
				+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
			
 
				+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
			
 
				+
			
 
				+	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
			
 
				+	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
			
 
				+	con->out_kvec_left++;
			
 
				+	con->out_kvec_bytes += auth_len;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We connected to a peer and are saying hello.
			
 
				+ */
			
 
				+static void prepare_write_banner(struct ceph_messenger *msgr,
			
 
				+				 struct ceph_connection *con)
			
 
				+{
			
 
				+	int len = strlen(CEPH_BANNER);
			
 
				+
			
 
				+	con->out_kvec[0].iov_base = CEPH_BANNER;
			
 
				+	con->out_kvec[0].iov_len = len;
			
 
				+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
			
 
				+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
			
 
				+	con->out_kvec_left = 2;
			
 
				+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
			
 
				+	con->out_kvec_cur = con->out_kvec;
			
 
				+	con->out_more = 0;
			
 
				+	set_bit(WRITE_PENDING, &con->state);
			
 
				+}
			
 
				+
			
 
				+static void prepare_write_connect(struct ceph_messenger *msgr,
			
 
				+				  struct ceph_connection *con,
			
 
				+				  int after_banner)
			
 
				+{
			
 
				+	unsigned global_seq = get_global_seq(con->msgr, 0);
			
 
				+	int proto;
			
 
				+
			
 
				+	switch (con->peer_name.type) {
			
 
				+	case CEPH_ENTITY_TYPE_MON:
			
 
				+		proto = CEPH_MONC_PROTOCOL;
			
 
				+		break;
			
 
				+	case CEPH_ENTITY_TYPE_OSD:
			
 
				+		proto = CEPH_OSDC_PROTOCOL;
			
 
				+		break;
			
 
				+	case CEPH_ENTITY_TYPE_MDS:
			
 
				+		proto = CEPH_MDSC_PROTOCOL;
			
 
				+		break;
			
 
				+	default:
			
 
				+		BUG();
			
 
				+	}
			
 
				+
			
 
				+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
			
 
				+	     con->connect_seq, global_seq, proto);
			
 
				+
			
 
				+	con->out_connect.features = CEPH_FEATURE_SUPPORTED;
			
 
				+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
			
 
				+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
			
 
				+	con->out_connect.global_seq = cpu_to_le32(global_seq);
			
 
				+	con->out_connect.protocol_version = cpu_to_le32(proto);
			
 
				+	con->out_connect.flags = 0;
			
 
				+
			
 
				+	if (!after_banner) {
			
 
				+		con->out_kvec_left = 0;
			
 
				+		con->out_kvec_bytes = 0;
			
 
				+	}
			
 
				+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
			
 
				+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
			
 
				+	con->out_kvec_left++;
			
 
				+	con->out_kvec_bytes += sizeof(con->out_connect);
			
 
				+	con->out_kvec_cur = con->out_kvec;
			
 
				+	con->out_more = 0;
			
 
				+	set_bit(WRITE_PENDING, &con->state);
			
 
				+
			
 
				+	prepare_connect_authorizer(con);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * write as much of pending kvecs to the socket as we can.
			
 
				+ *  1 -> done
			
 
				+ *  0 -> socket full, but more to do
			
 
				+ * <0 -> error
			
 
				+ */
			
 
				+static int write_partial_kvec(struct ceph_connection *con)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
			
 
				+	while (con->out_kvec_bytes > 0) {
			
 
				+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
			
 
				+				       con->out_kvec_left, con->out_kvec_bytes,
			
 
				+				       con->out_more);
			
 
				+		if (ret <= 0)
			
 
				+			goto out;
			
 
				+		con->out_kvec_bytes -= ret;
			
 
				+		if (con->out_kvec_bytes == 0)
			
 
				+			break;            /* done */
			
 
				+		while (ret > 0) {
			
 
				+			if (ret >= con->out_kvec_cur->iov_len) {
			
 
				+				ret -= con->out_kvec_cur->iov_len;
			
 
				+				con->out_kvec_cur++;
			
 
				+				con->out_kvec_left--;
			
 
				+			} else {
			
 
				+				con->out_kvec_cur->iov_len -= ret;
			
 
				+				con->out_kvec_cur->iov_base += ret;
			
 
				+				ret = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	con->out_kvec_left = 0;
			
 
				+	con->out_kvec_is_msg = false;
			
 
				+	ret = 1;
			
 
				+out:
			
 
				+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
			
 
				+	     con->out_kvec_bytes, con->out_kvec_left, ret);
			
 
				+	return ret;  /* done! */
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Write as much message data payload as we can.  If we finish, queue
			
 
				+ * up the footer.
			
 
				+ *  1 -> done, footer is now queued in out_kvec[].
			
 
				+ *  0 -> socket full, but more to do
			
 
				+ * <0 -> error
			
 
				+ */
			
 
				+static int write_partial_msg_pages(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_msg *msg = con->out_msg;
			
 
				+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
			
 
				+	size_t len;
			
 
				+	int crc = con->msgr->nocrc;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
			
 
				+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
			
 
				+	     con->out_msg_pos.page_pos);
			
 
				+
			
 
				+	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
			
 
				+		struct page *page = NULL;
			
 
				+		void *kaddr = NULL;
			
 
				+
			
 
				+		/*
			
 
				+		 * if we are calculating the data crc (the default), we need
			
 
				+		 * to map the page.  if our pages[] has been revoked, use the
			
 
				+		 * zero page.
			
 
				+		 */
			
 
				+		if (msg->pages) {
			
 
				+			page = msg->pages[con->out_msg_pos.page];
			
 
				+			if (crc)
			
 
				+				kaddr = kmap(page);
			
 
				+		} else if (msg->pagelist) {
			
 
				+			page = list_first_entry(&msg->pagelist->head,
			
 
				+						struct page, lru);
			
 
				+			if (crc)
			
 
				+				kaddr = kmap(page);
			
 
				+		} else {
			
 
				+			page = con->msgr->zero_page;
			
 
				+			if (crc)
			
 
				+				kaddr = page_address(con->msgr->zero_page);
			
 
				+		}
			
 
				+		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
			
 
				+			  (int)(data_len - con->out_msg_pos.data_pos));
			
 
				+		if (crc && !con->out_msg_pos.did_page_crc) {
			
 
				+			void *base = kaddr + con->out_msg_pos.page_pos;
			
 
				+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
			
 
				+
			
 
				+			BUG_ON(kaddr == NULL);
			
 
				+			con->out_msg->footer.data_crc =
			
 
				+				cpu_to_le32(crc32c(tmpcrc, base, len));
			
 
				+			con->out_msg_pos.did_page_crc = 1;
			
 
				+		}
			
 
				+
			
 
				+		ret = kernel_sendpage(con->sock, page,
			
 
				+				      con->out_msg_pos.page_pos, len,
			
 
				+				      MSG_DONTWAIT | MSG_NOSIGNAL |
			
 
				+				      MSG_MORE);
			
 
				+
			
 
				+		if (crc && (msg->pages || msg->pagelist))
			
 
				+			kunmap(page);
			
 
				+
			
 
				+		if (ret <= 0)
			
 
				+			goto out;
			
 
				+
			
 
				+		con->out_msg_pos.data_pos += ret;
			
 
				+		con->out_msg_pos.page_pos += ret;
			
 
				+		if (ret == len) {
			
 
				+			con->out_msg_pos.page_pos = 0;
			
 
				+			con->out_msg_pos.page++;
			
 
				+			con->out_msg_pos.did_page_crc = 0;
			
 
				+			if (msg->pagelist)
			
 
				+				list_move_tail(&page->lru,
			
 
				+					       &msg->pagelist->head);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
			
 
				+
			
 
				+	/* prepare and queue up footer, too */
			
 
				+	if (!crc)
			
 
				+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
			
 
				+	con->out_kvec_bytes = 0;
			
 
				+	con->out_kvec_left = 0;
			
 
				+	con->out_kvec_cur = con->out_kvec;
			
 
				+	prepare_write_message_footer(con, 0);
			
 
				+	ret = 1;
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * write some zeros
			
 
				+ */
			
 
				+static int write_partial_skip(struct ceph_connection *con)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	while (con->out_skip > 0) {
			
 
				+		struct kvec iov = {
			
 
				+			.iov_base = page_address(con->msgr->zero_page),
			
 
				+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
			
 
				+		};
			
 
				+
			
 
				+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
			
 
				+		if (ret <= 0)
			
 
				+			goto out;
			
 
				+		con->out_skip -= ret;
			
 
				+	}
			
 
				+	ret = 1;
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepare to read connection handshake, or an ack.
			
 
				+ */
			
 
				+static void prepare_read_banner(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_read_banner %p\n", con);
			
 
				+	con->in_base_pos = 0;
			
 
				+}
			
 
				+
			
 
				+static void prepare_read_connect(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_read_connect %p\n", con);
			
 
				+	con->in_base_pos = 0;
			
 
				+}
			
 
				+
			
 
				+static void prepare_read_connect_retry(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_read_connect_retry %p\n", con);
			
 
				+	con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
			
 
				+		+ sizeof(con->peer_addr_for_me);
			
 
				+}
			
 
				+
			
 
				+static void prepare_read_ack(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_read_ack %p\n", con);
			
 
				+	con->in_base_pos = 0;
			
 
				+}
			
 
				+
			
 
				+static void prepare_read_tag(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_read_tag %p\n", con);
			
 
				+	con->in_base_pos = 0;
			
 
				+	con->in_tag = CEPH_MSGR_TAG_READY;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepare to read a message.
			
 
				+ */
			
 
				+static int prepare_read_message(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("prepare_read_message %p\n", con);
			
 
				+	BUG_ON(con->in_msg != NULL);
			
 
				+	con->in_base_pos = 0;
			
 
				+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int read_partial(struct ceph_connection *con,
			
 
				+			int *to, int size, void *object)
			
 
				+{
			
 
				+	*to += size;
			
 
				+	while (con->in_base_pos < *to) {
			
 
				+		int left = *to - con->in_base_pos;
			
 
				+		int have = size - left;
			
 
				+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+		con->in_base_pos += ret;
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Read all or part of the connect-side handshake on a new connection
			
 
				+ */
			
 
				+static int read_partial_banner(struct ceph_connection *con)
			
 
				+{
			
 
				+	int ret, to = 0;
			
 
				+
			
 
				+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
			
 
				+
			
 
				+	/* peer's banner */
			
 
				+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
			
 
				+	if (ret <= 0)
			
 
				+		goto out;
			
 
				+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
			
 
				+			   &con->actual_peer_addr);
			
 
				+	if (ret <= 0)
			
 
				+		goto out;
			
 
				+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
			
 
				+			   &con->peer_addr_for_me);
			
 
				+	if (ret <= 0)
			
 
				+		goto out;
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int read_partial_connect(struct ceph_connection *con)
			
 
				+{
			
 
				+	int ret, to = 0;
			
 
				+
			
 
				+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
			
 
				+
			
 
				+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
			
 
				+	if (ret <= 0)
			
 
				+		goto out;
			
 
				+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
			
 
				+			   con->auth_reply_buf);
			
 
				+	if (ret <= 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
			
 
				+	     con, (int)con->in_reply.tag,
			
 
				+	     le32_to_cpu(con->in_reply.connect_seq),
			
 
				+	     le32_to_cpu(con->in_reply.global_seq));
			
 
				+out:
			
 
				+	return ret;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Verify the hello banner looks okay.
			
 
				+ */
			
 
				+static int verify_hello(struct ceph_connection *con)
			
 
				+{
			
 
				+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
			
 
				+		pr_err("connect to %s got bad banner\n",
			
 
				+		       pr_addr(&con->peer_addr.in_addr));
			
 
				+		con->error_msg = "protocol error, bad banner";
			
 
				+		return -1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static bool addr_is_blank(struct sockaddr_storage *ss)
			
 
				+{
			
 
				+	switch (ss->ss_family) {
			
 
				+	case AF_INET:
			
 
				+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
			
 
				+	case AF_INET6:
			
 
				+		return
			
 
				+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
			
 
				+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
			
 
				+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
			
 
				+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static int addr_port(struct sockaddr_storage *ss)
			
 
				+{
			
 
				+	switch (ss->ss_family) {
			
 
				+	case AF_INET:
			
 
				+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
			
 
				+	case AF_INET6:
			
 
				+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void addr_set_port(struct sockaddr_storage *ss, int p)
			
 
				+{
			
 
				+	switch (ss->ss_family) {
			
 
				+	case AF_INET:
			
 
				+		((struct sockaddr_in *)ss)->sin_port = htons(p);
			
 
				+	case AF_INET6:
			
 
				+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Parse an ip[:port] list into an addr array.  Use the default
			
 
				+ * monitor port if a port isn't specified.
			
 
				+ */
			
 
				+int ceph_parse_ips(const char *c, const char *end,
			
 
				+		   struct ceph_entity_addr *addr,
			
 
				+		   int max_count, int *count)
			
 
				+{
			
 
				+	int i;
			
 
				+	const char *p = c;
			
 
				+
			
 
				+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
			
 
				+	for (i = 0; i < max_count; i++) {
			
 
				+		const char *ipend;
			
 
				+		struct sockaddr_storage *ss = &addr[i].in_addr;
			
 
				+		struct sockaddr_in *in4 = (void *)ss;
			
 
				+		struct sockaddr_in6 *in6 = (void *)ss;
			
 
				+		int port;
			
 
				+
			
 
				+		memset(ss, 0, sizeof(*ss));
			
 
				+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
			
 
				+			     ',', &ipend)) {
			
 
				+			ss->ss_family = AF_INET;
			
 
				+		} else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
			
 
				+				    ',', &ipend)) {
			
 
				+			ss->ss_family = AF_INET6;
			
 
				+		} else {
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		p = ipend;
			
 
				+
			
 
				+		/* port? */
			
 
				+		if (p < end && *p == ':') {
			
 
				+			port = 0;
			
 
				+			p++;
			
 
				+			while (p < end && *p >= '0' && *p <= '9') {
			
 
				+				port = (port * 10) + (*p - '0');
			
 
				+				p++;
			
 
				+			}
			
 
				+			if (port > 65535 || port == 0)
			
 
				+				goto bad;
			
 
				+		} else {
			
 
				+			port = CEPH_MON_PORT;
			
 
				+		}
			
 
				+
			
 
				+		addr_set_port(ss, port);
			
 
				+
			
 
				+		dout("parse_ips got %s\n", pr_addr(ss));
			
 
				+
			
 
				+		if (p == end)
			
 
				+			break;
			
 
				+		if (*p != ',')
			
 
				+			goto bad;
			
 
				+		p++;
			
 
				+	}
			
 
				+
			
 
				+	if (p != end)
			
 
				+		goto bad;
			
 
				+
			
 
				+	if (count)
			
 
				+		*count = i + 1;
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("parse_ips bad ip '%s'\n", c);
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int process_banner(struct ceph_connection *con)
			
 
				+{
			
 
				+	dout("process_banner on %p\n", con);
			
 
				+
			
 
				+	if (verify_hello(con) < 0)
			
 
				+		return -1;
			
 
				+
			
 
				+	ceph_decode_addr(&con->actual_peer_addr);
			
 
				+	ceph_decode_addr(&con->peer_addr_for_me);
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure the other end is who we wanted.  note that the other
			
 
				+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
			
 
				+	 * them the benefit of the doubt.
			
 
				+	 */
			
 
				+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
			
 
				+		   sizeof(con->peer_addr)) != 0 &&
			
 
				+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
			
 
				+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
			
 
				+		pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
			
 
				+			   pr_addr(&con->peer_addr.in_addr),
			
 
				+			   le64_to_cpu(con->peer_addr.nonce),
			
 
				+			   pr_addr(&con->actual_peer_addr.in_addr),
			
 
				+			   le64_to_cpu(con->actual_peer_addr.nonce));
			
 
				+		con->error_msg = "wrong peer at address";
			
 
				+		return -1;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * did we learn our address?
			
 
				+	 */
			
 
				+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
			
 
				+		int port = addr_port(&con->msgr->inst.addr.in_addr);
			
 
				+
			
 
				+		memcpy(&con->msgr->inst.addr.in_addr,
			
 
				+		       &con->peer_addr_for_me.in_addr,
			
 
				+		       sizeof(con->peer_addr_for_me.in_addr));
			
 
				+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
			
 
				+		encode_my_addr(con->msgr);
			
 
				+		dout("process_banner learned my addr is %s\n",
			
 
				+		     pr_addr(&con->msgr->inst.addr.in_addr));
			
 
				+	}
			
 
				+
			
 
				+	set_bit(NEGOTIATING, &con->state);
			
 
				+	prepare_read_connect(con);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void fail_protocol(struct ceph_connection *con)
			
 
				+{
			
 
				+	reset_connection(con);
			
 
				+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
			
 
				+
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+	if (con->ops->bad_proto)
			
 
				+		con->ops->bad_proto(con);
			
 
				+	mutex_lock(&con->mutex);
			
 
				+}
			
 
				+
			
 
				+static int process_connect(struct ceph_connection *con)
			
 
				+{
			
 
				+	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
			
 
				+	u64 req_feat = CEPH_FEATURE_REQUIRED;
			
 
				+	u64 server_feat = le64_to_cpu(con->in_reply.features);
			
 
				+
			
 
				+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
			
 
				+
			
 
				+	switch (con->in_reply.tag) {
			
 
				+	case CEPH_MSGR_TAG_FEATURES:
			
 
				+		pr_err("%s%lld %s feature set mismatch,"
			
 
				+		       " my %llx < server's %llx, missing %llx\n",
			
 
				+		       ENTITY_NAME(con->peer_name),
			
 
				+		       pr_addr(&con->peer_addr.in_addr),
			
 
				+		       sup_feat, server_feat, server_feat & ~sup_feat);
			
 
				+		con->error_msg = "missing required protocol features";
			
 
				+		fail_protocol(con);
			
 
				+		return -1;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_BADPROTOVER:
			
 
				+		pr_err("%s%lld %s protocol version mismatch,"
			
 
				+		       " my %d != server's %d\n",
			
 
				+		       ENTITY_NAME(con->peer_name),
			
 
				+		       pr_addr(&con->peer_addr.in_addr),
			
 
				+		       le32_to_cpu(con->out_connect.protocol_version),
			
 
				+		       le32_to_cpu(con->in_reply.protocol_version));
			
 
				+		con->error_msg = "protocol version mismatch";
			
 
				+		fail_protocol(con);
			
 
				+		return -1;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_BADAUTHORIZER:
			
 
				+		con->auth_retry++;
			
 
				+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
			
 
				+		     con->auth_retry);
			
 
				+		if (con->auth_retry == 2) {
			
 
				+			con->error_msg = "connect authorization failure";
			
 
				+			reset_connection(con);
			
 
				+			set_bit(CLOSED, &con->state);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		con->auth_retry = 1;
			
 
				+		prepare_write_connect(con->msgr, con, 0);
			
 
				+		prepare_read_connect_retry(con);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_RESETSESSION:
			
 
				+		/*
			
 
				+		 * If we connected with a large connect_seq but the peer
			
 
				+		 * has no record of a session with us (no connection, or
			
 
				+		 * connect_seq == 0), they will send RESETSESION to indicate
			
 
				+		 * that they must have reset their session, and may have
			
 
				+		 * dropped messages.
			
 
				+		 */
			
 
				+		dout("process_connect got RESET peer seq %u\n",
			
 
				+		     le32_to_cpu(con->in_connect.connect_seq));
			
 
				+		pr_err("%s%lld %s connection reset\n",
			
 
				+		       ENTITY_NAME(con->peer_name),
			
 
				+		       pr_addr(&con->peer_addr.in_addr));
			
 
				+		reset_connection(con);
			
 
				+		prepare_write_connect(con->msgr, con, 0);
			
 
				+		prepare_read_connect(con);
			
 
				+
			
 
				+		/* Tell ceph about it. */
			
 
				+		mutex_unlock(&con->mutex);
			
 
				+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
			
 
				+		if (con->ops->peer_reset)
			
 
				+			con->ops->peer_reset(con);
			
 
				+		mutex_lock(&con->mutex);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_RETRY_SESSION:
			
 
				+		/*
			
 
				+		 * If we sent a smaller connect_seq than the peer has, try
			
 
				+		 * again with a larger value.
			
 
				+		 */
			
 
				+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
			
 
				+		     le32_to_cpu(con->out_connect.connect_seq),
			
 
				+		     le32_to_cpu(con->in_connect.connect_seq));
			
 
				+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
			
 
				+		prepare_write_connect(con->msgr, con, 0);
			
 
				+		prepare_read_connect(con);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
			
 
				+		/*
			
 
				+		 * If we sent a smaller global_seq than the peer has, try
			
 
				+		 * again with a larger value.
			
 
				+		 */
			
 
				+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
			
 
				+		     con->peer_global_seq,
			
 
				+		     le32_to_cpu(con->in_connect.global_seq));
			
 
				+		get_global_seq(con->msgr,
			
 
				+			       le32_to_cpu(con->in_connect.global_seq));
			
 
				+		prepare_write_connect(con->msgr, con, 0);
			
 
				+		prepare_read_connect(con);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_READY:
			
 
				+		if (req_feat & ~server_feat) {
			
 
				+			pr_err("%s%lld %s protocol feature mismatch,"
			
 
				+			       " my required %llx > server's %llx, need %llx\n",
			
 
				+			       ENTITY_NAME(con->peer_name),
			
 
				+			       pr_addr(&con->peer_addr.in_addr),
			
 
				+			       req_feat, server_feat, req_feat & ~server_feat);
			
 
				+			con->error_msg = "missing required protocol features";
			
 
				+			fail_protocol(con);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		clear_bit(CONNECTING, &con->state);
			
 
				+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
			
 
				+		con->connect_seq++;
			
 
				+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
			
 
				+		     con->peer_global_seq,
			
 
				+		     le32_to_cpu(con->in_reply.connect_seq),
			
 
				+		     con->connect_seq);
			
 
				+		WARN_ON(con->connect_seq !=
			
 
				+			le32_to_cpu(con->in_reply.connect_seq));
			
 
				+
			
 
				+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
			
 
				+			set_bit(LOSSYTX, &con->state);
			
 
				+
			
 
				+		prepare_read_tag(con);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSGR_TAG_WAIT:
			
 
				+		/*
			
 
				+		 * If there is a connection race (we are opening
			
 
				+		 * connections to each other), one of us may just have
			
 
				+		 * to WAIT.  This shouldn't happen if we are the
			
 
				+		 * client.
			
 
				+		 */
			
 
				+		pr_err("process_connect peer connecting WAIT\n");
			
 
				+
			
 
				+	default:
			
 
				+		pr_err("connect protocol error, will retry\n");
			
 
				+		con->error_msg = "protocol error, garbage tag during connect";
			
 
				+		return -1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * read (part of) an ack
			
 
				+ */
			
 
				+static int read_partial_ack(struct ceph_connection *con)
			
 
				+{
			
 
				+	int to = 0;
			
 
				+
			
 
				+	return read_partial(con, &to, sizeof(con->in_temp_ack),
			
 
				+			    &con->in_temp_ack);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * We can finally discard anything that's been acked.
			
 
				+ */
			
 
				+static void process_ack(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_msg *m;
			
 
				+	u64 ack = le64_to_cpu(con->in_temp_ack);
			
 
				+	u64 seq;
			
 
				+
			
 
				+	while (!list_empty(&con->out_sent)) {
			
 
				+		m = list_first_entry(&con->out_sent, struct ceph_msg,
			
 
				+				     list_head);
			
 
				+		seq = le64_to_cpu(m->hdr.seq);
			
 
				+		if (seq > ack)
			
 
				+			break;
			
 
				+		dout("got ack for seq %llu type %d at %p\n", seq,
			
 
				+		     le16_to_cpu(m->hdr.type), m);
			
 
				+		ceph_msg_remove(m);
			
 
				+	}
			
 
				+	prepare_read_tag(con);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+static int read_partial_message_section(struct ceph_connection *con,
			
 
				+					struct kvec *section, unsigned int sec_len,
			
 
				+					u32 *crc)
			
 
				+{
			
 
				+	int left;
			
 
				+	int ret;
			
 
				+
			
 
				+	BUG_ON(!section);
			
 
				+
			
 
				+	while (section->iov_len < sec_len) {
			
 
				+		BUG_ON(section->iov_base == NULL);
			
 
				+		left = sec_len - section->iov_len;
			
 
				+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
			
 
				+				       section->iov_len, left);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+		section->iov_len += ret;
			
 
				+		if (section->iov_len == sec_len)
			
 
				+			*crc = crc32c(0, section->iov_base,
			
 
				+				      section->iov_len);
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
			
 
				+				struct ceph_msg_header *hdr,
			
 
				+				int *skip);
			
 
				+/*
			
 
				+ * read (part of) a message.
			
 
				+ */
			
 
				+static int read_partial_message(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_msg *m = con->in_msg;
			
 
				+	void *p;
			
 
				+	int ret;
			
 
				+	int to, left;
			
 
				+	unsigned front_len, middle_len, data_len, data_off;
			
 
				+	int datacrc = con->msgr->nocrc;
			
 
				+	int skip;
			
 
				+
			
 
				+	dout("read_partial_message con %p msg %p\n", con, m);
			
 
				+
			
 
				+	/* header */
			
 
				+	while (con->in_base_pos < sizeof(con->in_hdr)) {
			
 
				+		left = sizeof(con->in_hdr) - con->in_base_pos;
			
 
				+		ret = ceph_tcp_recvmsg(con->sock,
			
 
				+				       (char *)&con->in_hdr + con->in_base_pos,
			
 
				+				       left);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+		con->in_base_pos += ret;
			
 
				+		if (con->in_base_pos == sizeof(con->in_hdr)) {
			
 
				+			u32 crc = crc32c(0, (void *)&con->in_hdr,
			
 
				+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
			
 
				+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
			
 
				+				pr_err("read_partial_message bad hdr "
			
 
				+				       " crc %u != expected %u\n",
			
 
				+				       crc, con->in_hdr.crc);
			
 
				+				return -EBADMSG;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	front_len = le32_to_cpu(con->in_hdr.front_len);
			
 
				+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
			
 
				+		return -EIO;
			
 
				+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
			
 
				+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
			
 
				+		return -EIO;
			
 
				+	data_len = le32_to_cpu(con->in_hdr.data_len);
			
 
				+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
			
 
				+		return -EIO;
			
 
				+	data_off = le16_to_cpu(con->in_hdr.data_off);
			
 
				+
			
 
				+	/* allocate message? */
			
 
				+	if (!con->in_msg) {
			
 
				+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
			
 
				+		     con->in_hdr.front_len, con->in_hdr.data_len);
			
 
				+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
			
 
				+		if (skip) {
			
 
				+			/* skip this message */
			
 
				+			dout("alloc_msg returned NULL, skipping message\n");
			
 
				+			con->in_base_pos = -front_len - middle_len - data_len -
			
 
				+				sizeof(m->footer);
			
 
				+			con->in_tag = CEPH_MSGR_TAG_READY;
			
 
				+			return 0;
			
 
				+		}
			
 
				+		if (IS_ERR(con->in_msg)) {
			
 
				+			ret = PTR_ERR(con->in_msg);
			
 
				+			con->in_msg = NULL;
			
 
				+			con->error_msg =
			
 
				+				"error allocating memory for incoming message";
			
 
				+			return ret;
			
 
				+		}
			
 
				+		m = con->in_msg;
			
 
				+		m->front.iov_len = 0;    /* haven't read it yet */
			
 
				+		if (m->middle)
			
 
				+			m->middle->vec.iov_len = 0;
			
 
				+
			
 
				+		con->in_msg_pos.page = 0;
			
 
				+		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
			
 
				+		con->in_msg_pos.data_pos = 0;
			
 
				+	}
			
 
				+
			
 
				+	/* front */
			
 
				+	ret = read_partial_message_section(con, &m->front, front_len,
			
 
				+					   &con->in_front_crc);
			
 
				+	if (ret <= 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* middle */
			
 
				+	if (m->middle) {
			
 
				+		ret = read_partial_message_section(con, &m->middle->vec, middle_len,
			
 
				+						   &con->in_middle_crc);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+	}
			
 
				+
			
 
				+	/* (page) data */
			
 
				+	while (con->in_msg_pos.data_pos < data_len) {
			
 
				+		left = min((int)(data_len - con->in_msg_pos.data_pos),
			
 
				+			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
			
 
				+		BUG_ON(m->pages == NULL);
			
 
				+		p = kmap(m->pages[con->in_msg_pos.page]);
			
 
				+		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
			
 
				+				       left);
			
 
				+		if (ret > 0 && datacrc)
			
 
				+			con->in_data_crc =
			
 
				+				crc32c(con->in_data_crc,
			
 
				+					  p + con->in_msg_pos.page_pos, ret);
			
 
				+		kunmap(m->pages[con->in_msg_pos.page]);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+		con->in_msg_pos.data_pos += ret;
			
 
				+		con->in_msg_pos.page_pos += ret;
			
 
				+		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
			
 
				+			con->in_msg_pos.page_pos = 0;
			
 
				+			con->in_msg_pos.page++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* footer */
			
 
				+	to = sizeof(m->hdr) + sizeof(m->footer);
			
 
				+	while (con->in_base_pos < to) {
			
 
				+		left = to - con->in_base_pos;
			
 
				+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
			
 
				+				       (con->in_base_pos - sizeof(m->hdr)),
			
 
				+				       left);
			
 
				+		if (ret <= 0)
			
 
				+			return ret;
			
 
				+		con->in_base_pos += ret;
			
 
				+	}
			
 
				+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
			
 
				+	     m, front_len, m->footer.front_crc, middle_len,
			
 
				+	     m->footer.middle_crc, data_len, m->footer.data_crc);
			
 
				+
			
 
				+	/* crc ok? */
			
 
				+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
			
 
				+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
			
 
				+		       m, con->in_front_crc, m->footer.front_crc);
			
 
				+		return -EBADMSG;
			
 
				+	}
			
 
				+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
			
 
				+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
			
 
				+		       m, con->in_middle_crc, m->footer.middle_crc);
			
 
				+		return -EBADMSG;
			
 
				+	}
			
 
				+	if (datacrc &&
			
 
				+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
			
 
				+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
			
 
				+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
			
 
				+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
			
 
				+		return -EBADMSG;
			
 
				+	}
			
 
				+
			
 
				+	return 1; /* done! */
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Process message.  This happens in the worker thread.  The callback should
			
 
				+ * be careful not to do anything that waits on other incoming messages or it
			
 
				+ * may deadlock.
			
 
				+ */
			
 
				+static void process_message(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+
			
 
				+	msg = con->in_msg;
			
 
				+	con->in_msg = NULL;
			
 
				+
			
 
				+	/* if first message, set peer_name */
			
 
				+	if (con->peer_name.type == 0)
			
 
				+		con->peer_name = msg->hdr.src.name;
			
 
				+
			
 
				+	con->in_seq++;
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+
			
 
				+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
			
 
				+	     msg, le64_to_cpu(msg->hdr.seq),
			
 
				+	     ENTITY_NAME(msg->hdr.src.name),
			
 
				+	     le16_to_cpu(msg->hdr.type),
			
 
				+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
			
 
				+	     le32_to_cpu(msg->hdr.front_len),
			
 
				+	     le32_to_cpu(msg->hdr.data_len),
			
 
				+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
			
 
				+	con->ops->dispatch(con, msg);
			
 
				+
			
 
				+	mutex_lock(&con->mutex);
			
 
				+	prepare_read_tag(con);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Write something to the socket.  Called in a worker thread when the
			
 
				+ * socket appears to be writeable and we have something ready to send.
			
 
				+ */
			
 
				+static int try_write(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_messenger *msgr = con->msgr;
			
 
				+	int ret = 1;
			
 
				+
			
 
				+	dout("try_write start %p state %lu nref %d\n", con, con->state,
			
 
				+	     atomic_read(&con->nref));
			
 
				+
			
 
				+	mutex_lock(&con->mutex);
			
 
				+more:
			
 
				+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
			
 
				+
			
 
				+	/* open the socket first? */
			
 
				+	if (con->sock == NULL) {
			
 
				+		/*
			
 
				+		 * if we were STANDBY and are reconnecting _this_
			
 
				+		 * connection, bump connect_seq now.  Always bump
			
 
				+		 * global_seq.
			
 
				+		 */
			
 
				+		if (test_and_clear_bit(STANDBY, &con->state))
			
 
				+			con->connect_seq++;
			
 
				+
			
 
				+		prepare_write_banner(msgr, con);
			
 
				+		prepare_write_connect(msgr, con, 1);
			
 
				+		prepare_read_banner(con);
			
 
				+		set_bit(CONNECTING, &con->state);
			
 
				+		clear_bit(NEGOTIATING, &con->state);
			
 
				+
			
 
				+		BUG_ON(con->in_msg);
			
 
				+		con->in_tag = CEPH_MSGR_TAG_READY;
			
 
				+		dout("try_write initiating connect on %p new state %lu\n",
			
 
				+		     con, con->state);
			
 
				+		con->sock = ceph_tcp_connect(con);
			
 
				+		if (IS_ERR(con->sock)) {
			
 
				+			con->sock = NULL;
			
 
				+			con->error_msg = "connect error";
			
 
				+			ret = -1;
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+more_kvec:
			
 
				+	/* kvec data queued? */
			
 
				+	if (con->out_skip) {
			
 
				+		ret = write_partial_skip(con);
			
 
				+		if (ret <= 0)
			
 
				+			goto done;
			
 
				+		if (ret < 0) {
			
 
				+			dout("try_write write_partial_skip err %d\n", ret);
			
 
				+			goto done;
			
 
				+		}
			
 
				+	}
			
 
				+	if (con->out_kvec_left) {
			
 
				+		ret = write_partial_kvec(con);
			
 
				+		if (ret <= 0)
			
 
				+			goto done;
			
 
				+	}
			
 
				+
			
 
				+	/* msg pages? */
			
 
				+	if (con->out_msg) {
			
 
				+		if (con->out_msg_done) {
			
 
				+			ceph_msg_put(con->out_msg);
			
 
				+			con->out_msg = NULL;   /* we're done with this one */
			
 
				+			goto do_next;
			
 
				+		}
			
 
				+
			
 
				+		ret = write_partial_msg_pages(con);
			
 
				+		if (ret == 1)
			
 
				+			goto more_kvec;  /* we need to send the footer, too! */
			
 
				+		if (ret == 0)
			
 
				+			goto done;
			
 
				+		if (ret < 0) {
			
 
				+			dout("try_write write_partial_msg_pages err %d\n",
			
 
				+			     ret);
			
 
				+			goto done;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+do_next:
			
 
				+	if (!test_bit(CONNECTING, &con->state)) {
			
 
				+		/* is anything else pending? */
			
 
				+		if (!list_empty(&con->out_queue)) {
			
 
				+			prepare_write_message(con);
			
 
				+			goto more;
			
 
				+		}
			
 
				+		if (con->in_seq > con->in_seq_acked) {
			
 
				+			prepare_write_ack(con);
			
 
				+			goto more;
			
 
				+		}
			
 
				+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
			
 
				+			prepare_write_keepalive(con);
			
 
				+			goto more;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Nothing to do! */
			
 
				+	clear_bit(WRITE_PENDING, &con->state);
			
 
				+	dout("try_write nothing else to write.\n");
			
 
				+done:
			
 
				+	ret = 0;
			
 
				+out:
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+	dout("try_write done on %p\n", con);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Read what we can from the socket.
			
 
				+ */
			
 
				+static int try_read(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_messenger *msgr;
			
 
				+	int ret = -1;
			
 
				+
			
 
				+	if (!con->sock)
			
 
				+		return 0;
			
 
				+
			
 
				+	if (test_bit(STANDBY, &con->state))
			
 
				+		return 0;
			
 
				+
			
 
				+	dout("try_read start on %p\n", con);
			
 
				+	msgr = con->msgr;
			
 
				+
			
 
				+	mutex_lock(&con->mutex);
			
 
				+
			
 
				+more:
			
 
				+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
			
 
				+	     con->in_base_pos);
			
 
				+	if (test_bit(CONNECTING, &con->state)) {
			
 
				+		if (!test_bit(NEGOTIATING, &con->state)) {
			
 
				+			dout("try_read connecting\n");
			
 
				+			ret = read_partial_banner(con);
			
 
				+			if (ret <= 0)
			
 
				+				goto done;
			
 
				+			if (process_banner(con) < 0) {
			
 
				+				ret = -1;
			
 
				+				goto out;
			
 
				+			}
			
 
				+		}
			
 
				+		ret = read_partial_connect(con);
			
 
				+		if (ret <= 0)
			
 
				+			goto done;
			
 
				+		if (process_connect(con) < 0) {
			
 
				+			ret = -1;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		goto more;
			
 
				+	}
			
 
				+
			
 
				+	if (con->in_base_pos < 0) {
			
 
				+		/*
			
 
				+		 * skipping + discarding content.
			
 
				+		 *
			
 
				+		 * FIXME: there must be a better way to do this!
			
 
				+		 */
			
 
				+		static char buf[1024];
			
 
				+		int skip = min(1024, -con->in_base_pos);
			
 
				+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
			
 
				+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
			
 
				+		if (ret <= 0)
			
 
				+			goto done;
			
 
				+		con->in_base_pos += ret;
			
 
				+		if (con->in_base_pos)
			
 
				+			goto more;
			
 
				+	}
			
 
				+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
			
 
				+		/*
			
 
				+		 * what's next?
			
 
				+		 */
			
 
				+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
			
 
				+		if (ret <= 0)
			
 
				+			goto done;
			
 
				+		dout("try_read got tag %d\n", (int)con->in_tag);
			
 
				+		switch (con->in_tag) {
			
 
				+		case CEPH_MSGR_TAG_MSG:
			
 
				+			prepare_read_message(con);
			
 
				+			break;
			
 
				+		case CEPH_MSGR_TAG_ACK:
			
 
				+			prepare_read_ack(con);
			
 
				+			break;
			
 
				+		case CEPH_MSGR_TAG_CLOSE:
			
 
				+			set_bit(CLOSED, &con->state);   /* fixme */
			
 
				+			goto done;
			
 
				+		default:
			
 
				+			goto bad_tag;
			
 
				+		}
			
 
				+	}
			
 
				+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
			
 
				+		ret = read_partial_message(con);
			
 
				+		if (ret <= 0) {
			
 
				+			switch (ret) {
			
 
				+			case -EBADMSG:
			
 
				+				con->error_msg = "bad crc";
			
 
				+				ret = -EIO;
			
 
				+				goto out;
			
 
				+			case -EIO:
			
 
				+				con->error_msg = "io error";
			
 
				+				goto out;
			
 
				+			default:
			
 
				+				goto done;
			
 
				+			}
			
 
				+		}
			
 
				+		if (con->in_tag == CEPH_MSGR_TAG_READY)
			
 
				+			goto more;
			
 
				+		process_message(con);
			
 
				+		goto more;
			
 
				+	}
			
 
				+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
			
 
				+		ret = read_partial_ack(con);
			
 
				+		if (ret <= 0)
			
 
				+			goto done;
			
 
				+		process_ack(con);
			
 
				+		goto more;
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	ret = 0;
			
 
				+out:
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+	dout("try_read done on %p\n", con);
			
 
				+	return ret;
			
 
				+
			
 
				+bad_tag:
			
 
				+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
			
 
				+	con->error_msg = "protocol error, garbage tag";
			
 
				+	ret = -1;
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Atomically queue work on a connection.  Bump @con reference to
			
 
				+ * avoid races with connection teardown.
			
 
				+ *
			
 
				+ * There is some trickery going on with QUEUED and BUSY because we
			
 
				+ * only want a _single_ thread operating on each connection at any
			
 
				+ * point in time, but we want to use all available CPUs.
			
 
				+ *
			
 
				+ * The worker thread only proceeds if it can atomically set BUSY.  It
			
 
				+ * clears QUEUED and does it's thing.  When it thinks it's done, it
			
 
				+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
			
 
				+ * (tries again to set BUSY).
			
 
				+ *
			
 
				+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
			
 
				+ * try to queue work.  If that fails (work is already queued, or BUSY)
			
 
				+ * we give up (work also already being done or is queued) but leave QUEUED
			
 
				+ * set so that the worker thread will loop if necessary.
			
 
				+ */
			
 
				+static void queue_con(struct ceph_connection *con)
			
 
				+{
			
 
				+	if (test_bit(DEAD, &con->state)) {
			
 
				+		dout("queue_con %p ignoring: DEAD\n",
			
 
				+		     con);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (!con->ops->get(con)) {
			
 
				+		dout("queue_con %p ref count 0\n", con);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	set_bit(QUEUED, &con->state);
			
 
				+	if (test_bit(BUSY, &con->state)) {
			
 
				+		dout("queue_con %p - already BUSY\n", con);
			
 
				+		con->ops->put(con);
			
 
				+	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
			
 
				+		dout("queue_con %p - already queued\n", con);
			
 
				+		con->ops->put(con);
			
 
				+	} else {
			
 
				+		dout("queue_con %p\n", con);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Do some work on a connection.  Drop a connection ref when we're done.
			
 
				+ */
			
 
				+static void con_work(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_connection *con = container_of(work, struct ceph_connection,
			
 
				+						   work.work);
			
 
				+	int backoff = 0;
			
 
				+
			
 
				+more:
			
 
				+	if (test_and_set_bit(BUSY, &con->state) != 0) {
			
 
				+		dout("con_work %p BUSY already set\n", con);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	dout("con_work %p start, clearing QUEUED\n", con);
			
 
				+	clear_bit(QUEUED, &con->state);
			
 
				+
			
 
				+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
			
 
				+		dout("con_work CLOSED\n");
			
 
				+		con_close_socket(con);
			
 
				+		goto done;
			
 
				+	}
			
 
				+	if (test_and_clear_bit(OPENING, &con->state)) {
			
 
				+		/* reopen w/ new peer */
			
 
				+		dout("con_work OPENING\n");
			
 
				+		con_close_socket(con);
			
 
				+	}
			
 
				+
			
 
				+	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
			
 
				+	    try_read(con) < 0 ||
			
 
				+	    try_write(con) < 0) {
			
 
				+		backoff = 1;
			
 
				+		ceph_fault(con);     /* error/fault path */
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	clear_bit(BUSY, &con->state);
			
 
				+	dout("con->state=%lu\n", con->state);
			
 
				+	if (test_bit(QUEUED, &con->state)) {
			
 
				+		if (!backoff || test_bit(OPENING, &con->state)) {
			
 
				+			dout("con_work %p QUEUED reset, looping\n", con);
			
 
				+			goto more;
			
 
				+		}
			
 
				+		dout("con_work %p QUEUED reset, but just faulted\n", con);
			
 
				+		clear_bit(QUEUED, &con->state);
			
 
				+	}
			
 
				+	dout("con_work %p done\n", con);
			
 
				+
			
 
				+out:
			
 
				+	con->ops->put(con);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Generic error/fault handler.  A retry mechanism is used with
			
 
				+ * exponential backoff
			
 
				+ */
			
 
				+static void ceph_fault(struct ceph_connection *con)
			
 
				+{
			
 
				+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
			
 
				+	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
			
 
				+	dout("fault %p state %lu to peer %s\n",
			
 
				+	     con, con->state, pr_addr(&con->peer_addr.in_addr));
			
 
				+
			
 
				+	if (test_bit(LOSSYTX, &con->state)) {
			
 
				+		dout("fault on LOSSYTX channel\n");
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
			
 
				+
			
 
				+	mutex_lock(&con->mutex);
			
 
				+	if (test_bit(CLOSED, &con->state))
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	con_close_socket(con);
			
 
				+
			
 
				+	if (con->in_msg) {
			
 
				+		ceph_msg_put(con->in_msg);
			
 
				+		con->in_msg = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/* Requeue anything that hasn't been acked */
			
 
				+	list_splice_init(&con->out_sent, &con->out_queue);
			
 
				+
			
 
				+	/* If there are no messages in the queue, place the connection
			
 
				+	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
			
 
				+	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
			
 
				+		dout("fault setting STANDBY\n");
			
 
				+		set_bit(STANDBY, &con->state);
			
 
				+	} else {
			
 
				+		/* retry after a delay. */
			
 
				+		if (con->delay == 0)
			
 
				+			con->delay = BASE_DELAY_INTERVAL;
			
 
				+		else if (con->delay < MAX_DELAY_INTERVAL)
			
 
				+			con->delay *= 2;
			
 
				+		dout("fault queueing %p delay %lu\n", con, con->delay);
			
 
				+		con->ops->get(con);
			
 
				+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
			
 
				+				       round_jiffies_relative(con->delay)) == 0)
			
 
				+			con->ops->put(con);
			
 
				+	}
			
 
				+
			
 
				+out_unlock:
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+out:
			
 
				+	/*
			
 
				+	 * in case we faulted due to authentication, invalidate our
			
 
				+	 * current tickets so that we can get new ones.
			
 
				+         */
			
 
				+	if (con->auth_retry && con->ops->invalidate_authorizer) {
			
 
				+		dout("calling invalidate_authorizer()\n");
			
 
				+		con->ops->invalidate_authorizer(con);
			
 
				+	}
			
 
				+
			
 
				+	if (con->ops->fault)
			
 
				+		con->ops->fault(con);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * create a new messenger instance
			
 
				+ */
			
 
				+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
			
 
				+{
			
 
				+	struct ceph_messenger *msgr;
			
 
				+
			
 
				+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
			
 
				+	if (msgr == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	spin_lock_init(&msgr->global_seq_lock);
			
 
				+
			
 
				+	/* the zero page is needed if a request is "canceled" while the message
			
 
				+	 * is being written over the socket */
			
 
				+	msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
			
 
				+	if (!msgr->zero_page) {
			
 
				+		kfree(msgr);
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	}
			
 
				+	kmap(msgr->zero_page);
			
 
				+
			
 
				+	if (myaddr)
			
 
				+		msgr->inst.addr = *myaddr;
			
 
				+
			
 
				+	/* select a random nonce */
			
 
				+	msgr->inst.addr.type = 0;
			
 
				+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
			
 
				+	encode_my_addr(msgr);
			
 
				+
			
 
				+	dout("messenger_create %p\n", msgr);
			
 
				+	return msgr;
			
 
				+}
			
 
				+
			
 
				+void ceph_messenger_destroy(struct ceph_messenger *msgr)
			
 
				+{
			
 
				+	dout("destroy %p\n", msgr);
			
 
				+	kunmap(msgr->zero_page);
			
 
				+	__free_page(msgr->zero_page);
			
 
				+	kfree(msgr);
			
 
				+	dout("destroyed messenger %p\n", msgr);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Queue up an outgoing message on the given connection.
			
 
				+ */
			
 
				+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	if (test_bit(CLOSED, &con->state)) {
			
 
				+		dout("con_send %p closed, dropping %p\n", con, msg);
			
 
				+		ceph_msg_put(msg);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* set src+dst */
			
 
				+	msg->hdr.src.name = con->msgr->inst.name;
			
 
				+	msg->hdr.src.addr = con->msgr->my_enc_addr;
			
 
				+	msg->hdr.orig_src = msg->hdr.src;
			
 
				+
			
 
				+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
			
 
				+
			
 
				+	/* queue */
			
 
				+	mutex_lock(&con->mutex);
			
 
				+	BUG_ON(!list_empty(&msg->list_head));
			
 
				+	list_add_tail(&msg->list_head, &con->out_queue);
			
 
				+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
			
 
				+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
			
 
				+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
			
 
				+	     le32_to_cpu(msg->hdr.front_len),
			
 
				+	     le32_to_cpu(msg->hdr.middle_len),
			
 
				+	     le32_to_cpu(msg->hdr.data_len));
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+
			
 
				+	/* if there wasn't anything waiting to send before, queue
			
 
				+	 * new work */
			
 
				+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
			
 
				+		queue_con(con);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Revoke a message that was previously queued for send
			
 
				+ */
			
 
				+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	mutex_lock(&con->mutex);
			
 
				+	if (!list_empty(&msg->list_head)) {
			
 
				+		dout("con_revoke %p msg %p\n", con, msg);
			
 
				+		list_del_init(&msg->list_head);
			
 
				+		ceph_msg_put(msg);
			
 
				+		msg->hdr.seq = 0;
			
 
				+		if (con->out_msg == msg) {
			
 
				+			ceph_msg_put(con->out_msg);
			
 
				+			con->out_msg = NULL;
			
 
				+		}
			
 
				+		if (con->out_kvec_is_msg) {
			
 
				+			con->out_skip = con->out_kvec_bytes;
			
 
				+			con->out_kvec_is_msg = false;
			
 
				+		}
			
 
				+	} else {
			
 
				+		dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
			
 
				+	}
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Revoke a message that we may be reading data into
			
 
				+ */
			
 
				+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	mutex_lock(&con->mutex);
			
 
				+	if (con->in_msg && con->in_msg == msg) {
			
 
				+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
			
 
				+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
			
 
				+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
			
 
				+
			
 
				+		/* skip rest of message */
			
 
				+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
			
 
				+			con->in_base_pos = con->in_base_pos -
			
 
				+				sizeof(struct ceph_msg_header) -
			
 
				+				front_len -
			
 
				+				middle_len -
			
 
				+				data_len -
			
 
				+				sizeof(struct ceph_msg_footer);
			
 
				+		ceph_msg_put(con->in_msg);
			
 
				+		con->in_msg = NULL;
			
 
				+		con->in_tag = CEPH_MSGR_TAG_READY;
			
 
				+	} else {
			
 
				+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
			
 
				+		     con, con->in_msg, msg);
			
 
				+	}
			
 
				+	mutex_unlock(&con->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Queue a keepalive byte to ensure the tcp connection is alive.
			
 
				+ */
			
 
				+void ceph_con_keepalive(struct ceph_connection *con)
			
 
				+{
			
 
				+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
			
 
				+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
			
 
				+		queue_con(con);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * construct a new message with given type, size
			
 
				+ * the new msg has a ref count of 1.
			
 
				+ */
			
 
				+struct ceph_msg *ceph_msg_new(int type, int front_len,
			
 
				+			      int page_len, int page_off, struct page **pages)
			
 
				+{
			
 
				+	struct ceph_msg *m;
			
 
				+
			
 
				+	m = kmalloc(sizeof(*m), GFP_NOFS);
			
 
				+	if (m == NULL)
			
 
				+		goto out;
			
 
				+	kref_init(&m->kref);
			
 
				+	INIT_LIST_HEAD(&m->list_head);
			
 
				+
			
 
				+	m->hdr.type = cpu_to_le16(type);
			
 
				+	m->hdr.front_len = cpu_to_le32(front_len);
			
 
				+	m->hdr.middle_len = 0;
			
 
				+	m->hdr.data_len = cpu_to_le32(page_len);
			
 
				+	m->hdr.data_off = cpu_to_le16(page_off);
			
 
				+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
			
 
				+	m->footer.front_crc = 0;
			
 
				+	m->footer.middle_crc = 0;
			
 
				+	m->footer.data_crc = 0;
			
 
				+	m->front_max = front_len;
			
 
				+	m->front_is_vmalloc = false;
			
 
				+	m->more_to_follow = false;
			
 
				+	m->pool = NULL;
			
 
				+
			
 
				+	/* front */
			
 
				+	if (front_len) {
			
 
				+		if (front_len > PAGE_CACHE_SIZE) {
			
 
				+			m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
			
 
				+						      PAGE_KERNEL);
			
 
				+			m->front_is_vmalloc = true;
			
 
				+		} else {
			
 
				+			m->front.iov_base = kmalloc(front_len, GFP_NOFS);
			
 
				+		}
			
 
				+		if (m->front.iov_base == NULL) {
			
 
				+			pr_err("msg_new can't allocate %d bytes\n",
			
 
				+			     front_len);
			
 
				+			goto out2;
			
 
				+		}
			
 
				+	} else {
			
 
				+		m->front.iov_base = NULL;
			
 
				+	}
			
 
				+	m->front.iov_len = front_len;
			
 
				+
			
 
				+	/* middle */
			
 
				+	m->middle = NULL;
			
 
				+
			
 
				+	/* data */
			
 
				+	m->nr_pages = calc_pages_for(page_off, page_len);
			
 
				+	m->pages = pages;
			
 
				+	m->pagelist = NULL;
			
 
				+
			
 
				+	dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
			
 
				+	     m->nr_pages);
			
 
				+	return m;
			
 
				+
			
 
				+out2:
			
 
				+	ceph_msg_put(m);
			
 
				+out:
			
 
				+	pr_err("msg_new can't create type %d len %d\n", type, front_len);
			
 
				+	return ERR_PTR(-ENOMEM);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate "middle" portion of a message, if it is needed and wasn't
			
 
				+ * allocated by alloc_msg.  This allows us to read a small fixed-size
			
 
				+ * per-type header in the front and then gracefully fail (i.e.,
			
 
				+ * propagate the error to the caller based on info in the front) when
			
 
				+ * the middle is too large.
			
 
				+ */
			
 
				+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	int type = le16_to_cpu(msg->hdr.type);
			
 
				+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
			
 
				+
			
 
				+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
			
 
				+	     ceph_msg_type_name(type), middle_len);
			
 
				+	BUG_ON(!middle_len);
			
 
				+	BUG_ON(msg->middle);
			
 
				+
			
 
				+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
			
 
				+	if (!msg->middle)
			
 
				+		return -ENOMEM;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generic message allocator, for incoming messages.
			
 
				+ */
			
 
				+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
			
 
				+				struct ceph_msg_header *hdr,
			
 
				+				int *skip)
			
 
				+{
			
 
				+	int type = le16_to_cpu(hdr->type);
			
 
				+	int front_len = le32_to_cpu(hdr->front_len);
			
 
				+	int middle_len = le32_to_cpu(hdr->middle_len);
			
 
				+	struct ceph_msg *msg = NULL;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (con->ops->alloc_msg) {
			
 
				+		mutex_unlock(&con->mutex);
			
 
				+		msg = con->ops->alloc_msg(con, hdr, skip);
			
 
				+		mutex_lock(&con->mutex);
			
 
				+		if (IS_ERR(msg))
			
 
				+			return msg;
			
 
				+
			
 
				+		if (*skip)
			
 
				+			return NULL;
			
 
				+	}
			
 
				+	if (!msg) {
			
 
				+		*skip = 0;
			
 
				+		msg = ceph_msg_new(type, front_len, 0, 0, NULL);
			
 
				+		if (!msg) {
			
 
				+			pr_err("unable to allocate msg type %d len %d\n",
			
 
				+			       type, front_len);
			
 
				+			return ERR_PTR(-ENOMEM);
			
 
				+		}
			
 
				+	}
			
 
				+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
			
 
				+
			
 
				+	if (middle_len) {
			
 
				+		ret = ceph_alloc_middle(con, msg);
			
 
				+
			
 
				+		if (ret < 0) {
			
 
				+			ceph_msg_put(msg);
			
 
				+			return msg;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return msg;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Free a generically kmalloc'd message.
			
 
				+ */
			
 
				+void ceph_msg_kfree(struct ceph_msg *m)
			
 
				+{
			
 
				+	dout("msg_kfree %p\n", m);
			
 
				+	if (m->front_is_vmalloc)
			
 
				+		vfree(m->front.iov_base);
			
 
				+	else
			
 
				+		kfree(m->front.iov_base);
			
 
				+	kfree(m);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Drop a msg ref.  Destroy as needed.
			
 
				+ */
			
 
				+void ceph_msg_last_put(struct kref *kref)
			
 
				+{
			
 
				+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
			
 
				+
			
 
				+	dout("ceph_msg_put last one on %p\n", m);
			
 
				+	WARN_ON(!list_empty(&m->list_head));
			
 
				+
			
 
				+	/* drop middle, data, if any */
			
 
				+	if (m->middle) {
			
 
				+		ceph_buffer_put(m->middle);
			
 
				+		m->middle = NULL;
			
 
				+	}
			
 
				+	m->nr_pages = 0;
			
 
				+	m->pages = NULL;
			
 
				+
			
 
				+	if (m->pagelist) {
			
 
				+		ceph_pagelist_release(m->pagelist);
			
 
				+		kfree(m->pagelist);
			
 
				+		m->pagelist = NULL;
			
 
				+	}
			
 
				+
			
 
				+	if (m->pool)
			
 
				+		ceph_msgpool_put(m->pool, m);
			
 
				+	else
			
 
				+		ceph_msg_kfree(m);
			
 
				+}
			
 
				+
			
 
				+void ceph_msg_dump(struct ceph_msg *msg)
			
 
				+{
			
 
				+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
			
 
				+		 msg->front_max, msg->nr_pages);
			
 
				+	print_hex_dump(KERN_DEBUG, "header: ",
			
 
				+		       DUMP_PREFIX_OFFSET, 16, 1,
			
 
				+		       &msg->hdr, sizeof(msg->hdr), true);
			
 
				+	print_hex_dump(KERN_DEBUG, " front: ",
			
 
				+		       DUMP_PREFIX_OFFSET, 16, 1,
			
 
				+		       msg->front.iov_base, msg->front.iov_len, true);
			
 
				+	if (msg->middle)
			
 
				+		print_hex_dump(KERN_DEBUG, "middle: ",
			
 
				+			       DUMP_PREFIX_OFFSET, 16, 1,
			
 
				+			       msg->middle->vec.iov_base,
			
 
				+			       msg->middle->vec.iov_len, true);
			
 
				+	print_hex_dump(KERN_DEBUG, "footer: ",
			
 
				+		       DUMP_PREFIX_OFFSET, 16, 1,
			
 
				+		       &msg->footer, sizeof(msg->footer), true);
			
 
				+}
			
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,254 @@
 
				+#ifndef __FS_CEPH_MESSENGER_H
			
 
				+#define __FS_CEPH_MESSENGER_H
			
 
				+
			
 
				+#include <linux/kref.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/net.h>
			
 
				+#include <linux/radix-tree.h>
			
 
				+#include <linux/uio.h>
			
 
				+#include <linux/version.h>
			
 
				+#include <linux/workqueue.h>
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "buffer.h"
			
 
				+
			
 
				+struct ceph_msg;
			
 
				+struct ceph_connection;
			
 
				+
			
 
				+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
			
 
				+
			
 
				+/*
			
 
				+ * Ceph defines these callbacks for handling connection events.
			
 
				+ */
			
 
				+struct ceph_connection_operations {
			
 
				+	struct ceph_connection *(*get)(struct ceph_connection *);
			
 
				+	void (*put)(struct ceph_connection *);
			
 
				+
			
 
				+	/* handle an incoming message. */
			
 
				+	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
			
 
				+
			
 
				+	/* authorize an outgoing connection */
			
 
				+	int (*get_authorizer) (struct ceph_connection *con,
			
 
				+			       void **buf, int *len, int *proto,
			
 
				+			       void **reply_buf, int *reply_len, int force_new);
			
 
				+	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
			
 
				+	int (*invalidate_authorizer)(struct ceph_connection *con);
			
 
				+
			
 
				+	/* protocol version mismatch */
			
 
				+	void (*bad_proto) (struct ceph_connection *con);
			
 
				+
			
 
				+	/* there was some error on the socket (disconnect, whatever) */
			
 
				+	void (*fault) (struct ceph_connection *con);
			
 
				+
			
 
				+	/* a remote host as terminated a message exchange session, and messages
			
 
				+	 * we sent (or they tried to send us) may be lost. */
			
 
				+	void (*peer_reset) (struct ceph_connection *con);
			
 
				+
			
 
				+	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
			
 
				+					struct ceph_msg_header *hdr,
			
 
				+					int *skip);
			
 
				+};
			
 
				+
			
 
				+extern const char *ceph_name_type_str(int t);
			
 
				+
			
 
				+/* use format string %s%d */
			
 
				+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
			
 
				+
			
 
				+struct ceph_messenger {
			
 
				+	struct ceph_entity_inst inst;    /* my name+address */
			
 
				+	struct ceph_entity_addr my_enc_addr;
			
 
				+	struct page *zero_page;          /* used in certain error cases */
			
 
				+
			
 
				+	bool nocrc;
			
 
				+
			
 
				+	/*
			
 
				+	 * the global_seq counts connections i (attempt to) initiate
			
 
				+	 * in order to disambiguate certain connect race conditions.
			
 
				+	 */
			
 
				+	u32 global_seq;
			
 
				+	spinlock_t global_seq_lock;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * a single message.  it contains a header (src, dest, message type, etc.),
			
 
				+ * footer (crc values, mainly), a "front" message body, and possibly a
			
 
				+ * data payload (stored in some number of pages).
			
 
				+ */
			
 
				+struct ceph_msg {
			
 
				+	struct ceph_msg_header hdr;	/* header */
			
 
				+	struct ceph_msg_footer footer;	/* footer */
			
 
				+	struct kvec front;              /* unaligned blobs of message */
			
 
				+	struct ceph_buffer *middle;
			
 
				+	struct page **pages;            /* data payload.  NOT OWNER. */
			
 
				+	unsigned nr_pages;              /* size of page array */
			
 
				+	struct ceph_pagelist *pagelist; /* instead of pages */
			
 
				+	struct list_head list_head;
			
 
				+	struct kref kref;
			
 
				+	bool front_is_vmalloc;
			
 
				+	bool more_to_follow;
			
 
				+	int front_max;
			
 
				+
			
 
				+	struct ceph_msgpool *pool;
			
 
				+};
			
 
				+
			
 
				+struct ceph_msg_pos {
			
 
				+	int page, page_pos;  /* which page; offset in page */
			
 
				+	int data_pos;        /* offset in data payload */
			
 
				+	int did_page_crc;    /* true if we've calculated crc for current page */
			
 
				+};
			
 
				+
			
 
				+/* ceph connection fault delay defaults, for exponential backoff */
			
 
				+#define BASE_DELAY_INTERVAL	(HZ/2)
			
 
				+#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
			
 
				+
			
 
				+/*
			
 
				+ * ceph_connection state bit flags
			
 
				+ *
			
 
				+ * QUEUED and BUSY are used together to ensure that only a single
			
 
				+ * thread is currently opening, reading or writing data to the socket.
			
 
				+ */
			
 
				+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
			
 
				+#define CONNECTING	1
			
 
				+#define NEGOTIATING	2
			
 
				+#define KEEPALIVE_PENDING      3
			
 
				+#define WRITE_PENDING	4  /* we have data ready to send */
			
 
				+#define QUEUED          5  /* there is work queued on this connection */
			
 
				+#define BUSY            6  /* work is being done */
			
 
				+#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
			
 
				+			    * the ceph_connection around to maintain shared
			
 
				+			    * state with the peer. */
			
 
				+#define CLOSED		10 /* we've closed the connection */
			
 
				+#define SOCK_CLOSED	11 /* socket state changed to closed */
			
 
				+#define OPENING         13 /* open connection w/ (possibly new) peer */
			
 
				+#define DEAD            14 /* dead, about to kfree */
			
 
				+
			
 
				+/*
			
 
				+ * A single connection with another host.
			
 
				+ *
			
 
				+ * We maintain a queue of outgoing messages, and some session state to
			
 
				+ * ensure that we can preserve the lossless, ordered delivery of
			
 
				+ * messages in the case of a TCP disconnect.
			
 
				+ */
			
 
				+struct ceph_connection {
			
 
				+	void *private;
			
 
				+	atomic_t nref;
			
 
				+
			
 
				+	const struct ceph_connection_operations *ops;
			
 
				+
			
 
				+	struct ceph_messenger *msgr;
			
 
				+	struct socket *sock;
			
 
				+	unsigned long state;	/* connection state (see flags above) */
			
 
				+	const char *error_msg;  /* error message, if any */
			
 
				+
			
 
				+	struct ceph_entity_addr peer_addr; /* peer address */
			
 
				+	struct ceph_entity_name peer_name; /* peer name */
			
 
				+	struct ceph_entity_addr peer_addr_for_me;
			
 
				+	u32 connect_seq;      /* identify the most recent connection
			
 
				+				 attempt for this connection, client */
			
 
				+	u32 peer_global_seq;  /* peer's global seq for this connection */
			
 
				+
			
 
				+	int auth_retry;       /* true if we need a newer authorizer */
			
 
				+	void *auth_reply_buf;   /* where to put the authorizer reply */
			
 
				+	int auth_reply_buf_len;
			
 
				+
			
 
				+	struct mutex mutex;
			
 
				+
			
 
				+	/* out queue */
			
 
				+	struct list_head out_queue;
			
 
				+	struct list_head out_sent;   /* sending or sent but unacked */
			
 
				+	u64 out_seq;		     /* last message queued for send */
			
 
				+	u64 out_seq_sent;            /* last message sent */
			
 
				+	bool out_keepalive_pending;
			
 
				+
			
 
				+	u64 in_seq, in_seq_acked;  /* last message received, acked */
			
 
				+
			
 
				+	/* connection negotiation temps */
			
 
				+	char in_banner[CEPH_BANNER_MAX_LEN];
			
 
				+	union {
			
 
				+		struct {  /* outgoing connection */
			
 
				+			struct ceph_msg_connect out_connect;
			
 
				+			struct ceph_msg_connect_reply in_reply;
			
 
				+		};
			
 
				+		struct {  /* incoming */
			
 
				+			struct ceph_msg_connect in_connect;
			
 
				+			struct ceph_msg_connect_reply out_reply;
			
 
				+		};
			
 
				+	};
			
 
				+	struct ceph_entity_addr actual_peer_addr;
			
 
				+
			
 
				+	/* message out temps */
			
 
				+	struct ceph_msg *out_msg;        /* sending message (== tail of
			
 
				+					    out_sent) */
			
 
				+	bool out_msg_done;
			
 
				+	struct ceph_msg_pos out_msg_pos;
			
 
				+
			
 
				+	struct kvec out_kvec[8],         /* sending header/footer data */
			
 
				+		*out_kvec_cur;
			
 
				+	int out_kvec_left;   /* kvec's left in out_kvec */
			
 
				+	int out_skip;        /* skip this many bytes */
			
 
				+	int out_kvec_bytes;  /* total bytes left */
			
 
				+	bool out_kvec_is_msg; /* kvec refers to out_msg */
			
 
				+	int out_more;        /* there is more data after the kvecs */
			
 
				+	__le64 out_temp_ack; /* for writing an ack */
			
 
				+
			
 
				+	/* message in temps */
			
 
				+	struct ceph_msg_header in_hdr;
			
 
				+	struct ceph_msg *in_msg;
			
 
				+	struct ceph_msg_pos in_msg_pos;
			
 
				+	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
			
 
				+
			
 
				+	char in_tag;         /* protocol control byte */
			
 
				+	int in_base_pos;     /* bytes read */
			
 
				+	__le64 in_temp_ack;  /* for reading an ack */
			
 
				+
			
 
				+	struct delayed_work work;	    /* send|recv work */
			
 
				+	unsigned long       delay;          /* current delay interval */
			
 
				+};
			
 
				+
			
 
				+
			
 
				+extern const char *pr_addr(const struct sockaddr_storage *ss);
			
 
				+extern int ceph_parse_ips(const char *c, const char *end,
			
 
				+			  struct ceph_entity_addr *addr,
			
 
				+			  int max_count, int *count);
			
 
				+
			
 
				+
			
 
				+extern int ceph_msgr_init(void);
			
 
				+extern void ceph_msgr_exit(void);
			
 
				+
			
 
				+extern struct ceph_messenger *ceph_messenger_create(
			
 
				+	struct ceph_entity_addr *myaddr);
			
 
				+extern void ceph_messenger_destroy(struct ceph_messenger *);
			
 
				+
			
 
				+extern void ceph_con_init(struct ceph_messenger *msgr,
			
 
				+			  struct ceph_connection *con);
			
 
				+extern void ceph_con_open(struct ceph_connection *con,
			
 
				+			  struct ceph_entity_addr *addr);
			
 
				+extern void ceph_con_close(struct ceph_connection *con);
			
 
				+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
			
 
				+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
			
 
				+extern void ceph_con_revoke_message(struct ceph_connection *con,
			
 
				+				  struct ceph_msg *msg);
			
 
				+extern void ceph_con_keepalive(struct ceph_connection *con);
			
 
				+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
			
 
				+extern void ceph_con_put(struct ceph_connection *con);
			
 
				+
			
 
				+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
			
 
				+				     int page_len, int page_off,
			
 
				+				     struct page **pages);
			
 
				+extern void ceph_msg_kfree(struct ceph_msg *m);
			
 
				+
			
 
				+
			
 
				+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
			
 
				+{
			
 
				+	kref_get(&msg->kref);
			
 
				+	return msg;
			
 
				+}
			
 
				+extern void ceph_msg_last_put(struct kref *kref);
			
 
				+static inline void ceph_msg_put(struct ceph_msg *msg)
			
 
				+{
			
 
				+	kref_put(&msg->kref, ceph_msg_last_put);
			
 
				+}
			
 
				+
			
 
				+extern void ceph_msg_dump(struct ceph_msg *msg);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/random.h>
			
 
				+#include <linux/sched.h>
			
 
				+
			
 
				+#include "mon_client.h"
			
 
				+#include "super.h"
			
 
				+#include "auth.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+/*
			
 
				+ * Interact with Ceph monitor cluster.  Handle requests for new map
			
 
				+ * versions, and periodically resend as needed.  Also implement
			
 
				+ * statfs() and umount().
			
 
				+ *
			
 
				+ * A small cluster of Ceph "monitors" are responsible for managing critical
			
 
				+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
			
 
				+ * of cmon daemons use a modified version of the Paxos part-time parliament
			
 
				+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
			
 
				+ * list of clients who have mounted the file system.
			
 
				+ *
			
 
				+ * We maintain an open, active session with a monitor at all times in order to
			
 
				+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
			
 
				+ * TCP socket to ensure we detect a failure.  If the connection does break, we
			
 
				+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
			
 
				+ * resend any outstanding requests.
			
 
				+ */
			
 
				+
			
 
				+const static struct ceph_connection_operations mon_con_ops;
			
 
				+
			
 
				+static int __validate_auth(struct ceph_mon_client *monc);
			
 
				+
			
 
				+/*
			
 
				+ * Decode a monmap blob (e.g., during mount).
			
 
				+ */
			
 
				+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
			
 
				+{
			
 
				+	struct ceph_monmap *m = NULL;
			
 
				+	int i, err = -EINVAL;
			
 
				+	struct ceph_fsid fsid;
			
 
				+	u32 epoch, num_mon;
			
 
				+	u16 version;
			
 
				+	u32 len;
			
 
				+
			
 
				+	ceph_decode_32_safe(&p, end, len, bad);
			
 
				+	ceph_decode_need(&p, end, len, bad);
			
 
				+
			
 
				+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
			
 
				+
			
 
				+	ceph_decode_16_safe(&p, end, version, bad);
			
 
				+
			
 
				+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
			
 
				+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
			
 
				+	epoch = ceph_decode_32(&p);
			
 
				+
			
 
				+	num_mon = ceph_decode_32(&p);
			
 
				+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
			
 
				+
			
 
				+	if (num_mon >= CEPH_MAX_MON)
			
 
				+		goto bad;
			
 
				+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
			
 
				+	if (m == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	m->fsid = fsid;
			
 
				+	m->epoch = epoch;
			
 
				+	m->num_mon = num_mon;
			
 
				+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
			
 
				+	for (i = 0; i < num_mon; i++)
			
 
				+		ceph_decode_addr(&m->mon_inst[i].addr);
			
 
				+
			
 
				+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
			
 
				+	     m->num_mon);
			
 
				+	for (i = 0; i < m->num_mon; i++)
			
 
				+		dout("monmap_decode  mon%d is %s\n", i,
			
 
				+		     pr_addr(&m->mon_inst[i].addr.in_addr));
			
 
				+	return m;
			
 
				+
			
 
				+bad:
			
 
				+	dout("monmap_decode failed with %d\n", err);
			
 
				+	kfree(m);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * return true if *addr is included in the monmap.
			
 
				+ */
			
 
				+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < m->num_mon; i++)
			
 
				+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
			
 
				+			return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Send an auth request.
			
 
				+ */
			
 
				+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
			
 
				+{
			
 
				+	monc->pending_auth = 1;
			
 
				+	monc->m_auth->front.iov_len = len;
			
 
				+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
			
 
				+	ceph_msg_get(monc->m_auth);  /* keep our ref */
			
 
				+	ceph_con_send(monc->con, monc->m_auth);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Close monitor session, if any.
			
 
				+ */
			
 
				+static void __close_session(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	if (monc->con) {
			
 
				+		dout("__close_session closing mon%d\n", monc->cur_mon);
			
 
				+		ceph_con_revoke(monc->con, monc->m_auth);
			
 
				+		ceph_con_close(monc->con);
			
 
				+		monc->cur_mon = -1;
			
 
				+		monc->pending_auth = 0;
			
 
				+		ceph_auth_reset(monc->auth);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Open a session with a (new) monitor.
			
 
				+ */
			
 
				+static int __open_session(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	char r;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (monc->cur_mon < 0) {
			
 
				+		get_random_bytes(&r, 1);
			
 
				+		monc->cur_mon = r % monc->monmap->num_mon;
			
 
				+		dout("open_session num=%d r=%d -> mon%d\n",
			
 
				+		     monc->monmap->num_mon, r, monc->cur_mon);
			
 
				+		monc->sub_sent = 0;
			
 
				+		monc->sub_renew_after = jiffies;  /* i.e., expired */
			
 
				+		monc->want_next_osdmap = !!monc->want_next_osdmap;
			
 
				+
			
 
				+		dout("open_session mon%d opening\n", monc->cur_mon);
			
 
				+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
			
 
				+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
			
 
				+		ceph_con_open(monc->con,
			
 
				+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
			
 
				+
			
 
				+		/* initiatiate authentication handshake */
			
 
				+		ret = ceph_auth_build_hello(monc->auth,
			
 
				+					    monc->m_auth->front.iov_base,
			
 
				+					    monc->m_auth->front_max);
			
 
				+		__send_prepared_auth_request(monc, ret);
			
 
				+	} else {
			
 
				+		dout("open_session mon%d already open\n", monc->cur_mon);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static bool __sub_expired(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	return time_after_eq(jiffies, monc->sub_renew_after);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reschedule delayed work timer.
			
 
				+ */
			
 
				+static void __schedule_delayed(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	unsigned delay;
			
 
				+
			
 
				+	if (monc->cur_mon < 0 || __sub_expired(monc))
			
 
				+		delay = 10 * HZ;
			
 
				+	else
			
 
				+		delay = 20 * HZ;
			
 
				+	dout("__schedule_delayed after %u\n", delay);
			
 
				+	schedule_delayed_work(&monc->delayed_work, delay);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Send subscribe request for mdsmap and/or osdmap.
			
 
				+ */
			
 
				+static void __send_subscribe(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
			
 
				+	     (unsigned)monc->sub_sent, __sub_expired(monc),
			
 
				+	     monc->want_next_osdmap);
			
 
				+	if ((__sub_expired(monc) && !monc->sub_sent) ||
			
 
				+	    monc->want_next_osdmap == 1) {
			
 
				+		struct ceph_msg *msg;
			
 
				+		struct ceph_mon_subscribe_item *i;
			
 
				+		void *p, *end;
			
 
				+
			
 
				+		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
			
 
				+		if (!msg)
			
 
				+			return;
			
 
				+
			
 
				+		p = msg->front.iov_base;
			
 
				+		end = p + msg->front.iov_len;
			
 
				+
			
 
				+		dout("__send_subscribe to 'mdsmap' %u+\n",
			
 
				+		     (unsigned)monc->have_mdsmap);
			
 
				+		if (monc->want_next_osdmap) {
			
 
				+			dout("__send_subscribe to 'osdmap' %u\n",
			
 
				+			     (unsigned)monc->have_osdmap);
			
 
				+			ceph_encode_32(&p, 3);
			
 
				+			ceph_encode_string(&p, end, "osdmap", 6);
			
 
				+			i = p;
			
 
				+			i->have = cpu_to_le64(monc->have_osdmap);
			
 
				+			i->onetime = 1;
			
 
				+			p += sizeof(*i);
			
 
				+			monc->want_next_osdmap = 2;  /* requested */
			
 
				+		} else {
			
 
				+			ceph_encode_32(&p, 2);
			
 
				+		}
			
 
				+		ceph_encode_string(&p, end, "mdsmap", 6);
			
 
				+		i = p;
			
 
				+		i->have = cpu_to_le64(monc->have_mdsmap);
			
 
				+		i->onetime = 0;
			
 
				+		p += sizeof(*i);
			
 
				+		ceph_encode_string(&p, end, "monmap", 6);
			
 
				+		i = p;
			
 
				+		i->have = 0;
			
 
				+		i->onetime = 0;
			
 
				+		p += sizeof(*i);
			
 
				+
			
 
				+		msg->front.iov_len = p - msg->front.iov_base;
			
 
				+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
			
 
				+		ceph_con_send(monc->con, msg);
			
 
				+
			
 
				+		monc->sub_sent = jiffies | 1;  /* never 0 */
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void handle_subscribe_ack(struct ceph_mon_client *monc,
			
 
				+				 struct ceph_msg *msg)
			
 
				+{
			
 
				+	unsigned seconds;
			
 
				+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
			
 
				+
			
 
				+	if (msg->front.iov_len < sizeof(*h))
			
 
				+		goto bad;
			
 
				+	seconds = le32_to_cpu(h->duration);
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	if (monc->hunting) {
			
 
				+		pr_info("mon%d %s session established\n",
			
 
				+			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
			
 
				+		monc->hunting = false;
			
 
				+	}
			
 
				+	dout("handle_subscribe_ack after %d seconds\n", seconds);
			
 
				+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
			
 
				+	monc->sub_sent = 0;
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	return;
			
 
				+bad:
			
 
				+	pr_err("got corrupt subscribe-ack msg\n");
			
 
				+	ceph_msg_dump(msg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Keep track of which maps we have
			
 
				+ */
			
 
				+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
			
 
				+{
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	monc->have_mdsmap = got;
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
			
 
				+{
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	monc->have_osdmap = got;
			
 
				+	monc->want_next_osdmap = 0;
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Register interest in the next osdmap
			
 
				+ */
			
 
				+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	if (!monc->want_next_osdmap)
			
 
				+		monc->want_next_osdmap = 1;
			
 
				+	if (monc->want_next_osdmap < 2)
			
 
				+		__send_subscribe(monc);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *
			
 
				+ */
			
 
				+int ceph_monc_open_session(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	if (!monc->con) {
			
 
				+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
			
 
				+		if (!monc->con)
			
 
				+			return -ENOMEM;
			
 
				+		ceph_con_init(monc->client->msgr, monc->con);
			
 
				+		monc->con->private = monc;
			
 
				+		monc->con->ops = &mon_con_ops;
			
 
				+	}
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	__open_session(monc);
			
 
				+	__schedule_delayed(monc);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The monitor responds with mount ack indicate mount success.  The
			
 
				+ * included client ticket allows the client to talk to MDSs and OSDs.
			
 
				+ */
			
 
				+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
			
 
				+				 struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_client *client = monc->client;
			
 
				+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
			
 
				+	void *p, *end;
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+
			
 
				+	dout("handle_monmap\n");
			
 
				+	p = msg->front.iov_base;
			
 
				+	end = p + msg->front.iov_len;
			
 
				+
			
 
				+	monmap = ceph_monmap_decode(p, end);
			
 
				+	if (IS_ERR(monmap)) {
			
 
				+		pr_err("problem decoding monmap, %d\n",
			
 
				+		       (int)PTR_ERR(monmap));
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
			
 
				+		kfree(monmap);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	client->monc.monmap = monmap;
			
 
				+	kfree(old);
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	wake_up(&client->auth_wq);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * statfs
			
 
				+ */
			
 
				+static struct ceph_mon_statfs_request *__lookup_statfs(
			
 
				+	struct ceph_mon_client *monc, u64 tid)
			
 
				+{
			
 
				+	struct ceph_mon_statfs_request *req;
			
 
				+	struct rb_node *n = monc->statfs_request_tree.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		req = rb_entry(n, struct ceph_mon_statfs_request, node);
			
 
				+		if (tid < req->tid)
			
 
				+			n = n->rb_left;
			
 
				+		else if (tid > req->tid)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return req;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void __insert_statfs(struct ceph_mon_client *monc,
			
 
				+			    struct ceph_mon_statfs_request *new)
			
 
				+{
			
 
				+	struct rb_node **p = &monc->statfs_request_tree.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_mon_statfs_request *req = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		req = rb_entry(parent, struct ceph_mon_statfs_request, node);
			
 
				+		if (new->tid < req->tid)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->tid > req->tid)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->node, parent, p);
			
 
				+	rb_insert_color(&new->node, &monc->statfs_request_tree);
			
 
				+}
			
 
				+
			
 
				+static void handle_statfs_reply(struct ceph_mon_client *monc,
			
 
				+				struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mon_statfs_request *req;
			
 
				+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
			
 
				+	u64 tid;
			
 
				+
			
 
				+	if (msg->front.iov_len != sizeof(*reply))
			
 
				+		goto bad;
			
 
				+	tid = le64_to_cpu(msg->hdr.tid);
			
 
				+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	req = __lookup_statfs(monc, tid);
			
 
				+	if (req) {
			
 
				+		*req->buf = reply->st;
			
 
				+		req->result = 0;
			
 
				+	}
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	if (req)
			
 
				+		complete(&req->completion);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("corrupt statfs reply, no tid\n");
			
 
				+	ceph_msg_dump(msg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * (re)send a statfs request
			
 
				+ */
			
 
				+static int send_statfs(struct ceph_mon_client *monc,
			
 
				+		       struct ceph_mon_statfs_request *req)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+	struct ceph_mon_statfs *h;
			
 
				+
			
 
				+	dout("send_statfs tid %llu\n", req->tid);
			
 
				+	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
			
 
				+	if (IS_ERR(msg))
			
 
				+		return PTR_ERR(msg);
			
 
				+	req->request = msg;
			
 
				+	msg->hdr.tid = cpu_to_le64(req->tid);
			
 
				+	h = msg->front.iov_base;
			
 
				+	h->monhdr.have_version = 0;
			
 
				+	h->monhdr.session_mon = cpu_to_le16(-1);
			
 
				+	h->monhdr.session_mon_tid = 0;
			
 
				+	h->fsid = monc->monmap->fsid;
			
 
				+	ceph_con_send(monc->con, msg);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Do a synchronous statfs().
			
 
				+ */
			
 
				+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
			
 
				+{
			
 
				+	struct ceph_mon_statfs_request req;
			
 
				+	int err;
			
 
				+
			
 
				+	req.buf = buf;
			
 
				+	init_completion(&req.completion);
			
 
				+
			
 
				+	/* allocate memory for reply */
			
 
				+	err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	/* register request */
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	req.tid = ++monc->last_tid;
			
 
				+	req.last_attempt = jiffies;
			
 
				+	req.delay = BASE_DELAY_INTERVAL;
			
 
				+	__insert_statfs(monc, &req);
			
 
				+	monc->num_statfs_requests++;
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+
			
 
				+	/* send request and wait */
			
 
				+	err = send_statfs(monc, &req);
			
 
				+	if (!err)
			
 
				+		err = wait_for_completion_interruptible(&req.completion);
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	rb_erase(&req.node, &monc->statfs_request_tree);
			
 
				+	monc->num_statfs_requests--;
			
 
				+	ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+
			
 
				+	if (!err)
			
 
				+		err = req.result;
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Resend pending statfs requests.
			
 
				+ */
			
 
				+static void __resend_statfs(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	struct ceph_mon_statfs_request *req;
			
 
				+	struct rb_node *p;
			
 
				+
			
 
				+	for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
			
 
				+		req = rb_entry(p, struct ceph_mon_statfs_request, node);
			
 
				+		send_statfs(monc, req);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
			
 
				+ * renew/retry subscription as needed (in case it is timing out, or we
			
 
				+ * got an ENOMEM).  And keep the monitor connection alive.
			
 
				+ */
			
 
				+static void delayed_work(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc =
			
 
				+		container_of(work, struct ceph_mon_client, delayed_work.work);
			
 
				+
			
 
				+	dout("monc delayed_work\n");
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	if (monc->hunting) {
			
 
				+		__close_session(monc);
			
 
				+		__open_session(monc);  /* continue hunting */
			
 
				+	} else {
			
 
				+		ceph_con_keepalive(monc->con);
			
 
				+
			
 
				+		__validate_auth(monc);
			
 
				+
			
 
				+		if (monc->auth->ops->is_authenticated(monc->auth))
			
 
				+			__send_subscribe(monc);
			
 
				+	}
			
 
				+	__schedule_delayed(monc);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * On startup, we build a temporary monmap populated with the IPs
			
 
				+ * provided by mount(2).
			
 
				+ */
			
 
				+static int build_initial_monmap(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	struct ceph_mount_args *args = monc->client->mount_args;
			
 
				+	struct ceph_entity_addr *mon_addr = args->mon_addr;
			
 
				+	int num_mon = args->num_mon;
			
 
				+	int i;
			
 
				+
			
 
				+	/* build initial monmap */
			
 
				+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
			
 
				+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
			
 
				+			       GFP_KERNEL);
			
 
				+	if (!monc->monmap)
			
 
				+		return -ENOMEM;
			
 
				+	for (i = 0; i < num_mon; i++) {
			
 
				+		monc->monmap->mon_inst[i].addr = mon_addr[i];
			
 
				+		monc->monmap->mon_inst[i].addr.nonce = 0;
			
 
				+		monc->monmap->mon_inst[i].name.type =
			
 
				+			CEPH_ENTITY_TYPE_MON;
			
 
				+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
			
 
				+	}
			
 
				+	monc->monmap->num_mon = num_mon;
			
 
				+	monc->have_fsid = false;
			
 
				+
			
 
				+	/* release addr memory */
			
 
				+	kfree(args->mon_addr);
			
 
				+	args->mon_addr = NULL;
			
 
				+	args->num_mon = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
			
 
				+{
			
 
				+	int err = 0;
			
 
				+
			
 
				+	dout("init\n");
			
 
				+	memset(monc, 0, sizeof(*monc));
			
 
				+	monc->client = cl;
			
 
				+	monc->monmap = NULL;
			
 
				+	mutex_init(&monc->mutex);
			
 
				+
			
 
				+	err = build_initial_monmap(monc);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+
			
 
				+	monc->con = NULL;
			
 
				+
			
 
				+	/* authentication */
			
 
				+	monc->auth = ceph_auth_init(cl->mount_args->name,
			
 
				+				    cl->mount_args->secret);
			
 
				+	if (IS_ERR(monc->auth))
			
 
				+		return PTR_ERR(monc->auth);
			
 
				+	monc->auth->want_keys =
			
 
				+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
			
 
				+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
			
 
				+
			
 
				+	/* msg pools */
			
 
				+	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
			
 
				+			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
			
 
				+	if (err < 0)
			
 
				+		goto out_monmap;
			
 
				+	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
			
 
				+				sizeof(struct ceph_mon_statfs_reply), 0, false);
			
 
				+	if (err < 0)
			
 
				+		goto out_pool1;
			
 
				+	err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
			
 
				+	if (err < 0)
			
 
				+		goto out_pool2;
			
 
				+
			
 
				+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
			
 
				+	monc->pending_auth = 0;
			
 
				+	if (IS_ERR(monc->m_auth)) {
			
 
				+		err = PTR_ERR(monc->m_auth);
			
 
				+		monc->m_auth = NULL;
			
 
				+		goto out_pool3;
			
 
				+	}
			
 
				+
			
 
				+	monc->cur_mon = -1;
			
 
				+	monc->hunting = true;
			
 
				+	monc->sub_renew_after = jiffies;
			
 
				+	monc->sub_sent = 0;
			
 
				+
			
 
				+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
			
 
				+	monc->statfs_request_tree = RB_ROOT;
			
 
				+	monc->num_statfs_requests = 0;
			
 
				+	monc->last_tid = 0;
			
 
				+
			
 
				+	monc->have_mdsmap = 0;
			
 
				+	monc->have_osdmap = 0;
			
 
				+	monc->want_next_osdmap = 1;
			
 
				+	return 0;
			
 
				+
			
 
				+out_pool3:
			
 
				+	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
			
 
				+out_pool2:
			
 
				+	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
			
 
				+out_pool1:
			
 
				+	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
			
 
				+out_monmap:
			
 
				+	kfree(monc->monmap);
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+void ceph_monc_stop(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	dout("stop\n");
			
 
				+	cancel_delayed_work_sync(&monc->delayed_work);
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	__close_session(monc);
			
 
				+	if (monc->con) {
			
 
				+		monc->con->private = NULL;
			
 
				+		monc->con->ops->put(monc->con);
			
 
				+		monc->con = NULL;
			
 
				+	}
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+
			
 
				+	ceph_auth_destroy(monc->auth);
			
 
				+
			
 
				+	ceph_msg_put(monc->m_auth);
			
 
				+	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
			
 
				+	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
			
 
				+	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
			
 
				+
			
 
				+	kfree(monc->monmap);
			
 
				+}
			
 
				+
			
 
				+static void handle_auth_reply(struct ceph_mon_client *monc,
			
 
				+			      struct ceph_msg *msg)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	monc->pending_auth = 0;
			
 
				+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
			
 
				+				     msg->front.iov_len,
			
 
				+				     monc->m_auth->front.iov_base,
			
 
				+				     monc->m_auth->front_max);
			
 
				+	if (ret < 0) {
			
 
				+		monc->client->auth_err = ret;
			
 
				+		wake_up(&monc->client->auth_wq);
			
 
				+	} else if (ret > 0) {
			
 
				+		__send_prepared_auth_request(monc, ret);
			
 
				+	} else if (monc->auth->ops->is_authenticated(monc->auth)) {
			
 
				+		dout("authenticated, starting session\n");
			
 
				+
			
 
				+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
			
 
				+		monc->client->msgr->inst.name.num = monc->auth->global_id;
			
 
				+
			
 
				+		__send_subscribe(monc);
			
 
				+		__resend_statfs(monc);
			
 
				+	}
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+}
			
 
				+
			
 
				+static int __validate_auth(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (monc->pending_auth)
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
			
 
				+			      monc->m_auth->front_max);
			
 
				+	if (ret <= 0)
			
 
				+		return ret; /* either an error, or no need to authenticate */
			
 
				+	__send_prepared_auth_request(monc, ret);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	ret = __validate_auth(monc);
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * handle incoming message
			
 
				+ */
			
 
				+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc = con->private;
			
 
				+	int type = le16_to_cpu(msg->hdr.type);
			
 
				+
			
 
				+	if (!monc)
			
 
				+		return;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case CEPH_MSG_AUTH_REPLY:
			
 
				+		handle_auth_reply(monc, msg);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
			
 
				+		handle_subscribe_ack(monc, msg);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSG_STATFS_REPLY:
			
 
				+		handle_statfs_reply(monc, msg);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSG_MON_MAP:
			
 
				+		ceph_monc_handle_map(monc, msg);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSG_MDS_MAP:
			
 
				+		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
			
 
				+		break;
			
 
				+
			
 
				+	case CEPH_MSG_OSD_MAP:
			
 
				+		ceph_osdc_handle_map(&monc->client->osdc, msg);
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		pr_err("received unknown message type %d %s\n", type,
			
 
				+		       ceph_msg_type_name(type));
			
 
				+	}
			
 
				+	ceph_msg_put(msg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Allocate memory for incoming message
			
 
				+ */
			
 
				+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
			
 
				+				      struct ceph_msg_header *hdr,
			
 
				+				      int *skip)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc = con->private;
			
 
				+	int type = le16_to_cpu(hdr->type);
			
 
				+	int front_len = le32_to_cpu(hdr->front_len);
			
 
				+	struct ceph_msg *m = NULL;
			
 
				+
			
 
				+	*skip = 0;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
			
 
				+		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
			
 
				+		break;
			
 
				+	case CEPH_MSG_STATFS_REPLY:
			
 
				+		m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
			
 
				+		break;
			
 
				+	case CEPH_MSG_AUTH_REPLY:
			
 
				+		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
			
 
				+		break;
			
 
				+	case CEPH_MSG_MON_MAP:
			
 
				+	case CEPH_MSG_MDS_MAP:
			
 
				+	case CEPH_MSG_OSD_MAP:
			
 
				+		m = ceph_msg_new(type, front_len, 0, 0, NULL);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	if (!m) {
			
 
				+		pr_info("alloc_msg unknown type %d\n", type);
			
 
				+		*skip = 1;
			
 
				+	}
			
 
				+	return m;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If the monitor connection resets, pick a new monitor and resubmit
			
 
				+ * any pending requests.
			
 
				+ */
			
 
				+static void mon_fault(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_mon_client *monc = con->private;
			
 
				+
			
 
				+	if (!monc)
			
 
				+		return;
			
 
				+
			
 
				+	dout("mon_fault\n");
			
 
				+	mutex_lock(&monc->mutex);
			
 
				+	if (!con->private)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (monc->con && !monc->hunting)
			
 
				+		pr_info("mon%d %s session lost, "
			
 
				+			"hunting for new mon\n", monc->cur_mon,
			
 
				+			pr_addr(&monc->con->peer_addr.in_addr));
			
 
				+
			
 
				+	__close_session(monc);
			
 
				+	if (!monc->hunting) {
			
 
				+		/* start hunting */
			
 
				+		monc->hunting = true;
			
 
				+		__open_session(monc);
			
 
				+	} else {
			
 
				+		/* already hunting, let's wait a bit */
			
 
				+		__schedule_delayed(monc);
			
 
				+	}
			
 
				+out:
			
 
				+	mutex_unlock(&monc->mutex);
			
 
				+}
			
 
				+
			
 
				+const static struct ceph_connection_operations mon_con_ops = {
			
 
				+	.get = ceph_con_get,
			
 
				+	.put = ceph_con_put,
			
 
				+	.dispatch = dispatch,
			
 
				+	.fault = mon_fault,
			
 
				+	.alloc_msg = mon_alloc_msg,
			
 
				+};
			
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
 
				+#ifndef _FS_CEPH_MON_CLIENT_H
			
 
				+#define _FS_CEPH_MON_CLIENT_H
			
 
				+
			
 
				+#include <linux/completion.h>
			
 
				+#include <linux/rbtree.h>
			
 
				+
			
 
				+#include "messenger.h"
			
 
				+#include "msgpool.h"
			
 
				+
			
 
				+struct ceph_client;
			
 
				+struct ceph_mount_args;
			
 
				+struct ceph_auth_client;
			
 
				+
			
 
				+/*
			
 
				+ * The monitor map enumerates the set of all monitors.
			
 
				+ */
			
 
				+struct ceph_monmap {
			
 
				+	struct ceph_fsid fsid;
			
 
				+	u32 epoch;
			
 
				+	u32 num_mon;
			
 
				+	struct ceph_entity_inst mon_inst[0];
			
 
				+};
			
 
				+
			
 
				+struct ceph_mon_client;
			
 
				+struct ceph_mon_statfs_request;
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Generic mechanism for resending monitor requests.
			
 
				+ */
			
 
				+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
			
 
				+					 int newmon);
			
 
				+
			
 
				+/* a pending monitor request */
			
 
				+struct ceph_mon_request {
			
 
				+	struct ceph_mon_client *monc;
			
 
				+	struct delayed_work delayed_work;
			
 
				+	unsigned long delay;
			
 
				+	ceph_monc_request_func_t do_request;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * statfs() is done a bit differently because we need to get data back
			
 
				+ * to the caller
			
 
				+ */
			
 
				+struct ceph_mon_statfs_request {
			
 
				+	u64 tid;
			
 
				+	struct rb_node node;
			
 
				+	int result;
			
 
				+	struct ceph_statfs *buf;
			
 
				+	struct completion completion;
			
 
				+	unsigned long last_attempt, delay; /* jiffies */
			
 
				+	struct ceph_msg *request;  /* original request */
			
 
				+};
			
 
				+
			
 
				+struct ceph_mon_client {
			
 
				+	struct ceph_client *client;
			
 
				+	struct ceph_monmap *monmap;
			
 
				+
			
 
				+	struct mutex mutex;
			
 
				+	struct delayed_work delayed_work;
			
 
				+
			
 
				+	struct ceph_auth_client *auth;
			
 
				+	struct ceph_msg *m_auth;
			
 
				+	int pending_auth;
			
 
				+
			
 
				+	bool hunting;
			
 
				+	int cur_mon;                       /* last monitor i contacted */
			
 
				+	unsigned long sub_sent, sub_renew_after;
			
 
				+	struct ceph_connection *con;
			
 
				+	bool have_fsid;
			
 
				+
			
 
				+	/* msg pools */
			
 
				+	struct ceph_msgpool msgpool_subscribe_ack;
			
 
				+	struct ceph_msgpool msgpool_statfs_reply;
			
 
				+	struct ceph_msgpool msgpool_auth_reply;
			
 
				+
			
 
				+	/* pending statfs requests */
			
 
				+	struct rb_root statfs_request_tree;
			
 
				+	int num_statfs_requests;
			
 
				+	u64 last_tid;
			
 
				+
			
 
				+	/* mds/osd map */
			
 
				+	int want_next_osdmap; /* 1 = want, 2 = want+asked */
			
 
				+	u32 have_osdmap, have_mdsmap;
			
 
				+
			
 
				+#ifdef CONFIG_DEBUG_FS
			
 
				+	struct dentry *debugfs_file;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
			
 
				+extern int ceph_monmap_contains(struct ceph_monmap *m,
			
 
				+				struct ceph_entity_addr *addr);
			
 
				+
			
 
				+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
			
 
				+extern void ceph_monc_stop(struct ceph_mon_client *monc);
			
 
				+
			
 
				+/*
			
 
				+ * The model here is to indicate that we need a new map of at least
			
 
				+ * epoch @want, and also call in when we receive a map.  We will
			
 
				+ * periodically rerequest the map from the monitor cluster until we
			
 
				+ * get what we want.
			
 
				+ */
			
 
				+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
			
 
				+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
			
 
				+
			
 
				+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
			
 
				+
			
 
				+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
			
 
				+			       struct ceph_statfs *buf);
			
 
				+
			
 
				+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
			
 
				+
			
 
				+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
			
 
				+
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+
			
 
				+#include "msgpool.h"
			
 
				+
			
 
				+/*
			
 
				+ * We use msg pools to preallocate memory for messages we expect to
			
 
				+ * receive over the wire, to avoid getting ourselves into OOM
			
 
				+ * conditions at unexpected times.  We take use a few different
			
 
				+ * strategies:
			
 
				+ *
			
 
				+ *  - for request/response type interactions, we preallocate the
			
 
				+ * memory needed for the response when we generate the request.
			
 
				+ *
			
 
				+ *  - for messages we can receive at any time from the MDS, we preallocate
			
 
				+ * a pool of messages we can re-use.
			
 
				+ *
			
 
				+ *  - for writeback, we preallocate some number of messages to use for
			
 
				+ * requests and their replies, so that we always make forward
			
 
				+ * progress.
			
 
				+ *
			
 
				+ * The msgpool behaves like a mempool_t, but keeps preallocated
			
 
				+ * ceph_msgs strung together on a list_head instead of using a pointer
			
 
				+ * vector.  This avoids vector reallocation when we adjust the number
			
 
				+ * of preallocated items (which happens frequently).
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Allocate or release as necessary to meet our target pool size.
			
 
				+ */
			
 
				+static int __fill_msgpool(struct ceph_msgpool *pool)
			
 
				+{
			
 
				+	struct ceph_msg *msg;
			
 
				+
			
 
				+	while (pool->num < pool->min) {
			
 
				+		dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
			
 
				+		     pool->min);
			
 
				+		spin_unlock(&pool->lock);
			
 
				+		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
			
 
				+		spin_lock(&pool->lock);
			
 
				+		if (IS_ERR(msg))
			
 
				+			return PTR_ERR(msg);
			
 
				+		msg->pool = pool;
			
 
				+		list_add(&msg->list_head, &pool->msgs);
			
 
				+		pool->num++;
			
 
				+	}
			
 
				+	while (pool->num > pool->min) {
			
 
				+		msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
			
 
				+		dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
			
 
				+		     pool->min, msg);
			
 
				+		list_del_init(&msg->list_head);
			
 
				+		pool->num--;
			
 
				+		ceph_msg_kfree(msg);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_msgpool_init(struct ceph_msgpool *pool,
			
 
				+		      int front_len, int min, bool blocking)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
			
 
				+	spin_lock_init(&pool->lock);
			
 
				+	pool->front_len = front_len;
			
 
				+	INIT_LIST_HEAD(&pool->msgs);
			
 
				+	pool->num = 0;
			
 
				+	pool->min = min;
			
 
				+	pool->blocking = blocking;
			
 
				+	init_waitqueue_head(&pool->wait);
			
 
				+
			
 
				+	spin_lock(&pool->lock);
			
 
				+	ret = __fill_msgpool(pool);
			
 
				+	spin_unlock(&pool->lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
			
 
				+{
			
 
				+	dout("msgpool_destroy %p\n", pool);
			
 
				+	spin_lock(&pool->lock);
			
 
				+	pool->min = 0;
			
 
				+	__fill_msgpool(pool);
			
 
				+	spin_unlock(&pool->lock);
			
 
				+}
			
 
				+
			
 
				+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	spin_lock(&pool->lock);
			
 
				+	dout("msgpool_resv %p delta %d\n", pool, delta);
			
 
				+	pool->min += delta;
			
 
				+	ret = __fill_msgpool(pool);
			
 
				+	spin_unlock(&pool->lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
			
 
				+{
			
 
				+	wait_queue_t wait;
			
 
				+	struct ceph_msg *msg;
			
 
				+
			
 
				+	if (front_len && front_len > pool->front_len) {
			
 
				+		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
			
 
				+		       pool, front_len, pool->front_len);
			
 
				+		WARN_ON(1);
			
 
				+
			
 
				+		/* try to alloc a fresh message */
			
 
				+		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
			
 
				+		if (!IS_ERR(msg))
			
 
				+			return msg;
			
 
				+	}
			
 
				+
			
 
				+	if (!front_len)
			
 
				+		front_len = pool->front_len;
			
 
				+
			
 
				+	if (pool->blocking) {
			
 
				+		/* mempool_t behavior; first try to alloc */
			
 
				+		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
			
 
				+		if (!IS_ERR(msg))
			
 
				+			return msg;
			
 
				+	}
			
 
				+
			
 
				+	while (1) {
			
 
				+		spin_lock(&pool->lock);
			
 
				+		if (likely(pool->num)) {
			
 
				+			msg = list_entry(pool->msgs.next, struct ceph_msg,
			
 
				+					 list_head);
			
 
				+			list_del_init(&msg->list_head);
			
 
				+			pool->num--;
			
 
				+			dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
			
 
				+			     pool->num, pool->min);
			
 
				+			spin_unlock(&pool->lock);
			
 
				+			return msg;
			
 
				+		}
			
 
				+		pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
			
 
				+		       pool->min, pool->blocking ? "waiting" : "may fail");
			
 
				+		spin_unlock(&pool->lock);
			
 
				+
			
 
				+		if (!pool->blocking) {
			
 
				+			WARN_ON(1);
			
 
				+
			
 
				+			/* maybe we can allocate it now? */
			
 
				+			msg = ceph_msg_new(0, front_len, 0, 0, NULL);
			
 
				+			if (!IS_ERR(msg))
			
 
				+				return msg;
			
 
				+
			
 
				+			pr_err("msgpool_get %p empty + alloc failed\n", pool);
			
 
				+			return ERR_PTR(-ENOMEM);
			
 
				+		}
			
 
				+
			
 
				+		init_wait(&wait);
			
 
				+		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
			
 
				+		schedule();
			
 
				+		finish_wait(&pool->wait, &wait);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
			
 
				+{
			
 
				+	spin_lock(&pool->lock);
			
 
				+	if (pool->num < pool->min) {
			
 
				+		/* reset msg front_len; user may have changed it */
			
 
				+		msg->front.iov_len = pool->front_len;
			
 
				+		msg->hdr.front_len = cpu_to_le32(pool->front_len);
			
 
				+
			
 
				+		kref_set(&msg->kref, 1);  /* retake a single ref */
			
 
				+		list_add(&msg->list_head, &pool->msgs);
			
 
				+		pool->num++;
			
 
				+		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
			
 
				+		     pool->num, pool->min);
			
 
				+		spin_unlock(&pool->lock);
			
 
				+		wake_up(&pool->wait);
			
 
				+	} else {
			
 
				+		dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
			
 
				+		     pool->num, pool->min);
			
 
				+		spin_unlock(&pool->lock);
			
 
				+		ceph_msg_kfree(msg);
			
 
				+	}
			
 
				+}
			
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
 
				+#ifndef _FS_CEPH_MSGPOOL
			
 
				+#define _FS_CEPH_MSGPOOL
			
 
				+
			
 
				+#include "messenger.h"
			
 
				+
			
 
				+/*
			
 
				+ * we use memory pools for preallocating messages we may receive, to
			
 
				+ * avoid unexpected OOM conditions.
			
 
				+ */
			
 
				+struct ceph_msgpool {
			
 
				+	spinlock_t lock;
			
 
				+	int front_len;          /* preallocated payload size */
			
 
				+	struct list_head msgs;  /* msgs in the pool; each has 1 ref */
			
 
				+	int num, min;           /* cur, min # msgs in the pool */
			
 
				+	bool blocking;
			
 
				+	wait_queue_head_t wait;
			
 
				+};
			
 
				+
			
 
				+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
			
 
				+			     int front_len, int size, bool blocking);
			
 
				+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
			
 
				+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
			
 
				+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
			
 
				+					 int front_len);
			
 
				+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
 
				+#ifndef __MSGR_H
			
 
				+#define __MSGR_H
			
 
				+
			
 
				+/*
			
 
				+ * Data types for message passing layer used by Ceph.
			
 
				+ */
			
 
				+
			
 
				+#define CEPH_MON_PORT    6789  /* default monitor port */
			
 
				+
			
 
				+/*
			
 
				+ * client-side processes will try to bind to ports in this
			
 
				+ * range, simply for the benefit of tools like nmap or wireshark
			
 
				+ * that would like to identify the protocol.
			
 
				+ */
			
 
				+#define CEPH_PORT_FIRST  6789
			
 
				+#define CEPH_PORT_START  6800  /* non-monitors start here */
			
 
				+#define CEPH_PORT_LAST   6900
			
 
				+
			
 
				+/*
			
 
				+ * tcp connection banner.  include a protocol version. and adjust
			
 
				+ * whenever the wire protocol changes.  try to keep this string length
			
 
				+ * constant.
			
 
				+ */
			
 
				+#define CEPH_BANNER "ceph v027"
			
 
				+#define CEPH_BANNER_MAX_LEN 30
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Rollover-safe type and comparator for 32-bit sequence numbers.
			
 
				+ * Comparator returns -1, 0, or 1.
			
 
				+ */
			
 
				+typedef __u32 ceph_seq_t;
			
 
				+
			
 
				+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
			
 
				+{
			
 
				+       return (__s32)a - (__s32)b;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * entity_name -- logical name for a process participating in the
			
 
				+ * network, e.g. 'mds0' or 'osd3'.
			
 
				+ */
			
 
				+struct ceph_entity_name {
			
 
				+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
			
 
				+	__le64 num;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_ENTITY_TYPE_MON    0x01
			
 
				+#define CEPH_ENTITY_TYPE_MDS    0x02
			
 
				+#define CEPH_ENTITY_TYPE_OSD    0x04
			
 
				+#define CEPH_ENTITY_TYPE_CLIENT 0x08
			
 
				+#define CEPH_ENTITY_TYPE_ADMIN  0x10
			
 
				+#define CEPH_ENTITY_TYPE_AUTH   0x20
			
 
				+
			
 
				+#define CEPH_ENTITY_TYPE_ANY    0xFF
			
 
				+
			
 
				+extern const char *ceph_entity_type_name(int type);
			
 
				+
			
 
				+/*
			
 
				+ * entity_addr -- network address
			
 
				+ */
			
 
				+struct ceph_entity_addr {
			
 
				+	__le32 type;
			
 
				+	__le32 nonce;  /* unique id for process (e.g. pid) */
			
 
				+	struct sockaddr_storage in_addr;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_entity_inst {
			
 
				+	struct ceph_entity_name name;
			
 
				+	struct ceph_entity_addr addr;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+/* used by message exchange protocol */
			
 
				+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
			
 
				+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
			
 
				+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
			
 
				+					  incoming connection */
			
 
				+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
			
 
				+					  with higher cseq */
			
 
				+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
			
 
				+					  with higher gseq */
			
 
				+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
			
 
				+#define CEPH_MSGR_TAG_MSG           7  /* message */
			
 
				+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
			
 
				+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
			
 
				+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
			
 
				+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
			
 
				+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * connection negotiation
			
 
				+ */
			
 
				+struct ceph_msg_connect {
			
 
				+	__le64 features;     /* supported feature bits */
			
 
				+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
			
 
				+	__le32 global_seq;   /* count connections initiated by this host */
			
 
				+	__le32 connect_seq;  /* count connections initiated in this session */
			
 
				+	__le32 protocol_version;
			
 
				+	__le32 authorizer_protocol;
			
 
				+	__le32 authorizer_len;
			
 
				+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_msg_connect_reply {
			
 
				+	__u8 tag;
			
 
				+	__le64 features;     /* feature bits for this session */
			
 
				+	__le32 global_seq;
			
 
				+	__le32 connect_seq;
			
 
				+	__le32 protocol_version;
			
 
				+	__le32 authorizer_len;
			
 
				+	__u8 flags;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * message header
			
 
				+ */
			
 
				+struct ceph_msg_header {
			
 
				+	__le64 seq;       /* message seq# for this session */
			
 
				+	__le64 tid;       /* transaction id */
			
 
				+	__le16 type;      /* message type */
			
 
				+	__le16 priority;  /* priority.  higher value == higher priority */
			
 
				+	__le16 version;   /* version of message encoding */
			
 
				+
			
 
				+	__le32 front_len; /* bytes in main payload */
			
 
				+	__le32 middle_len;/* bytes in middle payload */
			
 
				+	__le32 data_len;  /* bytes of data payload */
			
 
				+	__le16 data_off;  /* sender: include full offset;
			
 
				+			     receiver: mask against ~PAGE_MASK */
			
 
				+
			
 
				+	struct ceph_entity_inst src, orig_src;
			
 
				+	__le32 reserved;
			
 
				+	__le32 crc;       /* header crc32c */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_MSG_PRIO_LOW     64
			
 
				+#define CEPH_MSG_PRIO_DEFAULT 127
			
 
				+#define CEPH_MSG_PRIO_HIGH    196
			
 
				+#define CEPH_MSG_PRIO_HIGHEST 255
			
 
				+
			
 
				+/*
			
 
				+ * follows data payload
			
 
				+ */
			
 
				+struct ceph_msg_footer {
			
 
				+	__le32 front_crc, middle_crc, data_crc;
			
 
				+	__u8 flags;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
			
 
				+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1537 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/err.h>
			
 
				+#include <linux/highmem.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/pagemap.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "osd_client.h"
			
 
				+#include "messenger.h"
			
 
				+#include "decode.h"
			
 
				+#include "auth.h"
			
 
				+
			
 
				+#define OSD_OP_FRONT_LEN	4096
			
 
				+#define OSD_OPREPLY_FRONT_LEN	512
			
 
				+
			
 
				+const static struct ceph_connection_operations osd_con_ops;
			
 
				+static int __kick_requests(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_osd *kickosd);
			
 
				+
			
 
				+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
			
 
				+
			
 
				+/*
			
 
				+ * Implement client access to distributed object storage cluster.
			
 
				+ *
			
 
				+ * All data objects are stored within a cluster/cloud of OSDs, or
			
 
				+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
			
 
				+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
			
 
				+ * remote daemons serving up and coordinating consistent and safe
			
 
				+ * access to storage.
			
 
				+ *
			
 
				+ * Cluster membership and the mapping of data objects onto storage devices
			
 
				+ * are described by the osd map.
			
 
				+ *
			
 
				+ * We keep track of pending OSD requests (read, write), resubmit
			
 
				+ * requests to different OSDs when the cluster topology/data layout
			
 
				+ * change, or retry the affected requests when the communications
			
 
				+ * channel with an OSD is reset.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * calculate the mapping of a file extent onto an object, and fill out the
			
 
				+ * request accordingly.  shorten extent as necessary if it crosses an
			
 
				+ * object boundary.
			
 
				+ *
			
 
				+ * fill osd op in request message.
			
 
				+ */
			
 
				+static void calc_layout(struct ceph_osd_client *osdc,
			
 
				+			struct ceph_vino vino, struct ceph_file_layout *layout,
			
 
				+			u64 off, u64 *plen,
			
 
				+			struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
			
 
				+	struct ceph_osd_op *op = (void *)(reqhead + 1);
			
 
				+	u64 orig_len = *plen;
			
 
				+	u64 objoff, objlen;    /* extent in object */
			
 
				+	u64 bno;
			
 
				+
			
 
				+	reqhead->snapid = cpu_to_le64(vino.snap);
			
 
				+
			
 
				+	/* object extent? */
			
 
				+	ceph_calc_file_object_mapping(layout, off, plen, &bno,
			
 
				+				      &objoff, &objlen);
			
 
				+	if (*plen < orig_len)
			
 
				+		dout(" skipping last %llu, final file extent %llu~%llu\n",
			
 
				+		     orig_len - *plen, off, *plen);
			
 
				+
			
 
				+	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
			
 
				+	req->r_oid_len = strlen(req->r_oid);
			
 
				+
			
 
				+	op->extent.offset = cpu_to_le64(objoff);
			
 
				+	op->extent.length = cpu_to_le64(objlen);
			
 
				+	req->r_num_pages = calc_pages_for(off, *plen);
			
 
				+
			
 
				+	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
			
 
				+	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * requests
			
 
				+ */
			
 
				+void ceph_osdc_release_request(struct kref *kref)
			
 
				+{
			
 
				+	struct ceph_osd_request *req = container_of(kref,
			
 
				+						    struct ceph_osd_request,
			
 
				+						    r_kref);
			
 
				+
			
 
				+	if (req->r_request)
			
 
				+		ceph_msg_put(req->r_request);
			
 
				+	if (req->r_reply)
			
 
				+		ceph_msg_put(req->r_reply);
			
 
				+	if (req->r_con_filling_msg) {
			
 
				+		dout("release_request revoking pages %p from con %p\n",
			
 
				+		     req->r_pages, req->r_con_filling_msg);
			
 
				+		ceph_con_revoke_message(req->r_con_filling_msg,
			
 
				+				      req->r_reply);
			
 
				+		ceph_con_put(req->r_con_filling_msg);
			
 
				+	}
			
 
				+	if (req->r_own_pages)
			
 
				+		ceph_release_page_vector(req->r_pages,
			
 
				+					 req->r_num_pages);
			
 
				+	ceph_put_snap_context(req->r_snapc);
			
 
				+	if (req->r_mempool)
			
 
				+		mempool_free(req, req->r_osdc->req_mempool);
			
 
				+	else
			
 
				+		kfree(req);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * build new request AND message, calculate layout, and adjust file
			
 
				+ * extent as needed.
			
 
				+ *
			
 
				+ * if the file was recently truncated, we include information about its
			
 
				+ * old and new size so that the object can be updated appropriately.  (we
			
 
				+ * avoid synchronously deleting truncated objects because it's slow.)
			
 
				+ *
			
 
				+ * if @do_sync, include a 'startsync' command so that the osd will flush
			
 
				+ * data quickly.
			
 
				+ */
			
 
				+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
			
 
				+					       struct ceph_file_layout *layout,
			
 
				+					       struct ceph_vino vino,
			
 
				+					       u64 off, u64 *plen,
			
 
				+					       int opcode, int flags,
			
 
				+					       struct ceph_snap_context *snapc,
			
 
				+					       int do_sync,
			
 
				+					       u32 truncate_seq,
			
 
				+					       u64 truncate_size,
			
 
				+					       struct timespec *mtime,
			
 
				+					       bool use_mempool, int num_reply)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	struct ceph_msg *msg;
			
 
				+	struct ceph_osd_request_head *head;
			
 
				+	struct ceph_osd_op *op;
			
 
				+	void *p;
			
 
				+	int num_op = 1 + do_sync;
			
 
				+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
			
 
				+	int i;
			
 
				+
			
 
				+	if (use_mempool) {
			
 
				+		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
			
 
				+		memset(req, 0, sizeof(*req));
			
 
				+	} else {
			
 
				+		req = kzalloc(sizeof(*req), GFP_NOFS);
			
 
				+	}
			
 
				+	if (req == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	req->r_osdc = osdc;
			
 
				+	req->r_mempool = use_mempool;
			
 
				+	kref_init(&req->r_kref);
			
 
				+	init_completion(&req->r_completion);
			
 
				+	init_completion(&req->r_safe_completion);
			
 
				+	INIT_LIST_HEAD(&req->r_unsafe_item);
			
 
				+	req->r_flags = flags;
			
 
				+
			
 
				+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
			
 
				+
			
 
				+	/* create reply message */
			
 
				+	if (use_mempool)
			
 
				+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
			
 
				+	else
			
 
				+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
			
 
				+				   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
			
 
				+	if (IS_ERR(msg)) {
			
 
				+		ceph_osdc_put_request(req);
			
 
				+		return ERR_PTR(PTR_ERR(msg));
			
 
				+	}
			
 
				+	req->r_reply = msg;
			
 
				+
			
 
				+	/* create request message; allow space for oid */
			
 
				+	msg_size += 40;
			
 
				+	if (snapc)
			
 
				+		msg_size += sizeof(u64) * snapc->num_snaps;
			
 
				+	if (use_mempool)
			
 
				+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
			
 
				+	else
			
 
				+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
			
 
				+	if (IS_ERR(msg)) {
			
 
				+		ceph_osdc_put_request(req);
			
 
				+		return ERR_PTR(PTR_ERR(msg));
			
 
				+	}
			
 
				+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
			
 
				+	memset(msg->front.iov_base, 0, msg->front.iov_len);
			
 
				+	head = msg->front.iov_base;
			
 
				+	op = (void *)(head + 1);
			
 
				+	p = (void *)(op + num_op);
			
 
				+
			
 
				+	req->r_request = msg;
			
 
				+	req->r_snapc = ceph_get_snap_context(snapc);
			
 
				+
			
 
				+	head->client_inc = cpu_to_le32(1); /* always, for now. */
			
 
				+	head->flags = cpu_to_le32(flags);
			
 
				+	if (flags & CEPH_OSD_FLAG_WRITE)
			
 
				+		ceph_encode_timespec(&head->mtime, mtime);
			
 
				+	head->num_ops = cpu_to_le16(num_op);
			
 
				+	op->op = cpu_to_le16(opcode);
			
 
				+
			
 
				+	/* calculate max write size */
			
 
				+	calc_layout(osdc, vino, layout, off, plen, req);
			
 
				+	req->r_file_layout = *layout;  /* keep a copy */
			
 
				+
			
 
				+	if (flags & CEPH_OSD_FLAG_WRITE) {
			
 
				+		req->r_request->hdr.data_off = cpu_to_le16(off);
			
 
				+		req->r_request->hdr.data_len = cpu_to_le32(*plen);
			
 
				+		op->payload_len = cpu_to_le32(*plen);
			
 
				+	}
			
 
				+	op->extent.truncate_size = cpu_to_le64(truncate_size);
			
 
				+	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
			
 
				+
			
 
				+	/* fill in oid */
			
 
				+	head->object_len = cpu_to_le32(req->r_oid_len);
			
 
				+	memcpy(p, req->r_oid, req->r_oid_len);
			
 
				+	p += req->r_oid_len;
			
 
				+
			
 
				+	if (do_sync) {
			
 
				+		op++;
			
 
				+		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
			
 
				+	}
			
 
				+	if (snapc) {
			
 
				+		head->snap_seq = cpu_to_le64(snapc->seq);
			
 
				+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
			
 
				+		for (i = 0; i < snapc->num_snaps; i++) {
			
 
				+			put_unaligned_le64(snapc->snaps[i], p);
			
 
				+			p += sizeof(u64);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
			
 
				+	msg_size = p - msg->front.iov_base;
			
 
				+	msg->front.iov_len = msg_size;
			
 
				+	msg->hdr.front_len = cpu_to_le32(msg_size);
			
 
				+	return req;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We keep osd requests in an rbtree, sorted by ->r_tid.
			
 
				+ */
			
 
				+static void __insert_request(struct ceph_osd_client *osdc,
			
 
				+			     struct ceph_osd_request *new)
			
 
				+{
			
 
				+	struct rb_node **p = &osdc->requests.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_osd_request *req = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		req = rb_entry(parent, struct ceph_osd_request, r_node);
			
 
				+		if (new->r_tid < req->r_tid)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->r_tid > req->r_tid)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->r_node, parent, p);
			
 
				+	rb_insert_color(&new->r_node, &osdc->requests);
			
 
				+}
			
 
				+
			
 
				+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
			
 
				+						 u64 tid)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	struct rb_node *n = osdc->requests.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		req = rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+		if (tid < req->r_tid)
			
 
				+			n = n->rb_left;
			
 
				+		else if (tid > req->r_tid)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return req;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_osd_request *
			
 
				+__lookup_request_ge(struct ceph_osd_client *osdc,
			
 
				+		    u64 tid)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	struct rb_node *n = osdc->requests.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		req = rb_entry(n, struct ceph_osd_request, r_node);
			
 
				+		if (tid < req->r_tid) {
			
 
				+			if (!n->rb_left)
			
 
				+				return req;
			
 
				+			n = n->rb_left;
			
 
				+		} else if (tid > req->r_tid) {
			
 
				+			n = n->rb_right;
			
 
				+		} else {
			
 
				+			return req;
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * If the osd connection drops, we need to resubmit all requests.
			
 
				+ */
			
 
				+static void osd_reset(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	struct ceph_osd_client *osdc;
			
 
				+
			
 
				+	if (!osd)
			
 
				+		return;
			
 
				+	dout("osd_reset osd%d\n", osd->o_osd);
			
 
				+	osdc = osd->o_osdc;
			
 
				+	down_read(&osdc->map_sem);
			
 
				+	kick_requests(osdc, osd);
			
 
				+	up_read(&osdc->map_sem);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Track open sessions with osds.
			
 
				+ */
			
 
				+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	struct ceph_osd *osd;
			
 
				+
			
 
				+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
			
 
				+	if (!osd)
			
 
				+		return NULL;
			
 
				+
			
 
				+	atomic_set(&osd->o_ref, 1);
			
 
				+	osd->o_osdc = osdc;
			
 
				+	INIT_LIST_HEAD(&osd->o_requests);
			
 
				+	INIT_LIST_HEAD(&osd->o_osd_lru);
			
 
				+	osd->o_incarnation = 1;
			
 
				+
			
 
				+	ceph_con_init(osdc->client->msgr, &osd->o_con);
			
 
				+	osd->o_con.private = osd;
			
 
				+	osd->o_con.ops = &osd_con_ops;
			
 
				+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&osd->o_keepalive_item);
			
 
				+	return osd;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_osd *get_osd(struct ceph_osd *osd)
			
 
				+{
			
 
				+	if (atomic_inc_not_zero(&osd->o_ref)) {
			
 
				+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
			
 
				+		     atomic_read(&osd->o_ref));
			
 
				+		return osd;
			
 
				+	} else {
			
 
				+		dout("get_osd %p FAIL\n", osd);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void put_osd(struct ceph_osd *osd)
			
 
				+{
			
 
				+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
			
 
				+	     atomic_read(&osd->o_ref) - 1);
			
 
				+	if (atomic_dec_and_test(&osd->o_ref))
			
 
				+		kfree(osd);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * remove an osd from our map
			
 
				+ */
			
 
				+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
			
 
				+{
			
 
				+	dout("__remove_osd %p\n", osd);
			
 
				+	BUG_ON(!list_empty(&osd->o_requests));
			
 
				+	rb_erase(&osd->o_node, &osdc->osds);
			
 
				+	list_del_init(&osd->o_osd_lru);
			
 
				+	ceph_con_close(&osd->o_con);
			
 
				+	put_osd(osd);
			
 
				+}
			
 
				+
			
 
				+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
			
 
				+			      struct ceph_osd *osd)
			
 
				+{
			
 
				+	dout("__move_osd_to_lru %p\n", osd);
			
 
				+	BUG_ON(!list_empty(&osd->o_osd_lru));
			
 
				+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
			
 
				+	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
			
 
				+}
			
 
				+
			
 
				+static void __remove_osd_from_lru(struct ceph_osd *osd)
			
 
				+{
			
 
				+	dout("__remove_osd_from_lru %p\n", osd);
			
 
				+	if (!list_empty(&osd->o_osd_lru))
			
 
				+		list_del_init(&osd->o_osd_lru);
			
 
				+}
			
 
				+
			
 
				+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
			
 
				+{
			
 
				+	struct ceph_osd *osd, *nosd;
			
 
				+
			
 
				+	dout("__remove_old_osds %p\n", osdc);
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
			
 
				+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
			
 
				+			break;
			
 
				+		__remove_osd(osdc, osd);
			
 
				+	}
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * reset osd connect
			
 
				+ */
			
 
				+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
			
 
				+	if (list_empty(&osd->o_requests)) {
			
 
				+		__remove_osd(osdc, osd);
			
 
				+	} else {
			
 
				+		ceph_con_close(&osd->o_con);
			
 
				+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
			
 
				+		osd->o_incarnation++;
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
			
 
				+{
			
 
				+	struct rb_node **p = &osdc->osds.rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_osd *osd = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		osd = rb_entry(parent, struct ceph_osd, o_node);
			
 
				+		if (new->o_osd < osd->o_osd)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->o_osd > osd->o_osd)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->o_node, parent, p);
			
 
				+	rb_insert_color(&new->o_node, &osdc->osds);
			
 
				+}
			
 
				+
			
 
				+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
			
 
				+{
			
 
				+	struct ceph_osd *osd;
			
 
				+	struct rb_node *n = osdc->osds.rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		osd = rb_entry(n, struct ceph_osd, o_node);
			
 
				+		if (o < osd->o_osd)
			
 
				+			n = n->rb_left;
			
 
				+		else if (o > osd->o_osd)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return osd;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	schedule_delayed_work(&osdc->timeout_work,
			
 
				+			osdc->client->mount_args->osd_keepalive_timeout * HZ);
			
 
				+}
			
 
				+
			
 
				+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	cancel_delayed_work(&osdc->timeout_work);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Register request, assign tid.  If this is the first request, set up
			
 
				+ * the timeout event.
			
 
				+ */
			
 
				+static void register_request(struct ceph_osd_client *osdc,
			
 
				+			     struct ceph_osd_request *req)
			
 
				+{
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	req->r_tid = ++osdc->last_tid;
			
 
				+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
			
 
				+	INIT_LIST_HEAD(&req->r_req_lru_item);
			
 
				+
			
 
				+	dout("register_request %p tid %lld\n", req, req->r_tid);
			
 
				+	__insert_request(osdc, req);
			
 
				+	ceph_osdc_get_request(req);
			
 
				+	osdc->num_requests++;
			
 
				+
			
 
				+	if (osdc->num_requests == 1) {
			
 
				+		dout(" first request, scheduling timeout\n");
			
 
				+		__schedule_osd_timeout(osdc);
			
 
				+	}
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * called under osdc->request_mutex
			
 
				+ */
			
 
				+static void __unregister_request(struct ceph_osd_client *osdc,
			
 
				+				 struct ceph_osd_request *req)
			
 
				+{
			
 
				+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
			
 
				+	rb_erase(&req->r_node, &osdc->requests);
			
 
				+	osdc->num_requests--;
			
 
				+
			
 
				+	if (req->r_osd) {
			
 
				+		/* make sure the original request isn't in flight. */
			
 
				+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
			
 
				+
			
 
				+		list_del_init(&req->r_osd_item);
			
 
				+		if (list_empty(&req->r_osd->o_requests))
			
 
				+			__move_osd_to_lru(osdc, req->r_osd);
			
 
				+		req->r_osd = NULL;
			
 
				+	}
			
 
				+
			
 
				+	ceph_osdc_put_request(req);
			
 
				+
			
 
				+	list_del_init(&req->r_req_lru_item);
			
 
				+	if (osdc->num_requests == 0) {
			
 
				+		dout(" no requests, canceling timeout\n");
			
 
				+		__cancel_osd_timeout(osdc);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Cancel a previously queued request message
			
 
				+ */
			
 
				+static void __cancel_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	if (req->r_sent) {
			
 
				+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
			
 
				+		req->r_sent = 0;
			
 
				+	}
			
 
				+	list_del_init(&req->r_req_lru_item);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
			
 
				+ * (as needed), and set the request r_osd appropriately.  If there is
			
 
				+ * no up osd, set r_osd to NULL.
			
 
				+ *
			
 
				+ * Return 0 if unchanged, 1 if changed, or negative on error.
			
 
				+ *
			
 
				+ * Caller should hold map_sem for read and request_mutex.
			
 
				+ */
			
 
				+static int __map_osds(struct ceph_osd_client *osdc,
			
 
				+		      struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
			
 
				+	struct ceph_pg pgid;
			
 
				+	int o = -1;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("map_osds %p tid %lld\n", req, req->r_tid);
			
 
				+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
			
 
				+				      &req->r_file_layout, osdc->osdmap);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+	pgid = reqhead->layout.ol_pgid;
			
 
				+	req->r_pgid = pgid;
			
 
				+
			
 
				+	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
			
 
				+
			
 
				+	if ((req->r_osd && req->r_osd->o_osd == o &&
			
 
				+	     req->r_sent >= req->r_osd->o_incarnation) ||
			
 
				+	    (req->r_osd == NULL && o == -1))
			
 
				+		return 0;  /* no change */
			
 
				+
			
 
				+	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
			
 
				+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
			
 
				+	     req->r_osd ? req->r_osd->o_osd : -1);
			
 
				+
			
 
				+	if (req->r_osd) {
			
 
				+		__cancel_request(req);
			
 
				+		list_del_init(&req->r_osd_item);
			
 
				+		req->r_osd = NULL;
			
 
				+	}
			
 
				+
			
 
				+	req->r_osd = __lookup_osd(osdc, o);
			
 
				+	if (!req->r_osd && o >= 0) {
			
 
				+		err = -ENOMEM;
			
 
				+		req->r_osd = create_osd(osdc);
			
 
				+		if (!req->r_osd)
			
 
				+			goto out;
			
 
				+
			
 
				+		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
			
 
				+		req->r_osd->o_osd = o;
			
 
				+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
			
 
				+		__insert_osd(osdc, req->r_osd);
			
 
				+
			
 
				+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
			
 
				+	}
			
 
				+
			
 
				+	if (req->r_osd) {
			
 
				+		__remove_osd_from_lru(req->r_osd);
			
 
				+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
			
 
				+	}
			
 
				+	err = 1;   /* osd changed */
			
 
				+
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caller should hold map_sem (for read) and request_mutex
			
 
				+ */
			
 
				+static int __send_request(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_osd_request *req)
			
 
				+{
			
 
				+	struct ceph_osd_request_head *reqhead;
			
 
				+	int err;
			
 
				+
			
 
				+	err = __map_osds(osdc, req);
			
 
				+	if (err < 0)
			
 
				+		return err;
			
 
				+	if (req->r_osd == NULL) {
			
 
				+		dout("send_request %p no up osds in pg\n", req);
			
 
				+		ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	dout("send_request %p tid %llu to osd%d flags %d\n",
			
 
				+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
			
 
				+
			
 
				+	reqhead = req->r_request->front.iov_base;
			
 
				+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
			
 
				+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
			
 
				+	reqhead->reassert_version = req->r_reassert_version;
			
 
				+
			
 
				+	req->r_sent_stamp = jiffies;
			
 
				+	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
			
 
				+
			
 
				+	ceph_msg_get(req->r_request); /* send consumes a ref */
			
 
				+	ceph_con_send(&req->r_osd->o_con, req->r_request);
			
 
				+	req->r_sent = req->r_osd->o_incarnation;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Timeout callback, called every N seconds when 1 or more osd
			
 
				+ * requests has been active for more than N seconds.  When this
			
 
				+ * happens, we ping all OSDs with requests who have timed out to
			
 
				+ * ensure any communications channel reset is detected.  Reset the
			
 
				+ * request timeouts another N seconds in the future as we go.
			
 
				+ * Reschedule the timeout event another N seconds in future (unless
			
 
				+ * there are no open requests).
			
 
				+ */
			
 
				+static void handle_timeout(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc =
			
 
				+		container_of(work, struct ceph_osd_client, timeout_work.work);
			
 
				+	struct ceph_osd_request *req, *last_req = NULL;
			
 
				+	struct ceph_osd *osd;
			
 
				+	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
			
 
				+	unsigned long keepalive =
			
 
				+		osdc->client->mount_args->osd_keepalive_timeout * HZ;
			
 
				+	unsigned long last_sent = 0;
			
 
				+	struct rb_node *p;
			
 
				+	struct list_head slow_osds;
			
 
				+
			
 
				+	dout("timeout\n");
			
 
				+	down_read(&osdc->map_sem);
			
 
				+
			
 
				+	ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				+
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
			
 
				+		req = rb_entry(p, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+		if (req->r_resend) {
			
 
				+			int err;
			
 
				+
			
 
				+			dout("osdc resending prev failed %lld\n", req->r_tid);
			
 
				+			err = __send_request(osdc, req);
			
 
				+			if (err)
			
 
				+				dout("osdc failed again on %lld\n", req->r_tid);
			
 
				+			else
			
 
				+				req->r_resend = false;
			
 
				+			continue;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * reset osds that appear to be _really_ unresponsive.  this
			
 
				+	 * is a failsafe measure.. we really shouldn't be getting to
			
 
				+	 * this point if the system is working properly.  the monitors
			
 
				+	 * should mark the osd as failed and we should find out about
			
 
				+	 * it from an updated osd map.
			
 
				+	 */
			
 
				+	while (!list_empty(&osdc->req_lru)) {
			
 
				+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
			
 
				+				 r_req_lru_item);
			
 
				+
			
 
				+		if (time_before(jiffies, req->r_sent_stamp + timeout))
			
 
				+			break;
			
 
				+
			
 
				+		BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
			
 
				+		last_req = req;
			
 
				+		last_sent = req->r_sent_stamp;
			
 
				+
			
 
				+		osd = req->r_osd;
			
 
				+		BUG_ON(!osd);
			
 
				+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
			
 
				+			   req->r_tid, osd->o_osd);
			
 
				+		__kick_requests(osdc, osd);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * ping osds that are a bit slow.  this ensures that if there
			
 
				+	 * is a break in the TCP connection we will notice, and reopen
			
 
				+	 * a connection with that osd (from the fault callback).
			
 
				+	 */
			
 
				+	INIT_LIST_HEAD(&slow_osds);
			
 
				+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
			
 
				+		if (time_before(jiffies, req->r_sent_stamp + keepalive))
			
 
				+			break;
			
 
				+
			
 
				+		osd = req->r_osd;
			
 
				+		BUG_ON(!osd);
			
 
				+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
			
 
				+		     req->r_tid, osd->o_osd);
			
 
				+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
			
 
				+	}
			
 
				+	while (!list_empty(&slow_osds)) {
			
 
				+		osd = list_entry(slow_osds.next, struct ceph_osd,
			
 
				+				 o_keepalive_item);
			
 
				+		list_del_init(&osd->o_keepalive_item);
			
 
				+		ceph_con_keepalive(&osd->o_con);
			
 
				+	}
			
 
				+
			
 
				+	__schedule_osd_timeout(osdc);
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+
			
 
				+	up_read(&osdc->map_sem);
			
 
				+}
			
 
				+
			
 
				+static void handle_osds_timeout(struct work_struct *work)
			
 
				+{
			
 
				+	struct ceph_osd_client *osdc =
			
 
				+		container_of(work, struct ceph_osd_client,
			
 
				+			     osds_timeout_work.work);
			
 
				+	unsigned long delay =
			
 
				+		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
			
 
				+
			
 
				+	dout("osds timeout\n");
			
 
				+	down_read(&osdc->map_sem);
			
 
				+	remove_old_osds(osdc, 0);
			
 
				+	up_read(&osdc->map_sem);
			
 
				+
			
 
				+	schedule_delayed_work(&osdc->osds_timeout_work,
			
 
				+			      round_jiffies_relative(delay));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * handle osd op reply.  either call the callback if it is specified,
			
 
				+ * or do the completion to wake up the waiting thread.
			
 
				+ */
			
 
				+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
			
 
				+			 struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	u64 tid;
			
 
				+	int numops, object_len, flags;
			
 
				+
			
 
				+	tid = le64_to_cpu(msg->hdr.tid);
			
 
				+	if (msg->front.iov_len < sizeof(*rhead))
			
 
				+		goto bad;
			
 
				+	numops = le32_to_cpu(rhead->num_ops);
			
 
				+	object_len = le32_to_cpu(rhead->object_len);
			
 
				+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
			
 
				+	    numops * sizeof(struct ceph_osd_op))
			
 
				+		goto bad;
			
 
				+	dout("handle_reply %p tid %llu\n", msg, tid);
			
 
				+
			
 
				+	/* lookup */
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	req = __lookup_request(osdc, tid);
			
 
				+	if (req == NULL) {
			
 
				+		dout("handle_reply tid %llu dne\n", tid);
			
 
				+		mutex_unlock(&osdc->request_mutex);
			
 
				+		return;
			
 
				+	}
			
 
				+	ceph_osdc_get_request(req);
			
 
				+	flags = le32_to_cpu(rhead->flags);
			
 
				+
			
 
				+	/*
			
 
				+	 * if this connection filled our message, drop our reference now, to
			
 
				+	 * avoid a (safe but slower) revoke later.
			
 
				+	 */
			
 
				+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
			
 
				+		dout(" dropping con_filling_msg ref %p\n", con);
			
 
				+		req->r_con_filling_msg = NULL;
			
 
				+		ceph_con_put(con);
			
 
				+	}
			
 
				+
			
 
				+	if (!req->r_got_reply) {
			
 
				+		unsigned bytes;
			
 
				+
			
 
				+		req->r_result = le32_to_cpu(rhead->result);
			
 
				+		bytes = le32_to_cpu(msg->hdr.data_len);
			
 
				+		dout("handle_reply result %d bytes %d\n", req->r_result,
			
 
				+		     bytes);
			
 
				+		if (req->r_result == 0)
			
 
				+			req->r_result = bytes;
			
 
				+
			
 
				+		/* in case this is a write and we need to replay, */
			
 
				+		req->r_reassert_version = rhead->reassert_version;
			
 
				+
			
 
				+		req->r_got_reply = 1;
			
 
				+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
			
 
				+		dout("handle_reply tid %llu dup ack\n", tid);
			
 
				+		mutex_unlock(&osdc->request_mutex);
			
 
				+		goto done;
			
 
				+	}
			
 
				+
			
 
				+	dout("handle_reply tid %llu flags %d\n", tid, flags);
			
 
				+
			
 
				+	/* either this is a read, or we got the safe response */
			
 
				+	if ((flags & CEPH_OSD_FLAG_ONDISK) ||
			
 
				+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
			
 
				+		__unregister_request(osdc, req);
			
 
				+
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+
			
 
				+	if (req->r_callback)
			
 
				+		req->r_callback(req, msg);
			
 
				+	else
			
 
				+		complete(&req->r_completion);
			
 
				+
			
 
				+	if (flags & CEPH_OSD_FLAG_ONDISK) {
			
 
				+		if (req->r_safe_callback)
			
 
				+			req->r_safe_callback(req, msg);
			
 
				+		complete(&req->r_safe_completion);  /* fsync waiter */
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
			
 
				+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
			
 
				+	       (int)sizeof(*rhead));
			
 
				+	ceph_msg_dump(msg);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int __kick_requests(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_osd *kickosd)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	struct rb_node *p, *n;
			
 
				+	int needmap = 0;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
			
 
				+	if (kickosd) {
			
 
				+		__reset_osd(osdc, kickosd);
			
 
				+	} else {
			
 
				+		for (p = rb_first(&osdc->osds); p; p = n) {
			
 
				+			struct ceph_osd *osd =
			
 
				+				rb_entry(p, struct ceph_osd, o_node);
			
 
				+
			
 
				+			n = rb_next(p);
			
 
				+			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
			
 
				+			    memcmp(&osd->o_con.peer_addr,
			
 
				+				   ceph_osd_addr(osdc->osdmap,
			
 
				+						 osd->o_osd),
			
 
				+				   sizeof(struct ceph_entity_addr)) != 0)
			
 
				+				__reset_osd(osdc, osd);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
			
 
				+		req = rb_entry(p, struct ceph_osd_request, r_node);
			
 
				+
			
 
				+		if (req->r_resend) {
			
 
				+			dout(" r_resend set on tid %llu\n", req->r_tid);
			
 
				+			__cancel_request(req);
			
 
				+			goto kick;
			
 
				+		}
			
 
				+		if (req->r_osd && kickosd == req->r_osd) {
			
 
				+			__cancel_request(req);
			
 
				+			goto kick;
			
 
				+		}
			
 
				+
			
 
				+		err = __map_osds(osdc, req);
			
 
				+		if (err == 0)
			
 
				+			continue;  /* no change */
			
 
				+		if (err < 0) {
			
 
				+			/*
			
 
				+			 * FIXME: really, we should set the request
			
 
				+			 * error and fail if this isn't a 'nofail'
			
 
				+			 * request, but that's a fair bit more
			
 
				+			 * complicated to do.  So retry!
			
 
				+			 */
			
 
				+			dout(" setting r_resend on %llu\n", req->r_tid);
			
 
				+			req->r_resend = true;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (req->r_osd == NULL) {
			
 
				+			dout("tid %llu maps to no valid osd\n", req->r_tid);
			
 
				+			needmap++;  /* request a newer map */
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+kick:
			
 
				+		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
			
 
				+		     req->r_osd->o_osd);
			
 
				+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
			
 
				+		err = __send_request(osdc, req);
			
 
				+		if (err) {
			
 
				+			dout(" setting r_resend on %llu\n", req->r_tid);
			
 
				+			req->r_resend = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return needmap;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Resubmit osd requests whose osd or osd address has changed.  Request
			
 
				+ * a new osd map if osds are down, or we are otherwise unable to determine
			
 
				+ * how to direct a request.
			
 
				+ *
			
 
				+ * Close connections to down osds.
			
 
				+ *
			
 
				+ * If @who is specified, resubmit requests for that specific osd.
			
 
				+ *
			
 
				+ * Caller should hold map_sem for read and request_mutex.
			
 
				+ */
			
 
				+static void kick_requests(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_osd *kickosd)
			
 
				+{
			
 
				+	int needmap;
			
 
				+
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	needmap = __kick_requests(osdc, kickosd);
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+
			
 
				+	if (needmap) {
			
 
				+		dout("%d requests for down osds, need new map\n", needmap);
			
 
				+		ceph_monc_request_next_osdmap(&osdc->client->monc);
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+/*
			
 
				+ * Process updated osd map.
			
 
				+ *
			
 
				+ * The message contains any number of incremental and full maps, normally
			
 
				+ * indicating some sort of topology change in the cluster.  Kick requests
			
 
				+ * off to different OSDs as needed.
			
 
				+ */
			
 
				+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
			
 
				+{
			
 
				+	void *p, *end, *next;
			
 
				+	u32 nr_maps, maplen;
			
 
				+	u32 epoch;
			
 
				+	struct ceph_osdmap *newmap = NULL, *oldmap;
			
 
				+	int err;
			
 
				+	struct ceph_fsid fsid;
			
 
				+
			
 
				+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
			
 
				+	p = msg->front.iov_base;
			
 
				+	end = p + msg->front.iov_len;
			
 
				+
			
 
				+	/* verify fsid */
			
 
				+	ceph_decode_need(&p, end, sizeof(fsid), bad);
			
 
				+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
			
 
				+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
			
 
				+		return;
			
 
				+
			
 
				+	down_write(&osdc->map_sem);
			
 
				+
			
 
				+	/* incremental maps */
			
 
				+	ceph_decode_32_safe(&p, end, nr_maps, bad);
			
 
				+	dout(" %d inc maps\n", nr_maps);
			
 
				+	while (nr_maps > 0) {
			
 
				+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
			
 
				+		epoch = ceph_decode_32(&p);
			
 
				+		maplen = ceph_decode_32(&p);
			
 
				+		ceph_decode_need(&p, end, maplen, bad);
			
 
				+		next = p + maplen;
			
 
				+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
			
 
				+			dout("applying incremental map %u len %d\n",
			
 
				+			     epoch, maplen);
			
 
				+			newmap = osdmap_apply_incremental(&p, next,
			
 
				+							  osdc->osdmap,
			
 
				+							  osdc->client->msgr);
			
 
				+			if (IS_ERR(newmap)) {
			
 
				+				err = PTR_ERR(newmap);
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			BUG_ON(!newmap);
			
 
				+			if (newmap != osdc->osdmap) {
			
 
				+				ceph_osdmap_destroy(osdc->osdmap);
			
 
				+				osdc->osdmap = newmap;
			
 
				+			}
			
 
				+		} else {
			
 
				+			dout("ignoring incremental map %u len %d\n",
			
 
				+			     epoch, maplen);
			
 
				+		}
			
 
				+		p = next;
			
 
				+		nr_maps--;
			
 
				+	}
			
 
				+	if (newmap)
			
 
				+		goto done;
			
 
				+
			
 
				+	/* full maps */
			
 
				+	ceph_decode_32_safe(&p, end, nr_maps, bad);
			
 
				+	dout(" %d full maps\n", nr_maps);
			
 
				+	while (nr_maps) {
			
 
				+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
			
 
				+		epoch = ceph_decode_32(&p);
			
 
				+		maplen = ceph_decode_32(&p);
			
 
				+		ceph_decode_need(&p, end, maplen, bad);
			
 
				+		if (nr_maps > 1) {
			
 
				+			dout("skipping non-latest full map %u len %d\n",
			
 
				+			     epoch, maplen);
			
 
				+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
			
 
				+			dout("skipping full map %u len %d, "
			
 
				+			     "older than our %u\n", epoch, maplen,
			
 
				+			     osdc->osdmap->epoch);
			
 
				+		} else {
			
 
				+			dout("taking full map %u len %d\n", epoch, maplen);
			
 
				+			newmap = osdmap_decode(&p, p+maplen);
			
 
				+			if (IS_ERR(newmap)) {
			
 
				+				err = PTR_ERR(newmap);
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			BUG_ON(!newmap);
			
 
				+			oldmap = osdc->osdmap;
			
 
				+			osdc->osdmap = newmap;
			
 
				+			if (oldmap)
			
 
				+				ceph_osdmap_destroy(oldmap);
			
 
				+		}
			
 
				+		p += maplen;
			
 
				+		nr_maps--;
			
 
				+	}
			
 
				+
			
 
				+done:
			
 
				+	downgrade_write(&osdc->map_sem);
			
 
				+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
			
 
				+	if (newmap)
			
 
				+		kick_requests(osdc, NULL);
			
 
				+	up_read(&osdc->map_sem);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("osdc handle_map corrupt msg\n");
			
 
				+	ceph_msg_dump(msg);
			
 
				+	up_write(&osdc->map_sem);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * A read request prepares specific pages that data is to be read into.
			
 
				+ * When a message is being read off the wire, we call prepare_pages to
			
 
				+ * find those pages.
			
 
				+ *  0 = success, -1 failure.
			
 
				+ */
			
 
				+static int __prepare_pages(struct ceph_connection *con,
			
 
				+			 struct ceph_msg_header *hdr,
			
 
				+			 struct ceph_osd_request *req,
			
 
				+			 u64 tid,
			
 
				+			 struct ceph_msg *m)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	struct ceph_osd_client *osdc;
			
 
				+	int ret = -1;
			
 
				+	int data_len = le32_to_cpu(hdr->data_len);
			
 
				+	unsigned data_off = le16_to_cpu(hdr->data_off);
			
 
				+
			
 
				+	int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
			
 
				+
			
 
				+	if (!osd)
			
 
				+		return -1;
			
 
				+
			
 
				+	osdc = osd->o_osdc;
			
 
				+
			
 
				+	dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
			
 
				+	     tid, req->r_num_pages, want);
			
 
				+	if (unlikely(req->r_num_pages < want))
			
 
				+		goto out;
			
 
				+	m->pages = req->r_pages;
			
 
				+	m->nr_pages = req->r_num_pages;
			
 
				+	ret = 0; /* success */
			
 
				+out:
			
 
				+	BUG_ON(ret < 0 || m->nr_pages < want);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Register request, send initial attempt.
			
 
				+ */
			
 
				+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
			
 
				+			    struct ceph_osd_request *req,
			
 
				+			    bool nofail)
			
 
				+{
			
 
				+	int rc = 0;
			
 
				+
			
 
				+	req->r_request->pages = req->r_pages;
			
 
				+	req->r_request->nr_pages = req->r_num_pages;
			
 
				+
			
 
				+	register_request(osdc, req);
			
 
				+
			
 
				+	down_read(&osdc->map_sem);
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	/*
			
 
				+	 * a racing kick_requests() may have sent the message for us
			
 
				+	 * while we dropped request_mutex above, so only send now if
			
 
				+	 * the request still han't been touched yet.
			
 
				+	 */
			
 
				+	if (req->r_sent == 0) {
			
 
				+		rc = __send_request(osdc, req);
			
 
				+		if (rc) {
			
 
				+			if (nofail) {
			
 
				+				dout("osdc_start_request failed send, "
			
 
				+				     " marking %lld\n", req->r_tid);
			
 
				+				req->r_resend = true;
			
 
				+				rc = 0;
			
 
				+			} else {
			
 
				+				__unregister_request(osdc, req);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+	up_read(&osdc->map_sem);
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * wait for a request to complete
			
 
				+ */
			
 
				+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
			
 
				+			   struct ceph_osd_request *req)
			
 
				+{
			
 
				+	int rc;
			
 
				+
			
 
				+	rc = wait_for_completion_interruptible(&req->r_completion);
			
 
				+	if (rc < 0) {
			
 
				+		mutex_lock(&osdc->request_mutex);
			
 
				+		__cancel_request(req);
			
 
				+		__unregister_request(osdc, req);
			
 
				+		mutex_unlock(&osdc->request_mutex);
			
 
				+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
			
 
				+		return rc;
			
 
				+	}
			
 
				+
			
 
				+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
			
 
				+	return req->r_result;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * sync - wait for all in-flight requests to flush.  avoid starvation.
			
 
				+ */
			
 
				+void ceph_osdc_sync(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	u64 last_tid, next_tid = 0;
			
 
				+
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	last_tid = osdc->last_tid;
			
 
				+	while (1) {
			
 
				+		req = __lookup_request_ge(osdc, next_tid);
			
 
				+		if (!req)
			
 
				+			break;
			
 
				+		if (req->r_tid > last_tid)
			
 
				+			break;
			
 
				+
			
 
				+		next_tid = req->r_tid + 1;
			
 
				+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
			
 
				+			continue;
			
 
				+
			
 
				+		ceph_osdc_get_request(req);
			
 
				+		mutex_unlock(&osdc->request_mutex);
			
 
				+		dout("sync waiting on tid %llu (last is %llu)\n",
			
 
				+		     req->r_tid, last_tid);
			
 
				+		wait_for_completion(&req->r_safe_completion);
			
 
				+		mutex_lock(&osdc->request_mutex);
			
 
				+		ceph_osdc_put_request(req);
			
 
				+	}
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+	dout("sync done (thru tid %llu)\n", last_tid);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * init, shutdown
			
 
				+ */
			
 
				+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	dout("init\n");
			
 
				+	osdc->client = client;
			
 
				+	osdc->osdmap = NULL;
			
 
				+	init_rwsem(&osdc->map_sem);
			
 
				+	init_completion(&osdc->map_waiters);
			
 
				+	osdc->last_requested_map = 0;
			
 
				+	mutex_init(&osdc->request_mutex);
			
 
				+	osdc->last_tid = 0;
			
 
				+	osdc->osds = RB_ROOT;
			
 
				+	INIT_LIST_HEAD(&osdc->osd_lru);
			
 
				+	osdc->requests = RB_ROOT;
			
 
				+	INIT_LIST_HEAD(&osdc->req_lru);
			
 
				+	osdc->num_requests = 0;
			
 
				+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
			
 
				+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
			
 
				+
			
 
				+	schedule_delayed_work(&osdc->osds_timeout_work,
			
 
				+	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
			
 
				+
			
 
				+	err = -ENOMEM;
			
 
				+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
			
 
				+					sizeof(struct ceph_osd_request));
			
 
				+	if (!osdc->req_mempool)
			
 
				+		goto out;
			
 
				+
			
 
				+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
			
 
				+	if (err < 0)
			
 
				+		goto out_mempool;
			
 
				+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
			
 
				+				OSD_OPREPLY_FRONT_LEN, 10, true);
			
 
				+	if (err < 0)
			
 
				+		goto out_msgpool;
			
 
				+	return 0;
			
 
				+
			
 
				+out_msgpool:
			
 
				+	ceph_msgpool_destroy(&osdc->msgpool_op);
			
 
				+out_mempool:
			
 
				+	mempool_destroy(osdc->req_mempool);
			
 
				+out:
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+void ceph_osdc_stop(struct ceph_osd_client *osdc)
			
 
				+{
			
 
				+	cancel_delayed_work_sync(&osdc->timeout_work);
			
 
				+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
			
 
				+	if (osdc->osdmap) {
			
 
				+		ceph_osdmap_destroy(osdc->osdmap);
			
 
				+		osdc->osdmap = NULL;
			
 
				+	}
			
 
				+	remove_old_osds(osdc, 1);
			
 
				+	mempool_destroy(osdc->req_mempool);
			
 
				+	ceph_msgpool_destroy(&osdc->msgpool_op);
			
 
				+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
			
 
				+ * *plen.  Return number of bytes read, or error.
			
 
				+ */
			
 
				+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
			
 
				+			struct ceph_vino vino, struct ceph_file_layout *layout,
			
 
				+			u64 off, u64 *plen,
			
 
				+			u32 truncate_seq, u64 truncate_size,
			
 
				+			struct page **pages, int num_pages)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	int rc = 0;
			
 
				+
			
 
				+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
			
 
				+	     vino.snap, off, *plen);
			
 
				+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
			
 
				+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
			
 
				+				    NULL, 0, truncate_seq, truncate_size, NULL,
			
 
				+				    false, 1);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+
			
 
				+	/* it may be a short read due to an object boundary */
			
 
				+	req->r_pages = pages;
			
 
				+	num_pages = calc_pages_for(off, *plen);
			
 
				+	req->r_num_pages = num_pages;
			
 
				+
			
 
				+	dout("readpages  final extent is %llu~%llu (%d pages)\n",
			
 
				+	     off, *plen, req->r_num_pages);
			
 
				+
			
 
				+	rc = ceph_osdc_start_request(osdc, req, false);
			
 
				+	if (!rc)
			
 
				+		rc = ceph_osdc_wait_request(osdc, req);
			
 
				+
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	dout("readpages result %d\n", rc);
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * do a synchronous write on N pages
			
 
				+ */
			
 
				+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
			
 
				+			 struct ceph_file_layout *layout,
			
 
				+			 struct ceph_snap_context *snapc,
			
 
				+			 u64 off, u64 len,
			
 
				+			 u32 truncate_seq, u64 truncate_size,
			
 
				+			 struct timespec *mtime,
			
 
				+			 struct page **pages, int num_pages,
			
 
				+			 int flags, int do_sync, bool nofail)
			
 
				+{
			
 
				+	struct ceph_osd_request *req;
			
 
				+	int rc = 0;
			
 
				+
			
 
				+	BUG_ON(vino.snap != CEPH_NOSNAP);
			
 
				+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
			
 
				+				    CEPH_OSD_OP_WRITE,
			
 
				+				    flags | CEPH_OSD_FLAG_ONDISK |
			
 
				+					    CEPH_OSD_FLAG_WRITE,
			
 
				+				    snapc, do_sync,
			
 
				+				    truncate_seq, truncate_size, mtime,
			
 
				+				    nofail, 1);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+
			
 
				+	/* it may be a short write due to an object boundary */
			
 
				+	req->r_pages = pages;
			
 
				+	req->r_num_pages = calc_pages_for(off, len);
			
 
				+	dout("writepages %llu~%llu (%d pages)\n", off, len,
			
 
				+	     req->r_num_pages);
			
 
				+
			
 
				+	rc = ceph_osdc_start_request(osdc, req, nofail);
			
 
				+	if (!rc)
			
 
				+		rc = ceph_osdc_wait_request(osdc, req);
			
 
				+
			
 
				+	ceph_osdc_put_request(req);
			
 
				+	if (rc == 0)
			
 
				+		rc = len;
			
 
				+	dout("writepages result %d\n", rc);
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * handle incoming message
			
 
				+ */
			
 
				+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	struct ceph_osd_client *osdc;
			
 
				+	int type = le16_to_cpu(msg->hdr.type);
			
 
				+
			
 
				+	if (!osd)
			
 
				+		return;
			
 
				+	osdc = osd->o_osdc;
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case CEPH_MSG_OSD_MAP:
			
 
				+		ceph_osdc_handle_map(osdc, msg);
			
 
				+		break;
			
 
				+	case CEPH_MSG_OSD_OPREPLY:
			
 
				+		handle_reply(osdc, msg, con);
			
 
				+		break;
			
 
				+
			
 
				+	default:
			
 
				+		pr_err("received unknown message type %d %s\n", type,
			
 
				+		       ceph_msg_type_name(type));
			
 
				+	}
			
 
				+	ceph_msg_put(msg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * lookup and return message for incoming reply
			
 
				+ */
			
 
				+static struct ceph_msg *get_reply(struct ceph_connection *con,
			
 
				+				  struct ceph_msg_header *hdr,
			
 
				+				  int *skip)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	struct ceph_osd_client *osdc = osd->o_osdc;
			
 
				+	struct ceph_msg *m;
			
 
				+	struct ceph_osd_request *req;
			
 
				+	int front = le32_to_cpu(hdr->front_len);
			
 
				+	int data_len = le32_to_cpu(hdr->data_len);
			
 
				+	u64 tid;
			
 
				+	int err;
			
 
				+
			
 
				+	tid = le64_to_cpu(hdr->tid);
			
 
				+	mutex_lock(&osdc->request_mutex);
			
 
				+	req = __lookup_request(osdc, tid);
			
 
				+	if (!req) {
			
 
				+		*skip = 1;
			
 
				+		m = NULL;
			
 
				+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
			
 
				+			osd->o_osd);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (req->r_con_filling_msg) {
			
 
				+		dout("get_reply revoking msg %p from old con %p\n",
			
 
				+		     req->r_reply, req->r_con_filling_msg);
			
 
				+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
			
 
				+		ceph_con_put(req->r_con_filling_msg);
			
 
				+	}
			
 
				+
			
 
				+	if (front > req->r_reply->front.iov_len) {
			
 
				+		pr_warning("get_reply front %d > preallocated %d\n",
			
 
				+			   front, (int)req->r_reply->front.iov_len);
			
 
				+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
			
 
				+		if (IS_ERR(m))
			
 
				+			goto out;
			
 
				+		ceph_msg_put(req->r_reply);
			
 
				+		req->r_reply = m;
			
 
				+	}
			
 
				+	m = ceph_msg_get(req->r_reply);
			
 
				+
			
 
				+	if (data_len > 0) {
			
 
				+		err = __prepare_pages(con, hdr, req, tid, m);
			
 
				+		if (err < 0) {
			
 
				+			*skip = 1;
			
 
				+			ceph_msg_put(m);
			
 
				+			m = ERR_PTR(err);
			
 
				+		}
			
 
				+	}
			
 
				+	*skip = 0;
			
 
				+	req->r_con_filling_msg = ceph_con_get(con);
			
 
				+	dout("get_reply tid %lld %p\n", tid, m);
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&osdc->request_mutex);
			
 
				+	return m;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
			
 
				+				  struct ceph_msg_header *hdr,
			
 
				+				  int *skip)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	int type = le16_to_cpu(hdr->type);
			
 
				+	int front = le32_to_cpu(hdr->front_len);
			
 
				+
			
 
				+	switch (type) {
			
 
				+	case CEPH_MSG_OSD_MAP:
			
 
				+		return ceph_msg_new(type, front, 0, 0, NULL);
			
 
				+	case CEPH_MSG_OSD_OPREPLY:
			
 
				+		return get_reply(con, hdr, skip);
			
 
				+	default:
			
 
				+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
			
 
				+			osd->o_osd);
			
 
				+		*skip = 1;
			
 
				+		return NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wrappers to refcount containing ceph_osd struct
			
 
				+ */
			
 
				+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	if (get_osd(osd))
			
 
				+		return con;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void put_osd_con(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_osd *osd = con->private;
			
 
				+	put_osd(osd);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * authentication
			
 
				+ */
			
 
				+static int get_authorizer(struct ceph_connection *con,
			
 
				+	                  void **buf, int *len, int *proto,
			
 
				+	                  void **reply_buf, int *reply_len, int force_new)
			
 
				+{
			
 
				+	struct ceph_osd *o = con->private;
			
 
				+	struct ceph_osd_client *osdc = o->o_osdc;
			
 
				+	struct ceph_auth_client *ac = osdc->client->monc.auth;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (force_new && o->o_authorizer) {
			
 
				+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
			
 
				+		o->o_authorizer = NULL;
			
 
				+	}
			
 
				+	if (o->o_authorizer == NULL) {
			
 
				+		ret = ac->ops->create_authorizer(
			
 
				+			ac, CEPH_ENTITY_TYPE_OSD,
			
 
				+			&o->o_authorizer,
			
 
				+			&o->o_authorizer_buf,
			
 
				+			&o->o_authorizer_buf_len,
			
 
				+			&o->o_authorizer_reply_buf,
			
 
				+			&o->o_authorizer_reply_buf_len);
			
 
				+		if (ret)
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	*proto = ac->protocol;
			
 
				+	*buf = o->o_authorizer_buf;
			
 
				+	*len = o->o_authorizer_buf_len;
			
 
				+	*reply_buf = o->o_authorizer_reply_buf;
			
 
				+	*reply_len = o->o_authorizer_reply_buf_len;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int verify_authorizer_reply(struct ceph_connection *con, int len)
			
 
				+{
			
 
				+	struct ceph_osd *o = con->private;
			
 
				+	struct ceph_osd_client *osdc = o->o_osdc;
			
 
				+	struct ceph_auth_client *ac = osdc->client->monc.auth;
			
 
				+
			
 
				+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
			
 
				+}
			
 
				+
			
 
				+static int invalidate_authorizer(struct ceph_connection *con)
			
 
				+{
			
 
				+	struct ceph_osd *o = con->private;
			
 
				+	struct ceph_osd_client *osdc = o->o_osdc;
			
 
				+	struct ceph_auth_client *ac = osdc->client->monc.auth;
			
 
				+
			
 
				+	if (ac->ops->invalidate_authorizer)
			
 
				+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
			
 
				+
			
 
				+	return ceph_monc_validate_auth(&osdc->client->monc);
			
 
				+}
			
 
				+
			
 
				+const static struct ceph_connection_operations osd_con_ops = {
			
 
				+	.get = get_osd_con,
			
 
				+	.put = put_osd_con,
			
 
				+	.dispatch = dispatch,
			
 
				+	.get_authorizer = get_authorizer,
			
 
				+	.verify_authorizer_reply = verify_authorizer_reply,
			
 
				+	.invalidate_authorizer = invalidate_authorizer,
			
 
				+	.alloc_msg = alloc_msg,
			
 
				+	.fault = osd_reset,
			
 
				+};
			
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
 
				+#ifndef _FS_CEPH_OSD_CLIENT_H
			
 
				+#define _FS_CEPH_OSD_CLIENT_H
			
 
				+
			
 
				+#include <linux/completion.h>
			
 
				+#include <linux/kref.h>
			
 
				+#include <linux/mempool.h>
			
 
				+#include <linux/rbtree.h>
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "osdmap.h"
			
 
				+#include "messenger.h"
			
 
				+
			
 
				+struct ceph_msg;
			
 
				+struct ceph_snap_context;
			
 
				+struct ceph_osd_request;
			
 
				+struct ceph_osd_client;
			
 
				+struct ceph_authorizer;
			
 
				+
			
 
				+/*
			
 
				+ * completion callback for async writepages
			
 
				+ */
			
 
				+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
			
 
				+				     struct ceph_msg *);
			
 
				+
			
 
				+/* a given osd we're communicating with */
			
 
				+struct ceph_osd {
			
 
				+	atomic_t o_ref;
			
 
				+	struct ceph_osd_client *o_osdc;
			
 
				+	int o_osd;
			
 
				+	int o_incarnation;
			
 
				+	struct rb_node o_node;
			
 
				+	struct ceph_connection o_con;
			
 
				+	struct list_head o_requests;
			
 
				+	struct list_head o_osd_lru;
			
 
				+	struct ceph_authorizer *o_authorizer;
			
 
				+	void *o_authorizer_buf, *o_authorizer_reply_buf;
			
 
				+	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
			
 
				+	unsigned long lru_ttl;
			
 
				+	int o_marked_for_keepalive;
			
 
				+	struct list_head o_keepalive_item;
			
 
				+};
			
 
				+
			
 
				+/* an in-flight request */
			
 
				+struct ceph_osd_request {
			
 
				+	u64             r_tid;              /* unique for this client */
			
 
				+	struct rb_node  r_node;
			
 
				+	struct list_head r_req_lru_item;
			
 
				+	struct list_head r_osd_item;
			
 
				+	struct ceph_osd *r_osd;
			
 
				+	struct ceph_pg   r_pgid;
			
 
				+
			
 
				+	struct ceph_connection *r_con_filling_msg;
			
 
				+
			
 
				+	struct ceph_msg  *r_request, *r_reply;
			
 
				+	int               r_result;
			
 
				+	int               r_flags;     /* any additional flags for the osd */
			
 
				+	u32               r_sent;      /* >0 if r_request is sending/sent */
			
 
				+	int               r_got_reply;
			
 
				+
			
 
				+	struct ceph_osd_client *r_osdc;
			
 
				+	struct kref       r_kref;
			
 
				+	bool              r_mempool;
			
 
				+	struct completion r_completion, r_safe_completion;
			
 
				+	ceph_osdc_callback_t r_callback, r_safe_callback;
			
 
				+	struct ceph_eversion r_reassert_version;
			
 
				+	struct list_head  r_unsafe_item;
			
 
				+
			
 
				+	struct inode *r_inode;         	      /* for use by callbacks */
			
 
				+	struct writeback_control *r_wbc;      /* ditto */
			
 
				+
			
 
				+	char              r_oid[40];          /* object name */
			
 
				+	int               r_oid_len;
			
 
				+	unsigned long     r_sent_stamp;
			
 
				+	bool              r_resend;           /* msg send failed, needs retry */
			
 
				+
			
 
				+	struct ceph_file_layout r_file_layout;
			
 
				+	struct ceph_snap_context *r_snapc;    /* snap context for writes */
			
 
				+	unsigned          r_num_pages;        /* size of page array (follows) */
			
 
				+	struct page     **r_pages;            /* pages for data payload */
			
 
				+	int               r_pages_from_pool;
			
 
				+	int               r_own_pages;        /* if true, i own page list */
			
 
				+};
			
 
				+
			
 
				+struct ceph_osd_client {
			
 
				+	struct ceph_client     *client;
			
 
				+
			
 
				+	struct ceph_osdmap     *osdmap;       /* current map */
			
 
				+	struct rw_semaphore    map_sem;
			
 
				+	struct completion      map_waiters;
			
 
				+	u64                    last_requested_map;
			
 
				+
			
 
				+	struct mutex           request_mutex;
			
 
				+	struct rb_root         osds;          /* osds */
			
 
				+	struct list_head       osd_lru;       /* idle osds */
			
 
				+	u64                    timeout_tid;   /* tid of timeout triggering rq */
			
 
				+	u64                    last_tid;      /* tid of last request */
			
 
				+	struct rb_root         requests;      /* pending requests */
			
 
				+	struct list_head       req_lru;	      /* pending requests lru */
			
 
				+	int                    num_requests;
			
 
				+	struct delayed_work    timeout_work;
			
 
				+	struct delayed_work    osds_timeout_work;
			
 
				+#ifdef CONFIG_DEBUG_FS
			
 
				+	struct dentry 	       *debugfs_file;
			
 
				+#endif
			
 
				+
			
 
				+	mempool_t              *req_mempool;
			
 
				+
			
 
				+	struct ceph_msgpool	msgpool_op;
			
 
				+	struct ceph_msgpool	msgpool_op_reply;
			
 
				+};
			
 
				+
			
 
				+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
			
 
				+			  struct ceph_client *client);
			
 
				+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
			
 
				+
			
 
				+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
			
 
				+				   struct ceph_msg *msg);
			
 
				+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
			
 
				+				 struct ceph_msg *msg);
			
 
				+
			
 
				+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
			
 
				+				      struct ceph_file_layout *layout,
			
 
				+				      struct ceph_vino vino,
			
 
				+				      u64 offset, u64 *len, int op, int flags,
			
 
				+				      struct ceph_snap_context *snapc,
			
 
				+				      int do_sync, u32 truncate_seq,
			
 
				+				      u64 truncate_size,
			
 
				+				      struct timespec *mtime,
			
 
				+				      bool use_mempool, int num_reply);
			
 
				+
			
 
				+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	kref_get(&req->r_kref);
			
 
				+}
			
 
				+extern void ceph_osdc_release_request(struct kref *kref);
			
 
				+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
			
 
				+{
			
 
				+	kref_put(&req->r_kref, ceph_osdc_release_request);
			
 
				+}
			
 
				+
			
 
				+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
			
 
				+				   struct ceph_osd_request *req,
			
 
				+				   bool nofail);
			
 
				+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
			
 
				+				  struct ceph_osd_request *req);
			
 
				+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
			
 
				+
			
 
				+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
			
 
				+			       struct ceph_vino vino,
			
 
				+			       struct ceph_file_layout *layout,
			
 
				+			       u64 off, u64 *plen,
			
 
				+			       u32 truncate_seq, u64 truncate_size,
			
 
				+			       struct page **pages, int nr_pages);
			
 
				+
			
 
				+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
			
 
				+				struct ceph_vino vino,
			
 
				+				struct ceph_file_layout *layout,
			
 
				+				struct ceph_snap_context *sc,
			
 
				+				u64 off, u64 len,
			
 
				+				u32 truncate_seq, u64 truncate_size,
			
 
				+				struct timespec *mtime,
			
 
				+				struct page **pages, int nr_pages,
			
 
				+				int flags, int do_sync, bool nofail);
			
 
				+
			
 
				+#endif
			
 
				+
			
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1019 @@
 
				+
			
 
				+#include <asm/div64.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "osdmap.h"
			
 
				+#include "crush/hash.h"
			
 
				+#include "crush/mapper.h"
			
 
				+#include "decode.h"
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+char *ceph_osdmap_state_str(char *str, int len, int state)
			
 
				+{
			
 
				+	int flag = 0;
			
 
				+
			
 
				+	if (!len)
			
 
				+		goto done;
			
 
				+
			
 
				+	*str = '\0';
			
 
				+	if (state) {
			
 
				+		if (state & CEPH_OSD_EXISTS) {
			
 
				+			snprintf(str, len, "exists");
			
 
				+			flag = 1;
			
 
				+		}
			
 
				+		if (state & CEPH_OSD_UP) {
			
 
				+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
			
 
				+				 "up");
			
 
				+			flag = 1;
			
 
				+		}
			
 
				+	} else {
			
 
				+		snprintf(str, len, "doesn't exist");
			
 
				+	}
			
 
				+done:
			
 
				+	return str;
			
 
				+}
			
 
				+
			
 
				+/* maps */
			
 
				+
			
 
				+static int calc_bits_of(unsigned t)
			
 
				+{
			
 
				+	int b = 0;
			
 
				+	while (t) {
			
 
				+		t = t >> 1;
			
 
				+		b++;
			
 
				+	}
			
 
				+	return b;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
			
 
				+ */
			
 
				+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
			
 
				+{
			
 
				+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
			
 
				+	pi->pgp_num_mask =
			
 
				+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
			
 
				+	pi->lpg_num_mask =
			
 
				+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
			
 
				+	pi->lpgp_num_mask =
			
 
				+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * decode crush map
			
 
				+ */
			
 
				+static int crush_decode_uniform_bucket(void **p, void *end,
			
 
				+				       struct crush_bucket_uniform *b)
			
 
				+{
			
 
				+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
			
 
				+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
			
 
				+	b->item_weight = ceph_decode_32(p);
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int crush_decode_list_bucket(void **p, void *end,
			
 
				+				    struct crush_bucket_list *b)
			
 
				+{
			
 
				+	int j;
			
 
				+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
			
 
				+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
			
 
				+	if (b->item_weights == NULL)
			
 
				+		return -ENOMEM;
			
 
				+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
			
 
				+	if (b->sum_weights == NULL)
			
 
				+		return -ENOMEM;
			
 
				+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
			
 
				+	for (j = 0; j < b->h.size; j++) {
			
 
				+		b->item_weights[j] = ceph_decode_32(p);
			
 
				+		b->sum_weights[j] = ceph_decode_32(p);
			
 
				+	}
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int crush_decode_tree_bucket(void **p, void *end,
			
 
				+				    struct crush_bucket_tree *b)
			
 
				+{
			
 
				+	int j;
			
 
				+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
			
 
				+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
			
 
				+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
			
 
				+	if (b->node_weights == NULL)
			
 
				+		return -ENOMEM;
			
 
				+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
			
 
				+	for (j = 0; j < b->num_nodes; j++)
			
 
				+		b->node_weights[j] = ceph_decode_32(p);
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int crush_decode_straw_bucket(void **p, void *end,
			
 
				+				     struct crush_bucket_straw *b)
			
 
				+{
			
 
				+	int j;
			
 
				+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
			
 
				+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
			
 
				+	if (b->item_weights == NULL)
			
 
				+		return -ENOMEM;
			
 
				+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
			
 
				+	if (b->straws == NULL)
			
 
				+		return -ENOMEM;
			
 
				+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
			
 
				+	for (j = 0; j < b->h.size; j++) {
			
 
				+		b->item_weights[j] = ceph_decode_32(p);
			
 
				+		b->straws[j] = ceph_decode_32(p);
			
 
				+	}
			
 
				+	return 0;
			
 
				+bad:
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static struct crush_map *crush_decode(void *pbyval, void *end)
			
 
				+{
			
 
				+	struct crush_map *c;
			
 
				+	int err = -EINVAL;
			
 
				+	int i, j;
			
 
				+	void **p = &pbyval;
			
 
				+	void *start = pbyval;
			
 
				+	u32 magic;
			
 
				+
			
 
				+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
			
 
				+
			
 
				+	c = kzalloc(sizeof(*c), GFP_NOFS);
			
 
				+	if (c == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
			
 
				+	magic = ceph_decode_32(p);
			
 
				+	if (magic != CRUSH_MAGIC) {
			
 
				+		pr_err("crush_decode magic %x != current %x\n",
			
 
				+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+	c->max_buckets = ceph_decode_32(p);
			
 
				+	c->max_rules = ceph_decode_32(p);
			
 
				+	c->max_devices = ceph_decode_32(p);
			
 
				+
			
 
				+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
			
 
				+	if (c->device_parents == NULL)
			
 
				+		goto badmem;
			
 
				+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
			
 
				+	if (c->bucket_parents == NULL)
			
 
				+		goto badmem;
			
 
				+
			
 
				+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
			
 
				+	if (c->buckets == NULL)
			
 
				+		goto badmem;
			
 
				+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
			
 
				+	if (c->rules == NULL)
			
 
				+		goto badmem;
			
 
				+
			
 
				+	/* buckets */
			
 
				+	for (i = 0; i < c->max_buckets; i++) {
			
 
				+		int size = 0;
			
 
				+		u32 alg;
			
 
				+		struct crush_bucket *b;
			
 
				+
			
 
				+		ceph_decode_32_safe(p, end, alg, bad);
			
 
				+		if (alg == 0) {
			
 
				+			c->buckets[i] = NULL;
			
 
				+			continue;
			
 
				+		}
			
 
				+		dout("crush_decode bucket %d off %x %p to %p\n",
			
 
				+		     i, (int)(*p-start), *p, end);
			
 
				+
			
 
				+		switch (alg) {
			
 
				+		case CRUSH_BUCKET_UNIFORM:
			
 
				+			size = sizeof(struct crush_bucket_uniform);
			
 
				+			break;
			
 
				+		case CRUSH_BUCKET_LIST:
			
 
				+			size = sizeof(struct crush_bucket_list);
			
 
				+			break;
			
 
				+		case CRUSH_BUCKET_TREE:
			
 
				+			size = sizeof(struct crush_bucket_tree);
			
 
				+			break;
			
 
				+		case CRUSH_BUCKET_STRAW:
			
 
				+			size = sizeof(struct crush_bucket_straw);
			
 
				+			break;
			
 
				+		default:
			
 
				+			err = -EINVAL;
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		BUG_ON(size == 0);
			
 
				+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
			
 
				+		if (b == NULL)
			
 
				+			goto badmem;
			
 
				+
			
 
				+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
			
 
				+		b->id = ceph_decode_32(p);
			
 
				+		b->type = ceph_decode_16(p);
			
 
				+		b->alg = ceph_decode_8(p);
			
 
				+		b->hash = ceph_decode_8(p);
			
 
				+		b->weight = ceph_decode_32(p);
			
 
				+		b->size = ceph_decode_32(p);
			
 
				+
			
 
				+		dout("crush_decode bucket size %d off %x %p to %p\n",
			
 
				+		     b->size, (int)(*p-start), *p, end);
			
 
				+
			
 
				+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
			
 
				+		if (b->items == NULL)
			
 
				+			goto badmem;
			
 
				+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
			
 
				+		if (b->perm == NULL)
			
 
				+			goto badmem;
			
 
				+		b->perm_n = 0;
			
 
				+
			
 
				+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
			
 
				+		for (j = 0; j < b->size; j++)
			
 
				+			b->items[j] = ceph_decode_32(p);
			
 
				+
			
 
				+		switch (b->alg) {
			
 
				+		case CRUSH_BUCKET_UNIFORM:
			
 
				+			err = crush_decode_uniform_bucket(p, end,
			
 
				+				  (struct crush_bucket_uniform *)b);
			
 
				+			if (err < 0)
			
 
				+				goto bad;
			
 
				+			break;
			
 
				+		case CRUSH_BUCKET_LIST:
			
 
				+			err = crush_decode_list_bucket(p, end,
			
 
				+			       (struct crush_bucket_list *)b);
			
 
				+			if (err < 0)
			
 
				+				goto bad;
			
 
				+			break;
			
 
				+		case CRUSH_BUCKET_TREE:
			
 
				+			err = crush_decode_tree_bucket(p, end,
			
 
				+				(struct crush_bucket_tree *)b);
			
 
				+			if (err < 0)
			
 
				+				goto bad;
			
 
				+			break;
			
 
				+		case CRUSH_BUCKET_STRAW:
			
 
				+			err = crush_decode_straw_bucket(p, end,
			
 
				+				(struct crush_bucket_straw *)b);
			
 
				+			if (err < 0)
			
 
				+				goto bad;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* rules */
			
 
				+	dout("rule vec is %p\n", c->rules);
			
 
				+	for (i = 0; i < c->max_rules; i++) {
			
 
				+		u32 yes;
			
 
				+		struct crush_rule *r;
			
 
				+
			
 
				+		ceph_decode_32_safe(p, end, yes, bad);
			
 
				+		if (!yes) {
			
 
				+			dout("crush_decode NO rule %d off %x %p to %p\n",
			
 
				+			     i, (int)(*p-start), *p, end);
			
 
				+			c->rules[i] = NULL;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		dout("crush_decode rule %d off %x %p to %p\n",
			
 
				+		     i, (int)(*p-start), *p, end);
			
 
				+
			
 
				+		/* len */
			
 
				+		ceph_decode_32_safe(p, end, yes, bad);
			
 
				+#if BITS_PER_LONG == 32
			
 
				+		err = -EINVAL;
			
 
				+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
			
 
				+			goto bad;
			
 
				+#endif
			
 
				+		r = c->rules[i] = kmalloc(sizeof(*r) +
			
 
				+					  yes*sizeof(struct crush_rule_step),
			
 
				+					  GFP_NOFS);
			
 
				+		if (r == NULL)
			
 
				+			goto badmem;
			
 
				+		dout(" rule %d is at %p\n", i, r);
			
 
				+		r->len = yes;
			
 
				+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
			
 
				+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
			
 
				+		for (j = 0; j < r->len; j++) {
			
 
				+			r->steps[j].op = ceph_decode_32(p);
			
 
				+			r->steps[j].arg1 = ceph_decode_32(p);
			
 
				+			r->steps[j].arg2 = ceph_decode_32(p);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* ignore trailing name maps. */
			
 
				+
			
 
				+	dout("crush_decode success\n");
			
 
				+	return c;
			
 
				+
			
 
				+badmem:
			
 
				+	err = -ENOMEM;
			
 
				+bad:
			
 
				+	dout("crush_decode fail %d\n", err);
			
 
				+	crush_destroy(c);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * osd map
			
 
				+ */
			
 
				+void ceph_osdmap_destroy(struct ceph_osdmap *map)
			
 
				+{
			
 
				+	dout("osdmap_destroy %p\n", map);
			
 
				+	if (map->crush)
			
 
				+		crush_destroy(map->crush);
			
 
				+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
			
 
				+		struct ceph_pg_mapping *pg =
			
 
				+			rb_entry(rb_first(&map->pg_temp),
			
 
				+				 struct ceph_pg_mapping, node);
			
 
				+		rb_erase(&pg->node, &map->pg_temp);
			
 
				+		kfree(pg);
			
 
				+	}
			
 
				+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
			
 
				+		struct ceph_pg_pool_info *pi =
			
 
				+			rb_entry(rb_first(&map->pg_pools),
			
 
				+				 struct ceph_pg_pool_info, node);
			
 
				+		rb_erase(&pi->node, &map->pg_pools);
			
 
				+		kfree(pi);
			
 
				+	}
			
 
				+	kfree(map->osd_state);
			
 
				+	kfree(map->osd_weight);
			
 
				+	kfree(map->osd_addr);
			
 
				+	kfree(map);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * adjust max osd value.  reallocate arrays.
			
 
				+ */
			
 
				+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
			
 
				+{
			
 
				+	u8 *state;
			
 
				+	struct ceph_entity_addr *addr;
			
 
				+	u32 *weight;
			
 
				+
			
 
				+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
			
 
				+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
			
 
				+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
			
 
				+	if (state == NULL || addr == NULL || weight == NULL) {
			
 
				+		kfree(state);
			
 
				+		kfree(addr);
			
 
				+		kfree(weight);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	/* copy old? */
			
 
				+	if (map->osd_state) {
			
 
				+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
			
 
				+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
			
 
				+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
			
 
				+		kfree(map->osd_state);
			
 
				+		kfree(map->osd_addr);
			
 
				+		kfree(map->osd_weight);
			
 
				+	}
			
 
				+
			
 
				+	map->osd_state = state;
			
 
				+	map->osd_weight = weight;
			
 
				+	map->osd_addr = addr;
			
 
				+	map->max_osd = max;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
			
 
				+ * to a set of osds)
			
 
				+ */
			
 
				+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
			
 
				+{
			
 
				+	u64 a = *(u64 *)&l;
			
 
				+	u64 b = *(u64 *)&r;
			
 
				+
			
 
				+	if (a < b)
			
 
				+		return -1;
			
 
				+	if (a > b)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
			
 
				+			       struct rb_root *root)
			
 
				+{
			
 
				+	struct rb_node **p = &root->rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_pg_mapping *pg = NULL;
			
 
				+	int c;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
			
 
				+		c = pgid_cmp(new->pgid, pg->pgid);
			
 
				+		if (c < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (c > 0)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			return -EEXIST;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->node, parent, p);
			
 
				+	rb_insert_color(&new->node, root);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
			
 
				+						   struct ceph_pg pgid)
			
 
				+{
			
 
				+	struct rb_node *n = root->rb_node;
			
 
				+	struct ceph_pg_mapping *pg;
			
 
				+	int c;
			
 
				+
			
 
				+	while (n) {
			
 
				+		pg = rb_entry(n, struct ceph_pg_mapping, node);
			
 
				+		c = pgid_cmp(pgid, pg->pgid);
			
 
				+		if (c < 0)
			
 
				+			n = n->rb_left;
			
 
				+		else if (c > 0)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return pg;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * rbtree of pg pool info
			
 
				+ */
			
 
				+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
			
 
				+{
			
 
				+	struct rb_node **p = &root->rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_pg_pool_info *pi = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
			
 
				+		if (new->id < pi->id)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->id > pi->id)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			return -EEXIST;
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->node, parent, p);
			
 
				+	rb_insert_color(&new->node, root);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
			
 
				+{
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				+	struct rb_node *n = root->rb_node;
			
 
				+
			
 
				+	while (n) {
			
 
				+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
			
 
				+		if (id < pi->id)
			
 
				+			n = n->rb_left;
			
 
				+		else if (id > pi->id)
			
 
				+			n = n->rb_right;
			
 
				+		else
			
 
				+			return pi;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * decode a full map.
			
 
				+ */
			
 
				+struct ceph_osdmap *osdmap_decode(void **p, void *end)
			
 
				+{
			
 
				+	struct ceph_osdmap *map;
			
 
				+	u16 version;
			
 
				+	u32 len, max, i;
			
 
				+	u8 ev;
			
 
				+	int err = -EINVAL;
			
 
				+	void *start = *p;
			
 
				+	struct ceph_pg_pool_info *pi;
			
 
				+
			
 
				+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
			
 
				+
			
 
				+	map = kzalloc(sizeof(*map), GFP_NOFS);
			
 
				+	if (map == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	map->pg_temp = RB_ROOT;
			
 
				+
			
 
				+	ceph_decode_16_safe(p, end, version, bad);
			
 
				+	if (version > CEPH_OSDMAP_VERSION) {
			
 
				+		pr_warning("got unknown v %d > %d of osdmap\n", version,
			
 
				+			   CEPH_OSDMAP_VERSION);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
			
 
				+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
			
 
				+	map->epoch = ceph_decode_32(p);
			
 
				+	ceph_decode_copy(p, &map->created, sizeof(map->created));
			
 
				+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
			
 
				+
			
 
				+	ceph_decode_32_safe(p, end, max, bad);
			
 
				+	while (max--) {
			
 
				+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
			
 
				+		pi = kmalloc(sizeof(*pi), GFP_NOFS);
			
 
				+		if (!pi)
			
 
				+			goto bad;
			
 
				+		pi->id = ceph_decode_32(p);
			
 
				+		ev = ceph_decode_8(p); /* encoding version */
			
 
				+		if (ev > CEPH_PG_POOL_VERSION) {
			
 
				+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
			
 
				+				   ev, CEPH_PG_POOL_VERSION);
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
			
 
				+		__insert_pg_pool(&map->pg_pools, pi);
			
 
				+		calc_pg_masks(pi);
			
 
				+		*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
			
 
				+		*p += le32_to_cpu(pi->v.num_removed_snap_intervals)
			
 
				+			* sizeof(u64) * 2;
			
 
				+	}
			
 
				+	ceph_decode_32_safe(p, end, map->pool_max, bad);
			
 
				+
			
 
				+	ceph_decode_32_safe(p, end, map->flags, bad);
			
 
				+
			
 
				+	max = ceph_decode_32(p);
			
 
				+
			
 
				+	/* (re)alloc osd arrays */
			
 
				+	err = osdmap_set_max_osd(map, max);
			
 
				+	if (err < 0)
			
 
				+		goto bad;
			
 
				+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
			
 
				+
			
 
				+	/* osds */
			
 
				+	err = -EINVAL;
			
 
				+	ceph_decode_need(p, end, 3*sizeof(u32) +
			
 
				+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
			
 
				+				       sizeof(*map->osd_addr)), bad);
			
 
				+	*p += 4; /* skip length field (should match max) */
			
 
				+	ceph_decode_copy(p, map->osd_state, map->max_osd);
			
 
				+
			
 
				+	*p += 4; /* skip length field (should match max) */
			
 
				+	for (i = 0; i < map->max_osd; i++)
			
 
				+		map->osd_weight[i] = ceph_decode_32(p);
			
 
				+
			
 
				+	*p += 4; /* skip length field (should match max) */
			
 
				+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
			
 
				+	for (i = 0; i < map->max_osd; i++)
			
 
				+		ceph_decode_addr(&map->osd_addr[i]);
			
 
				+
			
 
				+	/* pg_temp */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	for (i = 0; i < len; i++) {
			
 
				+		int n, j;
			
 
				+		struct ceph_pg pgid;
			
 
				+		struct ceph_pg_mapping *pg;
			
 
				+
			
 
				+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
			
 
				+		ceph_decode_copy(p, &pgid, sizeof(pgid));
			
 
				+		n = ceph_decode_32(p);
			
 
				+		ceph_decode_need(p, end, n * sizeof(u32), bad);
			
 
				+		err = -ENOMEM;
			
 
				+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
			
 
				+		if (!pg)
			
 
				+			goto bad;
			
 
				+		pg->pgid = pgid;
			
 
				+		pg->len = n;
			
 
				+		for (j = 0; j < n; j++)
			
 
				+			pg->osds[j] = ceph_decode_32(p);
			
 
				+
			
 
				+		err = __insert_pg_mapping(pg, &map->pg_temp);
			
 
				+		if (err)
			
 
				+			goto bad;
			
 
				+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
			
 
				+	}
			
 
				+
			
 
				+	/* crush */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
			
 
				+	     (int)(*p - start));
			
 
				+	ceph_decode_need(p, end, len, bad);
			
 
				+	map->crush = crush_decode(*p, end);
			
 
				+	*p += len;
			
 
				+	if (IS_ERR(map->crush)) {
			
 
				+		err = PTR_ERR(map->crush);
			
 
				+		map->crush = NULL;
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	/* ignore the rest of the map */
			
 
				+	*p = end;
			
 
				+
			
 
				+	dout("osdmap_decode done %p %p\n", *p, end);
			
 
				+	return map;
			
 
				+
			
 
				+bad:
			
 
				+	dout("osdmap_decode fail\n");
			
 
				+	ceph_osdmap_destroy(map);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * decode and apply an incremental map update.
			
 
				+ */
			
 
				+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
			
 
				+					     struct ceph_osdmap *map,
			
 
				+					     struct ceph_messenger *msgr)
			
 
				+{
			
 
				+	struct crush_map *newcrush = NULL;
			
 
				+	struct ceph_fsid fsid;
			
 
				+	u32 epoch = 0;
			
 
				+	struct ceph_timespec modified;
			
 
				+	u32 len, pool;
			
 
				+	__s32 new_pool_max, new_flags, max;
			
 
				+	void *start = *p;
			
 
				+	int err = -EINVAL;
			
 
				+	u16 version;
			
 
				+	struct rb_node *rbp;
			
 
				+
			
 
				+	ceph_decode_16_safe(p, end, version, bad);
			
 
				+	if (version > CEPH_OSDMAP_INC_VERSION) {
			
 
				+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
			
 
				+			   CEPH_OSDMAP_INC_VERSION);
			
 
				+		goto bad;
			
 
				+	}
			
 
				+
			
 
				+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
			
 
				+			 bad);
			
 
				+	ceph_decode_copy(p, &fsid, sizeof(fsid));
			
 
				+	epoch = ceph_decode_32(p);
			
 
				+	BUG_ON(epoch != map->epoch+1);
			
 
				+	ceph_decode_copy(p, &modified, sizeof(modified));
			
 
				+	new_pool_max = ceph_decode_32(p);
			
 
				+	new_flags = ceph_decode_32(p);
			
 
				+
			
 
				+	/* full map? */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	if (len > 0) {
			
 
				+		dout("apply_incremental full map len %d, %p to %p\n",
			
 
				+		     len, *p, end);
			
 
				+		return osdmap_decode(p, min(*p+len, end));
			
 
				+	}
			
 
				+
			
 
				+	/* new crush? */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	if (len > 0) {
			
 
				+		dout("apply_incremental new crush map len %d, %p to %p\n",
			
 
				+		     len, *p, end);
			
 
				+		newcrush = crush_decode(*p, min(*p+len, end));
			
 
				+		if (IS_ERR(newcrush))
			
 
				+			return ERR_PTR(PTR_ERR(newcrush));
			
 
				+	}
			
 
				+
			
 
				+	/* new flags? */
			
 
				+	if (new_flags >= 0)
			
 
				+		map->flags = new_flags;
			
 
				+	if (new_pool_max >= 0)
			
 
				+		map->pool_max = new_pool_max;
			
 
				+
			
 
				+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
			
 
				+
			
 
				+	/* new max? */
			
 
				+	max = ceph_decode_32(p);
			
 
				+	if (max >= 0) {
			
 
				+		err = osdmap_set_max_osd(map, max);
			
 
				+		if (err < 0)
			
 
				+			goto bad;
			
 
				+	}
			
 
				+
			
 
				+	map->epoch++;
			
 
				+	map->modified = map->modified;
			
 
				+	if (newcrush) {
			
 
				+		if (map->crush)
			
 
				+			crush_destroy(map->crush);
			
 
				+		map->crush = newcrush;
			
 
				+		newcrush = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/* new_pool */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	while (len--) {
			
 
				+		__u8 ev;
			
 
				+		struct ceph_pg_pool_info *pi;
			
 
				+
			
 
				+		ceph_decode_32_safe(p, end, pool, bad);
			
 
				+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
			
 
				+		ev = ceph_decode_8(p);  /* encoding version */
			
 
				+		if (ev > CEPH_PG_POOL_VERSION) {
			
 
				+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
			
 
				+				   ev, CEPH_PG_POOL_VERSION);
			
 
				+			goto bad;
			
 
				+		}
			
 
				+		pi = __lookup_pg_pool(&map->pg_pools, pool);
			
 
				+		if (!pi) {
			
 
				+			pi = kmalloc(sizeof(*pi), GFP_NOFS);
			
 
				+			if (!pi) {
			
 
				+				err = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			pi->id = pool;
			
 
				+			__insert_pg_pool(&map->pg_pools, pi);
			
 
				+		}
			
 
				+		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
			
 
				+		calc_pg_masks(pi);
			
 
				+	}
			
 
				+
			
 
				+	/* old_pool */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	while (len--) {
			
 
				+		struct ceph_pg_pool_info *pi;
			
 
				+
			
 
				+		ceph_decode_32_safe(p, end, pool, bad);
			
 
				+		pi = __lookup_pg_pool(&map->pg_pools, pool);
			
 
				+		if (pi) {
			
 
				+			rb_erase(&pi->node, &map->pg_pools);
			
 
				+			kfree(pi);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* new_up */
			
 
				+	err = -EINVAL;
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	while (len--) {
			
 
				+		u32 osd;
			
 
				+		struct ceph_entity_addr addr;
			
 
				+		ceph_decode_32_safe(p, end, osd, bad);
			
 
				+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
			
 
				+		ceph_decode_addr(&addr);
			
 
				+		pr_info("osd%d up\n", osd);
			
 
				+		BUG_ON(osd >= map->max_osd);
			
 
				+		map->osd_state[osd] |= CEPH_OSD_UP;
			
 
				+		map->osd_addr[osd] = addr;
			
 
				+	}
			
 
				+
			
 
				+	/* new_down */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	while (len--) {
			
 
				+		u32 osd;
			
 
				+		ceph_decode_32_safe(p, end, osd, bad);
			
 
				+		(*p)++;  /* clean flag */
			
 
				+		pr_info("osd%d down\n", osd);
			
 
				+		if (osd < map->max_osd)
			
 
				+			map->osd_state[osd] &= ~CEPH_OSD_UP;
			
 
				+	}
			
 
				+
			
 
				+	/* new_weight */
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	while (len--) {
			
 
				+		u32 osd, off;
			
 
				+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
			
 
				+		osd = ceph_decode_32(p);
			
 
				+		off = ceph_decode_32(p);
			
 
				+		pr_info("osd%d weight 0x%x %s\n", osd, off,
			
 
				+		     off == CEPH_OSD_IN ? "(in)" :
			
 
				+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
			
 
				+		if (osd < map->max_osd)
			
 
				+			map->osd_weight[osd] = off;
			
 
				+	}
			
 
				+
			
 
				+	/* new_pg_temp */
			
 
				+	rbp = rb_first(&map->pg_temp);
			
 
				+	ceph_decode_32_safe(p, end, len, bad);
			
 
				+	while (len--) {
			
 
				+		struct ceph_pg_mapping *pg;
			
 
				+		int j;
			
 
				+		struct ceph_pg pgid;
			
 
				+		u32 pglen;
			
 
				+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
			
 
				+		ceph_decode_copy(p, &pgid, sizeof(pgid));
			
 
				+		pglen = ceph_decode_32(p);
			
 
				+
			
 
				+		/* remove any? */
			
 
				+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
			
 
				+						node)->pgid, pgid) <= 0) {
			
 
				+			struct rb_node *cur = rbp;
			
 
				+			rbp = rb_next(rbp);
			
 
				+			dout(" removed pg_temp %llx\n",
			
 
				+			     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
			
 
				+					       node)->pgid);
			
 
				+			rb_erase(cur, &map->pg_temp);
			
 
				+		}
			
 
				+
			
 
				+		if (pglen) {
			
 
				+			/* insert */
			
 
				+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
			
 
				+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
			
 
				+			if (!pg) {
			
 
				+				err = -ENOMEM;
			
 
				+				goto bad;
			
 
				+			}
			
 
				+			pg->pgid = pgid;
			
 
				+			pg->len = pglen;
			
 
				+			for (j = 0; j < pglen; j++)
			
 
				+				pg->osds[j] = ceph_decode_32(p);
			
 
				+			err = __insert_pg_mapping(pg, &map->pg_temp);
			
 
				+			if (err)
			
 
				+				goto bad;
			
 
				+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
			
 
				+			     pglen);
			
 
				+		}
			
 
				+	}
			
 
				+	while (rbp) {
			
 
				+		struct rb_node *cur = rbp;
			
 
				+		rbp = rb_next(rbp);
			
 
				+		dout(" removed pg_temp %llx\n",
			
 
				+		     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
			
 
				+				       node)->pgid);
			
 
				+		rb_erase(cur, &map->pg_temp);
			
 
				+	}
			
 
				+
			
 
				+	/* ignore the rest */
			
 
				+	*p = end;
			
 
				+	return map;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
			
 
				+	       epoch, (int)(*p - start), *p, start, end);
			
 
				+	print_hex_dump(KERN_DEBUG, "osdmap: ",
			
 
				+		       DUMP_PREFIX_OFFSET, 16, 1,
			
 
				+		       start, end - start, true);
			
 
				+	if (newcrush)
			
 
				+		crush_destroy(newcrush);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * calculate file layout from given offset, length.
			
 
				+ * fill in correct oid, logical length, and object extent
			
 
				+ * offset, length.
			
 
				+ *
			
 
				+ * for now, we write only a single su, until we can
			
 
				+ * pass a stride back to the caller.
			
 
				+ */
			
 
				+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
			
 
				+				   u64 off, u64 *plen,
			
 
				+				   u64 *ono,
			
 
				+				   u64 *oxoff, u64 *oxlen)
			
 
				+{
			
 
				+	u32 osize = le32_to_cpu(layout->fl_object_size);
			
 
				+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
			
 
				+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
			
 
				+	u32 bl, stripeno, stripepos, objsetno;
			
 
				+	u32 su_per_object;
			
 
				+	u64 t, su_offset;
			
 
				+
			
 
				+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
			
 
				+	     osize, su);
			
 
				+	su_per_object = osize / su;
			
 
				+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
			
 
				+	     su_per_object);
			
 
				+
			
 
				+	BUG_ON((su & ~PAGE_MASK) != 0);
			
 
				+	/* bl = *off / su; */
			
 
				+	t = off;
			
 
				+	do_div(t, su);
			
 
				+	bl = t;
			
 
				+	dout("off %llu / su %u = bl %u\n", off, su, bl);
			
 
				+
			
 
				+	stripeno = bl / sc;
			
 
				+	stripepos = bl % sc;
			
 
				+	objsetno = stripeno / su_per_object;
			
 
				+
			
 
				+	*ono = objsetno * sc + stripepos;
			
 
				+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
			
 
				+
			
 
				+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
			
 
				+	t = off;
			
 
				+	su_offset = do_div(t, su);
			
 
				+	*oxoff = su_offset + (stripeno % su_per_object) * su;
			
 
				+
			
 
				+	/*
			
 
				+	 * Calculate the length of the extent being written to the selected
			
 
				+	 * object. This is the minimum of the full length requested (plen) or
			
 
				+	 * the remainder of the current stripe being written to.
			
 
				+	 */
			
 
				+	*oxlen = min_t(u64, *plen, su - su_offset);
			
 
				+	*plen = *oxlen;
			
 
				+
			
 
				+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * calculate an object layout (i.e. pgid) from an oid,
			
 
				+ * file_layout, and osdmap
			
 
				+ */
			
 
				+int ceph_calc_object_layout(struct ceph_object_layout *ol,
			
 
				+			    const char *oid,
			
 
				+			    struct ceph_file_layout *fl,
			
 
				+			    struct ceph_osdmap *osdmap)
			
 
				+{
			
 
				+	unsigned num, num_mask;
			
 
				+	struct ceph_pg pgid;
			
 
				+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
			
 
				+	int poolid = le32_to_cpu(fl->fl_pg_pool);
			
 
				+	struct ceph_pg_pool_info *pool;
			
 
				+	unsigned ps;
			
 
				+
			
 
				+	BUG_ON(!osdmap);
			
 
				+
			
 
				+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
			
 
				+	if (!pool)
			
 
				+		return -EIO;
			
 
				+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
			
 
				+	if (preferred >= 0) {
			
 
				+		ps += preferred;
			
 
				+		num = le32_to_cpu(pool->v.lpg_num);
			
 
				+		num_mask = pool->lpg_num_mask;
			
 
				+	} else {
			
 
				+		num = le32_to_cpu(pool->v.pg_num);
			
 
				+		num_mask = pool->pg_num_mask;
			
 
				+	}
			
 
				+
			
 
				+	pgid.ps = cpu_to_le16(ps);
			
 
				+	pgid.preferred = cpu_to_le16(preferred);
			
 
				+	pgid.pool = fl->fl_pg_pool;
			
 
				+	if (preferred >= 0)
			
 
				+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
			
 
				+		     (int)preferred);
			
 
				+	else
			
 
				+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
			
 
				+
			
 
				+	ol->ol_pgid = pgid;
			
 
				+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
			
 
				+ * array, or NULL on failure.
			
 
				+ */
			
 
				+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
			
 
				+			int *osds, int *num)
			
 
				+{
			
 
				+	struct ceph_pg_mapping *pg;
			
 
				+	struct ceph_pg_pool_info *pool;
			
 
				+	int ruleno;
			
 
				+	unsigned poolid, ps, pps;
			
 
				+	int preferred;
			
 
				+
			
 
				+	/* pg_temp? */
			
 
				+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
			
 
				+	if (pg) {
			
 
				+		*num = pg->len;
			
 
				+		return pg->osds;
			
 
				+	}
			
 
				+
			
 
				+	/* crush */
			
 
				+	poolid = le32_to_cpu(pgid.pool);
			
 
				+	ps = le16_to_cpu(pgid.ps);
			
 
				+	preferred = (s16)le16_to_cpu(pgid.preferred);
			
 
				+
			
 
				+	/* don't forcefeed bad device ids to crush */
			
 
				+	if (preferred >= osdmap->max_osd ||
			
 
				+	    preferred >= osdmap->crush->max_devices)
			
 
				+		preferred = -1;
			
 
				+
			
 
				+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
			
 
				+	if (!pool)
			
 
				+		return NULL;
			
 
				+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
			
 
				+				 pool->v.type, pool->v.size);
			
 
				+	if (ruleno < 0) {
			
 
				+		pr_err("no crush rule pool %d type %d size %d\n",
			
 
				+		       poolid, pool->v.type, pool->v.size);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	if (preferred >= 0)
			
 
				+		pps = ceph_stable_mod(ps,
			
 
				+				      le32_to_cpu(pool->v.lpgp_num),
			
 
				+				      pool->lpgp_num_mask);
			
 
				+	else
			
 
				+		pps = ceph_stable_mod(ps,
			
 
				+				      le32_to_cpu(pool->v.pgp_num),
			
 
				+				      pool->pgp_num_mask);
			
 
				+	pps += poolid;
			
 
				+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
			
 
				+			     min_t(int, pool->v.size, *num),
			
 
				+			     preferred, osdmap->osd_weight);
			
 
				+	return osds;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Return primary osd for given pgid, or -1 if none.
			
 
				+ */
			
 
				+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
			
 
				+{
			
 
				+	int rawosds[10], *osds;
			
 
				+	int i, num = ARRAY_SIZE(rawosds);
			
 
				+
			
 
				+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
			
 
				+	if (!osds)
			
 
				+		return -1;
			
 
				+
			
 
				+	/* primary is first up osd */
			
 
				+	for (i = 0; i < num; i++)
			
 
				+		if (ceph_osd_is_up(osdmap, osds[i])) {
			
 
				+			return osds[i];
			
 
				+			break;
			
 
				+		}
			
 
				+	return -1;
			
 
				+}
			
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
 
				+#ifndef _FS_CEPH_OSDMAP_H
			
 
				+#define _FS_CEPH_OSDMAP_H
			
 
				+
			
 
				+#include <linux/rbtree.h>
			
 
				+#include "types.h"
			
 
				+#include "ceph_fs.h"
			
 
				+#include "crush/crush.h"
			
 
				+
			
 
				+/*
			
 
				+ * The osd map describes the current membership of the osd cluster and
			
 
				+ * specifies the mapping of objects to placement groups and placement
			
 
				+ * groups to (sets of) osds.  That is, it completely specifies the
			
 
				+ * (desired) distribution of all data objects in the system at some
			
 
				+ * point in time.
			
 
				+ *
			
 
				+ * Each map version is identified by an epoch, which increases monotonically.
			
 
				+ *
			
 
				+ * The map can be updated either via an incremental map (diff) describing
			
 
				+ * the change between two successive epochs, or as a fully encoded map.
			
 
				+ */
			
 
				+struct ceph_pg_pool_info {
			
 
				+	struct rb_node node;
			
 
				+	int id;
			
 
				+	struct ceph_pg_pool v;
			
 
				+	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
			
 
				+};
			
 
				+
			
 
				+struct ceph_pg_mapping {
			
 
				+	struct rb_node node;
			
 
				+	struct ceph_pg pgid;
			
 
				+	int len;
			
 
				+	int osds[];
			
 
				+};
			
 
				+
			
 
				+struct ceph_osdmap {
			
 
				+	struct ceph_fsid fsid;
			
 
				+	u32 epoch;
			
 
				+	u32 mkfs_epoch;
			
 
				+	struct ceph_timespec created, modified;
			
 
				+
			
 
				+	u32 flags;         /* CEPH_OSDMAP_* */
			
 
				+
			
 
				+	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
			
 
				+	u8 *osd_state;     /* CEPH_OSD_* */
			
 
				+	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
			
 
				+	struct ceph_entity_addr *osd_addr;
			
 
				+
			
 
				+	struct rb_root pg_temp;
			
 
				+	struct rb_root pg_pools;
			
 
				+	u32 pool_max;
			
 
				+
			
 
				+	/* the CRUSH map specifies the mapping of placement groups to
			
 
				+	 * the list of osds that store+replicate them. */
			
 
				+	struct crush_map *crush;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * file layout helpers
			
 
				+ */
			
 
				+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
			
 
				+#define ceph_file_layout_stripe_count(l) \
			
 
				+	((__s32)le32_to_cpu((l).fl_stripe_count))
			
 
				+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
			
 
				+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
			
 
				+#define ceph_file_layout_object_su(l) \
			
 
				+	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
			
 
				+#define ceph_file_layout_pg_preferred(l) \
			
 
				+	((__s32)le32_to_cpu((l).fl_pg_preferred))
			
 
				+#define ceph_file_layout_pg_pool(l) \
			
 
				+	((__s32)le32_to_cpu((l).fl_pg_pool))
			
 
				+
			
 
				+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
			
 
				+{
			
 
				+	return le32_to_cpu(l->fl_stripe_unit) *
			
 
				+		le32_to_cpu(l->fl_stripe_count);
			
 
				+}
			
 
				+
			
 
				+/* "period" == bytes before i start on a new set of objects */
			
 
				+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
			
 
				+{
			
 
				+	return le32_to_cpu(l->fl_object_size) *
			
 
				+		le32_to_cpu(l->fl_stripe_count);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
			
 
				+{
			
 
				+	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
			
 
				+}
			
 
				+
			
 
				+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
			
 
				+{
			
 
				+	return map && (map->flags & flag);
			
 
				+}
			
 
				+
			
 
				+extern char *ceph_osdmap_state_str(char *str, int len, int state);
			
 
				+
			
 
				+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
			
 
				+						     int osd)
			
 
				+{
			
 
				+	if (osd >= map->max_osd)
			
 
				+		return NULL;
			
 
				+	return &map->osd_addr[osd];
			
 
				+}
			
 
				+
			
 
				+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
			
 
				+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
			
 
				+					    struct ceph_osdmap *map,
			
 
				+					    struct ceph_messenger *msgr);
			
 
				+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
			
 
				+
			
 
				+/* calculate mapping of a file extent to an object */
			
 
				+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
			
 
				+					  u64 off, u64 *plen,
			
 
				+					  u64 *bno, u64 *oxoff, u64 *oxlen);
			
 
				+
			
 
				+/* calculate mapping of object to a placement group */
			
 
				+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
			
 
				+				   const char *oid,
			
 
				+				   struct ceph_file_layout *fl,
			
 
				+				   struct ceph_osdmap *osdmap);
			
 
				+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
			
 
				+				struct ceph_pg pgid);
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
 
				+
			
 
				+#include <linux/pagemap.h>
			
 
				+#include <linux/highmem.h>
			
 
				+
			
 
				+#include "pagelist.h"
			
 
				+
			
 
				+int ceph_pagelist_release(struct ceph_pagelist *pl)
			
 
				+{
			
 
				+	if (pl->mapped_tail)
			
 
				+		kunmap(pl->mapped_tail);
			
 
				+	while (!list_empty(&pl->head)) {
			
 
				+		struct page *page = list_first_entry(&pl->head, struct page,
			
 
				+						     lru);
			
 
				+		list_del(&page->lru);
			
 
				+		__free_page(page);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
			
 
				+{
			
 
				+	struct page *page = alloc_page(GFP_NOFS);
			
 
				+	if (!page)
			
 
				+		return -ENOMEM;
			
 
				+	pl->room += PAGE_SIZE;
			
 
				+	list_add_tail(&page->lru, &pl->head);
			
 
				+	if (pl->mapped_tail)
			
 
				+		kunmap(pl->mapped_tail);
			
 
				+	pl->mapped_tail = kmap(page);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
			
 
				+{
			
 
				+	while (pl->room < len) {
			
 
				+		size_t bit = pl->room;
			
 
				+		int ret;
			
 
				+
			
 
				+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
			
 
				+		       buf, bit);
			
 
				+		pl->length += bit;
			
 
				+		pl->room -= bit;
			
 
				+		buf += bit;
			
 
				+		len -= bit;
			
 
				+		ret = ceph_pagelist_addpage(pl);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+	}
			
 
				+
			
 
				+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
			
 
				+	pl->length += len;
			
 
				+	pl->room -= len;
			
 
				+	return 0;
			
 
				+}
			
--- a/fs/ceph/pagelist.h
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
 
				+#ifndef __FS_CEPH_PAGELIST_H
			
 
				+#define __FS_CEPH_PAGELIST_H
			
 
				+
			
 
				+#include <linux/list.h>
			
 
				+
			
 
				+struct ceph_pagelist {
			
 
				+	struct list_head head;
			
 
				+	void *mapped_tail;
			
 
				+	size_t length;
			
 
				+	size_t room;
			
 
				+};
			
 
				+
			
 
				+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
			
 
				+{
			
 
				+	INIT_LIST_HEAD(&pl->head);
			
 
				+	pl->mapped_tail = NULL;
			
 
				+	pl->length = 0;
			
 
				+	pl->room = 0;
			
 
				+}
			
 
				+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
			
 
				+
			
 
				+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
			
 
				+
			
 
				+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
			
 
				+{
			
 
				+	__le64 ev = cpu_to_le64(v);
			
 
				+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
			
 
				+}
			
 
				+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
			
 
				+{
			
 
				+	__le32 ev = cpu_to_le32(v);
			
 
				+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
			
 
				+}
			
 
				+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
			
 
				+{
			
 
				+	__le16 ev = cpu_to_le16(v);
			
 
				+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
			
 
				+}
			
 
				+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
			
 
				+{
			
 
				+	return ceph_pagelist_append(pl, &v, 1);
			
 
				+}
			
 
				+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
			
 
				+					      char *s, size_t len)
			
 
				+{
			
 
				+	int ret = ceph_pagelist_encode_32(pl, len);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+	if (len)
			
 
				+		return ceph_pagelist_append(pl, s, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
 
				+#ifndef __RADOS_H
			
 
				+#define __RADOS_H
			
 
				+
			
 
				+/*
			
 
				+ * Data types for the Ceph distributed object storage layer RADOS
			
 
				+ * (Reliable Autonomic Distributed Object Store).
			
 
				+ */
			
 
				+
			
 
				+#include "msgr.h"
			
 
				+
			
 
				+/*
			
 
				+ * osdmap encoding versions
			
 
				+ */
			
 
				+#define CEPH_OSDMAP_INC_VERSION 4
			
 
				+#define CEPH_OSDMAP_VERSION     4
			
 
				+
			
 
				+/*
			
 
				+ * fs id
			
 
				+ */
			
 
				+struct ceph_fsid {
			
 
				+	unsigned char fsid[16];
			
 
				+};
			
 
				+
			
 
				+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
			
 
				+				    const struct ceph_fsid *b)
			
 
				+{
			
 
				+	return memcmp(a, b, sizeof(*a));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * ino, object, etc.
			
 
				+ */
			
 
				+typedef __le64 ceph_snapid_t;
			
 
				+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
			
 
				+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
			
 
				+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
			
 
				+
			
 
				+struct ceph_timespec {
			
 
				+	__le32 tv_sec;
			
 
				+	__le32 tv_nsec;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * object layout - how objects are mapped into PGs
			
 
				+ */
			
 
				+#define CEPH_OBJECT_LAYOUT_HASH     1
			
 
				+#define CEPH_OBJECT_LAYOUT_LINEAR   2
			
 
				+#define CEPH_OBJECT_LAYOUT_HASHINO  3
			
 
				+
			
 
				+/*
			
 
				+ * pg layout -- how PGs are mapped onto (sets of) OSDs
			
 
				+ */
			
 
				+#define CEPH_PG_LAYOUT_CRUSH  0
			
 
				+#define CEPH_PG_LAYOUT_HASH   1
			
 
				+#define CEPH_PG_LAYOUT_LINEAR 2
			
 
				+#define CEPH_PG_LAYOUT_HYBRID 3
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * placement group.
			
 
				+ * we encode this into one __le64.
			
 
				+ */
			
 
				+struct ceph_pg {
			
 
				+	__le16 preferred; /* preferred primary osd */
			
 
				+	__le16 ps;        /* placement seed */
			
 
				+	__le32 pool;      /* object pool */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * pg_pool is a set of pgs storing a pool of objects
			
 
				+ *
			
 
				+ *  pg_num -- base number of pseudorandomly placed pgs
			
 
				+ *
			
 
				+ *  pgp_num -- effective number when calculating pg placement.  this
			
 
				+ * is used for pg_num increases.  new pgs result in data being "split"
			
 
				+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
			
 
				+ * colocated with their parents; that is, pgp_num doesn't increase
			
 
				+ * until the new pgs have successfully split.  only _then_ are the new
			
 
				+ * pgs placed independently.
			
 
				+ *
			
 
				+ *  lpg_num -- localized pg count (per device).  replicas are randomly
			
 
				+ * selected.
			
 
				+ *
			
 
				+ *  lpgp_num -- as above.
			
 
				+ */
			
 
				+#define CEPH_PG_TYPE_REP     1
			
 
				+#define CEPH_PG_TYPE_RAID4   2
			
 
				+#define CEPH_PG_POOL_VERSION 2
			
 
				+struct ceph_pg_pool {
			
 
				+	__u8 type;                /* CEPH_PG_TYPE_* */
			
 
				+	__u8 size;                /* number of osds in each pg */
			
 
				+	__u8 crush_ruleset;       /* crush placement rule */
			
 
				+	__u8 object_hash;         /* hash mapping object name to ps */
			
 
				+	__le32 pg_num, pgp_num;   /* number of pg's */
			
 
				+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
			
 
				+	__le32 last_change;       /* most recent epoch changed */
			
 
				+	__le64 snap_seq;          /* seq for per-pool snapshot */
			
 
				+	__le32 snap_epoch;        /* epoch of last snap */
			
 
				+	__le32 num_snaps;
			
 
				+	__le32 num_removed_snap_intervals;
			
 
				+	__le64 uid;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * stable_mod func is used to control number of placement groups.
			
 
				+ * similar to straight-up modulo, but produces a stable mapping as b
			
 
				+ * increases over time.  b is the number of bins, and bmask is the
			
 
				+ * containing power of 2 minus 1.
			
 
				+ *
			
 
				+ * b <= bmask and bmask=(2**n)-1
			
 
				+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
			
 
				+ */
			
 
				+static inline int ceph_stable_mod(int x, int b, int bmask)
			
 
				+{
			
 
				+	if ((x & bmask) < b)
			
 
				+		return x & bmask;
			
 
				+	else
			
 
				+		return x & (bmask >> 1);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * object layout - how a given object should be stored.
			
 
				+ */
			
 
				+struct ceph_object_layout {
			
 
				+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
			
 
				+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * compound epoch+version, used by storage layer to serialize mutations
			
 
				+ */
			
 
				+struct ceph_eversion {
			
 
				+	__le32 epoch;
			
 
				+	__le64 version;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * osd map bits
			
 
				+ */
			
 
				+
			
 
				+/* status bits */
			
 
				+#define CEPH_OSD_EXISTS 1
			
 
				+#define CEPH_OSD_UP     2
			
 
				+
			
 
				+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
			
 
				+#define CEPH_OSD_IN  0x10000
			
 
				+#define CEPH_OSD_OUT 0
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * osd map flag bits
			
 
				+ */
			
 
				+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
			
 
				+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
			
 
				+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
			
 
				+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
			
 
				+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
			
 
				+
			
 
				+/*
			
 
				+ * osd ops
			
 
				+ */
			
 
				+#define CEPH_OSD_OP_MODE       0xf000
			
 
				+#define CEPH_OSD_OP_MODE_RD    0x1000
			
 
				+#define CEPH_OSD_OP_MODE_WR    0x2000
			
 
				+#define CEPH_OSD_OP_MODE_RMW   0x3000
			
 
				+#define CEPH_OSD_OP_MODE_SUB   0x4000
			
 
				+
			
 
				+#define CEPH_OSD_OP_TYPE       0x0f00
			
 
				+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
			
 
				+#define CEPH_OSD_OP_TYPE_DATA  0x0200
			
 
				+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
			
 
				+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
			
 
				+#define CEPH_OSD_OP_TYPE_PG    0x0500
			
 
				+
			
 
				+enum {
			
 
				+	/** data **/
			
 
				+	/* read */
			
 
				+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
			
 
				+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
			
 
				+
			
 
				+	/* fancy read */
			
 
				+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
			
 
				+
			
 
				+	/* write */
			
 
				+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
			
 
				+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
			
 
				+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
			
 
				+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
			
 
				+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
			
 
				+
			
 
				+	/* fancy write */
			
 
				+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
			
 
				+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
			
 
				+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
			
 
				+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
			
 
				+
			
 
				+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
			
 
				+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
			
 
				+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
			
 
				+
			
 
				+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
			
 
				+
			
 
				+	/** attrs **/
			
 
				+	/* read */
			
 
				+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
			
 
				+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
			
 
				+
			
 
				+	/* write */
			
 
				+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
			
 
				+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
			
 
				+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
			
 
				+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
			
 
				+
			
 
				+	/** subop **/
			
 
				+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
			
 
				+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
			
 
				+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
			
 
				+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
			
 
				+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
			
 
				+
			
 
				+	/** lock **/
			
 
				+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
			
 
				+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
			
 
				+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
			
 
				+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
			
 
				+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
			
 
				+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
			
 
				+
			
 
				+	/** exec **/
			
 
				+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
			
 
				+
			
 
				+	/** pg **/
			
 
				+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
			
 
				+};
			
 
				+
			
 
				+static inline int ceph_osd_op_type_lock(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
			
 
				+}
			
 
				+static inline int ceph_osd_op_type_data(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
			
 
				+}
			
 
				+static inline int ceph_osd_op_type_attr(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
			
 
				+}
			
 
				+static inline int ceph_osd_op_type_exec(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
			
 
				+}
			
 
				+static inline int ceph_osd_op_type_pg(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
			
 
				+}
			
 
				+
			
 
				+static inline int ceph_osd_op_mode_subop(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
			
 
				+}
			
 
				+static inline int ceph_osd_op_mode_read(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
			
 
				+}
			
 
				+static inline int ceph_osd_op_mode_modify(int op)
			
 
				+{
			
 
				+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
			
 
				+}
			
 
				+
			
 
				+#define CEPH_OSD_TMAP_HDR 'h'
			
 
				+#define CEPH_OSD_TMAP_SET 's'
			
 
				+#define CEPH_OSD_TMAP_RM  'r'
			
 
				+
			
 
				+extern const char *ceph_osd_op_name(int op);
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * osd op flags
			
 
				+ *
			
 
				+ * An op may be READ, WRITE, or READ|WRITE.
			
 
				+ */
			
 
				+enum {
			
 
				+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
			
 
				+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
			
 
				+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
			
 
				+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
			
 
				+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
			
 
				+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
			
 
				+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
			
 
				+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
			
 
				+	CEPH_OSD_FLAG_BALANCE_READS = 256,
			
 
				+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
			
 
				+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
			
 
				+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
			
 
				+};
			
 
				+
			
 
				+enum {
			
 
				+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
			
 
				+};
			
 
				+
			
 
				+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
			
 
				+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
			
 
				+
			
 
				+/*
			
 
				+ * an individual object operation.  each may be accompanied by some data
			
 
				+ * payload
			
 
				+ */
			
 
				+struct ceph_osd_op {
			
 
				+	__le16 op;           /* CEPH_OSD_OP_* */
			
 
				+	__le32 flags;        /* CEPH_OSD_FLAG_* */
			
 
				+	union {
			
 
				+		struct {
			
 
				+			__le64 offset, length;
			
 
				+			__le64 truncate_size;
			
 
				+			__le32 truncate_seq;
			
 
				+		} __attribute__ ((packed)) extent;
			
 
				+		struct {
			
 
				+			__le32 name_len;
			
 
				+			__le32 value_len;
			
 
				+		} __attribute__ ((packed)) xattr;
			
 
				+		struct {
			
 
				+			__u8 class_len;
			
 
				+			__u8 method_len;
			
 
				+			__u8 argc;
			
 
				+			__le32 indata_len;
			
 
				+		} __attribute__ ((packed)) cls;
			
 
				+		struct {
			
 
				+			__le64 cookie, count;
			
 
				+		} __attribute__ ((packed)) pgls;
			
 
				+	};
			
 
				+	__le32 payload_len;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+/*
			
 
				+ * osd request message header.  each request may include multiple
			
 
				+ * ceph_osd_op object operations.
			
 
				+ */
			
 
				+struct ceph_osd_request_head {
			
 
				+	__le32 client_inc;                 /* client incarnation */
			
 
				+	struct ceph_object_layout layout;  /* pgid */
			
 
				+	__le32 osdmap_epoch;               /* client's osdmap epoch */
			
 
				+
			
 
				+	__le32 flags;
			
 
				+
			
 
				+	struct ceph_timespec mtime;        /* for mutations only */
			
 
				+	struct ceph_eversion reassert_version; /* if we are replaying op */
			
 
				+
			
 
				+	__le32 object_len;     /* length of object name */
			
 
				+
			
 
				+	__le64 snapid;         /* snapid to read */
			
 
				+	__le64 snap_seq;       /* writer's snap context */
			
 
				+	__le32 num_snaps;
			
 
				+
			
 
				+	__le16 num_ops;
			
 
				+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+struct ceph_osd_reply_head {
			
 
				+	__le32 client_inc;                /* client incarnation */
			
 
				+	__le32 flags;
			
 
				+	struct ceph_object_layout layout;
			
 
				+	__le32 osdmap_epoch;
			
 
				+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
			
 
				+
			
 
				+	__le32 result;                    /* result code */
			
 
				+
			
 
				+	__le32 object_len;                /* length of object name */
			
 
				+	__le32 num_ops;
			
 
				+	struct ceph_osd_op ops[0];  /* ops[], object */
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -0,0 +1,904 @@
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/sort.h>
			
 
				+
			
 
				+#include "super.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+/*
			
 
				+ * Snapshots in ceph are driven in large part by cooperation from the
			
 
				+ * client.  In contrast to local file systems or file servers that
			
 
				+ * implement snapshots at a single point in the system, ceph's
			
 
				+ * distributed access to storage requires clients to help decide
			
 
				+ * whether a write logically occurs before or after a recently created
			
 
				+ * snapshot.
			
 
				+ *
			
 
				+ * This provides a perfect instantanous client-wide snapshot.  Between
			
 
				+ * clients, however, snapshots may appear to be applied at slightly
			
 
				+ * different points in time, depending on delays in delivering the
			
 
				+ * snapshot notification.
			
 
				+ *
			
 
				+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
			
 
				+ * applies to the subdirectory nested beneath some directory.  This
			
 
				+ * effectively divides the hierarchy into multiple "realms," where all
			
 
				+ * of the files contained by each realm share the same set of
			
 
				+ * snapshots.  An individual realm's snap set contains snapshots
			
 
				+ * explicitly created on that realm, as well as any snaps in its
			
 
				+ * parent's snap set _after_ the point at which the parent became it's
			
 
				+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
			
 
				+ * during the time intervals during which they were the parent are included.
			
 
				+ *
			
 
				+ * The client is spared most of this detail, fortunately... it must only
			
 
				+ * maintains a hierarchy of realms reflecting the current parent/child
			
 
				+ * realm relationship, and for each realm has an explicit list of snaps
			
 
				+ * inherited from prior parents.
			
 
				+ *
			
 
				+ * A snap_realm struct is maintained for realms containing every inode
			
 
				+ * with an open cap in the system.  (The needed snap realm information is
			
 
				+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
			
 
				+ * version number is used to ensure that as realm parameters change (new
			
 
				+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
			
 
				+ *
			
 
				+ * The realm hierarchy drives the generation of a 'snap context' for each
			
 
				+ * realm, which simply lists the resulting set of snaps for the realm.  This
			
 
				+ * is attached to any writes sent to OSDs.
			
 
				+ */
			
 
				+/*
			
 
				+ * Unfortunately error handling is a bit mixed here.  If we get a snap
			
 
				+ * update, but don't have enough memory to update our realm hierarchy,
			
 
				+ * it's not clear what we can do about it (besides complaining to the
			
 
				+ * console).
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * increase ref count for the realm
			
 
				+ *
			
 
				+ * caller must hold snap_rwsem for write.
			
 
				+ */
			
 
				+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+			 struct ceph_snap_realm *realm)
			
 
				+{
			
 
				+	dout("get_realm %p %d -> %d\n", realm,
			
 
				+	     atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
			
 
				+	/*
			
 
				+	 * since we _only_ increment realm refs or empty the empty
			
 
				+	 * list with snap_rwsem held, adjusting the empty list here is
			
 
				+	 * safe.  we do need to protect against concurrent empty list
			
 
				+	 * additions, however.
			
 
				+	 */
			
 
				+	if (atomic_read(&realm->nref) == 0) {
			
 
				+		spin_lock(&mdsc->snap_empty_lock);
			
 
				+		list_del_init(&realm->empty_item);
			
 
				+		spin_unlock(&mdsc->snap_empty_lock);
			
 
				+	}
			
 
				+
			
 
				+	atomic_inc(&realm->nref);
			
 
				+}
			
 
				+
			
 
				+static void __insert_snap_realm(struct rb_root *root,
			
 
				+				struct ceph_snap_realm *new)
			
 
				+{
			
 
				+	struct rb_node **p = &root->rb_node;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_snap_realm *r = NULL;
			
 
				+
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		r = rb_entry(parent, struct ceph_snap_realm, node);
			
 
				+		if (new->ino < r->ino)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (new->ino > r->ino)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else
			
 
				+			BUG();
			
 
				+	}
			
 
				+
			
 
				+	rb_link_node(&new->node, parent, p);
			
 
				+	rb_insert_color(&new->node, root);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * create and get the realm rooted at @ino and bump its ref count.
			
 
				+ *
			
 
				+ * caller must hold snap_rwsem for write.
			
 
				+ */
			
 
				+static struct ceph_snap_realm *ceph_create_snap_realm(
			
 
				+	struct ceph_mds_client *mdsc,
			
 
				+	u64 ino)
			
 
				+{
			
 
				+	struct ceph_snap_realm *realm;
			
 
				+
			
 
				+	realm = kzalloc(sizeof(*realm), GFP_NOFS);
			
 
				+	if (!realm)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	atomic_set(&realm->nref, 0);    /* tree does not take a ref */
			
 
				+	realm->ino = ino;
			
 
				+	INIT_LIST_HEAD(&realm->children);
			
 
				+	INIT_LIST_HEAD(&realm->child_item);
			
 
				+	INIT_LIST_HEAD(&realm->empty_item);
			
 
				+	INIT_LIST_HEAD(&realm->inodes_with_caps);
			
 
				+	spin_lock_init(&realm->inodes_with_caps_lock);
			
 
				+	__insert_snap_realm(&mdsc->snap_realms, realm);
			
 
				+	dout("create_snap_realm %llx %p\n", realm->ino, realm);
			
 
				+	return realm;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * lookup the realm rooted at @ino.
			
 
				+ *
			
 
				+ * caller must hold snap_rwsem for write.
			
 
				+ */
			
 
				+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+					       u64 ino)
			
 
				+{
			
 
				+	struct rb_node *n = mdsc->snap_realms.rb_node;
			
 
				+	struct ceph_snap_realm *r;
			
 
				+
			
 
				+	while (n) {
			
 
				+		r = rb_entry(n, struct ceph_snap_realm, node);
			
 
				+		if (ino < r->ino)
			
 
				+			n = n->rb_left;
			
 
				+		else if (ino > r->ino)
			
 
				+			n = n->rb_right;
			
 
				+		else {
			
 
				+			dout("lookup_snap_realm %llx %p\n", r->ino, r);
			
 
				+			return r;
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void __put_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_snap_realm *realm);
			
 
				+
			
 
				+/*
			
 
				+ * called with snap_rwsem (write)
			
 
				+ */
			
 
				+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+				 struct ceph_snap_realm *realm)
			
 
				+{
			
 
				+	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
			
 
				+
			
 
				+	rb_erase(&realm->node, &mdsc->snap_realms);
			
 
				+
			
 
				+	if (realm->parent) {
			
 
				+		list_del_init(&realm->child_item);
			
 
				+		__put_snap_realm(mdsc, realm->parent);
			
 
				+	}
			
 
				+
			
 
				+	kfree(realm->prior_parent_snaps);
			
 
				+	kfree(realm->snaps);
			
 
				+	ceph_put_snap_context(realm->cached_context);
			
 
				+	kfree(realm);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caller holds snap_rwsem (write)
			
 
				+ */
			
 
				+static void __put_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_snap_realm *realm)
			
 
				+{
			
 
				+	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
			
 
				+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
			
 
				+	if (atomic_dec_and_test(&realm->nref))
			
 
				+		__destroy_snap_realm(mdsc, realm);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caller needn't hold any locks
			
 
				+ */
			
 
				+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+			 struct ceph_snap_realm *realm)
			
 
				+{
			
 
				+	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
			
 
				+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
			
 
				+	if (!atomic_dec_and_test(&realm->nref))
			
 
				+		return;
			
 
				+
			
 
				+	if (down_write_trylock(&mdsc->snap_rwsem)) {
			
 
				+		__destroy_snap_realm(mdsc, realm);
			
 
				+		up_write(&mdsc->snap_rwsem);
			
 
				+	} else {
			
 
				+		spin_lock(&mdsc->snap_empty_lock);
			
 
				+		list_add(&mdsc->snap_empty, &realm->empty_item);
			
 
				+		spin_unlock(&mdsc->snap_empty_lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Clean up any realms whose ref counts have dropped to zero.  Note
			
 
				+ * that this does not include realms who were created but not yet
			
 
				+ * used.
			
 
				+ *
			
 
				+ * Called under snap_rwsem (write)
			
 
				+ */
			
 
				+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_snap_realm *realm;
			
 
				+
			
 
				+	spin_lock(&mdsc->snap_empty_lock);
			
 
				+	while (!list_empty(&mdsc->snap_empty)) {
			
 
				+		realm = list_first_entry(&mdsc->snap_empty,
			
 
				+				   struct ceph_snap_realm, empty_item);
			
 
				+		list_del(&realm->empty_item);
			
 
				+		spin_unlock(&mdsc->snap_empty_lock);
			
 
				+		__destroy_snap_realm(mdsc, realm);
			
 
				+		spin_lock(&mdsc->snap_empty_lock);
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->snap_empty_lock);
			
 
				+}
			
 
				+
			
 
				+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	down_write(&mdsc->snap_rwsem);
			
 
				+	__cleanup_empty_realms(mdsc);
			
 
				+	up_write(&mdsc->snap_rwsem);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * adjust the parent realm of a given @realm.  adjust child list, and parent
			
 
				+ * pointers, and ref counts appropriately.
			
 
				+ *
			
 
				+ * return true if parent was changed, 0 if unchanged, <0 on error.
			
 
				+ *
			
 
				+ * caller must hold snap_rwsem for write.
			
 
				+ */
			
 
				+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
			
 
				+				    struct ceph_snap_realm *realm,
			
 
				+				    u64 parentino)
			
 
				+{
			
 
				+	struct ceph_snap_realm *parent;
			
 
				+
			
 
				+	if (realm->parent_ino == parentino)
			
 
				+		return 0;
			
 
				+
			
 
				+	parent = ceph_lookup_snap_realm(mdsc, parentino);
			
 
				+	if (!parent) {
			
 
				+		parent = ceph_create_snap_realm(mdsc, parentino);
			
 
				+		if (IS_ERR(parent))
			
 
				+			return PTR_ERR(parent);
			
 
				+	}
			
 
				+	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
			
 
				+	     realm->ino, realm, realm->parent_ino, realm->parent,
			
 
				+	     parentino, parent);
			
 
				+	if (realm->parent) {
			
 
				+		list_del_init(&realm->child_item);
			
 
				+		ceph_put_snap_realm(mdsc, realm->parent);
			
 
				+	}
			
 
				+	realm->parent_ino = parentino;
			
 
				+	realm->parent = parent;
			
 
				+	ceph_get_snap_realm(mdsc, parent);
			
 
				+	list_add(&realm->child_item, &parent->children);
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int cmpu64_rev(const void *a, const void *b)
			
 
				+{
			
 
				+	if (*(u64 *)a < *(u64 *)b)
			
 
				+		return 1;
			
 
				+	if (*(u64 *)a > *(u64 *)b)
			
 
				+		return -1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * build the snap context for a given realm.
			
 
				+ */
			
 
				+static int build_snap_context(struct ceph_snap_realm *realm)
			
 
				+{
			
 
				+	struct ceph_snap_realm *parent = realm->parent;
			
 
				+	struct ceph_snap_context *snapc;
			
 
				+	int err = 0;
			
 
				+	int i;
			
 
				+	int num = realm->num_prior_parent_snaps + realm->num_snaps;
			
 
				+
			
 
				+	/*
			
 
				+	 * build parent context, if it hasn't been built.
			
 
				+	 * conservatively estimate that all parent snaps might be
			
 
				+	 * included by us.
			
 
				+	 */
			
 
				+	if (parent) {
			
 
				+		if (!parent->cached_context) {
			
 
				+			err = build_snap_context(parent);
			
 
				+			if (err)
			
 
				+				goto fail;
			
 
				+		}
			
 
				+		num += parent->cached_context->num_snaps;
			
 
				+	}
			
 
				+
			
 
				+	/* do i actually need to update?  not if my context seq
			
 
				+	   matches realm seq, and my parents' does to.  (this works
			
 
				+	   because we rebuild_snap_realms() works _downward_ in
			
 
				+	   hierarchy after each update.) */
			
 
				+	if (realm->cached_context &&
			
 
				+	    realm->cached_context->seq <= realm->seq &&
			
 
				+	    (!parent ||
			
 
				+	     realm->cached_context->seq <= parent->cached_context->seq)) {
			
 
				+		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
			
 
				+		     " (unchanged)\n",
			
 
				+		     realm->ino, realm, realm->cached_context,
			
 
				+		     realm->cached_context->seq,
			
 
				+		     realm->cached_context->num_snaps);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* alloc new snap context */
			
 
				+	err = -ENOMEM;
			
 
				+	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
			
 
				+		goto fail;
			
 
				+	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
			
 
				+	if (!snapc)
			
 
				+		goto fail;
			
 
				+	atomic_set(&snapc->nref, 1);
			
 
				+
			
 
				+	/* build (reverse sorted) snap vector */
			
 
				+	num = 0;
			
 
				+	snapc->seq = realm->seq;
			
 
				+	if (parent) {
			
 
				+		/* include any of parent's snaps occuring _after_ my
			
 
				+		   parent became my parent */
			
 
				+		for (i = 0; i < parent->cached_context->num_snaps; i++)
			
 
				+			if (parent->cached_context->snaps[i] >=
			
 
				+			    realm->parent_since)
			
 
				+				snapc->snaps[num++] =
			
 
				+					parent->cached_context->snaps[i];
			
 
				+		if (parent->cached_context->seq > snapc->seq)
			
 
				+			snapc->seq = parent->cached_context->seq;
			
 
				+	}
			
 
				+	memcpy(snapc->snaps + num, realm->snaps,
			
 
				+	       sizeof(u64)*realm->num_snaps);
			
 
				+	num += realm->num_snaps;
			
 
				+	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
			
 
				+	       sizeof(u64)*realm->num_prior_parent_snaps);
			
 
				+	num += realm->num_prior_parent_snaps;
			
 
				+
			
 
				+	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
			
 
				+	snapc->num_snaps = num;
			
 
				+	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
			
 
				+	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
			
 
				+
			
 
				+	if (realm->cached_context)
			
 
				+		ceph_put_snap_context(realm->cached_context);
			
 
				+	realm->cached_context = snapc;
			
 
				+	return 0;
			
 
				+
			
 
				+fail:
			
 
				+	/*
			
 
				+	 * if we fail, clear old (incorrect) cached_context... hopefully
			
 
				+	 * we'll have better luck building it later
			
 
				+	 */
			
 
				+	if (realm->cached_context) {
			
 
				+		ceph_put_snap_context(realm->cached_context);
			
 
				+		realm->cached_context = NULL;
			
 
				+	}
			
 
				+	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
			
 
				+	       realm, err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * rebuild snap context for the given realm and all of its children.
			
 
				+ */
			
 
				+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
			
 
				+{
			
 
				+	struct ceph_snap_realm *child;
			
 
				+
			
 
				+	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
			
 
				+	build_snap_context(realm);
			
 
				+
			
 
				+	list_for_each_entry(child, &realm->children, child_item)
			
 
				+		rebuild_snap_realms(child);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * helper to allocate and decode an array of snapids.  free prior
			
 
				+ * instance, if any.
			
 
				+ */
			
 
				+static int dup_array(u64 **dst, __le64 *src, int num)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	kfree(*dst);
			
 
				+	if (num) {
			
 
				+		*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
			
 
				+		if (!*dst)
			
 
				+			return -ENOMEM;
			
 
				+		for (i = 0; i < num; i++)
			
 
				+			(*dst)[i] = get_unaligned_le64(src + i);
			
 
				+	} else {
			
 
				+		*dst = NULL;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * When a snapshot is applied, the size/mtime inode metadata is queued
			
 
				+ * in a ceph_cap_snap (one for each snapshot) until writeback
			
 
				+ * completes and the metadata can be flushed back to the MDS.
			
 
				+ *
			
 
				+ * However, if a (sync) write is currently in-progress when we apply
			
 
				+ * the snapshot, we have to wait until the write succeeds or fails
			
 
				+ * (and a final size/mtime is known).  In this case the
			
 
				+ * cap_snap->writing = 1, and is said to be "pending."  When the write
			
 
				+ * finishes, we __ceph_finish_cap_snap().
			
 
				+ *
			
 
				+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
			
 
				+ * change).
			
 
				+ */
			
 
				+void ceph_queue_cap_snap(struct ceph_inode_info *ci,
			
 
				+			 struct ceph_snap_context *snapc)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	struct ceph_cap_snap *capsnap;
			
 
				+	int used;
			
 
				+
			
 
				+	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
			
 
				+	if (!capsnap) {
			
 
				+		pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	used = __ceph_caps_used(ci);
			
 
				+	if (__ceph_have_pending_cap_snap(ci)) {
			
 
				+		/* there is no point in queuing multiple "pending" cap_snaps,
			
 
				+		   as no new writes are allowed to start when pending, so any
			
 
				+		   writes in progress now were started before the previous
			
 
				+		   cap_snap.  lucky us. */
			
 
				+		dout("queue_cap_snap %p snapc %p seq %llu used %d"
			
 
				+		     " already pending\n", inode, snapc, snapc->seq, used);
			
 
				+		kfree(capsnap);
			
 
				+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
			
 
				+		igrab(inode);
			
 
				+
			
 
				+		atomic_set(&capsnap->nref, 1);
			
 
				+		capsnap->ci = ci;
			
 
				+		INIT_LIST_HEAD(&capsnap->ci_item);
			
 
				+		INIT_LIST_HEAD(&capsnap->flushing_item);
			
 
				+
			
 
				+		capsnap->follows = snapc->seq - 1;
			
 
				+		capsnap->context = ceph_get_snap_context(snapc);
			
 
				+		capsnap->issued = __ceph_caps_issued(ci, NULL);
			
 
				+		capsnap->dirty = __ceph_caps_dirty(ci);
			
 
				+
			
 
				+		capsnap->mode = inode->i_mode;
			
 
				+		capsnap->uid = inode->i_uid;
			
 
				+		capsnap->gid = inode->i_gid;
			
 
				+
			
 
				+		/* fixme? */
			
 
				+		capsnap->xattr_blob = NULL;
			
 
				+		capsnap->xattr_len = 0;
			
 
				+
			
 
				+		/* dirty page count moved from _head to this cap_snap;
			
 
				+		   all subsequent writes page dirties occur _after_ this
			
 
				+		   snapshot. */
			
 
				+		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
			
 
				+		ci->i_wrbuffer_ref_head = 0;
			
 
				+		ceph_put_snap_context(ci->i_head_snapc);
			
 
				+		ci->i_head_snapc = NULL;
			
 
				+		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
			
 
				+
			
 
				+		if (used & CEPH_CAP_FILE_WR) {
			
 
				+			dout("queue_cap_snap %p cap_snap %p snapc %p"
			
 
				+			     " seq %llu used WR, now pending\n", inode,
			
 
				+			     capsnap, snapc, snapc->seq);
			
 
				+			capsnap->writing = 1;
			
 
				+		} else {
			
 
				+			/* note mtime, size NOW. */
			
 
				+			__ceph_finish_cap_snap(ci, capsnap);
			
 
				+		}
			
 
				+	} else {
			
 
				+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
			
 
				+		kfree(capsnap);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
			
 
				+ * to be used for the snapshot, to be flushed back to the mds.
			
 
				+ *
			
 
				+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
			
 
				+ *
			
 
				+ * Caller must hold i_lock.
			
 
				+ */
			
 
				+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
			
 
				+			    struct ceph_cap_snap *capsnap)
			
 
				+{
			
 
				+	struct inode *inode = &ci->vfs_inode;
			
 
				+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
			
 
				+
			
 
				+	BUG_ON(capsnap->writing);
			
 
				+	capsnap->size = inode->i_size;
			
 
				+	capsnap->mtime = inode->i_mtime;
			
 
				+	capsnap->atime = inode->i_atime;
			
 
				+	capsnap->ctime = inode->i_ctime;
			
 
				+	capsnap->time_warp_seq = ci->i_time_warp_seq;
			
 
				+	if (capsnap->dirty_pages) {
			
 
				+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
			
 
				+		     "still has %d dirty pages\n", inode, capsnap,
			
 
				+		     capsnap->context, capsnap->context->seq,
			
 
				+		     capsnap->size, capsnap->dirty_pages);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
			
 
				+	     inode, capsnap, capsnap->context,
			
 
				+	     capsnap->context->seq, capsnap->size);
			
 
				+
			
 
				+	spin_lock(&mdsc->snap_flush_lock);
			
 
				+	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
			
 
				+	spin_unlock(&mdsc->snap_flush_lock);
			
 
				+	return 1;  /* caller may want to ceph_flush_snaps */
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
			
 
				+ * the snap realm parameters from a given realm and all of its ancestors,
			
 
				+ * up to the root.
			
 
				+ *
			
 
				+ * Caller must hold snap_rwsem for write.
			
 
				+ */
			
 
				+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
			
 
				+			   void *p, void *e, bool deletion)
			
 
				+{
			
 
				+	struct ceph_mds_snap_realm *ri;    /* encoded */
			
 
				+	__le64 *snaps;                     /* encoded */
			
 
				+	__le64 *prior_parent_snaps;        /* encoded */
			
 
				+	struct ceph_snap_realm *realm;
			
 
				+	int invalidate = 0;
			
 
				+	int err = -ENOMEM;
			
 
				+
			
 
				+	dout("update_snap_trace deletion=%d\n", deletion);
			
 
				+more:
			
 
				+	ceph_decode_need(&p, e, sizeof(*ri), bad);
			
 
				+	ri = p;
			
 
				+	p += sizeof(*ri);
			
 
				+	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
			
 
				+			    le32_to_cpu(ri->num_prior_parent_snaps)), bad);
			
 
				+	snaps = p;
			
 
				+	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
			
 
				+	prior_parent_snaps = p;
			
 
				+	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
			
 
				+
			
 
				+	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
			
 
				+	if (!realm) {
			
 
				+		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
			
 
				+		if (IS_ERR(realm)) {
			
 
				+			err = PTR_ERR(realm);
			
 
				+			goto fail;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (le64_to_cpu(ri->seq) > realm->seq) {
			
 
				+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
			
 
				+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
			
 
				+		/*
			
 
				+		 * if the realm seq has changed, queue a cap_snap for every
			
 
				+		 * inode with open caps.  we do this _before_ we update
			
 
				+		 * the realm info so that we prepare for writeback under the
			
 
				+		 * _previous_ snap context.
			
 
				+		 *
			
 
				+		 * ...unless it's a snap deletion!
			
 
				+		 */
			
 
				+		if (!deletion) {
			
 
				+			struct ceph_inode_info *ci;
			
 
				+			struct inode *lastinode = NULL;
			
 
				+
			
 
				+			spin_lock(&realm->inodes_with_caps_lock);
			
 
				+			list_for_each_entry(ci, &realm->inodes_with_caps,
			
 
				+					    i_snap_realm_item) {
			
 
				+				struct inode *inode = igrab(&ci->vfs_inode);
			
 
				+				if (!inode)
			
 
				+					continue;
			
 
				+				spin_unlock(&realm->inodes_with_caps_lock);
			
 
				+				if (lastinode)
			
 
				+					iput(lastinode);
			
 
				+				lastinode = inode;
			
 
				+				ceph_queue_cap_snap(ci, realm->cached_context);
			
 
				+				spin_lock(&realm->inodes_with_caps_lock);
			
 
				+			}
			
 
				+			spin_unlock(&realm->inodes_with_caps_lock);
			
 
				+			if (lastinode)
			
 
				+				iput(lastinode);
			
 
				+			dout("update_snap_trace cap_snaps queued\n");
			
 
				+		}
			
 
				+
			
 
				+	} else {
			
 
				+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
			
 
				+		     realm->ino, realm, realm->seq);
			
 
				+	}
			
 
				+
			
 
				+	/* ensure the parent is correct */
			
 
				+	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
			
 
				+	if (err < 0)
			
 
				+		goto fail;
			
 
				+	invalidate += err;
			
 
				+
			
 
				+	if (le64_to_cpu(ri->seq) > realm->seq) {
			
 
				+		/* update realm parameters, snap lists */
			
 
				+		realm->seq = le64_to_cpu(ri->seq);
			
 
				+		realm->created = le64_to_cpu(ri->created);
			
 
				+		realm->parent_since = le64_to_cpu(ri->parent_since);
			
 
				+
			
 
				+		realm->num_snaps = le32_to_cpu(ri->num_snaps);
			
 
				+		err = dup_array(&realm->snaps, snaps, realm->num_snaps);
			
 
				+		if (err < 0)
			
 
				+			goto fail;
			
 
				+
			
 
				+		realm->num_prior_parent_snaps =
			
 
				+			le32_to_cpu(ri->num_prior_parent_snaps);
			
 
				+		err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
			
 
				+				realm->num_prior_parent_snaps);
			
 
				+		if (err < 0)
			
 
				+			goto fail;
			
 
				+
			
 
				+		invalidate = 1;
			
 
				+	} else if (!realm->cached_context) {
			
 
				+		invalidate = 1;
			
 
				+	}
			
 
				+
			
 
				+	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
			
 
				+	     realm, invalidate, p, e);
			
 
				+
			
 
				+	if (p < e)
			
 
				+		goto more;
			
 
				+
			
 
				+	/* invalidate when we reach the _end_ (root) of the trace */
			
 
				+	if (invalidate)
			
 
				+		rebuild_snap_realms(realm);
			
 
				+
			
 
				+	__cleanup_empty_realms(mdsc);
			
 
				+	return 0;
			
 
				+
			
 
				+bad:
			
 
				+	err = -EINVAL;
			
 
				+fail:
			
 
				+	pr_err("update_snap_trace error %d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Send any cap_snaps that are queued for flush.  Try to carry
			
 
				+ * s_mutex across multiple snap flushes to avoid locking overhead.
			
 
				+ *
			
 
				+ * Caller holds no locks.
			
 
				+ */
			
 
				+static void flush_snaps(struct ceph_mds_client *mdsc)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct inode *inode;
			
 
				+	struct ceph_mds_session *session = NULL;
			
 
				+
			
 
				+	dout("flush_snaps\n");
			
 
				+	spin_lock(&mdsc->snap_flush_lock);
			
 
				+	while (!list_empty(&mdsc->snap_flush_list)) {
			
 
				+		ci = list_first_entry(&mdsc->snap_flush_list,
			
 
				+				struct ceph_inode_info, i_snap_flush_item);
			
 
				+		inode = &ci->vfs_inode;
			
 
				+		igrab(inode);
			
 
				+		spin_unlock(&mdsc->snap_flush_lock);
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		__ceph_flush_snaps(ci, &session);
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		iput(inode);
			
 
				+		spin_lock(&mdsc->snap_flush_lock);
			
 
				+	}
			
 
				+	spin_unlock(&mdsc->snap_flush_lock);
			
 
				+
			
 
				+	if (session) {
			
 
				+		mutex_unlock(&session->s_mutex);
			
 
				+		ceph_put_mds_session(session);
			
 
				+	}
			
 
				+	dout("flush_snaps done\n");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Handle a snap notification from the MDS.
			
 
				+ *
			
 
				+ * This can take two basic forms: the simplest is just a snap creation
			
 
				+ * or deletion notification on an existing realm.  This should update the
			
 
				+ * realm and its children.
			
 
				+ *
			
 
				+ * The more difficult case is realm creation, due to snap creation at a
			
 
				+ * new point in the file hierarchy, or due to a rename that moves a file or
			
 
				+ * directory into another realm.
			
 
				+ */
			
 
				+void ceph_handle_snap(struct ceph_mds_client *mdsc,
			
 
				+		      struct ceph_mds_session *session,
			
 
				+		      struct ceph_msg *msg)
			
 
				+{
			
 
				+	struct super_block *sb = mdsc->client->sb;
			
 
				+	int mds = session->s_mds;
			
 
				+	u64 split;
			
 
				+	int op;
			
 
				+	int trace_len;
			
 
				+	struct ceph_snap_realm *realm = NULL;
			
 
				+	void *p = msg->front.iov_base;
			
 
				+	void *e = p + msg->front.iov_len;
			
 
				+	struct ceph_mds_snap_head *h;
			
 
				+	int num_split_inos, num_split_realms;
			
 
				+	__le64 *split_inos = NULL, *split_realms = NULL;
			
 
				+	int i;
			
 
				+	int locked_rwsem = 0;
			
 
				+
			
 
				+	/* decode */
			
 
				+	if (msg->front.iov_len < sizeof(*h))
			
 
				+		goto bad;
			
 
				+	h = p;
			
 
				+	op = le32_to_cpu(h->op);
			
 
				+	split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
			
 
				+					  * existing realm */
			
 
				+	num_split_inos = le32_to_cpu(h->num_split_inos);
			
 
				+	num_split_realms = le32_to_cpu(h->num_split_realms);
			
 
				+	trace_len = le32_to_cpu(h->trace_len);
			
 
				+	p += sizeof(*h);
			
 
				+
			
 
				+	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
			
 
				+	     ceph_snap_op_name(op), split, trace_len);
			
 
				+
			
 
				+	mutex_lock(&session->s_mutex);
			
 
				+	session->s_seq++;
			
 
				+	mutex_unlock(&session->s_mutex);
			
 
				+
			
 
				+	down_write(&mdsc->snap_rwsem);
			
 
				+	locked_rwsem = 1;
			
 
				+
			
 
				+	if (op == CEPH_SNAP_OP_SPLIT) {
			
 
				+		struct ceph_mds_snap_realm *ri;
			
 
				+
			
 
				+		/*
			
 
				+		 * A "split" breaks part of an existing realm off into
			
 
				+		 * a new realm.  The MDS provides a list of inodes
			
 
				+		 * (with caps) and child realms that belong to the new
			
 
				+		 * child.
			
 
				+		 */
			
 
				+		split_inos = p;
			
 
				+		p += sizeof(u64) * num_split_inos;
			
 
				+		split_realms = p;
			
 
				+		p += sizeof(u64) * num_split_realms;
			
 
				+		ceph_decode_need(&p, e, sizeof(*ri), bad);
			
 
				+		/* we will peek at realm info here, but will _not_
			
 
				+		 * advance p, as the realm update will occur below in
			
 
				+		 * ceph_update_snap_trace. */
			
 
				+		ri = p;
			
 
				+
			
 
				+		realm = ceph_lookup_snap_realm(mdsc, split);
			
 
				+		if (!realm) {
			
 
				+			realm = ceph_create_snap_realm(mdsc, split);
			
 
				+			if (IS_ERR(realm))
			
 
				+				goto out;
			
 
				+		}
			
 
				+		ceph_get_snap_realm(mdsc, realm);
			
 
				+
			
 
				+		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
			
 
				+		for (i = 0; i < num_split_inos; i++) {
			
 
				+			struct ceph_vino vino = {
			
 
				+				.ino = le64_to_cpu(split_inos[i]),
			
 
				+				.snap = CEPH_NOSNAP,
			
 
				+			};
			
 
				+			struct inode *inode = ceph_find_inode(sb, vino);
			
 
				+			struct ceph_inode_info *ci;
			
 
				+
			
 
				+			if (!inode)
			
 
				+				continue;
			
 
				+			ci = ceph_inode(inode);
			
 
				+
			
 
				+			spin_lock(&inode->i_lock);
			
 
				+			if (!ci->i_snap_realm)
			
 
				+				goto skip_inode;
			
 
				+			/*
			
 
				+			 * If this inode belongs to a realm that was
			
 
				+			 * created after our new realm, we experienced
			
 
				+			 * a race (due to another split notifications
			
 
				+			 * arriving from a different MDS).  So skip
			
 
				+			 * this inode.
			
 
				+			 */
			
 
				+			if (ci->i_snap_realm->created >
			
 
				+			    le64_to_cpu(ri->created)) {
			
 
				+				dout(" leaving %p in newer realm %llx %p\n",
			
 
				+				     inode, ci->i_snap_realm->ino,
			
 
				+				     ci->i_snap_realm);
			
 
				+				goto skip_inode;
			
 
				+			}
			
 
				+			dout(" will move %p to split realm %llx %p\n",
			
 
				+			     inode, realm->ino, realm);
			
 
				+			/*
			
 
				+			 * Remove the inode from the realm's inode
			
 
				+			 * list, but don't add it to the new realm
			
 
				+			 * yet.  We don't want the cap_snap to be
			
 
				+			 * queued (again) by ceph_update_snap_trace()
			
 
				+			 * below.  Queue it _now_, under the old context.
			
 
				+			 */
			
 
				+			list_del_init(&ci->i_snap_realm_item);
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+			ceph_queue_cap_snap(ci,
			
 
				+					    ci->i_snap_realm->cached_context);
			
 
				+
			
 
				+			iput(inode);
			
 
				+			continue;
			
 
				+
			
 
				+skip_inode:
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+			iput(inode);
			
 
				+		}
			
 
				+
			
 
				+		/* we may have taken some of the old realm's children. */
			
 
				+		for (i = 0; i < num_split_realms; i++) {
			
 
				+			struct ceph_snap_realm *child =
			
 
				+				ceph_lookup_snap_realm(mdsc,
			
 
				+					   le64_to_cpu(split_realms[i]));
			
 
				+			if (!child)
			
 
				+				continue;
			
 
				+			adjust_snap_realm_parent(mdsc, child, realm->ino);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * update using the provided snap trace. if we are deleting a
			
 
				+	 * snap, we can avoid queueing cap_snaps.
			
 
				+	 */
			
 
				+	ceph_update_snap_trace(mdsc, p, e,
			
 
				+			       op == CEPH_SNAP_OP_DESTROY);
			
 
				+
			
 
				+	if (op == CEPH_SNAP_OP_SPLIT) {
			
 
				+		/*
			
 
				+		 * ok, _now_ add the inodes into the new realm.
			
 
				+		 */
			
 
				+		for (i = 0; i < num_split_inos; i++) {
			
 
				+			struct ceph_vino vino = {
			
 
				+				.ino = le64_to_cpu(split_inos[i]),
			
 
				+				.snap = CEPH_NOSNAP,
			
 
				+			};
			
 
				+			struct inode *inode = ceph_find_inode(sb, vino);
			
 
				+			struct ceph_inode_info *ci;
			
 
				+
			
 
				+			if (!inode)
			
 
				+				continue;
			
 
				+			ci = ceph_inode(inode);
			
 
				+			spin_lock(&inode->i_lock);
			
 
				+			if (!ci->i_snap_realm)
			
 
				+				goto split_skip_inode;
			
 
				+			ceph_put_snap_realm(mdsc, ci->i_snap_realm);
			
 
				+			spin_lock(&realm->inodes_with_caps_lock);
			
 
				+			list_add(&ci->i_snap_realm_item,
			
 
				+				 &realm->inodes_with_caps);
			
 
				+			ci->i_snap_realm = realm;
			
 
				+			spin_unlock(&realm->inodes_with_caps_lock);
			
 
				+			ceph_get_snap_realm(mdsc, realm);
			
 
				+split_skip_inode:
			
 
				+			spin_unlock(&inode->i_lock);
			
 
				+			iput(inode);
			
 
				+		}
			
 
				+
			
 
				+		/* we took a reference when we created the realm, above */
			
 
				+		ceph_put_snap_realm(mdsc, realm);
			
 
				+	}
			
 
				+
			
 
				+	__cleanup_empty_realms(mdsc);
			
 
				+
			
 
				+	up_write(&mdsc->snap_rwsem);
			
 
				+
			
 
				+	flush_snaps(mdsc);
			
 
				+	return;
			
 
				+
			
 
				+bad:
			
 
				+	pr_err("corrupt snap message from mds%d\n", mds);
			
 
				+	ceph_msg_dump(msg);
			
 
				+out:
			
 
				+	if (locked_rwsem)
			
 
				+		up_write(&mdsc->snap_rwsem);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
 
				+
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <linux/backing-dev.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/inet.h>
			
 
				+#include <linux/in6.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/mount.h>
			
 
				+#include <linux/parser.h>
			
 
				+#include <linux/rwsem.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/seq_file.h>
			
 
				+#include <linux/statfs.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/version.h>
			
 
				+#include <linux/vmalloc.h>
			
 
				+
			
 
				+#include "decode.h"
			
 
				+#include "super.h"
			
 
				+#include "mon_client.h"
			
 
				+#include "auth.h"
			
 
				+
			
 
				+/*
			
 
				+ * Ceph superblock operations
			
 
				+ *
			
 
				+ * Handle the basics of mounting, unmounting.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * find filename portion of a path (/foo/bar/baz -> baz)
			
 
				+ */
			
 
				+const char *ceph_file_part(const char *s, int len)
			
 
				+{
			
 
				+	const char *e = s + len;
			
 
				+
			
 
				+	while (e != s && *(e-1) != '/')
			
 
				+		e--;
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * super ops
			
 
				+ */
			
 
				+static void ceph_put_super(struct super_block *s)
			
 
				+{
			
 
				+	struct ceph_client *cl = ceph_client(s);
			
 
				+
			
 
				+	dout("put_super\n");
			
 
				+	ceph_mdsc_close_sessions(&cl->mdsc);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
			
 
				+	struct ceph_monmap *monmap = client->monc.monmap;
			
 
				+	struct ceph_statfs st;
			
 
				+	u64 fsid;
			
 
				+	int err;
			
 
				+
			
 
				+	dout("statfs\n");
			
 
				+	err = ceph_monc_do_statfs(&client->monc, &st);
			
 
				+	if (err < 0)
			
 
				+		return err;
			
 
				+
			
 
				+	/* fill in kstatfs */
			
 
				+	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
			
 
				+
			
 
				+	/*
			
 
				+	 * express utilization in terms of large blocks to avoid
			
 
				+	 * overflow on 32-bit machines.
			
 
				+	 */
			
 
				+	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
			
 
				+	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
			
 
				+	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
			
 
				+		(CEPH_BLOCK_SHIFT-10);
			
 
				+	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
			
 
				+
			
 
				+	buf->f_files = le64_to_cpu(st.num_objects);
			
 
				+	buf->f_ffree = -1;
			
 
				+	buf->f_namelen = PATH_MAX;
			
 
				+	buf->f_frsize = PAGE_CACHE_SIZE;
			
 
				+
			
 
				+	/* leave fsid little-endian, regardless of host endianness */
			
 
				+	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
			
 
				+	buf->f_fsid.val[0] = fsid & 0xffffffff;
			
 
				+	buf->f_fsid.val[1] = fsid >> 32;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int ceph_syncfs(struct super_block *sb, int wait)
			
 
				+{
			
 
				+	dout("sync_fs %d\n", wait);
			
 
				+	ceph_osdc_sync(&ceph_client(sb)->osdc);
			
 
				+	ceph_mdsc_sync(&ceph_client(sb)->mdsc);
			
 
				+	dout("sync_fs %d done\n", wait);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * ceph_show_options - Show mount options in /proc/mounts
			
 
				+ * @m: seq_file to write to
			
 
				+ * @mnt: mount descriptor
			
 
				+ */
			
 
				+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
			
 
				+	struct ceph_mount_args *args = client->mount_args;
			
 
				+
			
 
				+	if (args->flags & CEPH_OPT_FSID)
			
 
				+		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
			
 
				+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
			
 
				+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
			
 
				+	if (args->flags & CEPH_OPT_NOSHARE)
			
 
				+		seq_puts(m, ",noshare");
			
 
				+	if (args->flags & CEPH_OPT_DIRSTAT)
			
 
				+		seq_puts(m, ",dirstat");
			
 
				+	if ((args->flags & CEPH_OPT_RBYTES) == 0)
			
 
				+		seq_puts(m, ",norbytes");
			
 
				+	if (args->flags & CEPH_OPT_NOCRC)
			
 
				+		seq_puts(m, ",nocrc");
			
 
				+	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
			
 
				+		seq_puts(m, ",noasyncreaddir");
			
 
				+	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
			
 
				+		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
			
 
				+	if (args->name)
			
 
				+		seq_printf(m, ",name=%s", args->name);
			
 
				+	if (args->secret)
			
 
				+		seq_puts(m, ",secret=<hidden>");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * caches
			
 
				+ */
			
 
				+struct kmem_cache *ceph_inode_cachep;
			
 
				+struct kmem_cache *ceph_cap_cachep;
			
 
				+struct kmem_cache *ceph_dentry_cachep;
			
 
				+struct kmem_cache *ceph_file_cachep;
			
 
				+
			
 
				+static void ceph_inode_init_once(void *foo)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = foo;
			
 
				+	inode_init_once(&ci->vfs_inode);
			
 
				+}
			
 
				+
			
 
				+static int default_congestion_kb(void)
			
 
				+{
			
 
				+	int congestion_kb;
			
 
				+
			
 
				+	/*
			
 
				+	 * Copied from NFS
			
 
				+	 *
			
 
				+	 * congestion size, scale with available memory.
			
 
				+	 *
			
 
				+	 *  64MB:    8192k
			
 
				+	 * 128MB:   11585k
			
 
				+	 * 256MB:   16384k
			
 
				+	 * 512MB:   23170k
			
 
				+	 *   1GB:   32768k
			
 
				+	 *   2GB:   46340k
			
 
				+	 *   4GB:   65536k
			
 
				+	 *   8GB:   92681k
			
 
				+	 *  16GB:  131072k
			
 
				+	 *
			
 
				+	 * This allows larger machines to have larger/more transfers.
			
 
				+	 * Limit the default to 256M
			
 
				+	 */
			
 
				+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
			
 
				+	if (congestion_kb > 256*1024)
			
 
				+		congestion_kb = 256*1024;
			
 
				+
			
 
				+	return congestion_kb;
			
 
				+}
			
 
				+
			
 
				+static int __init init_caches(void)
			
 
				+{
			
 
				+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
			
 
				+				      sizeof(struct ceph_inode_info),
			
 
				+				      __alignof__(struct ceph_inode_info),
			
 
				+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
			
 
				+				      ceph_inode_init_once);
			
 
				+	if (ceph_inode_cachep == NULL)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
			
 
				+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
			
 
				+	if (ceph_cap_cachep == NULL)
			
 
				+		goto bad_cap;
			
 
				+
			
 
				+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
			
 
				+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
			
 
				+	if (ceph_dentry_cachep == NULL)
			
 
				+		goto bad_dentry;
			
 
				+
			
 
				+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
			
 
				+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
			
 
				+	if (ceph_file_cachep == NULL)
			
 
				+		goto bad_file;
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+bad_file:
			
 
				+	kmem_cache_destroy(ceph_dentry_cachep);
			
 
				+bad_dentry:
			
 
				+	kmem_cache_destroy(ceph_cap_cachep);
			
 
				+bad_cap:
			
 
				+	kmem_cache_destroy(ceph_inode_cachep);
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static void destroy_caches(void)
			
 
				+{
			
 
				+	kmem_cache_destroy(ceph_inode_cachep);
			
 
				+	kmem_cache_destroy(ceph_cap_cachep);
			
 
				+	kmem_cache_destroy(ceph_dentry_cachep);
			
 
				+	kmem_cache_destroy(ceph_file_cachep);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * ceph_umount_begin - initiate forced umount.  Tear down down the
			
 
				+ * mount, skipping steps that may hang while waiting for server(s).
			
 
				+ */
			
 
				+static void ceph_umount_begin(struct super_block *sb)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(sb);
			
 
				+
			
 
				+	dout("ceph_umount_begin - starting forced umount\n");
			
 
				+	if (!client)
			
 
				+		return;
			
 
				+	client->mount_state = CEPH_MOUNT_SHUTDOWN;
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+static const struct super_operations ceph_super_ops = {
			
 
				+	.alloc_inode	= ceph_alloc_inode,
			
 
				+	.destroy_inode	= ceph_destroy_inode,
			
 
				+	.write_inode    = ceph_write_inode,
			
 
				+	.sync_fs        = ceph_syncfs,
			
 
				+	.put_super	= ceph_put_super,
			
 
				+	.show_options   = ceph_show_options,
			
 
				+	.statfs		= ceph_statfs,
			
 
				+	.umount_begin   = ceph_umount_begin,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+const char *ceph_msg_type_name(int type)
			
 
				+{
			
 
				+	switch (type) {
			
 
				+	case CEPH_MSG_SHUTDOWN: return "shutdown";
			
 
				+	case CEPH_MSG_PING: return "ping";
			
 
				+	case CEPH_MSG_AUTH: return "auth";
			
 
				+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
			
 
				+	case CEPH_MSG_MON_MAP: return "mon_map";
			
 
				+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
			
 
				+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
			
 
				+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
			
 
				+	case CEPH_MSG_STATFS: return "statfs";
			
 
				+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
			
 
				+	case CEPH_MSG_MDS_MAP: return "mds_map";
			
 
				+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
			
 
				+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
			
 
				+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
			
 
				+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
			
 
				+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
			
 
				+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
			
 
				+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
			
 
				+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
			
 
				+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
			
 
				+	case CEPH_MSG_OSD_MAP: return "osd_map";
			
 
				+	case CEPH_MSG_OSD_OP: return "osd_op";
			
 
				+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
			
 
				+	default: return "unknown";
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * mount options
			
 
				+ */
			
 
				+enum {
			
 
				+	Opt_fsidmajor,
			
 
				+	Opt_fsidminor,
			
 
				+	Opt_monport,
			
 
				+	Opt_wsize,
			
 
				+	Opt_rsize,
			
 
				+	Opt_osdtimeout,
			
 
				+	Opt_osdkeepalivetimeout,
			
 
				+	Opt_mount_timeout,
			
 
				+	Opt_osd_idle_ttl,
			
 
				+	Opt_caps_wanted_delay_min,
			
 
				+	Opt_caps_wanted_delay_max,
			
 
				+	Opt_readdir_max_entries,
			
 
				+	Opt_congestion_kb,
			
 
				+	Opt_last_int,
			
 
				+	/* int args above */
			
 
				+	Opt_snapdirname,
			
 
				+	Opt_name,
			
 
				+	Opt_secret,
			
 
				+	Opt_last_string,
			
 
				+	/* string args above */
			
 
				+	Opt_ip,
			
 
				+	Opt_noshare,
			
 
				+	Opt_dirstat,
			
 
				+	Opt_nodirstat,
			
 
				+	Opt_rbytes,
			
 
				+	Opt_norbytes,
			
 
				+	Opt_nocrc,
			
 
				+	Opt_noasyncreaddir,
			
 
				+};
			
 
				+
			
 
				+static match_table_t arg_tokens = {
			
 
				+	{Opt_fsidmajor, "fsidmajor=%ld"},
			
 
				+	{Opt_fsidminor, "fsidminor=%ld"},
			
 
				+	{Opt_monport, "monport=%d"},
			
 
				+	{Opt_wsize, "wsize=%d"},
			
 
				+	{Opt_rsize, "rsize=%d"},
			
 
				+	{Opt_osdtimeout, "osdtimeout=%d"},
			
 
				+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
			
 
				+	{Opt_mount_timeout, "mount_timeout=%d"},
			
 
				+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
			
 
				+	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
			
 
				+	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
			
 
				+	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
			
 
				+	{Opt_congestion_kb, "write_congestion_kb=%d"},
			
 
				+	/* int args above */
			
 
				+	{Opt_snapdirname, "snapdirname=%s"},
			
 
				+	{Opt_name, "name=%s"},
			
 
				+	{Opt_secret, "secret=%s"},
			
 
				+	/* string args above */
			
 
				+	{Opt_ip, "ip=%s"},
			
 
				+	{Opt_noshare, "noshare"},
			
 
				+	{Opt_dirstat, "dirstat"},
			
 
				+	{Opt_nodirstat, "nodirstat"},
			
 
				+	{Opt_rbytes, "rbytes"},
			
 
				+	{Opt_norbytes, "norbytes"},
			
 
				+	{Opt_nocrc, "nocrc"},
			
 
				+	{Opt_noasyncreaddir, "noasyncreaddir"},
			
 
				+	{-1, NULL}
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
			
 
				+						const char *dev_name,
			
 
				+						const char **path)
			
 
				+{
			
 
				+	struct ceph_mount_args *args;
			
 
				+	const char *c;
			
 
				+	int err = -ENOMEM;
			
 
				+	substring_t argstr[MAX_OPT_ARGS];
			
 
				+
			
 
				+	args = kzalloc(sizeof(*args), GFP_KERNEL);
			
 
				+	if (!args)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
			
 
				+				 GFP_KERNEL);
			
 
				+	if (!args->mon_addr)
			
 
				+		goto out;
			
 
				+
			
 
				+	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
			
 
				+
			
 
				+	/* start with defaults */
			
 
				+	args->sb_flags = flags;
			
 
				+	args->flags = CEPH_OPT_DEFAULT;
			
 
				+	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
			
 
				+	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
			
 
				+	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
			
 
				+	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
			
 
				+	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
			
 
				+	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
			
 
				+	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
			
 
				+	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
			
 
				+	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
			
 
				+	args->max_readdir = 1024;
			
 
				+	args->congestion_kb = default_congestion_kb();
			
 
				+
			
 
				+	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
			
 
				+	err = -EINVAL;
			
 
				+	if (!dev_name)
			
 
				+		goto out;
			
 
				+	*path = strstr(dev_name, ":/");
			
 
				+	if (*path == NULL) {
			
 
				+		pr_err("device name is missing path (no :/ in %s)\n",
			
 
				+		       dev_name);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* get mon ip(s) */
			
 
				+	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
			
 
				+			     CEPH_MAX_MON, &args->num_mon);
			
 
				+	if (err < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	/* path on server */
			
 
				+	*path += 2;
			
 
				+	dout("server path '%s'\n", *path);
			
 
				+
			
 
				+	/* parse mount options */
			
 
				+	while ((c = strsep(&options, ",")) != NULL) {
			
 
				+		int token, intval, ret;
			
 
				+		if (!*c)
			
 
				+			continue;
			
 
				+		err = -EINVAL;
			
 
				+		token = match_token((char *)c, arg_tokens, argstr);
			
 
				+		if (token < 0) {
			
 
				+			pr_err("bad mount option at '%s'\n", c);
			
 
				+			goto out;
			
 
				+		}
			
 
				+		if (token < Opt_last_int) {
			
 
				+			ret = match_int(&argstr[0], &intval);
			
 
				+			if (ret < 0) {
			
 
				+				pr_err("bad mount option arg (not int) "
			
 
				+				       "at '%s'\n", c);
			
 
				+				continue;
			
 
				+			}
			
 
				+			dout("got int token %d val %d\n", token, intval);
			
 
				+		} else if (token > Opt_last_int && token < Opt_last_string) {
			
 
				+			dout("got string token %d val %s\n", token,
			
 
				+			     argstr[0].from);
			
 
				+		} else {
			
 
				+			dout("got token %d\n", token);
			
 
				+		}
			
 
				+		switch (token) {
			
 
				+		case Opt_fsidmajor:
			
 
				+			*(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
			
 
				+			break;
			
 
				+		case Opt_fsidminor:
			
 
				+			*(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
			
 
				+			break;
			
 
				+		case Opt_ip:
			
 
				+			err = ceph_parse_ips(argstr[0].from,
			
 
				+					     argstr[0].to,
			
 
				+					     &args->my_addr,
			
 
				+					     1, NULL);
			
 
				+			if (err < 0)
			
 
				+				goto out;
			
 
				+			args->flags |= CEPH_OPT_MYIP;
			
 
				+			break;
			
 
				+
			
 
				+		case Opt_snapdirname:
			
 
				+			kfree(args->snapdir_name);
			
 
				+			args->snapdir_name = kstrndup(argstr[0].from,
			
 
				+					      argstr[0].to-argstr[0].from,
			
 
				+					      GFP_KERNEL);
			
 
				+			break;
			
 
				+		case Opt_name:
			
 
				+			args->name = kstrndup(argstr[0].from,
			
 
				+					      argstr[0].to-argstr[0].from,
			
 
				+					      GFP_KERNEL);
			
 
				+			break;
			
 
				+		case Opt_secret:
			
 
				+			args->secret = kstrndup(argstr[0].from,
			
 
				+						argstr[0].to-argstr[0].from,
			
 
				+						GFP_KERNEL);
			
 
				+			break;
			
 
				+
			
 
				+			/* misc */
			
 
				+		case Opt_wsize:
			
 
				+			args->wsize = intval;
			
 
				+			break;
			
 
				+		case Opt_rsize:
			
 
				+			args->rsize = intval;
			
 
				+			break;
			
 
				+		case Opt_osdtimeout:
			
 
				+			args->osd_timeout = intval;
			
 
				+			break;
			
 
				+		case Opt_osdkeepalivetimeout:
			
 
				+			args->osd_keepalive_timeout = intval;
			
 
				+			break;
			
 
				+		case Opt_mount_timeout:
			
 
				+			args->mount_timeout = intval;
			
 
				+			break;
			
 
				+		case Opt_caps_wanted_delay_min:
			
 
				+			args->caps_wanted_delay_min = intval;
			
 
				+			break;
			
 
				+		case Opt_caps_wanted_delay_max:
			
 
				+			args->caps_wanted_delay_max = intval;
			
 
				+			break;
			
 
				+		case Opt_readdir_max_entries:
			
 
				+			args->max_readdir = intval;
			
 
				+			break;
			
 
				+		case Opt_congestion_kb:
			
 
				+			args->congestion_kb = intval;
			
 
				+			break;
			
 
				+
			
 
				+		case Opt_noshare:
			
 
				+			args->flags |= CEPH_OPT_NOSHARE;
			
 
				+			break;
			
 
				+
			
 
				+		case Opt_dirstat:
			
 
				+			args->flags |= CEPH_OPT_DIRSTAT;
			
 
				+			break;
			
 
				+		case Opt_nodirstat:
			
 
				+			args->flags &= ~CEPH_OPT_DIRSTAT;
			
 
				+			break;
			
 
				+		case Opt_rbytes:
			
 
				+			args->flags |= CEPH_OPT_RBYTES;
			
 
				+			break;
			
 
				+		case Opt_norbytes:
			
 
				+			args->flags &= ~CEPH_OPT_RBYTES;
			
 
				+			break;
			
 
				+		case Opt_nocrc:
			
 
				+			args->flags |= CEPH_OPT_NOCRC;
			
 
				+			break;
			
 
				+		case Opt_noasyncreaddir:
			
 
				+			args->flags |= CEPH_OPT_NOASYNCREADDIR;
			
 
				+			break;
			
 
				+
			
 
				+		default:
			
 
				+			BUG_ON(token);
			
 
				+		}
			
 
				+	}
			
 
				+	return args;
			
 
				+
			
 
				+out:
			
 
				+	kfree(args->mon_addr);
			
 
				+	kfree(args);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+static void destroy_mount_args(struct ceph_mount_args *args)
			
 
				+{
			
 
				+	dout("destroy_mount_args %p\n", args);
			
 
				+	kfree(args->snapdir_name);
			
 
				+	args->snapdir_name = NULL;
			
 
				+	kfree(args->name);
			
 
				+	args->name = NULL;
			
 
				+	kfree(args->secret);
			
 
				+	args->secret = NULL;
			
 
				+	kfree(args);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * create a fresh client instance
			
 
				+ */
			
 
				+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
			
 
				+{
			
 
				+	struct ceph_client *client;
			
 
				+	int err = -ENOMEM;
			
 
				+
			
 
				+	client = kzalloc(sizeof(*client), GFP_KERNEL);
			
 
				+	if (client == NULL)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	mutex_init(&client->mount_mutex);
			
 
				+
			
 
				+	init_waitqueue_head(&client->auth_wq);
			
 
				+
			
 
				+	client->sb = NULL;
			
 
				+	client->mount_state = CEPH_MOUNT_MOUNTING;
			
 
				+	client->mount_args = args;
			
 
				+
			
 
				+	client->msgr = NULL;
			
 
				+
			
 
				+	client->auth_err = 0;
			
 
				+	atomic_long_set(&client->writeback_count, 0);
			
 
				+
			
 
				+	err = bdi_init(&client->backing_dev_info);
			
 
				+	if (err < 0)
			
 
				+		goto fail;
			
 
				+
			
 
				+	err = -ENOMEM;
			
 
				+	client->wb_wq = create_workqueue("ceph-writeback");
			
 
				+	if (client->wb_wq == NULL)
			
 
				+		goto fail_bdi;
			
 
				+	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
			
 
				+	if (client->pg_inv_wq == NULL)
			
 
				+		goto fail_wb_wq;
			
 
				+	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
			
 
				+	if (client->trunc_wq == NULL)
			
 
				+		goto fail_pg_inv_wq;
			
 
				+
			
 
				+	/* set up mempools */
			
 
				+	err = -ENOMEM;
			
 
				+	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
			
 
				+			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
			
 
				+	if (!client->wb_pagevec_pool)
			
 
				+		goto fail_trunc_wq;
			
 
				+
			
 
				+	/* caps */
			
 
				+	client->min_caps = args->max_readdir;
			
 
				+	ceph_adjust_min_caps(client->min_caps);
			
 
				+
			
 
				+	/* subsystems */
			
 
				+	err = ceph_monc_init(&client->monc, client);
			
 
				+	if (err < 0)
			
 
				+		goto fail_mempool;
			
 
				+	err = ceph_osdc_init(&client->osdc, client);
			
 
				+	if (err < 0)
			
 
				+		goto fail_monc;
			
 
				+	err = ceph_mdsc_init(&client->mdsc, client);
			
 
				+	if (err < 0)
			
 
				+		goto fail_osdc;
			
 
				+	return client;
			
 
				+
			
 
				+fail_osdc:
			
 
				+	ceph_osdc_stop(&client->osdc);
			
 
				+fail_monc:
			
 
				+	ceph_monc_stop(&client->monc);
			
 
				+fail_mempool:
			
 
				+	mempool_destroy(client->wb_pagevec_pool);
			
 
				+fail_trunc_wq:
			
 
				+	destroy_workqueue(client->trunc_wq);
			
 
				+fail_pg_inv_wq:
			
 
				+	destroy_workqueue(client->pg_inv_wq);
			
 
				+fail_wb_wq:
			
 
				+	destroy_workqueue(client->wb_wq);
			
 
				+fail_bdi:
			
 
				+	bdi_destroy(&client->backing_dev_info);
			
 
				+fail:
			
 
				+	kfree(client);
			
 
				+	return ERR_PTR(err);
			
 
				+}
			
 
				+
			
 
				+static void ceph_destroy_client(struct ceph_client *client)
			
 
				+{
			
 
				+	dout("destroy_client %p\n", client);
			
 
				+
			
 
				+	/* unmount */
			
 
				+	ceph_mdsc_stop(&client->mdsc);
			
 
				+	ceph_monc_stop(&client->monc);
			
 
				+	ceph_osdc_stop(&client->osdc);
			
 
				+
			
 
				+	ceph_adjust_min_caps(-client->min_caps);
			
 
				+
			
 
				+	ceph_debugfs_client_cleanup(client);
			
 
				+	destroy_workqueue(client->wb_wq);
			
 
				+	destroy_workqueue(client->pg_inv_wq);
			
 
				+	destroy_workqueue(client->trunc_wq);
			
 
				+
			
 
				+	if (client->msgr)
			
 
				+		ceph_messenger_destroy(client->msgr);
			
 
				+	mempool_destroy(client->wb_pagevec_pool);
			
 
				+
			
 
				+	destroy_mount_args(client->mount_args);
			
 
				+
			
 
				+	kfree(client);
			
 
				+	dout("destroy_client %p done\n", client);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Initially learn our fsid, or verify an fsid matches.
			
 
				+ */
			
 
				+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
			
 
				+{
			
 
				+	if (client->have_fsid) {
			
 
				+		if (ceph_fsid_compare(&client->fsid, fsid)) {
			
 
				+			pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
			
 
				+			       PR_FSID(&client->fsid), PR_FSID(fsid));
			
 
				+			return -1;
			
 
				+		}
			
 
				+	} else {
			
 
				+		pr_info("client%lld fsid " FSID_FORMAT "\n",
			
 
				+			client->monc.auth->global_id, PR_FSID(fsid));
			
 
				+		memcpy(&client->fsid, fsid, sizeof(*fsid));
			
 
				+		ceph_debugfs_client_init(client);
			
 
				+		client->have_fsid = true;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * true if we have the mon map (and have thus joined the cluster)
			
 
				+ */
			
 
				+static int have_mon_map(struct ceph_client *client)
			
 
				+{
			
 
				+	return client->monc.monmap && client->monc.monmap->epoch;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Bootstrap mount by opening the root directory.  Note the mount
			
 
				+ * @started time from caller, and time out if this takes too long.
			
 
				+ */
			
 
				+static struct dentry *open_root_dentry(struct ceph_client *client,
			
 
				+				       const char *path,
			
 
				+				       unsigned long started)
			
 
				+{
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct ceph_mds_request *req = NULL;
			
 
				+	int err;
			
 
				+	struct dentry *root;
			
 
				+
			
 
				+	/* open dir */
			
 
				+	dout("open_root_inode opening '%s'\n", path);
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return ERR_PTR(PTR_ERR(req));
			
 
				+	req->r_path1 = kstrdup(path, GFP_NOFS);
			
 
				+	req->r_ino1.ino = CEPH_INO_ROOT;
			
 
				+	req->r_ino1.snap = CEPH_NOSNAP;
			
 
				+	req->r_started = started;
			
 
				+	req->r_timeout = client->mount_args->mount_timeout * HZ;
			
 
				+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
			
 
				+	req->r_num_caps = 2;
			
 
				+	err = ceph_mdsc_do_request(mdsc, NULL, req);
			
 
				+	if (err == 0) {
			
 
				+		dout("open_root_inode success\n");
			
 
				+		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
			
 
				+		    client->sb->s_root == NULL)
			
 
				+			root = d_alloc_root(req->r_target_inode);
			
 
				+		else
			
 
				+			root = d_obtain_alias(req->r_target_inode);
			
 
				+		req->r_target_inode = NULL;
			
 
				+		dout("open_root_inode success, root dentry is %p\n", root);
			
 
				+	} else {
			
 
				+		root = ERR_PTR(err);
			
 
				+	}
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return root;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * mount: join the ceph cluster, and open root directory.
			
 
				+ */
			
 
				+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
			
 
				+		      const char *path)
			
 
				+{
			
 
				+	struct ceph_entity_addr *myaddr = NULL;
			
 
				+	int err;
			
 
				+	unsigned long timeout = client->mount_args->mount_timeout * HZ;
			
 
				+	unsigned long started = jiffies;  /* note the start time */
			
 
				+	struct dentry *root;
			
 
				+
			
 
				+	dout("mount start\n");
			
 
				+	mutex_lock(&client->mount_mutex);
			
 
				+
			
 
				+	/* initialize the messenger */
			
 
				+	if (client->msgr == NULL) {
			
 
				+		if (ceph_test_opt(client, MYIP))
			
 
				+			myaddr = &client->mount_args->my_addr;
			
 
				+		client->msgr = ceph_messenger_create(myaddr);
			
 
				+		if (IS_ERR(client->msgr)) {
			
 
				+			err = PTR_ERR(client->msgr);
			
 
				+			client->msgr = NULL;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
			
 
				+	}
			
 
				+
			
 
				+	/* open session, and wait for mon, mds, and osd maps */
			
 
				+	err = ceph_monc_open_session(&client->monc);
			
 
				+	if (err < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	while (!have_mon_map(client)) {
			
 
				+		err = -EIO;
			
 
				+		if (timeout && time_after_eq(jiffies, started + timeout))
			
 
				+			goto out;
			
 
				+
			
 
				+		/* wait */
			
 
				+		dout("mount waiting for mon_map\n");
			
 
				+		err = wait_event_interruptible_timeout(client->auth_wq,
			
 
				+			       have_mon_map(client) || (client->auth_err < 0),
			
 
				+			       timeout);
			
 
				+		if (err == -EINTR || err == -ERESTARTSYS)
			
 
				+			goto out;
			
 
				+		if (client->auth_err < 0) {
			
 
				+			err = client->auth_err;
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dout("mount opening root\n");
			
 
				+	root = open_root_dentry(client, "", started);
			
 
				+	if (IS_ERR(root)) {
			
 
				+		err = PTR_ERR(root);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	if (client->sb->s_root)
			
 
				+		dput(root);
			
 
				+	else
			
 
				+		client->sb->s_root = root;
			
 
				+
			
 
				+	if (path[0] == 0) {
			
 
				+		dget(root);
			
 
				+	} else {
			
 
				+		dout("mount opening base mountpoint\n");
			
 
				+		root = open_root_dentry(client, path, started);
			
 
				+		if (IS_ERR(root)) {
			
 
				+			err = PTR_ERR(root);
			
 
				+			dput(client->sb->s_root);
			
 
				+			client->sb->s_root = NULL;
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	mnt->mnt_root = root;
			
 
				+	mnt->mnt_sb = client->sb;
			
 
				+
			
 
				+	client->mount_state = CEPH_MOUNT_MOUNTED;
			
 
				+	dout("mount success\n");
			
 
				+	err = 0;
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&client->mount_mutex);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_set_super(struct super_block *s, void *data)
			
 
				+{
			
 
				+	struct ceph_client *client = data;
			
 
				+	int ret;
			
 
				+
			
 
				+	dout("set_super %p data %p\n", s, data);
			
 
				+
			
 
				+	s->s_flags = client->mount_args->sb_flags;
			
 
				+	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
			
 
				+
			
 
				+	s->s_fs_info = client;
			
 
				+	client->sb = s;
			
 
				+
			
 
				+	s->s_op = &ceph_super_ops;
			
 
				+	s->s_export_op = &ceph_export_ops;
			
 
				+
			
 
				+	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
			
 
				+
			
 
				+	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
			
 
				+	if (ret != 0)
			
 
				+		goto fail;
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+fail:
			
 
				+	s->s_fs_info = NULL;
			
 
				+	client->sb = NULL;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * share superblock if same fs AND options
			
 
				+ */
			
 
				+static int ceph_compare_super(struct super_block *sb, void *data)
			
 
				+{
			
 
				+	struct ceph_client *new = data;
			
 
				+	struct ceph_mount_args *args = new->mount_args;
			
 
				+	struct ceph_client *other = ceph_sb_to_client(sb);
			
 
				+	int i;
			
 
				+
			
 
				+	dout("ceph_compare_super %p\n", sb);
			
 
				+	if (args->flags & CEPH_OPT_FSID) {
			
 
				+		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
			
 
				+			dout("fsid doesn't match\n");
			
 
				+			return 0;
			
 
				+		}
			
 
				+	} else {
			
 
				+		/* do we share (a) monitor? */
			
 
				+		for (i = 0; i < new->monc.monmap->num_mon; i++)
			
 
				+			if (ceph_monmap_contains(other->monc.monmap,
			
 
				+					 &new->monc.monmap->mon_inst[i].addr))
			
 
				+				break;
			
 
				+		if (i == new->monc.monmap->num_mon) {
			
 
				+			dout("mon ip not part of monmap\n");
			
 
				+			return 0;
			
 
				+		}
			
 
				+		dout("mon ip matches existing sb %p\n", sb);
			
 
				+	}
			
 
				+	if (args->sb_flags != other->mount_args->sb_flags) {
			
 
				+		dout("flags differ\n");
			
 
				+		return 0;
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * construct our own bdi so we can control readahead, etc.
			
 
				+ */
			
 
				+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+	sb->s_bdi = &client->backing_dev_info;
			
 
				+
			
 
				+	/* set ra_pages based on rsize mount option? */
			
 
				+	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
			
 
				+		client->backing_dev_info.ra_pages =
			
 
				+			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
			
 
				+			>> PAGE_SHIFT;
			
 
				+	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_get_sb(struct file_system_type *fs_type,
			
 
				+		       int flags, const char *dev_name, void *data,
			
 
				+		       struct vfsmount *mnt)
			
 
				+{
			
 
				+	struct super_block *sb;
			
 
				+	struct ceph_client *client;
			
 
				+	int err;
			
 
				+	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
			
 
				+	const char *path = NULL;
			
 
				+	struct ceph_mount_args *args;
			
 
				+
			
 
				+	dout("ceph_get_sb\n");
			
 
				+	args = parse_mount_args(flags, data, dev_name, &path);
			
 
				+	if (IS_ERR(args)) {
			
 
				+		err = PTR_ERR(args);
			
 
				+		goto out_final;
			
 
				+	}
			
 
				+
			
 
				+	/* create client (which we may/may not use) */
			
 
				+	client = ceph_create_client(args);
			
 
				+	if (IS_ERR(client)) {
			
 
				+		err = PTR_ERR(client);
			
 
				+		goto out_final;
			
 
				+	}
			
 
				+
			
 
				+	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
			
 
				+		compare_super = NULL;
			
 
				+	sb = sget(fs_type, compare_super, ceph_set_super, client);
			
 
				+	if (IS_ERR(sb)) {
			
 
				+		err = PTR_ERR(sb);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (ceph_client(sb) != client) {
			
 
				+		ceph_destroy_client(client);
			
 
				+		client = ceph_client(sb);
			
 
				+		dout("get_sb got existing client %p\n", client);
			
 
				+	} else {
			
 
				+		dout("get_sb using new client %p\n", client);
			
 
				+		err = ceph_register_bdi(sb, client);
			
 
				+		if (err < 0)
			
 
				+			goto out_splat;
			
 
				+	}
			
 
				+
			
 
				+	err = ceph_mount(client, mnt, path);
			
 
				+	if (err < 0)
			
 
				+		goto out_splat;
			
 
				+	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
			
 
				+	     mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
			
 
				+	return 0;
			
 
				+
			
 
				+out_splat:
			
 
				+	ceph_mdsc_close_sessions(&client->mdsc);
			
 
				+	up_write(&sb->s_umount);
			
 
				+	deactivate_super(sb);
			
 
				+	goto out_final;
			
 
				+
			
 
				+out:
			
 
				+	ceph_destroy_client(client);
			
 
				+out_final:
			
 
				+	dout("ceph_get_sb fail %d\n", err);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void ceph_kill_sb(struct super_block *s)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_sb_to_client(s);
			
 
				+	dout("kill_sb %p\n", s);
			
 
				+	ceph_mdsc_pre_umount(&client->mdsc);
			
 
				+	kill_anon_super(s);    /* will call put_super after sb is r/o */
			
 
				+	if (s->s_bdi == &client->backing_dev_info)
			
 
				+		bdi_unregister(&client->backing_dev_info);
			
 
				+	bdi_destroy(&client->backing_dev_info);
			
 
				+	ceph_destroy_client(client);
			
 
				+}
			
 
				+
			
 
				+static struct file_system_type ceph_fs_type = {
			
 
				+	.owner		= THIS_MODULE,
			
 
				+	.name		= "ceph",
			
 
				+	.get_sb		= ceph_get_sb,
			
 
				+	.kill_sb	= ceph_kill_sb,
			
 
				+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
			
 
				+};
			
 
				+
			
 
				+#define _STRINGIFY(x) #x
			
 
				+#define STRINGIFY(x) _STRINGIFY(x)
			
 
				+
			
 
				+static int __init init_ceph(void)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	ret = ceph_debugfs_init();
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	ret = ceph_msgr_init();
			
 
				+	if (ret < 0)
			
 
				+		goto out_debugfs;
			
 
				+
			
 
				+	ret = init_caches();
			
 
				+	if (ret)
			
 
				+		goto out_msgr;
			
 
				+
			
 
				+	ceph_caps_init();
			
 
				+
			
 
				+	ret = register_filesystem(&ceph_fs_type);
			
 
				+	if (ret)
			
 
				+		goto out_icache;
			
 
				+
			
 
				+	pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
			
 
				+		CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
			
 
				+		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
			
 
				+	return 0;
			
 
				+
			
 
				+out_icache:
			
 
				+	destroy_caches();
			
 
				+out_msgr:
			
 
				+	ceph_msgr_exit();
			
 
				+out_debugfs:
			
 
				+	ceph_debugfs_cleanup();
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void __exit exit_ceph(void)
			
 
				+{
			
 
				+	dout("exit_ceph\n");
			
 
				+	unregister_filesystem(&ceph_fs_type);
			
 
				+	ceph_caps_finalize();
			
 
				+	destroy_caches();
			
 
				+	ceph_msgr_exit();
			
 
				+	ceph_debugfs_cleanup();
			
 
				+}
			
 
				+
			
 
				+module_init(init_ceph);
			
 
				+module_exit(exit_ceph);
			
 
				+
			
 
				+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
			
 
				+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
			
 
				+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
			
 
				+MODULE_DESCRIPTION("Ceph filesystem for Linux");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
 
				+#ifndef _FS_CEPH_SUPER_H
			
 
				+#define _FS_CEPH_SUPER_H
			
 
				+
			
 
				+#include "ceph_debug.h"
			
 
				+
			
 
				+#include <asm/unaligned.h>
			
 
				+#include <linux/backing-dev.h>
			
 
				+#include <linux/completion.h>
			
 
				+#include <linux/exportfs.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/mempool.h>
			
 
				+#include <linux/pagemap.h>
			
 
				+#include <linux/wait.h>
			
 
				+#include <linux/writeback.h>
			
 
				+
			
 
				+#include "types.h"
			
 
				+#include "messenger.h"
			
 
				+#include "msgpool.h"
			
 
				+#include "mon_client.h"
			
 
				+#include "mds_client.h"
			
 
				+#include "osd_client.h"
			
 
				+#include "ceph_fs.h"
			
 
				+
			
 
				+/* f_type in struct statfs */
			
 
				+#define CEPH_SUPER_MAGIC 0x00c36400
			
 
				+
			
 
				+/* large granularity for statfs utilization stats to facilitate
			
 
				+ * large volume sizes on 32-bit machines. */
			
 
				+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
			
 
				+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
			
 
				+
			
 
				+/*
			
 
				+ * mount options
			
 
				+ */
			
 
				+#define CEPH_OPT_FSID             (1<<0)
			
 
				+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
			
 
				+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
			
 
				+#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
			
 
				+#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
			
 
				+#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
			
 
				+#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
			
 
				+
			
 
				+#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
			
 
				+
			
 
				+#define ceph_set_opt(client, opt) \
			
 
				+	(client)->mount_args->flags |= CEPH_OPT_##opt;
			
 
				+#define ceph_test_opt(client, opt) \
			
 
				+	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
			
 
				+
			
 
				+
			
 
				+struct ceph_mount_args {
			
 
				+	int sb_flags;
			
 
				+	int num_mon;
			
 
				+	struct ceph_entity_addr *mon_addr;
			
 
				+	int flags;
			
 
				+	int mount_timeout;
			
 
				+	int osd_idle_ttl;
			
 
				+	int caps_wanted_delay_min, caps_wanted_delay_max;
			
 
				+	struct ceph_fsid fsid;
			
 
				+	struct ceph_entity_addr my_addr;
			
 
				+	int wsize;
			
 
				+	int rsize;            /* max readahead */
			
 
				+	int max_readdir;      /* max readdir size */
			
 
				+	int congestion_kb;      /* max readdir size */
			
 
				+	int osd_timeout;
			
 
				+	int osd_keepalive_timeout;
			
 
				+	char *snapdir_name;   /* default ".snap" */
			
 
				+	char *name;
			
 
				+	char *secret;
			
 
				+	int cap_release_safety;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * defaults
			
 
				+ */
			
 
				+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
			
 
				+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
			
 
				+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
			
 
				+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
			
 
				+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
			
 
				+
			
 
				+#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
			
 
				+#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
			
 
				+
			
 
				+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
			
 
				+#define CEPH_AUTH_NAME_DEFAULT   "guest"
			
 
				+
			
 
				+/*
			
 
				+ * Delay telling the MDS we no longer want caps, in case we reopen
			
 
				+ * the file.  Delay a minimum amount of time, even if we send a cap
			
 
				+ * message for some other reason.  Otherwise, take the oppotunity to
			
 
				+ * update the mds to avoid sending another message later.
			
 
				+ */
			
 
				+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
			
 
				+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
			
 
				+
			
 
				+
			
 
				+/* mount state */
			
 
				+enum {
			
 
				+	CEPH_MOUNT_MOUNTING,
			
 
				+	CEPH_MOUNT_MOUNTED,
			
 
				+	CEPH_MOUNT_UNMOUNTING,
			
 
				+	CEPH_MOUNT_UNMOUNTED,
			
 
				+	CEPH_MOUNT_SHUTDOWN,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * subtract jiffies
			
 
				+ */
			
 
				+static inline unsigned long time_sub(unsigned long a, unsigned long b)
			
 
				+{
			
 
				+	BUG_ON(time_after(b, a));
			
 
				+	return (long)a - (long)b;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * per-filesystem client state
			
 
				+ *
			
 
				+ * possibly shared by multiple mount points, if they are
			
 
				+ * mounting the same ceph filesystem/cluster.
			
 
				+ */
			
 
				+struct ceph_client {
			
 
				+	struct ceph_fsid fsid;
			
 
				+	bool have_fsid;
			
 
				+
			
 
				+	struct mutex mount_mutex;       /* serialize mount attempts */
			
 
				+	struct ceph_mount_args *mount_args;
			
 
				+
			
 
				+	struct super_block *sb;
			
 
				+
			
 
				+	unsigned long mount_state;
			
 
				+	wait_queue_head_t auth_wq;
			
 
				+
			
 
				+	int auth_err;
			
 
				+
			
 
				+	int min_caps;                  /* min caps i added */
			
 
				+
			
 
				+	struct ceph_messenger *msgr;   /* messenger instance */
			
 
				+	struct ceph_mon_client monc;
			
 
				+	struct ceph_mds_client mdsc;
			
 
				+	struct ceph_osd_client osdc;
			
 
				+
			
 
				+	/* writeback */
			
 
				+	mempool_t *wb_pagevec_pool;
			
 
				+	struct workqueue_struct *wb_wq;
			
 
				+	struct workqueue_struct *pg_inv_wq;
			
 
				+	struct workqueue_struct *trunc_wq;
			
 
				+	atomic_long_t writeback_count;
			
 
				+
			
 
				+	struct backing_dev_info backing_dev_info;
			
 
				+
			
 
				+#ifdef CONFIG_DEBUG_FS
			
 
				+	struct dentry *debugfs_monmap;
			
 
				+	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
			
 
				+	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
			
 
				+	struct dentry *debugfs_congestion_kb;
			
 
				+	struct dentry *debugfs_bdi;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+static inline struct ceph_client *ceph_client(struct super_block *sb)
			
 
				+{
			
 
				+	return sb->s_fs_info;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * File i/o capability.  This tracks shared state with the metadata
			
 
				+ * server that allows us to cache or writeback attributes or to read
			
 
				+ * and write data.  For any given inode, we should have one or more
			
 
				+ * capabilities, one issued by each metadata server, and our
			
 
				+ * cumulative access is the OR of all issued capabilities.
			
 
				+ *
			
 
				+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
			
 
				+ * session capability lists.
			
 
				+ */
			
 
				+struct ceph_cap {
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct rb_node ci_node;          /* per-ci cap tree */
			
 
				+	struct ceph_mds_session *session;
			
 
				+	struct list_head session_caps;   /* per-session caplist */
			
 
				+	int mds;
			
 
				+	u64 cap_id;       /* unique cap id (mds provided) */
			
 
				+	int issued;       /* latest, from the mds */
			
 
				+	int implemented;  /* implemented superset of issued (for revocation) */
			
 
				+	int mds_wanted;
			
 
				+	u32 seq, issue_seq, mseq;
			
 
				+	u32 cap_gen;      /* active/stale cycle */
			
 
				+	unsigned long last_used;
			
 
				+	struct list_head caps_item;
			
 
				+};
			
 
				+
			
 
				+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
			
 
				+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
			
 
				+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
			
 
				+
			
 
				+/*
			
 
				+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
			
 
				+ * we first complete any in-process sync writes and writeback any dirty
			
 
				+ * data before flushing the snapped state (tracked here) back to the MDS.
			
 
				+ */
			
 
				+struct ceph_cap_snap {
			
 
				+	atomic_t nref;
			
 
				+	struct ceph_inode_info *ci;
			
 
				+	struct list_head ci_item, flushing_item;
			
 
				+
			
 
				+	u64 follows, flush_tid;
			
 
				+	int issued, dirty;
			
 
				+	struct ceph_snap_context *context;
			
 
				+
			
 
				+	mode_t mode;
			
 
				+	uid_t uid;
			
 
				+	gid_t gid;
			
 
				+
			
 
				+	void *xattr_blob;
			
 
				+	int xattr_len;
			
 
				+	u64 xattr_version;
			
 
				+
			
 
				+	u64 size;
			
 
				+	struct timespec mtime, atime, ctime;
			
 
				+	u64 time_warp_seq;
			
 
				+	int writing;   /* a sync write is still in progress */
			
 
				+	int dirty_pages;     /* dirty pages awaiting writeback */
			
 
				+};
			
 
				+
			
 
				+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&capsnap->nref))
			
 
				+		kfree(capsnap);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The frag tree describes how a directory is fragmented, potentially across
			
 
				+ * multiple metadata servers.  It is also used to indicate points where
			
 
				+ * metadata authority is delegated, and whether/where metadata is replicated.
			
 
				+ *
			
 
				+ * A _leaf_ frag will be present in the i_fragtree IFF there is
			
 
				+ * delegation info.  That is, if mds >= 0 || ndist > 0.
			
 
				+ */
			
 
				+#define CEPH_MAX_DIRFRAG_REP 4
			
 
				+
			
 
				+struct ceph_inode_frag {
			
 
				+	struct rb_node node;
			
 
				+
			
 
				+	/* fragtree state */
			
 
				+	u32 frag;
			
 
				+	int split_by;         /* i.e. 2^(split_by) children */
			
 
				+
			
 
				+	/* delegation and replication info */
			
 
				+	int mds;              /* -1 if same authority as parent */
			
 
				+	int ndist;            /* >0 if replicated */
			
 
				+	int dist[CEPH_MAX_DIRFRAG_REP];
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * We cache inode xattrs as an encoded blob until they are first used,
			
 
				+ * at which point we parse them into an rbtree.
			
 
				+ */
			
 
				+struct ceph_inode_xattr {
			
 
				+	struct rb_node node;
			
 
				+
			
 
				+	const char *name;
			
 
				+	int name_len;
			
 
				+	const char *val;
			
 
				+	int val_len;
			
 
				+	int dirty;
			
 
				+
			
 
				+	int should_free_name;
			
 
				+	int should_free_val;
			
 
				+};
			
 
				+
			
 
				+struct ceph_inode_xattrs_info {
			
 
				+	/*
			
 
				+	 * (still encoded) xattr blob. we avoid the overhead of parsing
			
 
				+	 * this until someone actually calls getxattr, etc.
			
 
				+	 *
			
 
				+	 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
			
 
				+	 * NULL means we don't know.
			
 
				+	*/
			
 
				+	struct ceph_buffer *blob, *prealloc_blob;
			
 
				+
			
 
				+	struct rb_root index;
			
 
				+	bool dirty;
			
 
				+	int count;
			
 
				+	int names_size;
			
 
				+	int vals_size;
			
 
				+	u64 version, index_version;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Ceph inode.
			
 
				+ */
			
 
				+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
			
 
				+#define CEPH_I_NODELAY   4  /* do not delay cap release */
			
 
				+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
			
 
				+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
			
 
				+
			
 
				+struct ceph_inode_info {
			
 
				+	struct ceph_vino i_vino;   /* ceph ino + snap */
			
 
				+
			
 
				+	u64 i_version;
			
 
				+	u32 i_time_warp_seq;
			
 
				+
			
 
				+	unsigned i_ceph_flags;
			
 
				+	unsigned long i_release_count;
			
 
				+
			
 
				+	struct ceph_file_layout i_layout;
			
 
				+	char *i_symlink;
			
 
				+
			
 
				+	/* for dirs */
			
 
				+	struct timespec i_rctime;
			
 
				+	u64 i_rbytes, i_rfiles, i_rsubdirs;
			
 
				+	u64 i_files, i_subdirs;
			
 
				+	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
			
 
				+
			
 
				+	struct rb_root i_fragtree;
			
 
				+	struct mutex i_fragtree_mutex;
			
 
				+
			
 
				+	struct ceph_inode_xattrs_info i_xattrs;
			
 
				+
			
 
				+	/* capabilities.  protected _both_ by i_lock and cap->session's
			
 
				+	 * s_mutex. */
			
 
				+	struct rb_root i_caps;           /* cap list */
			
 
				+	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
			
 
				+	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
			
 
				+	struct list_head i_dirty_item, i_flushing_item;
			
 
				+	u64 i_cap_flush_seq;
			
 
				+	/* we need to track cap writeback on a per-cap-bit basis, to allow
			
 
				+	 * overlapping, pipelined cap flushes to the mds.  we can probably
			
 
				+	 * reduce the tid to 8 bits if we're concerned about inode size. */
			
 
				+	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
			
 
				+	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
			
 
				+	unsigned long i_hold_caps_min; /* jiffies */
			
 
				+	unsigned long i_hold_caps_max; /* jiffies */
			
 
				+	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
			
 
				+	int i_cap_exporting_mds;         /* to handle cap migration between */
			
 
				+	unsigned i_cap_exporting_mseq;   /*  mds's. */
			
 
				+	unsigned i_cap_exporting_issued;
			
 
				+	struct ceph_cap_reservation i_cap_migration_resv;
			
 
				+	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
			
 
				+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
			
 
				+	unsigned i_snap_caps;           /* cap bits for snapped files */
			
 
				+
			
 
				+	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
			
 
				+
			
 
				+	u32 i_truncate_seq;        /* last truncate to smaller size */
			
 
				+	u64 i_truncate_size;       /*  and the size we last truncated down to */
			
 
				+	int i_truncate_pending;    /*  still need to call vmtruncate */
			
 
				+
			
 
				+	u64 i_max_size;            /* max file size authorized by mds */
			
 
				+	u64 i_reported_size; /* (max_)size reported to or requested of mds */
			
 
				+	u64 i_wanted_max_size;     /* offset we'd like to write too */
			
 
				+	u64 i_requested_max_size;  /* max_size we've requested */
			
 
				+
			
 
				+	/* held references to caps */
			
 
				+	int i_pin_ref;
			
 
				+	int i_rd_ref, i_rdcache_ref, i_wr_ref;
			
 
				+	int i_wrbuffer_ref, i_wrbuffer_ref_head;
			
 
				+	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
			
 
				+	u32 i_rdcache_gen;      /* we increment this each time we get
			
 
				+				   FILE_CACHE.  If it's non-zero, we
			
 
				+				   _may_ have cached pages. */
			
 
				+	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
			
 
				+
			
 
				+	struct list_head i_unsafe_writes; /* uncommitted sync writes */
			
 
				+	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
			
 
				+	spinlock_t i_unsafe_lock;
			
 
				+
			
 
				+	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
			
 
				+	int i_snap_realm_counter; /* snap realm (if caps) */
			
 
				+	struct list_head i_snap_realm_item;
			
 
				+	struct list_head i_snap_flush_item;
			
 
				+
			
 
				+	struct work_struct i_wb_work;  /* writeback work */
			
 
				+	struct work_struct i_pg_inv_work;  /* page invalidation work */
			
 
				+
			
 
				+	struct work_struct i_vmtruncate_work;
			
 
				+
			
 
				+	struct inode vfs_inode; /* at end */
			
 
				+};
			
 
				+
			
 
				+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
			
 
				+{
			
 
				+	return container_of(inode, struct ceph_inode_info, vfs_inode);
			
 
				+}
			
 
				+
			
 
				+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	ci->i_ceph_flags &= ~mask;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+}
			
 
				+
			
 
				+static inline void ceph_i_set(struct inode *inode, unsigned mask)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	ci->i_ceph_flags |= mask;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+}
			
 
				+
			
 
				+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
			
 
				+{
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	bool r;
			
 
				+
			
 
				+	smp_mb();
			
 
				+	r = (ci->i_ceph_flags & mask) == mask;
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* find a specific frag @f */
			
 
				+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
			
 
				+						u32 f);
			
 
				+
			
 
				+/*
			
 
				+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
			
 
				+ * exists
			
 
				+ */
			
 
				+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
			
 
				+			    struct ceph_inode_frag *pfrag,
			
 
				+			    int *found);
			
 
				+
			
 
				+/*
			
 
				+ * Ceph dentry state
			
 
				+ */
			
 
				+struct ceph_dentry_info {
			
 
				+	struct ceph_mds_session *lease_session;
			
 
				+	u32 lease_gen, lease_shared_gen;
			
 
				+	u32 lease_seq;
			
 
				+	unsigned long lease_renew_after, lease_renew_from;
			
 
				+	struct list_head lru;
			
 
				+	struct dentry *dentry;
			
 
				+	u64 time;
			
 
				+	u64 offset;
			
 
				+};
			
 
				+
			
 
				+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
			
 
				+{
			
 
				+	return (struct ceph_dentry_info *)dentry->d_fsdata;
			
 
				+}
			
 
				+
			
 
				+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
			
 
				+{
			
 
				+	return ((loff_t)frag << 32) | (loff_t)off;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * ino_t is <64 bits on many architectures, blech.
			
 
				+ *
			
 
				+ * don't include snap in ino hash, at least for now.
			
 
				+ */
			
 
				+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
			
 
				+{
			
 
				+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
			
 
				+#if BITS_PER_LONG == 32
			
 
				+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
			
 
				+	if (!ino)
			
 
				+		ino = 1;
			
 
				+#endif
			
 
				+	return ino;
			
 
				+}
			
 
				+
			
 
				+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
			
 
				+{
			
 
				+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
			
 
				+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline struct ceph_vino ceph_vino(struct inode *inode)
			
 
				+{
			
 
				+	return ceph_inode(inode)->i_vino;
			
 
				+}
			
 
				+
			
 
				+/* for printf-style formatting */
			
 
				+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
			
 
				+
			
 
				+static inline u64 ceph_ino(struct inode *inode)
			
 
				+{
			
 
				+	return ceph_inode(inode)->i_vino.ino;
			
 
				+}
			
 
				+static inline u64 ceph_snap(struct inode *inode)
			
 
				+{
			
 
				+	return ceph_inode(inode)->i_vino.snap;
			
 
				+}
			
 
				+
			
 
				+static inline int ceph_ino_compare(struct inode *inode, void *data)
			
 
				+{
			
 
				+	struct ceph_vino *pvino = (struct ceph_vino *)data;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	return ci->i_vino.ino == pvino->ino &&
			
 
				+		ci->i_vino.snap == pvino->snap;
			
 
				+}
			
 
				+
			
 
				+static inline struct inode *ceph_find_inode(struct super_block *sb,
			
 
				+					    struct ceph_vino vino)
			
 
				+{
			
 
				+	ino_t t = ceph_vino_to_ino(vino);
			
 
				+	return ilookup5(sb, t, ceph_ino_compare, &vino);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * caps helpers
			
 
				+ */
			
 
				+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	return !RB_EMPTY_ROOT(&ci->i_caps);
			
 
				+}
			
 
				+
			
 
				+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
			
 
				+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
			
 
				+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
			
 
				+				    struct ceph_cap *cap);
			
 
				+
			
 
				+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	int issued;
			
 
				+	spin_lock(&ci->vfs_inode.i_lock);
			
 
				+	issued = __ceph_caps_issued(ci, NULL);
			
 
				+	spin_unlock(&ci->vfs_inode.i_lock);
			
 
				+	return issued;
			
 
				+}
			
 
				+
			
 
				+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
			
 
				+					int touch)
			
 
				+{
			
 
				+	int r;
			
 
				+	spin_lock(&ci->vfs_inode.i_lock);
			
 
				+	r = __ceph_caps_issued_mask(ci, mask, touch);
			
 
				+	spin_unlock(&ci->vfs_inode.i_lock);
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	return ci->i_dirty_caps | ci->i_flushing_caps;
			
 
				+}
			
 
				+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
			
 
				+
			
 
				+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
			
 
				+extern int __ceph_caps_used(struct ceph_inode_info *ci);
			
 
				+
			
 
				+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
			
 
				+
			
 
				+/*
			
 
				+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
			
 
				+ */
			
 
				+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
			
 
				+	if (w & CEPH_CAP_FILE_BUFFER)
			
 
				+		w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
			
 
				+	return w;
			
 
				+}
			
 
				+
			
 
				+/* what the mds thinks we want */
			
 
				+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
			
 
				+
			
 
				+extern void ceph_caps_init(void);
			
 
				+extern void ceph_caps_finalize(void);
			
 
				+extern void ceph_adjust_min_caps(int delta);
			
 
				+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
			
 
				+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
			
 
				+extern void ceph_reservation_status(struct ceph_client *client,
			
 
				+				    int *total, int *avail, int *used,
			
 
				+				    int *reserved, int *min);
			
 
				+
			
 
				+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
			
 
				+{
			
 
				+	return (struct ceph_client *)inode->i_sb->s_fs_info;
			
 
				+}
			
 
				+
			
 
				+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
			
 
				+{
			
 
				+	return (struct ceph_client *)sb->s_fs_info;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * we keep buffered readdir results attached to file->private_data
			
 
				+ */
			
 
				+struct ceph_file_info {
			
 
				+	int fmode;     /* initialized on open */
			
 
				+
			
 
				+	/* readdir: position within the dir */
			
 
				+	u32 frag;
			
 
				+	struct ceph_mds_request *last_readdir;
			
 
				+	int at_end;
			
 
				+
			
 
				+	/* readdir: position within a frag */
			
 
				+	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
			
 
				+	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
			
 
				+	char *last_name;       /* last entry in previous chunk */
			
 
				+	struct dentry *dentry; /* next dentry (for dcache readdir) */
			
 
				+	unsigned long dir_release_count;
			
 
				+
			
 
				+	/* used for -o dirstat read() on directory thing */
			
 
				+	char *dir_info;
			
 
				+	int dir_info_len;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * snapshots
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * A "snap context" is the set of existing snapshots when we
			
 
				+ * write data.  It is used by the OSD to guide its COW behavior.
			
 
				+ *
			
 
				+ * The ceph_snap_context is refcounted, and attached to each dirty
			
 
				+ * page, indicating which context the dirty data belonged when it was
			
 
				+ * dirtied.
			
 
				+ */
			
 
				+struct ceph_snap_context {
			
 
				+	atomic_t nref;
			
 
				+	u64 seq;
			
 
				+	int num_snaps;
			
 
				+	u64 snaps[];
			
 
				+};
			
 
				+
			
 
				+static inline struct ceph_snap_context *
			
 
				+ceph_get_snap_context(struct ceph_snap_context *sc)
			
 
				+{
			
 
				+	/*
			
 
				+	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
			
 
				+	       atomic_read(&sc->nref)+1);
			
 
				+	*/
			
 
				+	if (sc)
			
 
				+		atomic_inc(&sc->nref);
			
 
				+	return sc;
			
 
				+}
			
 
				+
			
 
				+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
			
 
				+{
			
 
				+	if (!sc)
			
 
				+		return;
			
 
				+	/*
			
 
				+	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
			
 
				+	       atomic_read(&sc->nref)-1);
			
 
				+	*/
			
 
				+	if (atomic_dec_and_test(&sc->nref)) {
			
 
				+		/*printk(" deleting snap_context %p\n", sc);*/
			
 
				+		kfree(sc);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * A "snap realm" describes a subset of the file hierarchy sharing
			
 
				+ * the same set of snapshots that apply to it.  The realms themselves
			
 
				+ * are organized into a hierarchy, such that children inherit (some of)
			
 
				+ * the snapshots of their parents.
			
 
				+ *
			
 
				+ * All inodes within the realm that have capabilities are linked into a
			
 
				+ * per-realm list.
			
 
				+ */
			
 
				+struct ceph_snap_realm {
			
 
				+	u64 ino;
			
 
				+	atomic_t nref;
			
 
				+	struct rb_node node;
			
 
				+
			
 
				+	u64 created, seq;
			
 
				+	u64 parent_ino;
			
 
				+	u64 parent_since;   /* snapid when our current parent became so */
			
 
				+
			
 
				+	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
			
 
				+	int num_prior_parent_snaps;   /*  had prior to parent_since */
			
 
				+	u64 *snaps;                   /* snaps specific to this realm */
			
 
				+	int num_snaps;
			
 
				+
			
 
				+	struct ceph_snap_realm *parent;
			
 
				+	struct list_head children;       /* list of child realms */
			
 
				+	struct list_head child_item;
			
 
				+
			
 
				+	struct list_head empty_item;     /* if i have ref==0 */
			
 
				+
			
 
				+	/* the current set of snaps for this realm */
			
 
				+	struct ceph_snap_context *cached_context;
			
 
				+
			
 
				+	struct list_head inodes_with_caps;
			
 
				+	spinlock_t inodes_with_caps_lock;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * calculate the number of pages a given length and offset map onto,
			
 
				+ * if we align the data.
			
 
				+ */
			
 
				+static inline int calc_pages_for(u64 off, u64 len)
			
 
				+{
			
 
				+	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
			
 
				+		(off >> PAGE_CACHE_SHIFT);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* snap.c */
			
 
				+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+					       u64 ino);
			
 
				+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+				struct ceph_snap_realm *realm);
			
 
				+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
			
 
				+				struct ceph_snap_realm *realm);
			
 
				+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
			
 
				+				  void *p, void *e, bool deletion);
			
 
				+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
			
 
				+			     struct ceph_mds_session *session,
			
 
				+			     struct ceph_msg *msg);
			
 
				+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
			
 
				+				struct ceph_snap_context *snapc);
			
 
				+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
			
 
				+				  struct ceph_cap_snap *capsnap);
			
 
				+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
			
 
				+
			
 
				+/*
			
 
				+ * a cap_snap is "pending" if it is still awaiting an in-progress
			
 
				+ * sync write (that may/may not still update size, mtime, etc.).
			
 
				+ */
			
 
				+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	return !list_empty(&ci->i_cap_snaps) &&
			
 
				+		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
			
 
				+			   ci_item)->writing;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* super.c */
			
 
				+extern struct kmem_cache *ceph_inode_cachep;
			
 
				+extern struct kmem_cache *ceph_cap_cachep;
			
 
				+extern struct kmem_cache *ceph_dentry_cachep;
			
 
				+extern struct kmem_cache *ceph_file_cachep;
			
 
				+
			
 
				+extern const char *ceph_msg_type_name(int type);
			
 
				+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
			
 
				+
			
 
				+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
			
 
				+	"%02x%02x%02x%02x%02x%02x"
			
 
				+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
			
 
				+		(f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
			
 
				+		(f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
			
 
				+		(f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
			
 
				+
			
 
				+/* inode.c */
			
 
				+extern const struct inode_operations ceph_file_iops;
			
 
				+
			
 
				+extern struct inode *ceph_alloc_inode(struct super_block *sb);
			
 
				+extern void ceph_destroy_inode(struct inode *inode);
			
 
				+
			
 
				+extern struct inode *ceph_get_inode(struct super_block *sb,
			
 
				+				    struct ceph_vino vino);
			
 
				+extern struct inode *ceph_get_snapdir(struct inode *parent);
			
 
				+extern int ceph_fill_file_size(struct inode *inode, int issued,
			
 
				+			       u32 truncate_seq, u64 truncate_size, u64 size);
			
 
				+extern void ceph_fill_file_time(struct inode *inode, int issued,
			
 
				+				u64 time_warp_seq, struct timespec *ctime,
			
 
				+				struct timespec *mtime, struct timespec *atime);
			
 
				+extern int ceph_fill_trace(struct super_block *sb,
			
 
				+			   struct ceph_mds_request *req,
			
 
				+			   struct ceph_mds_session *session);
			
 
				+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
			
 
				+				    struct ceph_mds_session *session);
			
 
				+
			
 
				+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
			
 
				+
			
 
				+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
			
 
				+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
			
 
				+extern void ceph_queue_vmtruncate(struct inode *inode);
			
 
				+
			
 
				+extern void ceph_queue_invalidate(struct inode *inode);
			
 
				+extern void ceph_queue_writeback(struct inode *inode);
			
 
				+
			
 
				+extern int ceph_do_getattr(struct inode *inode, int mask);
			
 
				+extern int ceph_permission(struct inode *inode, int mask);
			
 
				+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
			
 
				+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
			
 
				+			struct kstat *stat);
			
 
				+
			
 
				+/* xattr.c */
			
 
				+extern int ceph_setxattr(struct dentry *, const char *, const void *,
			
 
				+			 size_t, int);
			
 
				+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
			
 
				+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
			
 
				+extern int ceph_removexattr(struct dentry *, const char *);
			
 
				+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
			
 
				+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
			
 
				+
			
 
				+/* caps.c */
			
 
				+extern const char *ceph_cap_string(int c);
			
 
				+extern void ceph_handle_caps(struct ceph_mds_session *session,
			
 
				+			     struct ceph_msg *msg);
			
 
				+extern int ceph_add_cap(struct inode *inode,
			
 
				+			struct ceph_mds_session *session, u64 cap_id,
			
 
				+			int fmode, unsigned issued, unsigned wanted,
			
 
				+			unsigned cap, unsigned seq, u64 realmino, int flags,
			
 
				+			struct ceph_cap_reservation *caps_reservation);
			
 
				+extern void __ceph_remove_cap(struct ceph_cap *cap);
			
 
				+static inline void ceph_remove_cap(struct ceph_cap *cap)
			
 
				+{
			
 
				+	struct inode *inode = &cap->ci->vfs_inode;
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	__ceph_remove_cap(cap);
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+}
			
 
				+extern void ceph_put_cap(struct ceph_cap *cap);
			
 
				+
			
 
				+extern void ceph_queue_caps_release(struct inode *inode);
			
 
				+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
			
 
				+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
			
 
				+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
			
 
				+				    struct ceph_mds_session *session);
			
 
				+extern int ceph_get_cap_mds(struct inode *inode);
			
 
				+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
			
 
				+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
			
 
				+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
			
 
				+				       struct ceph_snap_context *snapc);
			
 
				+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
			
 
				+			       struct ceph_mds_session **psession);
			
 
				+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
			
 
				+			    struct ceph_mds_session *session);
			
 
				+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
			
 
				+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
			
 
				+
			
 
				+extern int ceph_encode_inode_release(void **p, struct inode *inode,
			
 
				+				     int mds, int drop, int unless, int force);
			
 
				+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
			
 
				+				      int mds, int drop, int unless);
			
 
				+
			
 
				+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
			
 
				+			 int *got, loff_t endoff);
			
 
				+
			
 
				+/* for counting open files by mode */
			
 
				+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
			
 
				+{
			
 
				+	ci->i_nr_by_mode[mode]++;
			
 
				+}
			
 
				+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
			
 
				+
			
 
				+/* addr.c */
			
 
				+extern const struct address_space_operations ceph_aops;
			
 
				+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
			
 
				+
			
 
				+/* file.c */
			
 
				+extern const struct file_operations ceph_file_fops;
			
 
				+extern const struct address_space_operations ceph_aops;
			
 
				+extern int ceph_open(struct inode *inode, struct file *file);
			
 
				+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
			
 
				+				       struct nameidata *nd, int mode,
			
 
				+				       int locked_dir);
			
 
				+extern int ceph_release(struct inode *inode, struct file *filp);
			
 
				+extern void ceph_release_page_vector(struct page **pages, int num_pages);
			
 
				+
			
 
				+/* dir.c */
			
 
				+extern const struct file_operations ceph_dir_fops;
			
 
				+extern const struct inode_operations ceph_dir_iops;
			
 
				+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
			
 
				+	ceph_snapdir_dentry_ops;
			
 
				+
			
 
				+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
			
 
				+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
			
 
				+					 struct dentry *dentry, int err);
			
 
				+
			
 
				+extern void ceph_dentry_lru_add(struct dentry *dn);
			
 
				+extern void ceph_dentry_lru_touch(struct dentry *dn);
			
 
				+extern void ceph_dentry_lru_del(struct dentry *dn);
			
 
				+
			
 
				+/*
			
 
				+ * our d_ops vary depending on whether the inode is live,
			
 
				+ * snapshotted (read-only), or a virtual ".snap" directory.
			
 
				+ */
			
 
				+int ceph_init_dentry(struct dentry *dentry);
			
 
				+
			
 
				+
			
 
				+/* ioctl.c */
			
 
				+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
			
 
				+
			
 
				+/* export.c */
			
 
				+extern const struct export_operations ceph_export_ops;
			
 
				+
			
 
				+/* debugfs.c */
			
 
				+extern int ceph_debugfs_init(void);
			
 
				+extern void ceph_debugfs_cleanup(void);
			
 
				+extern int ceph_debugfs_client_init(struct ceph_client *client);
			
 
				+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
			
 
				+
			
 
				+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
			
 
				+{
			
 
				+	if (dentry && dentry->d_parent)
			
 
				+		return dentry->d_parent->d_inode;
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+#endif /* _FS_CEPH_SUPER_H */
			
--- a/fs/ceph/types.h
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
 
				+#ifndef _FS_CEPH_TYPES_H
			
 
				+#define _FS_CEPH_TYPES_H
			
 
				+
			
 
				+/* needed before including ceph_fs.h */
			
 
				+#include <linux/in.h>
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/fcntl.h>
			
 
				+#include <linux/string.h>
			
 
				+
			
 
				+#include "ceph_fs.h"
			
 
				+#include "ceph_frag.h"
			
 
				+#include "ceph_hash.h"
			
 
				+
			
 
				+/*
			
 
				+ * Identify inodes by both their ino AND snapshot id (a u64).
			
 
				+ */
			
 
				+struct ceph_vino {
			
 
				+	u64 ino;
			
 
				+	u64 snap;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/* context for the caps reservation mechanism */
			
 
				+struct ceph_cap_reservation {
			
 
				+	int count;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,844 @@
 
				+#include "ceph_debug.h"
			
 
				+#include "super.h"
			
 
				+#include "decode.h"
			
 
				+
			
 
				+#include <linux/xattr.h>
			
 
				+
			
 
				+static bool ceph_is_valid_xattr(const char *name)
			
 
				+{
			
 
				+	return !strncmp(name, XATTR_SECURITY_PREFIX,
			
 
				+			XATTR_SECURITY_PREFIX_LEN) ||
			
 
				+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
			
 
				+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * These define virtual xattrs exposing the recursive directory
			
 
				+ * statistics and layout metadata.
			
 
				+ */
			
 
				+struct ceph_vxattr_cb {
			
 
				+	bool readonly;
			
 
				+	char *name;
			
 
				+	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
			
 
				+			      size_t size);
			
 
				+};
			
 
				+
			
 
				+/* directories */
			
 
				+
			
 
				+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
			
 
				+					size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
			
 
				+				      size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_files);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
			
 
				+					size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_subdirs);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
			
 
				+					 size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
			
 
				+				       size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_rfiles);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
			
 
				+					 size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_rsubdirs);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
			
 
				+				       size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%lld", ci->i_rbytes);
			
 
				+}
			
 
				+
			
 
				+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
			
 
				+				       size_t size)
			
 
				+{
			
 
				+	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
			
 
				+			(long)ci->i_rctime.tv_nsec);
			
 
				+}
			
 
				+
			
 
				+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
			
 
				+	{ true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
			
 
				+	{ true, "user.ceph.dir.files", ceph_vxattrcb_files},
			
 
				+	{ true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
			
 
				+	{ true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
			
 
				+	{ true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
			
 
				+	{ true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
			
 
				+	{ true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
			
 
				+	{ true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
			
 
				+	{ true, NULL, NULL }
			
 
				+};
			
 
				+
			
 
				+/* files */
			
 
				+
			
 
				+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
			
 
				+				   size_t size)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = snprintf(val, size,
			
 
				+		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
			
 
				+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
			
 
				+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
			
 
				+		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
			
 
				+	if (ceph_file_layout_pg_preferred(ci->i_layout))
			
 
				+		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
			
 
				+			    (unsigned long long)ceph_file_layout_pg_preferred(
			
 
				+				    ci->i_layout));
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
			
 
				+	{ true, "user.ceph.layout", ceph_vxattrcb_layout},
			
 
				+	{ NULL, NULL }
			
 
				+};
			
 
				+
			
 
				+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
			
 
				+{
			
 
				+	if (S_ISDIR(inode->i_mode))
			
 
				+		return ceph_dir_vxattrs;
			
 
				+	else if (S_ISREG(inode->i_mode))
			
 
				+		return ceph_file_vxattrs;
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
			
 
				+						const char *name)
			
 
				+{
			
 
				+	do {
			
 
				+		if (strcmp(vxattr->name, name) == 0)
			
 
				+			return vxattr;
			
 
				+		vxattr++;
			
 
				+	} while (vxattr->name);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static int __set_xattr(struct ceph_inode_info *ci,
			
 
				+			   const char *name, int name_len,
			
 
				+			   const char *val, int val_len,
			
 
				+			   int dirty,
			
 
				+			   int should_free_name, int should_free_val,
			
 
				+			   struct ceph_inode_xattr **newxattr)
			
 
				+{
			
 
				+	struct rb_node **p;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_inode_xattr *xattr = NULL;
			
 
				+	int c;
			
 
				+	int new = 0;
			
 
				+
			
 
				+	p = &ci->i_xattrs.index.rb_node;
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
			
 
				+		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
			
 
				+		if (c < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (c > 0)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else {
			
 
				+			if (name_len == xattr->name_len)
			
 
				+				break;
			
 
				+			else if (name_len < xattr->name_len)
			
 
				+				p = &(*p)->rb_left;
			
 
				+			else
			
 
				+				p = &(*p)->rb_right;
			
 
				+		}
			
 
				+		xattr = NULL;
			
 
				+	}
			
 
				+
			
 
				+	if (!xattr) {
			
 
				+		new = 1;
			
 
				+		xattr = *newxattr;
			
 
				+		xattr->name = name;
			
 
				+		xattr->name_len = name_len;
			
 
				+		xattr->should_free_name = should_free_name;
			
 
				+
			
 
				+		ci->i_xattrs.count++;
			
 
				+		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
			
 
				+	} else {
			
 
				+		kfree(*newxattr);
			
 
				+		*newxattr = NULL;
			
 
				+		if (xattr->should_free_val)
			
 
				+			kfree((void *)xattr->val);
			
 
				+
			
 
				+		if (should_free_name) {
			
 
				+			kfree((void *)name);
			
 
				+			name = xattr->name;
			
 
				+		}
			
 
				+		ci->i_xattrs.names_size -= xattr->name_len;
			
 
				+		ci->i_xattrs.vals_size -= xattr->val_len;
			
 
				+	}
			
 
				+	if (!xattr) {
			
 
				+		pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
			
 
				+		       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
			
 
				+		       xattr->val);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+	ci->i_xattrs.names_size += name_len;
			
 
				+	ci->i_xattrs.vals_size += val_len;
			
 
				+	if (val)
			
 
				+		xattr->val = val;
			
 
				+	else
			
 
				+		xattr->val = "";
			
 
				+
			
 
				+	xattr->val_len = val_len;
			
 
				+	xattr->dirty = dirty;
			
 
				+	xattr->should_free_val = (val && should_free_val);
			
 
				+
			
 
				+	if (new) {
			
 
				+		rb_link_node(&xattr->node, parent, p);
			
 
				+		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
			
 
				+		dout("__set_xattr_val p=%p\n", p);
			
 
				+	}
			
 
				+
			
 
				+	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
			
 
				+	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
			
 
				+			   const char *name)
			
 
				+{
			
 
				+	struct rb_node **p;
			
 
				+	struct rb_node *parent = NULL;
			
 
				+	struct ceph_inode_xattr *xattr = NULL;
			
 
				+	int c;
			
 
				+
			
 
				+	p = &ci->i_xattrs.index.rb_node;
			
 
				+	while (*p) {
			
 
				+		parent = *p;
			
 
				+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
			
 
				+		c = strncmp(name, xattr->name, xattr->name_len);
			
 
				+		if (c < 0)
			
 
				+			p = &(*p)->rb_left;
			
 
				+		else if (c > 0)
			
 
				+			p = &(*p)->rb_right;
			
 
				+		else {
			
 
				+			dout("__get_xattr %s: found %.*s\n", name,
			
 
				+			     xattr->val_len, xattr->val);
			
 
				+			return xattr;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dout("__get_xattr %s: not found\n", name);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static void __free_xattr(struct ceph_inode_xattr *xattr)
			
 
				+{
			
 
				+	BUG_ON(!xattr);
			
 
				+
			
 
				+	if (xattr->should_free_name)
			
 
				+		kfree((void *)xattr->name);
			
 
				+	if (xattr->should_free_val)
			
 
				+		kfree((void *)xattr->val);
			
 
				+
			
 
				+	kfree(xattr);
			
 
				+}
			
 
				+
			
 
				+static int __remove_xattr(struct ceph_inode_info *ci,
			
 
				+			  struct ceph_inode_xattr *xattr)
			
 
				+{
			
 
				+	if (!xattr)
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				+	rb_erase(&xattr->node, &ci->i_xattrs.index);
			
 
				+
			
 
				+	if (xattr->should_free_name)
			
 
				+		kfree((void *)xattr->name);
			
 
				+	if (xattr->should_free_val)
			
 
				+		kfree((void *)xattr->val);
			
 
				+
			
 
				+	ci->i_xattrs.names_size -= xattr->name_len;
			
 
				+	ci->i_xattrs.vals_size -= xattr->val_len;
			
 
				+	ci->i_xattrs.count--;
			
 
				+	kfree(xattr);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
			
 
				+			   const char *name)
			
 
				+{
			
 
				+	struct rb_node **p;
			
 
				+	struct ceph_inode_xattr *xattr;
			
 
				+	int err;
			
 
				+
			
 
				+	p = &ci->i_xattrs.index.rb_node;
			
 
				+	xattr = __get_xattr(ci, name);
			
 
				+	err = __remove_xattr(ci, xattr);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static char *__copy_xattr_names(struct ceph_inode_info *ci,
			
 
				+				char *dest)
			
 
				+{
			
 
				+	struct rb_node *p;
			
 
				+	struct ceph_inode_xattr *xattr = NULL;
			
 
				+
			
 
				+	p = rb_first(&ci->i_xattrs.index);
			
 
				+	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
			
 
				+
			
 
				+	while (p) {
			
 
				+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
			
 
				+		memcpy(dest, xattr->name, xattr->name_len);
			
 
				+		dest[xattr->name_len] = '\0';
			
 
				+
			
 
				+		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
			
 
				+		     xattr->name_len, ci->i_xattrs.names_size);
			
 
				+
			
 
				+		dest += xattr->name_len + 1;
			
 
				+		p = rb_next(p);
			
 
				+	}
			
 
				+
			
 
				+	return dest;
			
 
				+}
			
 
				+
			
 
				+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	struct rb_node *p, *tmp;
			
 
				+	struct ceph_inode_xattr *xattr = NULL;
			
 
				+
			
 
				+	p = rb_first(&ci->i_xattrs.index);
			
 
				+
			
 
				+	dout("__ceph_destroy_xattrs p=%p\n", p);
			
 
				+
			
 
				+	while (p) {
			
 
				+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
			
 
				+		tmp = p;
			
 
				+		p = rb_next(tmp);
			
 
				+		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
			
 
				+		     xattr->name_len, xattr->name);
			
 
				+		rb_erase(tmp, &ci->i_xattrs.index);
			
 
				+
			
 
				+		__free_xattr(xattr);
			
 
				+	}
			
 
				+
			
 
				+	ci->i_xattrs.names_size = 0;
			
 
				+	ci->i_xattrs.vals_size = 0;
			
 
				+	ci->i_xattrs.index_version = 0;
			
 
				+	ci->i_xattrs.count = 0;
			
 
				+	ci->i_xattrs.index = RB_ROOT;
			
 
				+}
			
 
				+
			
 
				+static int __build_xattrs(struct inode *inode)
			
 
				+{
			
 
				+	u32 namelen;
			
 
				+	u32 numattr = 0;
			
 
				+	void *p, *end;
			
 
				+	u32 len;
			
 
				+	const char *name, *val;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	int xattr_version;
			
 
				+	struct ceph_inode_xattr **xattrs = NULL;
			
 
				+	int err = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	dout("__build_xattrs() len=%d\n",
			
 
				+	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
			
 
				+
			
 
				+	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
			
 
				+		return 0; /* already built */
			
 
				+
			
 
				+	__ceph_destroy_xattrs(ci);
			
 
				+
			
 
				+start:
			
 
				+	/* updated internal xattr rb tree */
			
 
				+	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
			
 
				+		p = ci->i_xattrs.blob->vec.iov_base;
			
 
				+		end = p + ci->i_xattrs.blob->vec.iov_len;
			
 
				+		ceph_decode_32_safe(&p, end, numattr, bad);
			
 
				+		xattr_version = ci->i_xattrs.version;
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
			
 
				+				 GFP_NOFS);
			
 
				+		err = -ENOMEM;
			
 
				+		if (!xattrs)
			
 
				+			goto bad_lock;
			
 
				+		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
			
 
				+		for (i = 0; i < numattr; i++) {
			
 
				+			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
			
 
				+					    GFP_NOFS);
			
 
				+			if (!xattrs[i])
			
 
				+				goto bad_lock;
			
 
				+		}
			
 
				+
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		if (ci->i_xattrs.version != xattr_version) {
			
 
				+			/* lost a race, retry */
			
 
				+			for (i = 0; i < numattr; i++)
			
 
				+				kfree(xattrs[i]);
			
 
				+			kfree(xattrs);
			
 
				+			goto start;
			
 
				+		}
			
 
				+		err = -EIO;
			
 
				+		while (numattr--) {
			
 
				+			ceph_decode_32_safe(&p, end, len, bad);
			
 
				+			namelen = len;
			
 
				+			name = p;
			
 
				+			p += len;
			
 
				+			ceph_decode_32_safe(&p, end, len, bad);
			
 
				+			val = p;
			
 
				+			p += len;
			
 
				+
			
 
				+			err = __set_xattr(ci, name, namelen, val, len,
			
 
				+					  0, 0, 0, &xattrs[numattr]);
			
 
				+
			
 
				+			if (err < 0)
			
 
				+				goto bad;
			
 
				+		}
			
 
				+		kfree(xattrs);
			
 
				+	}
			
 
				+	ci->i_xattrs.index_version = ci->i_xattrs.version;
			
 
				+	ci->i_xattrs.dirty = false;
			
 
				+
			
 
				+	return err;
			
 
				+bad_lock:
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+bad:
			
 
				+	if (xattrs) {
			
 
				+		for (i = 0; i < numattr; i++)
			
 
				+			kfree(xattrs[i]);
			
 
				+		kfree(xattrs);
			
 
				+	}
			
 
				+	ci->i_xattrs.names_size = 0;
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
			
 
				+				    int val_size)
			
 
				+{
			
 
				+	/*
			
 
				+	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
			
 
				+	 * 4 bytes per each value
			
 
				+	 */
			
 
				+	int size = 4 + ci->i_xattrs.count*(4 + 4) +
			
 
				+			     ci->i_xattrs.names_size +
			
 
				+			     ci->i_xattrs.vals_size;
			
 
				+	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
			
 
				+	     ci->i_xattrs.count, ci->i_xattrs.names_size,
			
 
				+	     ci->i_xattrs.vals_size);
			
 
				+
			
 
				+	if (name_size)
			
 
				+		size += 4 + 4 + name_size + val_size;
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
			
 
				+ * and swap into place.
			
 
				+ */
			
 
				+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
			
 
				+{
			
 
				+	struct rb_node *p;
			
 
				+	struct ceph_inode_xattr *xattr = NULL;
			
 
				+	void *dest;
			
 
				+
			
 
				+	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
			
 
				+	if (ci->i_xattrs.dirty) {
			
 
				+		int need = __get_required_blob_size(ci, 0, 0);
			
 
				+
			
 
				+		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
			
 
				+
			
 
				+		p = rb_first(&ci->i_xattrs.index);
			
 
				+		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
			
 
				+
			
 
				+		ceph_encode_32(&dest, ci->i_xattrs.count);
			
 
				+		while (p) {
			
 
				+			xattr = rb_entry(p, struct ceph_inode_xattr, node);
			
 
				+
			
 
				+			ceph_encode_32(&dest, xattr->name_len);
			
 
				+			memcpy(dest, xattr->name, xattr->name_len);
			
 
				+			dest += xattr->name_len;
			
 
				+			ceph_encode_32(&dest, xattr->val_len);
			
 
				+			memcpy(dest, xattr->val, xattr->val_len);
			
 
				+			dest += xattr->val_len;
			
 
				+
			
 
				+			p = rb_next(p);
			
 
				+		}
			
 
				+
			
 
				+		/* adjust buffer len; it may be larger than we need */
			
 
				+		ci->i_xattrs.prealloc_blob->vec.iov_len =
			
 
				+			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
			
 
				+
			
 
				+		if (ci->i_xattrs.blob)
			
 
				+			ceph_buffer_put(ci->i_xattrs.blob);
			
 
				+		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
			
 
				+		ci->i_xattrs.prealloc_blob = NULL;
			
 
				+		ci->i_xattrs.dirty = false;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
			
 
				+		      size_t size)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
			
 
				+	int err;
			
 
				+	struct ceph_inode_xattr *xattr;
			
 
				+	struct ceph_vxattr_cb *vxattr = NULL;
			
 
				+
			
 
				+	if (!ceph_is_valid_xattr(name))
			
 
				+		return -ENODATA;
			
 
				+
			
 
				+	/* let's see if a virtual xattr was requested */
			
 
				+	if (vxattrs)
			
 
				+		vxattr = ceph_match_vxattr(vxattrs, name);
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
			
 
				+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
			
 
				+
			
 
				+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
			
 
				+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
			
 
				+		goto get_xattr;
			
 
				+	} else {
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		/* get xattrs from mds (if we don't already have them) */
			
 
				+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
			
 
				+		if (err)
			
 
				+			return err;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	if (vxattr && vxattr->readonly) {
			
 
				+		err = vxattr->getxattr_cb(ci, value, size);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	err = __build_xattrs(inode);
			
 
				+	if (err < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+get_xattr:
			
 
				+	err = -ENODATA;  /* == ENOATTR */
			
 
				+	xattr = __get_xattr(ci, name);
			
 
				+	if (!xattr) {
			
 
				+		if (vxattr)
			
 
				+			err = vxattr->getxattr_cb(ci, value, size);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	err = -ERANGE;
			
 
				+	if (size && size < xattr->val_len)
			
 
				+		goto out;
			
 
				+
			
 
				+	err = xattr->val_len;
			
 
				+	if (size == 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	memcpy(value, xattr->val, xattr->val_len);
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
			
 
				+	u32 vir_namelen = 0;
			
 
				+	u32 namelen;
			
 
				+	int err;
			
 
				+	u32 len;
			
 
				+	int i;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
			
 
				+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
			
 
				+
			
 
				+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
			
 
				+	    (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
			
 
				+		goto list_xattr;
			
 
				+	} else {
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
			
 
				+		if (err)
			
 
				+			return err;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+
			
 
				+	err = __build_xattrs(inode);
			
 
				+	if (err < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+list_xattr:
			
 
				+	vir_namelen = 0;
			
 
				+	/* include virtual dir xattrs */
			
 
				+	if (vxattrs)
			
 
				+		for (i = 0; vxattrs[i].name; i++)
			
 
				+			vir_namelen += strlen(vxattrs[i].name) + 1;
			
 
				+	/* adding 1 byte per each variable due to the null termination */
			
 
				+	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
			
 
				+	err = -ERANGE;
			
 
				+	if (size && namelen > size)
			
 
				+		goto out;
			
 
				+
			
 
				+	err = namelen;
			
 
				+	if (size == 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	names = __copy_xattr_names(ci, names);
			
 
				+
			
 
				+	/* virtual xattr names, too */
			
 
				+	if (vxattrs)
			
 
				+		for (i = 0; vxattrs[i].name; i++) {
			
 
				+			len = sprintf(names, "%s", vxattrs[i].name);
			
 
				+			names += len + 1;
			
 
				+		}
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
			
 
				+			      const char *value, size_t size, int flags)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_client(dentry->d_sb);
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct inode *parent_inode = dentry->d_parent->d_inode;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	int err;
			
 
				+	int i, nr_pages;
			
 
				+	struct page **pages = NULL;
			
 
				+	void *kaddr;
			
 
				+
			
 
				+	/* copy value into some pages */
			
 
				+	nr_pages = calc_pages_for(0, size);
			
 
				+	if (nr_pages) {
			
 
				+		pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
			
 
				+		if (!pages)
			
 
				+			return -ENOMEM;
			
 
				+		err = -ENOMEM;
			
 
				+		for (i = 0; i < nr_pages; i++) {
			
 
				+			pages[i] = alloc_page(GFP_NOFS);
			
 
				+			if (!pages[i]) {
			
 
				+				nr_pages = i;
			
 
				+				goto out;
			
 
				+			}
			
 
				+			kaddr = kmap(pages[i]);
			
 
				+			memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
			
 
				+			       min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	dout("setxattr value=%.*s\n", (int)size, value);
			
 
				+
			
 
				+	/* do request */
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
			
 
				+				       USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req)) {
			
 
				+		err = PTR_ERR(req);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	req->r_inode = igrab(inode);
			
 
				+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
			
 
				+	req->r_num_caps = 1;
			
 
				+	req->r_args.setxattr.flags = cpu_to_le32(flags);
			
 
				+	req->r_path2 = kstrdup(name, GFP_NOFS);
			
 
				+
			
 
				+	req->r_pages = pages;
			
 
				+	req->r_num_pages = nr_pages;
			
 
				+	req->r_data_len = size;
			
 
				+
			
 
				+	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
			
 
				+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
			
 
				+
			
 
				+out:
			
 
				+	if (pages) {
			
 
				+		for (i = 0; i < nr_pages; i++)
			
 
				+			__free_page(pages[i]);
			
 
				+		kfree(pages);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+int ceph_setxattr(struct dentry *dentry, const char *name,
			
 
				+		  const void *value, size_t size, int flags)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
			
 
				+	int err;
			
 
				+	int name_len = strlen(name);
			
 
				+	int val_len = size;
			
 
				+	char *newname = NULL;
			
 
				+	char *newval = NULL;
			
 
				+	struct ceph_inode_xattr *xattr = NULL;
			
 
				+	int issued;
			
 
				+	int required_blob_size;
			
 
				+
			
 
				+	if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	if (!ceph_is_valid_xattr(name))
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				+	if (vxattrs) {
			
 
				+		struct ceph_vxattr_cb *vxattr =
			
 
				+			ceph_match_vxattr(vxattrs, name);
			
 
				+		if (vxattr && vxattr->readonly)
			
 
				+			return -EOPNOTSUPP;
			
 
				+	}
			
 
				+
			
 
				+	/* preallocate memory for xattr name, value, index node */
			
 
				+	err = -ENOMEM;
			
 
				+	newname = kmalloc(name_len + 1, GFP_NOFS);
			
 
				+	if (!newname)
			
 
				+		goto out;
			
 
				+	memcpy(newname, name, name_len + 1);
			
 
				+
			
 
				+	if (val_len) {
			
 
				+		newval = kmalloc(val_len + 1, GFP_NOFS);
			
 
				+		if (!newval)
			
 
				+			goto out;
			
 
				+		memcpy(newval, value, val_len);
			
 
				+		newval[val_len] = '\0';
			
 
				+	}
			
 
				+
			
 
				+	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
			
 
				+	if (!xattr)
			
 
				+		goto out;
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+retry:
			
 
				+	issued = __ceph_caps_issued(ci, NULL);
			
 
				+	if (!(issued & CEPH_CAP_XATTR_EXCL))
			
 
				+		goto do_sync;
			
 
				+	__build_xattrs(inode);
			
 
				+
			
 
				+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
			
 
				+
			
 
				+	if (!ci->i_xattrs.prealloc_blob ||
			
 
				+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
			
 
				+		struct ceph_buffer *blob = NULL;
			
 
				+
			
 
				+		spin_unlock(&inode->i_lock);
			
 
				+		dout(" preaallocating new blob size=%d\n", required_blob_size);
			
 
				+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
			
 
				+		if (!blob)
			
 
				+			goto out;
			
 
				+		spin_lock(&inode->i_lock);
			
 
				+		if (ci->i_xattrs.prealloc_blob)
			
 
				+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
			
 
				+		ci->i_xattrs.prealloc_blob = blob;
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
			
 
				+	err = __set_xattr(ci, newname, name_len, newval,
			
 
				+			  val_len, 1, 1, 1, &xattr);
			
 
				+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
			
 
				+	ci->i_xattrs.dirty = true;
			
 
				+	inode->i_ctime = CURRENT_TIME;
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	return err;
			
 
				+
			
 
				+do_sync:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	err = ceph_sync_setxattr(dentry, name, value, size, flags);
			
 
				+out:
			
 
				+	kfree(newname);
			
 
				+	kfree(newval);
			
 
				+	kfree(xattr);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
			
 
				+{
			
 
				+	struct ceph_client *client = ceph_client(dentry->d_sb);
			
 
				+	struct ceph_mds_client *mdsc = &client->mdsc;
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct inode *parent_inode = dentry->d_parent->d_inode;
			
 
				+	struct ceph_mds_request *req;
			
 
				+	int err;
			
 
				+
			
 
				+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
			
 
				+				       USE_AUTH_MDS);
			
 
				+	if (IS_ERR(req))
			
 
				+		return PTR_ERR(req);
			
 
				+	req->r_inode = igrab(inode);
			
 
				+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
			
 
				+	req->r_num_caps = 1;
			
 
				+	req->r_path2 = kstrdup(name, GFP_NOFS);
			
 
				+
			
 
				+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
			
 
				+	ceph_mdsc_put_request(req);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+int ceph_removexattr(struct dentry *dentry, const char *name)
			
 
				+{
			
 
				+	struct inode *inode = dentry->d_inode;
			
 
				+	struct ceph_inode_info *ci = ceph_inode(inode);
			
 
				+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
			
 
				+	int issued;
			
 
				+	int err;
			
 
				+
			
 
				+	if (ceph_snap(inode) != CEPH_NOSNAP)
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	if (!ceph_is_valid_xattr(name))
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				+	if (vxattrs) {
			
 
				+		struct ceph_vxattr_cb *vxattr =
			
 
				+			ceph_match_vxattr(vxattrs, name);
			
 
				+		if (vxattr && vxattr->readonly)
			
 
				+			return -EOPNOTSUPP;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock(&inode->i_lock);
			
 
				+	__build_xattrs(inode);
			
 
				+	issued = __ceph_caps_issued(ci, NULL);
			
 
				+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
			
 
				+
			
 
				+	if (!(issued & CEPH_CAP_XATTR_EXCL))
			
 
				+		goto do_sync;
			
 
				+
			
 
				+	err = __remove_xattr_by_name(ceph_inode(inode), name);
			
 
				+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
			
 
				+	ci->i_xattrs.dirty = true;
			
 
				+	inode->i_ctime = CURRENT_TIME;
			
 
				+
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				+	return err;
			
 
				+do_sync:
			
 
				+	spin_unlock(&inode->i_lock);
			
 
				+	err = ceph_send_removexattr(dentry, name);
			
 
				+	return err;
			
 
				+}
			
 
				+