17 years ago · 3d733633a6
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
 
				 #include <linux/quotaops.h>
			
 
				 #include <linux/acct.h>
			
 
				 #include <linux/capability.h>
			
 
				+#include <linux/cpumask.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/sysfs.h>
			
 
				 #include <linux/seq_file.h>
			
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 
				 	return tmp & (HASH_SIZE - 1);
			
 
				 }
			
 
				 
			
 
				+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
			
 
				+
			
 
				 struct vfsmount *alloc_vfsmnt(const char *name)
			
 
				 {
			
 
				 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
			
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 
				 		INIT_LIST_HEAD(&mnt->mnt_share);
			
 
				 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
			
 
				 		INIT_LIST_HEAD(&mnt->mnt_slave);
			
 
				+		atomic_set(&mnt->__mnt_writers, 0);
			
 
				 		if (name) {
			
 
				 			int size = strlen(name) + 1;
			
 
				 			char *newname = kmalloc(size, GFP_KERNEL);
			
@@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 
				 	return mnt;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Most r/o checks on a fs are for operations that take
			
 
				+ * discrete amounts of time, like a write() or unlink().
			
 
				+ * We must keep track of when those operations start
			
 
				+ * (for permission checks) and when they end, so that
			
 
				+ * we can determine when writes are able to occur to
			
 
				+ * a filesystem.
			
 
				+ */
			
 
				+/*
			
 
				+ * __mnt_is_readonly: check whether a mount is read-only
			
 
				+ * @mnt: the mount to check for its write status
			
 
				+ *
			
 
				+ * This shouldn't be used directly ouside of the VFS.
			
 
				+ * It does not guarantee that the filesystem will stay
			
 
				+ * r/w, just that it is right *now*.  This can not and
			
 
				+ * should not be used in place of IS_RDONLY(inode).
			
 
				+ * mnt_want/drop_write() will _keep_ the filesystem
			
 
				+ * r/w.
			
 
				+ */
			
 
				+int __mnt_is_readonly(struct vfsmount *mnt)
			
 
				+{
			
 
				+	return (mnt->mnt_sb->s_flags & MS_RDONLY);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
			
 
				+
			
 
				+struct mnt_writer {
			
 
				+	/*
			
 
				+	 * If holding multiple instances of this lock, they
			
 
				+	 * must be ordered by cpu number.
			
 
				+	 */
			
 
				+	spinlock_t lock;
			
 
				+	struct lock_class_key lock_class; /* compiles out with !lockdep */
			
 
				+	unsigned long count;
			
 
				+	struct vfsmount *mnt;
			
 
				+} ____cacheline_aligned_in_smp;
			
 
				+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
			
 
				+
			
 
				+static int __init init_mnt_writers(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
			
 
				+		spin_lock_init(&writer->lock);
			
 
				+		lockdep_set_class(&writer->lock, &writer->lock_class);
			
 
				+		writer->count = 0;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+fs_initcall(init_mnt_writers);
			
 
				+
			
 
				+static void unlock_mnt_writers(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+	struct mnt_writer *cpu_writer;
			
 
				+
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		cpu_writer = &per_cpu(mnt_writers, cpu);
			
 
				+		spin_unlock(&cpu_writer->lock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
			
 
				+{
			
 
				+	if (!cpu_writer->mnt)
			
 
				+		return;
			
 
				+	/*
			
 
				+	 * This is in case anyone ever leaves an invalid,
			
 
				+	 * old ->mnt and a count of 0.
			
 
				+	 */
			
 
				+	if (!cpu_writer->count)
			
 
				+		return;
			
 
				+	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
			
 
				+	cpu_writer->count = 0;
			
 
				+}
			
 
				+ /*
			
 
				+ * must hold cpu_writer->lock
			
 
				+ */
			
 
				+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
			
 
				+					  struct vfsmount *mnt)
			
 
				+{
			
 
				+	if (cpu_writer->mnt == mnt)
			
 
				+		return;
			
 
				+	__clear_mnt_count(cpu_writer);
			
 
				+	cpu_writer->mnt = mnt;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Most r/o checks on a fs are for operations that take
			
 
				  * discrete amounts of time, like a write() or unlink().
			
@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 
				  */
			
 
				 int mnt_want_write(struct vfsmount *mnt)
			
 
				 {
			
 
				-	if (__mnt_is_readonly(mnt))
			
 
				-		return -EROFS;
			
 
				-	return 0;
			
 
				+	int ret = 0;
			
 
				+	struct mnt_writer *cpu_writer;
			
 
				+
			
 
				+	cpu_writer = &get_cpu_var(mnt_writers);
			
 
				+	spin_lock(&cpu_writer->lock);
			
 
				+	if (__mnt_is_readonly(mnt)) {
			
 
				+		ret = -EROFS;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	use_cpu_writer_for_mount(cpu_writer, mnt);
			
 
				+	cpu_writer->count++;
			
 
				+out:
			
 
				+	spin_unlock(&cpu_writer->lock);
			
 
				+	put_cpu_var(mnt_writers);
			
 
				+	return ret;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(mnt_want_write);
			
 
				 
			
 
				+static void lock_mnt_writers(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+	struct mnt_writer *cpu_writer;
			
 
				+
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		cpu_writer = &per_cpu(mnt_writers, cpu);
			
 
				+		spin_lock(&cpu_writer->lock);
			
 
				+		__clear_mnt_count(cpu_writer);
			
 
				+		cpu_writer->mnt = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * These per-cpu write counts are not guaranteed to have
			
 
				+ * matched increments and decrements on any given cpu.
			
 
				+ * A file open()ed for write on one cpu and close()d on
			
 
				+ * another cpu will imbalance this count.  Make sure it
			
 
				+ * does not get too far out of whack.
			
 
				+ */
			
 
				+static void handle_write_count_underflow(struct vfsmount *mnt)
			
 
				+{
			
 
				+	if (atomic_read(&mnt->__mnt_writers) >=
			
 
				+	    MNT_WRITER_UNDERFLOW_LIMIT)
			
 
				+		return;
			
 
				+	/*
			
 
				+	 * It isn't necessary to hold all of the locks
			
 
				+	 * at the same time, but doing it this way makes
			
 
				+	 * us share a lot more code.
			
 
				+	 */
			
 
				+	lock_mnt_writers();
			
 
				+	/*
			
 
				+	 * vfsmount_lock is for mnt_flags.
			
 
				+	 */
			
 
				+	spin_lock(&vfsmount_lock);
			
 
				+	/*
			
 
				+	 * If coalescing the per-cpu writer counts did not
			
 
				+	 * get us back to a positive writer count, we have
			
 
				+	 * a bug.
			
 
				+	 */
			
 
				+	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
			
 
				+	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
			
 
				+		printk(KERN_DEBUG "leak detected on mount(%p) writers "
			
 
				+				"count: %d\n",
			
 
				+			mnt, atomic_read(&mnt->__mnt_writers));
			
 
				+		WARN_ON(1);
			
 
				+		/* use the flag to keep the dmesg spam down */
			
 
				+		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
			
 
				+	}
			
 
				+	spin_unlock(&vfsmount_lock);
			
 
				+	unlock_mnt_writers();
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * mnt_drop_write - give up write access to a mount
			
 
				  * @mnt: the mount on which to give up write access
			
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
 
				  */
			
 
				 void mnt_drop_write(struct vfsmount *mnt)
			
 
				 {
			
 
				+	int must_check_underflow = 0;
			
 
				+	struct mnt_writer *cpu_writer;
			
 
				+
			
 
				+	cpu_writer = &get_cpu_var(mnt_writers);
			
 
				+	spin_lock(&cpu_writer->lock);
			
 
				+
			
 
				+	use_cpu_writer_for_mount(cpu_writer, mnt);
			
 
				+	if (cpu_writer->count > 0) {
			
 
				+		cpu_writer->count--;
			
 
				+	} else {
			
 
				+		must_check_underflow = 1;
			
 
				+		atomic_dec(&mnt->__mnt_writers);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&cpu_writer->lock);
			
 
				+	/*
			
 
				+	 * Logically, we could call this each time,
			
 
				+	 * but the __mnt_writers cacheline tends to
			
 
				+	 * be cold, and makes this expensive.
			
 
				+	 */
			
 
				+	if (must_check_underflow)
			
 
				+		handle_write_count_underflow(mnt);
			
 
				+	/*
			
 
				+	 * This could be done right after the spinlock
			
 
				+	 * is taken because the spinlock keeps us on
			
 
				+	 * the cpu, and disables preemption.  However,
			
 
				+	 * putting it here bounds the amount that
			
 
				+	 * __mnt_writers can underflow.  Without it,
			
 
				+	 * we could theoretically wrap __mnt_writers.
			
 
				+	 */
			
 
				+	put_cpu_var(mnt_writers);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(mnt_drop_write);
			
 
				 
			
 
				-/*
			
 
				- * __mnt_is_readonly: check whether a mount is read-only
			
 
				- * @mnt: the mount to check for its write status
			
 
				- *
			
 
				- * This shouldn't be used directly ouside of the VFS.
			
 
				- * It does not guarantee that the filesystem will stay
			
 
				- * r/w, just that it is right *now*.  This can not and
			
 
				- * should not be used in place of IS_RDONLY(inode).
			
 
				- */
			
 
				-int __mnt_is_readonly(struct vfsmount *mnt)
			
 
				+int mnt_make_readonly(struct vfsmount *mnt)
			
 
				 {
			
 
				-	return (mnt->mnt_sb->s_flags & MS_RDONLY);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	lock_mnt_writers();
			
 
				+	/*
			
 
				+	 * With all the locks held, this value is stable
			
 
				+	 */
			
 
				+	if (atomic_read(&mnt->__mnt_writers) > 0) {
			
 
				+		ret = -EBUSY;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	/*
			
 
				+	 * actually set mount's r/o flag here to make
			
 
				+	 * __mnt_is_readonly() true, which keeps anyone
			
 
				+	 * from doing a successful mnt_want_write().
			
 
				+	 */
			
 
				+out:
			
 
				+	unlock_mnt_writers();
			
 
				+	return ret;
			
 
				 }
			
 
				-EXPORT_SYMBOL_GPL(__mnt_is_readonly);
			
 
				 
			
 
				 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
			
 
				 {
			
@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
				 
			
 
				 static inline void __mntput(struct vfsmount *mnt)
			
 
				 {
			
 
				+	int cpu;
			
 
				 	struct super_block *sb = mnt->mnt_sb;
			
 
				+	/*
			
 
				+	 * We don't have to hold all of the locks at the
			
 
				+	 * same time here because we know that we're the
			
 
				+	 * last reference to mnt and that no new writers
			
 
				+	 * can come in.
			
 
				+	 */
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
			
 
				+		if (cpu_writer->mnt != mnt)
			
 
				+			continue;
			
 
				+		spin_lock(&cpu_writer->lock);
			
 
				+		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
			
 
				+		cpu_writer->count = 0;
			
 
				+		/*
			
 
				+		 * Might as well do this so that no one
			
 
				+		 * ever sees the pointer and expects
			
 
				+		 * it to be valid.
			
 
				+		 */
			
 
				+		cpu_writer->mnt = NULL;
			
 
				+		spin_unlock(&cpu_writer->lock);
			
 
				+	}
			
 
				+	/*
			
 
				+	 * This probably indicates that somebody messed
			
 
				+	 * up a mnt_want/drop_write() pair.  If this
			
 
				+	 * happens, the filesystem was probably unable
			
 
				+	 * to make r/w->r/o transitions.
			
 
				+	 */
			
 
				+	WARN_ON(atomic_read(&mnt->__mnt_writers));
			
 
				 	dput(mnt->mnt_root);
			
 
				 	free_vfsmnt(mnt);
			
 
				 	deactivate_super(sb);
			
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -14,6 +14,7 @@
 
				 
			
 
				 #include <linux/types.h>
			
 
				 #include <linux/list.h>
			
 
				+#include <linux/nodemask.h>
			
 
				 #include <linux/spinlock.h>
			
 
				 #include <asm/atomic.h>
			
 
				 
			
@@ -30,6 +31,7 @@ struct mnt_namespace;
 
				 #define MNT_RELATIME	0x20
			
 
				 
			
 
				 #define MNT_SHRINKABLE	0x100
			
 
				+#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
			
 
				 
			
 
				 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
			
 
				 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
			
@@ -62,6 +64,11 @@ struct vfsmount {
 
				 	int mnt_expiry_mark;		/* true if marked for expiry */
			
 
				 	int mnt_pinned;
			
 
				 	int mnt_ghosts;
			
 
				+	/*
			
 
				+	 * This value is not stable unless all of the mnt_writers[] spinlocks
			
 
				+	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
			
 
				+	 */
			
 
				+	atomic_t __mnt_writers;
			
 
				 };
			
 
				 
			
 
				 static inline struct vfsmount *mntget(struct vfsmount *mnt)