16 年之前 · d3ef3d7351
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 
															 		INIT_LIST_HEAD(&mnt->mnt_share);
														
 
															 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
														
 
															 		INIT_LIST_HEAD(&mnt->mnt_slave);
														
 
															-		atomic_set(&mnt->__mnt_writers, 0);
														
 
															+#ifdef CONFIG_SMP
														
 
															+		mnt->mnt_writers = alloc_percpu(int);
														
 
															+		if (!mnt->mnt_writers)
														
 
															+			goto out_free_devname;
														
 
															+#else
														
 
															+		mnt->mnt_writers = 0;
														
 
															+#endif
														
 
															 	}
														
 
															 	return mnt;
														
 
															+#ifdef CONFIG_SMP
														
 
															+out_free_devname:
														
 
															+	kfree(mnt->mnt_devname);
														
 
															+#endif
														
 
															 out_free_id:
														
 
															 	mnt_free_id(mnt);
														
 
															 out_free_cache:
														
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
														
 
															-struct mnt_writer {
														
 
															-	/*
														
 
															-	 * If holding multiple instances of this lock, they
														
 
															-	 * must be ordered by cpu number.
														
 
															-	 */
														
 
															-	spinlock_t lock;
														
 
															-	struct lock_class_key lock_class; /* compiles out with !lockdep */
														
 
															-	unsigned long count;
														
 
															-	struct vfsmount *mnt;
														
 
															-} ____cacheline_aligned_in_smp;
														
 
															-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
														
 
															+static inline void inc_mnt_writers(struct vfsmount *mnt)
														
 
															+{
														
 
															+#ifdef CONFIG_SMP
														
 
															+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
														
 
															+#else
														
 
															+	mnt->mnt_writers++;
														
 
															+#endif
														
 
															+}
														
 
															-static int __init init_mnt_writers(void)
														
 
															+static inline void dec_mnt_writers(struct vfsmount *mnt)
														
 
															 {
														
 
															-	int cpu;
														
 
															-	for_each_possible_cpu(cpu) {
														
 
															-		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
														
 
															-		spin_lock_init(&writer->lock);
														
 
															-		lockdep_set_class(&writer->lock, &writer->lock_class);
														
 
															-		writer->count = 0;
														
 
															-	}
														
 
															-	return 0;
														
 
															+#ifdef CONFIG_SMP
														
 
															+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
														
 
															+#else
														
 
															+	mnt->mnt_writers--;
														
 
															+#endif
														
 
															 }
														
 
															-fs_initcall(init_mnt_writers);
														
 
															-static void unlock_mnt_writers(void)
														
 
															+static unsigned int count_mnt_writers(struct vfsmount *mnt)
														
 
															 {
														
 
															+#ifdef CONFIG_SMP
														
 
															+	unsigned int count = 0;
														
 
															 	int cpu;
														
 
															-	struct mnt_writer *cpu_writer;
														
 
															 	for_each_possible_cpu(cpu) {
														
 
															-		cpu_writer = &per_cpu(mnt_writers, cpu);
														
 
															-		spin_unlock(&cpu_writer->lock);
														
 
															+		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
														
 
															 	}
														
 
															-}
														
 
															-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
														
 
															-{
														
 
															-	if (!cpu_writer->mnt)
														
 
															-		return;
														
 
															-	/*
														
 
															-	 * This is in case anyone ever leaves an invalid,
														
 
															-	 * old ->mnt and a count of 0.
														
 
															-	 */
														
 
															-	if (!cpu_writer->count)
														
 
															-		return;
														
 
															-	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
														
 
															-	cpu_writer->count = 0;
														
 
															-}
														
 
															- /*
														
 
															- * must hold cpu_writer->lock
														
 
															- */
														
 
															-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
														
 
															-					  struct vfsmount *mnt)
														
 
															-{
														
 
															-	if (cpu_writer->mnt == mnt)
														
 
															-		return;
														
 
															-	__clear_mnt_count(cpu_writer);
														
 
															-	cpu_writer->mnt = mnt;
														
 
															+	return count;
														
 
															+#else
														
 
															+	return mnt->mnt_writers;
														
 
															+#endif
														
 
															 }
														
 
															 /*
														
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 
															 int mnt_want_write(struct vfsmount *mnt)
														
 
															 {
														
 
															 	int ret = 0;
														
 
															-	struct mnt_writer *cpu_writer;
														
 
															-	cpu_writer = &get_cpu_var(mnt_writers);
														
 
															-	spin_lock(&cpu_writer->lock);
														
 
															+	preempt_disable();
														
 
															+	inc_mnt_writers(mnt);
														
 
															+	/*
														
 
															+	 * The store to inc_mnt_writers must be visible before we pass
														
 
															+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
														
 
															+	 * incremented count after it has set MNT_WRITE_HOLD.
														
 
															+	 */
														
 
															+	smp_mb();
														
 
															+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
														
 
															+		cpu_relax();
														
 
															+	/*
														
 
															+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
														
 
															+	 * be set to match its requirements. So we must not load that until
														
 
															+	 * MNT_WRITE_HOLD is cleared.
														
 
															+	 */
														
 
															+	smp_rmb();
														
 
															 	if (__mnt_is_readonly(mnt)) {
														
 
															+		dec_mnt_writers(mnt);
														
 
															 		ret = -EROFS;
														
 
															 		goto out;
														
 
															 	}
														
 
															-	use_cpu_writer_for_mount(cpu_writer, mnt);
														
 
															-	cpu_writer->count++;
														
 
															 out:
														
 
															-	spin_unlock(&cpu_writer->lock);
														
 
															-	put_cpu_var(mnt_writers);
														
 
															+	preempt_enable();
														
 
															 	return ret;
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(mnt_want_write);
														
 
															-static void lock_mnt_writers(void)
														
 
															-{
														
 
															-	int cpu;
														
 
															-	struct mnt_writer *cpu_writer;
														
 
															-
														
 
															-	for_each_possible_cpu(cpu) {
														
 
															-		cpu_writer = &per_cpu(mnt_writers, cpu);
														
 
															-		spin_lock(&cpu_writer->lock);
														
 
															-		__clear_mnt_count(cpu_writer);
														
 
															-		cpu_writer->mnt = NULL;
														
 
															-	}
														
 
															-}
														
 
															-
														
 
															-/*
														
 
															- * These per-cpu write counts are not guaranteed to have
														
 
															- * matched increments and decrements on any given cpu.
														
 
															- * A file open()ed for write on one cpu and close()d on
														
 
															- * another cpu will imbalance this count.  Make sure it
														
 
															- * does not get too far out of whack.
														
 
															- */
														
 
															-static void handle_write_count_underflow(struct vfsmount *mnt)
														
 
															-{
														
 
															-	if (atomic_read(&mnt->__mnt_writers) >=
														
 
															-	    MNT_WRITER_UNDERFLOW_LIMIT)
														
 
															-		return;
														
 
															-	/*
														
 
															-	 * It isn't necessary to hold all of the locks
														
 
															-	 * at the same time, but doing it this way makes
														
 
															-	 * us share a lot more code.
														
 
															-	 */
														
 
															-	lock_mnt_writers();
														
 
															-	/*
														
 
															-	 * vfsmount_lock is for mnt_flags.
														
 
															-	 */
														
 
															-	spin_lock(&vfsmount_lock);
														
 
															-	/*
														
 
															-	 * If coalescing the per-cpu writer counts did not
														
 
															-	 * get us back to a positive writer count, we have
														
 
															-	 * a bug.
														
 
															-	 */
														
 
															-	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
														
 
															-	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
														
 
															-		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
														
 
															-				"count: %d\n",
														
 
															-			mnt, atomic_read(&mnt->__mnt_writers));
														
 
															-		/* use the flag to keep the dmesg spam down */
														
 
															-		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
														
 
															-	}
														
 
															-	spin_unlock(&vfsmount_lock);
														
 
															-	unlock_mnt_writers();
														
 
															-}
														
 
															-
														
 
															 /**
														
 
															  * mnt_drop_write - give up write access to a mount
														
 
															  * @mnt: the mount on which to give up write access
														
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 
															  */
														
 
															 void mnt_drop_write(struct vfsmount *mnt)
														
 
															 {
														
 
															-	int must_check_underflow = 0;
														
 
															-	struct mnt_writer *cpu_writer;
														
 
															-
														
 
															-	cpu_writer = &get_cpu_var(mnt_writers);
														
 
															-	spin_lock(&cpu_writer->lock);
														
 
															-
														
 
															-	use_cpu_writer_for_mount(cpu_writer, mnt);
														
 
															-	if (cpu_writer->count > 0) {
														
 
															-		cpu_writer->count--;
														
 
															-	} else {
														
 
															-		must_check_underflow = 1;
														
 
															-		atomic_dec(&mnt->__mnt_writers);
														
 
															-	}
														
 
															-
														
 
															-	spin_unlock(&cpu_writer->lock);
														
 
															-	/*
														
 
															-	 * Logically, we could call this each time,
														
 
															-	 * but the __mnt_writers cacheline tends to
														
 
															-	 * be cold, and makes this expensive.
														
 
															-	 */
														
 
															-	if (must_check_underflow)
														
 
															-		handle_write_count_underflow(mnt);
														
 
															-	/*
														
 
															-	 * This could be done right after the spinlock
														
 
															-	 * is taken because the spinlock keeps us on
														
 
															-	 * the cpu, and disables preemption.  However,
														
 
															-	 * putting it here bounds the amount that
														
 
															-	 * __mnt_writers can underflow.  Without it,
														
 
															-	 * we could theoretically wrap __mnt_writers.
														
 
															-	 */
														
 
															-	put_cpu_var(mnt_writers);
														
 
															+	preempt_disable();
														
 
															+	dec_mnt_writers(mnt);
														
 
															+	preempt_enable();
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(mnt_drop_write);
														
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 
															 {
														
 
															 	int ret = 0;
														
 
															-	lock_mnt_writers();
														
 
															+	spin_lock(&vfsmount_lock);
														
 
															+	mnt->mnt_flags |= MNT_WRITE_HOLD;
														
 
															 	/*
														
 
															-	 * With all the locks held, this value is stable
														
 
															+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
														
 
															+	 * should be visible before we do.
														
 
															 	 */
														
 
															-	if (atomic_read(&mnt->__mnt_writers) > 0) {
														
 
															-		ret = -EBUSY;
														
 
															-		goto out;
														
 
															-	}
														
 
															+	smp_mb();
														
 
															+
														
 
															 	/*
														
 
															-	 * nobody can do a successful mnt_want_write() with all
														
 
															-	 * of the counts in MNT_DENIED_WRITE and the locks held.
														
 
															+	 * With writers on hold, if this value is zero, then there are
														
 
															+	 * definitely no active writers (although held writers may subsequently
														
 
															+	 * increment the count, they'll have to wait, and decrement it after
														
 
															+	 * seeing MNT_READONLY).
														
 
															+	 *
														
 
															+	 * It is OK to have counter incremented on one CPU and decremented on
														
 
															+	 * another: the sum will add up correctly. The danger would be when we
														
 
															+	 * sum up each counter, if we read a counter before it is incremented,
														
 
															+	 * but then read another CPU's count which it has been subsequently
														
 
															+	 * decremented from -- we would see more decrements than we should.
														
 
															+	 * MNT_WRITE_HOLD protects against this scenario, because
														
 
															+	 * mnt_want_write first increments count, then smp_mb, then spins on
														
 
															+	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
														
 
															+	 * we're counting up here.
														
 
															 	 */
														
 
															-	spin_lock(&vfsmount_lock);
														
 
															-	if (!ret)
														
 
															+	if (count_mnt_writers(mnt) > 0)
														
 
															+		ret = -EBUSY;
														
 
															+	else
														
 
															 		mnt->mnt_flags |= MNT_READONLY;
														
 
															+	/*
														
 
															+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
														
 
															+	 * that become unheld will see MNT_READONLY.
														
 
															+	 */
														
 
															+	smp_wmb();
														
 
															+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
														
 
															 	spin_unlock(&vfsmount_lock);
														
 
															-out:
														
 
															-	unlock_mnt_writers();
														
 
															 	return ret;
														
 
															 }
														
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 
															 {
														
 
															 	kfree(mnt->mnt_devname);
														
 
															 	mnt_free_id(mnt);
														
 
															+#ifdef CONFIG_SMP
														
 
															+	free_percpu(mnt->mnt_writers);
														
 
															+#endif
														
 
															 	kmem_cache_free(mnt_cache, mnt);
														
 
															 }
														
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
															 static inline void __mntput(struct vfsmount *mnt)
														
 
															 {
														
 
															-	int cpu;
														
 
															 	struct super_block *sb = mnt->mnt_sb;
														
 
															-	/*
														
 
															-	 * We don't have to hold all of the locks at the
														
 
															-	 * same time here because we know that we're the
														
 
															-	 * last reference to mnt and that no new writers
														
 
															-	 * can come in.
														
 
															-	 */
														
 
															-	for_each_possible_cpu(cpu) {
														
 
															-		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
														
 
															-		spin_lock(&cpu_writer->lock);
														
 
															-		if (cpu_writer->mnt != mnt) {
														
 
															-			spin_unlock(&cpu_writer->lock);
														
 
															-			continue;
														
 
															-		}
														
 
															-		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
														
 
															-		cpu_writer->count = 0;
														
 
															-		/*
														
 
															-		 * Might as well do this so that no one
														
 
															-		 * ever sees the pointer and expects
														
 
															-		 * it to be valid.
														
 
															-		 */
														
 
															-		cpu_writer->mnt = NULL;
														
 
															-		spin_unlock(&cpu_writer->lock);
														
 
															-	}
														
 
															 	/*
														
 
															 	 * This probably indicates that somebody messed
														
 
															 	 * up a mnt_want/drop_write() pair.  If this
														
 
															 	 * happens, the filesystem was probably unable
														
 
															 	 * to make r/w->r/o transitions.
														
 
															 	 */
														
 
															-	WARN_ON(atomic_read(&mnt->__mnt_writers));
														
 
															+	/*
														
 
															+	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
														
 
															+	 * provides barriers, so count_mnt_writers() below is safe.  AV
														
 
															+	 */
														
 
															+	WARN_ON(count_mnt_writers(mnt));
														
 
															 	dput(mnt->mnt_root);
														
 
															 	free_vfsmnt(mnt);
														
 
															 	deactivate_super(sb);
														
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
 
															 #define MNT_STRICTATIME 0x80
														
 
															 #define MNT_SHRINKABLE	0x100
														
 
															-#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
														
 
															+#define MNT_WRITE_HOLD	0x200
														
 
															 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
														
 
															 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
														
@@ -65,13 +65,22 @@ struct vfsmount {
 
															 	int mnt_expiry_mark;		/* true if marked for expiry */
														
 
															 	int mnt_pinned;
														
 
															 	int mnt_ghosts;
														
 
															-	/*
														
 
															-	 * This value is not stable unless all of the mnt_writers[] spinlocks
														
 
															-	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
														
 
															-	 */
														
 
															-	atomic_t __mnt_writers;
														
 
															+#ifdef CONFIG_SMP
														
 
															+	int *mnt_writers;
														
 
															+#else
														
 
															+	int mnt_writers;
														
 
															+#endif
														
 
															 };
														
 
															+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
														
 
															+{
														
 
															+#ifdef CONFIG_SMP
														
 
															+	return mnt->mnt_writers;
														
 
															+#else
														
 
															+	return &mnt->mnt_writers;
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															 static inline struct vfsmount *mntget(struct vfsmount *mnt)
														
 
															 {
														
 
															 	if (mnt)