12 years ago · 5a53748568
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 
				 	fc->bdi.name = "fuse";
			
 
				 	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
			
 
				 	/* fuse does it's own writeback accounting */
			
 
				-	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
			
 
				+	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
			
 
				 
			
 
				 	err = bdi_init(&fc->bdi);
			
 
				 	if (err)
			
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
				  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
			
 
				  *
			
 
				  * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
			
 
				+ *
			
 
				+ * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
			
 
				  */
			
 
				 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
			
 
				 #define BDI_CAP_NO_WRITEBACK	0x00000002
			
@@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
				 #define BDI_CAP_NO_ACCT_WB	0x00000080
			
 
				 #define BDI_CAP_SWAP_BACKED	0x00000100
			
 
				 #define BDI_CAP_STABLE_WRITES	0x00000200
			
 
				+#define BDI_CAP_STRICTLIMIT	0x00000400
			
 
				 
			
 
				 #define BDI_CAP_VMFLAGS \
			
 
				 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
			
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -584,6 +584,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 
				 	return bdi_dirty;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ *                           setpoint - dirty 3
			
 
				+ *        f(dirty) := 1.0 + (----------------)
			
 
				+ *                           limit - setpoint
			
 
				+ *
			
 
				+ * it's a 3rd order polynomial that subjects to
			
 
				+ *
			
 
				+ * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
			
 
				+ * (2) f(setpoint) = 1.0 => the balance point
			
 
				+ * (3) f(limit)    = 0   => the hard limit
			
 
				+ * (4) df/dx      <= 0	 => negative feedback control
			
 
				+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
			
 
				+ *     => fast response on large errors; small oscillation near setpoint
			
 
				+ */
			
 
				+static inline long long pos_ratio_polynom(unsigned long setpoint,
			
 
				+					  unsigned long dirty,
			
 
				+					  unsigned long limit)
			
 
				+{
			
 
				+	long long pos_ratio;
			
 
				+	long x;
			
 
				+
			
 
				+	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
			
 
				+		    limit - setpoint + 1);
			
 
				+	pos_ratio = x;
			
 
				+	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
			
 
				+	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
			
 
				+	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
			
 
				+
			
 
				+	return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Dirty position control.
			
 
				  *
			
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 
				 	/*
			
 
				 	 * global setpoint
			
 
				 	 *
			
 
				-	 *                           setpoint - dirty 3
			
 
				-	 *        f(dirty) := 1.0 + (----------------)
			
 
				-	 *                           limit - setpoint
			
 
				+	 * See comment for pos_ratio_polynom().
			
 
				+	 */
			
 
				+	setpoint = (freerun + limit) / 2;
			
 
				+	pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
			
 
				+
			
 
				+	/*
			
 
				+	 * The strictlimit feature is a tool preventing mistrusted filesystems
			
 
				+	 * from growing a large number of dirty pages before throttling. For
			
 
				+	 * such filesystems balance_dirty_pages always checks bdi counters
			
 
				+	 * against bdi limits. Even if global "nr_dirty" is under "freerun".
			
 
				+	 * This is especially important for fuse which sets bdi->max_ratio to
			
 
				+	 * 1% by default. Without strictlimit feature, fuse writeback may
			
 
				+	 * consume arbitrary amount of RAM because it is accounted in
			
 
				+	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
			
 
				 	 *
			
 
				-	 * it's a 3rd order polynomial that subjects to
			
 
				+	 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
			
 
				+	 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
			
 
				+	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
			
 
				+	 * limits are set by default to 10% and 20% (background and throttle).
			
 
				+	 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
			
 
				+	 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
			
 
				+	 * about ~6K pages (as the average of background and throttle bdi
			
 
				+	 * limits). The 3rd order polynomial will provide positive feedback if
			
 
				+	 * bdi_dirty is under bdi_setpoint and vice versa.
			
 
				 	 *
			
 
				-	 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
			
 
				-	 * (2) f(setpoint) = 1.0 => the balance point
			
 
				-	 * (3) f(limit)    = 0   => the hard limit
			
 
				-	 * (4) df/dx      <= 0	 => negative feedback control
			
 
				-	 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
			
 
				-	 *     => fast response on large errors; small oscillation near setpoint
			
 
				+	 * Note, that we cannot use global counters in these calculations
			
 
				+	 * because we want to throttle process writing to a strictlimit BDI
			
 
				+	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
			
 
				+	 * in the example above).
			
 
				 	 */
			
 
				-	setpoint = (freerun + limit) / 2;
			
 
				-	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
			
 
				-		    limit - setpoint + 1);
			
 
				-	pos_ratio = x;
			
 
				-	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
			
 
				-	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
			
 
				-	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
			
 
				+	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
			
 
				+		long long bdi_pos_ratio;
			
 
				+		unsigned long bdi_bg_thresh;
			
 
				+
			
 
				+		if (bdi_dirty < 8)
			
 
				+			return min_t(long long, pos_ratio * 2,
			
 
				+				     2 << RATELIMIT_CALC_SHIFT);
			
 
				+
			
 
				+		if (bdi_dirty >= bdi_thresh)
			
 
				+			return 0;
			
 
				+
			
 
				+		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
			
 
				+		bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
			
 
				+						     bdi_bg_thresh);
			
 
				+
			
 
				+		if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
			
 
				+			return 0;
			
 
				+
			
 
				+		bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
			
 
				+						  bdi_thresh);
			
 
				+
			
 
				+		/*
			
 
				+		 * Typically, for strictlimit case, bdi_setpoint << setpoint
			
 
				+		 * and pos_ratio >> bdi_pos_ratio. In the other words global
			
 
				+		 * state ("dirty") is not limiting factor and we have to
			
 
				+		 * make decision based on bdi counters. But there is an
			
 
				+		 * important case when global pos_ratio should get precedence:
			
 
				+		 * global limits are exceeded (e.g. due to activities on other
			
 
				+		 * BDIs) while given strictlimit BDI is below limit.
			
 
				+		 *
			
 
				+		 * "pos_ratio * bdi_pos_ratio" would work for the case above,
			
 
				+		 * but it would look too non-natural for the case of all
			
 
				+		 * activity in the system coming from a single strictlimit BDI
			
 
				+		 * with bdi->max_ratio == 100%.
			
 
				+		 *
			
 
				+		 * Note that min() below somewhat changes the dynamics of the
			
 
				+		 * control system. Normally, pos_ratio value can be well over 3
			
 
				+		 * (when globally we are at freerun and bdi is well below bdi
			
 
				+		 * setpoint). Now the maximum pos_ratio in the same situation
			
 
				+		 * is 2. We might want to tweak this if we observe the control
			
 
				+		 * system is too slow to adapt.
			
 
				+		 */
			
 
				+		return min(pos_ratio, bdi_pos_ratio);
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * We have computed basic pos_ratio above based on global situation. If
			
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 
				 	 * keep that period small to reduce time lags).
			
 
				 	 */
			
 
				 	step = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * For strictlimit case, calculations above were based on bdi counters
			
 
				+	 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
			
 
				+	 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
			
 
				+	 * Hence, to calculate "step" properly, we have to use bdi_dirty as
			
 
				+	 * "dirty" and bdi_setpoint as "setpoint".
			
 
				+	 *
			
 
				+	 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
			
 
				+	 * it's possible that bdi_thresh is close to zero due to inactivity
			
 
				+	 * of backing device (see the implementation of bdi_dirty_limit()).
			
 
				+	 */
			
 
				+	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
			
 
				+		dirty = bdi_dirty;
			
 
				+		if (bdi_dirty < 8)
			
 
				+			setpoint = bdi_dirty + 1;
			
 
				+		else
			
 
				+			setpoint = (bdi_thresh +
			
 
				+				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
			
 
				+	}
			
 
				+
			
 
				 	if (dirty < setpoint) {
			
 
				 		x = min(bdi->balanced_dirty_ratelimit,
			
 
				 			 min(balanced_dirty_ratelimit, task_ratelimit));
			
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
 
				 	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
			
 
				 }
			
 
				 
			
 
				+static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
			
 
				+				    unsigned long dirty_thresh,
			
 
				+				    unsigned long background_thresh,
			
 
				+				    unsigned long *bdi_dirty,
			
 
				+				    unsigned long *bdi_thresh,
			
 
				+				    unsigned long *bdi_bg_thresh)
			
 
				+{
			
 
				+	unsigned long bdi_reclaimable;
			
 
				+
			
 
				+	/*
			
 
				+	 * bdi_thresh is not treated as some limiting factor as
			
 
				+	 * dirty_thresh, due to reasons
			
 
				+	 * - in JBOD setup, bdi_thresh can fluctuate a lot
			
 
				+	 * - in a system with HDD and USB key, the USB key may somehow
			
 
				+	 *   go into state (bdi_dirty >> bdi_thresh) either because
			
 
				+	 *   bdi_dirty starts high, or because bdi_thresh drops low.
			
 
				+	 *   In this case we don't want to hard throttle the USB key
			
 
				+	 *   dirtiers for 100 seconds until bdi_dirty drops under
			
 
				+	 *   bdi_thresh. Instead the auxiliary bdi control line in
			
 
				+	 *   bdi_position_ratio() will let the dirtier task progress
			
 
				+	 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
			
 
				+	 */
			
 
				+	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
			
 
				+
			
 
				+	if (bdi_bg_thresh)
			
 
				+		*bdi_bg_thresh = div_u64((u64)*bdi_thresh *
			
 
				+					 background_thresh,
			
 
				+					 dirty_thresh);
			
 
				+
			
 
				+	/*
			
 
				+	 * In order to avoid the stacked BDI deadlock we need
			
 
				+	 * to ensure we accurately count the 'dirty' pages when
			
 
				+	 * the threshold is low.
			
 
				+	 *
			
 
				+	 * Otherwise it would be possible to get thresh+n pages
			
 
				+	 * reported dirty, even though there are thresh-m pages
			
 
				+	 * actually dirty; with m+n sitting in the percpu
			
 
				+	 * deltas.
			
 
				+	 */
			
 
				+	if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
			
 
				+		bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
			
 
				+		*bdi_dirty = bdi_reclaimable +
			
 
				+			bdi_stat_sum(bdi, BDI_WRITEBACK);
			
 
				+	} else {
			
 
				+		bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
			
 
				+		*bdi_dirty = bdi_reclaimable +
			
 
				+			bdi_stat(bdi, BDI_WRITEBACK);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * balance_dirty_pages() must be called by processes which are generating dirty
			
 
				  * data.  It looks at the number of dirty pages in the machine and will force
			
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 
				 				unsigned long pages_dirtied)
			
 
				 {
			
 
				 	unsigned long nr_reclaimable;	/* = file_dirty + unstable_nfs */
			
 
				-	unsigned long bdi_reclaimable;
			
 
				 	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
			
 
				-	unsigned long bdi_dirty;
			
 
				-	unsigned long freerun;
			
 
				 	unsigned long background_thresh;
			
 
				 	unsigned long dirty_thresh;
			
 
				-	unsigned long bdi_thresh;
			
 
				 	long period;
			
 
				 	long pause;
			
 
				 	long max_pause;
			
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
 
				 	unsigned long dirty_ratelimit;
			
 
				 	unsigned long pos_ratio;
			
 
				 	struct backing_dev_info *bdi = mapping->backing_dev_info;
			
 
				+	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
			
 
				 	unsigned long start_time = jiffies;
			
 
				 
			
 
				 	for (;;) {
			
 
				 		unsigned long now = jiffies;
			
 
				+		unsigned long uninitialized_var(bdi_thresh);
			
 
				+		unsigned long thresh;
			
 
				+		unsigned long uninitialized_var(bdi_dirty);
			
 
				+		unsigned long dirty;
			
 
				+		unsigned long bg_thresh;
			
 
				 
			
 
				 		/*
			
 
				 		 * Unstable writes are a feature of certain networked
			
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
 
				 
			
 
				 		global_dirty_limits(&background_thresh, &dirty_thresh);
			
 
				 
			
 
				+		if (unlikely(strictlimit)) {
			
 
				+			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
			
 
				+					 &bdi_dirty, &bdi_thresh, &bg_thresh);
			
 
				+
			
 
				+			dirty = bdi_dirty;
			
 
				+			thresh = bdi_thresh;
			
 
				+		} else {
			
 
				+			dirty = nr_dirty;
			
 
				+			thresh = dirty_thresh;
			
 
				+			bg_thresh = background_thresh;
			
 
				+		}
			
 
				+
			
 
				 		/*
			
 
				 		 * Throttle it only when the background writeback cannot
			
 
				 		 * catch-up. This avoids (excessively) small writeouts
			
 
				-		 * when the bdi limits are ramping up.
			
 
				+		 * when the bdi limits are ramping up in case of !strictlimit.
			
 
				+		 *
			
 
				+		 * In strictlimit case make decision based on the bdi counters
			
 
				+		 * and limits. Small writeouts when the bdi limits are ramping
			
 
				+		 * up are the price we consciously pay for strictlimit-ing.
			
 
				 		 */
			
 
				-		freerun = dirty_freerun_ceiling(dirty_thresh,
			
 
				-						background_thresh);
			
 
				-		if (nr_dirty <= freerun) {
			
 
				+		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
			
 
				 			current->dirty_paused_when = now;
			
 
				 			current->nr_dirtied = 0;
			
 
				 			current->nr_dirtied_pause =
			
 
				-				dirty_poll_interval(nr_dirty, dirty_thresh);
			
 
				+				dirty_poll_interval(dirty, thresh);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				 		if (unlikely(!writeback_in_progress(bdi)))
			
 
				 			bdi_start_background_writeback(bdi);
			
 
				 
			
 
				-		/*
			
 
				-		 * bdi_thresh is not treated as some limiting factor as
			
 
				-		 * dirty_thresh, due to reasons
			
 
				-		 * - in JBOD setup, bdi_thresh can fluctuate a lot
			
 
				-		 * - in a system with HDD and USB key, the USB key may somehow
			
 
				-		 *   go into state (bdi_dirty >> bdi_thresh) either because
			
 
				-		 *   bdi_dirty starts high, or because bdi_thresh drops low.
			
 
				-		 *   In this case we don't want to hard throttle the USB key
			
 
				-		 *   dirtiers for 100 seconds until bdi_dirty drops under
			
 
				-		 *   bdi_thresh. Instead the auxiliary bdi control line in
			
 
				-		 *   bdi_position_ratio() will let the dirtier task progress
			
 
				-		 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
			
 
				-		 */
			
 
				-		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
			
 
				-
			
 
				-		/*
			
 
				-		 * In order to avoid the stacked BDI deadlock we need
			
 
				-		 * to ensure we accurately count the 'dirty' pages when
			
 
				-		 * the threshold is low.
			
 
				-		 *
			
 
				-		 * Otherwise it would be possible to get thresh+n pages
			
 
				-		 * reported dirty, even though there are thresh-m pages
			
 
				-		 * actually dirty; with m+n sitting in the percpu
			
 
				-		 * deltas.
			
 
				-		 */
			
 
				-		if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
			
 
				-			bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
			
 
				-			bdi_dirty = bdi_reclaimable +
			
 
				-				    bdi_stat_sum(bdi, BDI_WRITEBACK);
			
 
				-		} else {
			
 
				-			bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
			
 
				-			bdi_dirty = bdi_reclaimable +
			
 
				-				    bdi_stat(bdi, BDI_WRITEBACK);
			
 
				-		}
			
 
				+		if (!strictlimit)
			
 
				+			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
			
 
				+					 &bdi_dirty, &bdi_thresh, NULL);
			
 
				 
			
 
				 		dirty_exceeded = (bdi_dirty > bdi_thresh) &&
			
 
				-				  (nr_dirty > dirty_thresh);
			
 
				+				 ((nr_dirty > dirty_thresh) || strictlimit);
			
 
				 		if (dirty_exceeded && !bdi->dirty_exceeded)
			
 
				 			bdi->dirty_exceeded = 1;