|
@@ -34,6 +34,7 @@
|
|
#include <linux/syscalls.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
|
|
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
|
|
#include <linux/pagevec.h>
|
|
#include <linux/pagevec.h>
|
|
|
|
+#include <linux/timer.h>
|
|
#include <trace/events/writeback.h>
|
|
#include <trace/events/writeback.h>
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
|
|
* measured in page writeback completions.
|
|
* measured in page writeback completions.
|
|
*
|
|
*
|
|
*/
|
|
*/
|
|
-static struct prop_descriptor vm_completions;
|
|
|
|
|
|
+static struct fprop_global writeout_completions;
|
|
|
|
+
|
|
|
|
+static void writeout_period(unsigned long t);
|
|
|
|
+/* Timer for aging of writeout_completions */
|
|
|
|
+static struct timer_list writeout_period_timer =
|
|
|
|
+ TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
|
|
|
|
+static unsigned long writeout_period_time = 0;
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * Length of period for aging writeout fractions of bdis. This is an
|
|
|
|
+ * arbitrarily chosen number. The longer the period, the slower fractions will
|
|
|
|
+ * reflect changes in current writeout rate.
|
|
|
|
+ */
|
|
|
|
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
|
|
|
|
|
|
/*
|
|
/*
|
|
* Work out the current dirty-memory clamping and background writeout
|
|
* Work out the current dirty-memory clamping and background writeout
|
|
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
|
|
zone_page_state(zone, NR_WRITEBACK) <= limit;
|
|
zone_page_state(zone, NR_WRITEBACK) <= limit;
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * couple the period to the dirty_ratio:
|
|
|
|
- *
|
|
|
|
- * period/2 ~ roundup_pow_of_two(dirty limit)
|
|
|
|
- */
|
|
|
|
-static int calc_period_shift(void)
|
|
|
|
-{
|
|
|
|
- unsigned long dirty_total;
|
|
|
|
-
|
|
|
|
- if (vm_dirty_bytes)
|
|
|
|
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
|
|
|
|
- else
|
|
|
|
- dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
|
|
|
|
- 100;
|
|
|
|
- return 2 + ilog2(dirty_total - 1);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * update the period when the dirty threshold changes.
|
|
|
|
- */
|
|
|
|
-static void update_completion_period(void)
|
|
|
|
-{
|
|
|
|
- int shift = calc_period_shift();
|
|
|
|
- prop_change_shift(&vm_completions, shift);
|
|
|
|
-
|
|
|
|
- writeback_set_ratelimit();
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
int dirty_background_ratio_handler(struct ctl_table *table, int write,
|
|
int dirty_background_ratio_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
loff_t *ppos)
|
|
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
|
|
|
|
|
|
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
|
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
|
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
|
|
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
|
|
- update_completion_period();
|
|
|
|
|
|
+ writeback_set_ratelimit();
|
|
vm_dirty_bytes = 0;
|
|
vm_dirty_bytes = 0;
|
|
}
|
|
}
|
|
return ret;
|
|
return ret;
|
|
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
|
|
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
|
|
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
|
|
- update_completion_period();
|
|
|
|
|
|
+ writeback_set_ratelimit();
|
|
vm_dirty_ratio = 0;
|
|
vm_dirty_ratio = 0;
|
|
}
|
|
}
|
|
return ret;
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static unsigned long wp_next_time(unsigned long cur_time)
|
|
|
|
+{
|
|
|
|
+ cur_time += VM_COMPLETIONS_PERIOD_LEN;
|
|
|
|
+ /* 0 has a special meaning... */
|
|
|
|
+ if (!cur_time)
|
|
|
|
+ return 1;
|
|
|
|
+ return cur_time;
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* Increment the BDI's writeout completion count and the global writeout
|
|
* Increment the BDI's writeout completion count and the global writeout
|
|
* completion count. Called from test_clear_page_writeback().
|
|
* completion count. Called from test_clear_page_writeback().
|
|
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
|
|
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
|
|
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
|
|
{
|
|
{
|
|
__inc_bdi_stat(bdi, BDI_WRITTEN);
|
|
__inc_bdi_stat(bdi, BDI_WRITTEN);
|
|
- __prop_inc_percpu_max(&vm_completions, &bdi->completions,
|
|
|
|
- bdi->max_prop_frac);
|
|
|
|
|
|
+ __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
|
|
|
|
+ bdi->max_prop_frac);
|
|
|
|
+ /* First event after period switching was turned off? */
|
|
|
|
+ if (!unlikely(writeout_period_time)) {
|
|
|
|
+ /*
|
|
|
|
+ * We can race with other __bdi_writeout_inc calls here but
|
|
|
|
+ * it does not cause any harm since the resulting time when
|
|
|
|
+ * timer will fire and what is in writeout_period_time will be
|
|
|
|
+ * roughly the same.
|
|
|
|
+ */
|
|
|
|
+ writeout_period_time = wp_next_time(jiffies);
|
|
|
|
+ mod_timer(&writeout_period_timer, writeout_period_time);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
void bdi_writeout_inc(struct backing_dev_info *bdi)
|
|
void bdi_writeout_inc(struct backing_dev_info *bdi)
|
|
@@ -431,10 +437,32 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
|
|
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
|
|
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
|
|
long *numerator, long *denominator)
|
|
long *numerator, long *denominator)
|
|
{
|
|
{
|
|
- prop_fraction_percpu(&vm_completions, &bdi->completions,
|
|
|
|
|
|
+ fprop_fraction_percpu(&writeout_completions, &bdi->completions,
|
|
numerator, denominator);
|
|
numerator, denominator);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * On idle system, we can be called long after we scheduled because we use
|
|
|
|
+ * deferred timers so count with missed periods.
|
|
|
|
+ */
|
|
|
|
+static void writeout_period(unsigned long t)
|
|
|
|
+{
|
|
|
|
+ int miss_periods = (jiffies - writeout_period_time) /
|
|
|
|
+ VM_COMPLETIONS_PERIOD_LEN;
|
|
|
|
+
|
|
|
|
+ if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
|
|
|
|
+ writeout_period_time = wp_next_time(writeout_period_time +
|
|
|
|
+ miss_periods * VM_COMPLETIONS_PERIOD_LEN);
|
|
|
|
+ mod_timer(&writeout_period_timer, writeout_period_time);
|
|
|
|
+ } else {
|
|
|
|
+ /*
|
|
|
|
+ * Aging has zeroed all fractions. Stop wasting CPU on period
|
|
|
|
+ * updates.
|
|
|
|
+ */
|
|
|
|
+ writeout_period_time = 0;
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
|
|
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
|
|
* registered backing devices, which, for obvious reasons, can not
|
|
* registered backing devices, which, for obvious reasons, can not
|
|
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
|
|
ret = -EINVAL;
|
|
ret = -EINVAL;
|
|
} else {
|
|
} else {
|
|
bdi->max_ratio = max_ratio;
|
|
bdi->max_ratio = max_ratio;
|
|
- bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
|
|
|
|
|
|
+ bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
|
|
}
|
|
}
|
|
spin_unlock_bh(&bdi_lock);
|
|
spin_unlock_bh(&bdi_lock);
|
|
|
|
|
|
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
|
|
* bdi->dirty_ratelimit = balanced_dirty_ratelimit;
|
|
* bdi->dirty_ratelimit = balanced_dirty_ratelimit;
|
|
*
|
|
*
|
|
* However to get a more stable dirty_ratelimit, the below elaborated
|
|
* However to get a more stable dirty_ratelimit, the below elaborated
|
|
- * code makes use of task_ratelimit to filter out sigular points and
|
|
|
|
|
|
+ * code makes use of task_ratelimit to filter out singular points and
|
|
* limit the step size.
|
|
* limit the step size.
|
|
*
|
|
*
|
|
* The below code essentially only uses the relative value of
|
|
* The below code essentially only uses the relative value of
|
|
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
|
|
* feel and care are stable dirty rate and small position error.
|
|
* feel and care are stable dirty rate and small position error.
|
|
*
|
|
*
|
|
* |task_ratelimit - dirty_ratelimit| is used to limit the step size
|
|
* |task_ratelimit - dirty_ratelimit| is used to limit the step size
|
|
- * and filter out the sigular points of balanced_dirty_ratelimit. Which
|
|
|
|
|
|
+ * and filter out the singular points of balanced_dirty_ratelimit. Which
|
|
* keeps jumping around randomly and can even leap far away at times
|
|
* keeps jumping around randomly and can even leap far away at times
|
|
* due to the small 200ms estimation period of dirty_rate (we want to
|
|
* due to the small 200ms estimation period of dirty_rate (we want to
|
|
* keep that period small to reduce time lags).
|
|
* keep that period small to reduce time lags).
|
|
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
|
|
*/
|
|
*/
|
|
void __init page_writeback_init(void)
|
|
void __init page_writeback_init(void)
|
|
{
|
|
{
|
|
- int shift;
|
|
|
|
-
|
|
|
|
writeback_set_ratelimit();
|
|
writeback_set_ratelimit();
|
|
register_cpu_notifier(&ratelimit_nb);
|
|
register_cpu_notifier(&ratelimit_nb);
|
|
|
|
|
|
- shift = calc_period_shift();
|
|
|
|
- prop_descriptor_init(&vm_completions, shift);
|
|
|
|
|
|
+ fprop_global_init(&writeout_completions);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|