12 years ago · 1d3d4437ea
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -19,6 +19,8 @@ struct shrink_control {
 
				 
			
 
				 	/* shrink from these nodes */
			
 
				 	nodemask_t nodes_to_scan;
			
 
				+	/* current node being shrunk (for NUMA aware shrinkers) */
			
 
				+	int nid;
			
 
				 };
			
 
				 
			
 
				 #define SHRINK_STOP (~0UL)
			
@@ -44,6 +46,8 @@ struct shrink_control {
 
				  * due to potential deadlocks. If SHRINK_STOP is returned, then no further
			
 
				  * attempts to call the @scan_objects will be made from the current reclaim
			
 
				  * context.
			
 
				+ *
			
 
				+ * @flags determine the shrinker abilities, like numa awareness
			
 
				  */
			
 
				 struct shrinker {
			
 
				 	int (*shrink)(struct shrinker *, struct shrink_control *sc);
			
@@ -54,12 +58,18 @@ struct shrinker {
 
				 
			
 
				 	int seeks;	/* seeks to recreate an obj */
			
 
				 	long batch;	/* reclaim batch size, 0 = default */
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	/* These are for internal use */
			
 
				 	struct list_head list;
			
 
				-	atomic_long_t nr_in_batch; /* objs pending delete */
			
 
				+	/* objs pending delete, per node */
			
 
				+	atomic_long_t *nr_deferred;
			
 
				 };
			
 
				 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
			
 
				-extern void register_shrinker(struct shrinker *);
			
 
				+
			
 
				+/* Flags */
			
 
				+#define SHRINKER_NUMA_AWARE (1 << 0)
			
 
				+
			
 
				+extern int register_shrinker(struct shrinker *);
			
 
				 extern void unregister_shrinker(struct shrinker *);
			
 
				 #endif
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Add a shrinker callback to be called from the vm
			
 
				+ * Add a shrinker callback to be called from the vm.
			
 
				  */
			
 
				-void register_shrinker(struct shrinker *shrinker)
			
 
				+int register_shrinker(struct shrinker *shrinker)
			
 
				 {
			
 
				-	atomic_long_set(&shrinker->nr_in_batch, 0);
			
 
				+	size_t size = sizeof(*shrinker->nr_deferred);
			
 
				+
			
 
				+	/*
			
 
				+	 * If we only have one possible node in the system anyway, save
			
 
				+	 * ourselves the trouble and disable NUMA aware behavior. This way we
			
 
				+	 * will save memory and some small loop time later.
			
 
				+	 */
			
 
				+	if (nr_node_ids == 1)
			
 
				+		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
			
 
				+
			
 
				+	if (shrinker->flags & SHRINKER_NUMA_AWARE)
			
 
				+		size *= nr_node_ids;
			
 
				+
			
 
				+	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
			
 
				+	if (!shrinker->nr_deferred)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				 	down_write(&shrinker_rwsem);
			
 
				 	list_add_tail(&shrinker->list, &shrinker_list);
			
 
				 	up_write(&shrinker_rwsem);
			
 
				+	return 0;
			
 
				 }
			
 
				 EXPORT_SYMBOL(register_shrinker);
			
 
				 
			
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
 
				 }
			
 
				 
			
 
				 #define SHRINK_BATCH 128
			
 
				+
			
 
				+static unsigned long
			
 
				+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
			
 
				+		 unsigned long nr_pages_scanned, unsigned long lru_pages)
			
 
				+{
			
 
				+	unsigned long freed = 0;
			
 
				+	unsigned long long delta;
			
 
				+	long total_scan;
			
 
				+	long max_pass;
			
 
				+	long nr;
			
 
				+	long new_nr;
			
 
				+	int nid = shrinkctl->nid;
			
 
				+	long batch_size = shrinker->batch ? shrinker->batch
			
 
				+					  : SHRINK_BATCH;
			
 
				+
			
 
				+	if (shrinker->count_objects)
			
 
				+		max_pass = shrinker->count_objects(shrinker, shrinkctl);
			
 
				+	else
			
 
				+		max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
			
 
				+	if (max_pass == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * copy the current shrinker scan count into a local variable
			
 
				+	 * and zero it so that other concurrent shrinker invocations
			
 
				+	 * don't also do this scanning work.
			
 
				+	 */
			
 
				+	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
			
 
				+
			
 
				+	total_scan = nr;
			
 
				+	delta = (4 * nr_pages_scanned) / shrinker->seeks;
			
 
				+	delta *= max_pass;
			
 
				+	do_div(delta, lru_pages + 1);
			
 
				+	total_scan += delta;
			
 
				+	if (total_scan < 0) {
			
 
				+		printk(KERN_ERR
			
 
				+		"shrink_slab: %pF negative objects to delete nr=%ld\n",
			
 
				+		       shrinker->shrink, total_scan);
			
 
				+		total_scan = max_pass;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * We need to avoid excessive windup on filesystem shrinkers
			
 
				+	 * due to large numbers of GFP_NOFS allocations causing the
			
 
				+	 * shrinkers to return -1 all the time. This results in a large
			
 
				+	 * nr being built up so when a shrink that can do some work
			
 
				+	 * comes along it empties the entire cache due to nr >>>
			
 
				+	 * max_pass.  This is bad for sustaining a working set in
			
 
				+	 * memory.
			
 
				+	 *
			
 
				+	 * Hence only allow the shrinker to scan the entire cache when
			
 
				+	 * a large delta change is calculated directly.
			
 
				+	 */
			
 
				+	if (delta < max_pass / 4)
			
 
				+		total_scan = min(total_scan, max_pass / 2);
			
 
				+
			
 
				+	/*
			
 
				+	 * Avoid risking looping forever due to too large nr value:
			
 
				+	 * never try to free more than twice the estimate number of
			
 
				+	 * freeable entries.
			
 
				+	 */
			
 
				+	if (total_scan > max_pass * 2)
			
 
				+		total_scan = max_pass * 2;
			
 
				+
			
 
				+	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
			
 
				+				nr_pages_scanned, lru_pages,
			
 
				+				max_pass, delta, total_scan);
			
 
				+
			
 
				+	while (total_scan >= batch_size) {
			
 
				+
			
 
				+		if (shrinker->scan_objects) {
			
 
				+			unsigned long ret;
			
 
				+			shrinkctl->nr_to_scan = batch_size;
			
 
				+			ret = shrinker->scan_objects(shrinker, shrinkctl);
			
 
				+
			
 
				+			if (ret == SHRINK_STOP)
			
 
				+				break;
			
 
				+			freed += ret;
			
 
				+		} else {
			
 
				+			int nr_before;
			
 
				+			long ret;
			
 
				+
			
 
				+			nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
			
 
				+			ret = do_shrinker_shrink(shrinker, shrinkctl,
			
 
				+							batch_size);
			
 
				+			if (ret == -1)
			
 
				+				break;
			
 
				+			if (ret < nr_before)
			
 
				+				freed += nr_before - ret;
			
 
				+		}
			
 
				+
			
 
				+		count_vm_events(SLABS_SCANNED, batch_size);
			
 
				+		total_scan -= batch_size;
			
 
				+
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * move the unused scan count back into the shrinker in a
			
 
				+	 * manner that handles concurrent updates. If we exhausted the
			
 
				+	 * scan, there is no need to do an update.
			
 
				+	 */
			
 
				+	if (total_scan > 0)
			
 
				+		new_nr = atomic_long_add_return(total_scan,
			
 
				+						&shrinker->nr_deferred[nid]);
			
 
				+	else
			
 
				+		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
			
 
				+
			
 
				+	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
			
 
				+	return freed;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Call the shrink functions to age shrinkable caches
			
 
				  *
			
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
 
				 	}
			
 
				 
			
 
				 	list_for_each_entry(shrinker, &shrinker_list, list) {
			
 
				-		unsigned long long delta;
			
 
				-		long total_scan;
			
 
				-		long max_pass;
			
 
				-		long nr;
			
 
				-		long new_nr;
			
 
				-		long batch_size = shrinker->batch ? shrinker->batch
			
 
				-						  : SHRINK_BATCH;
			
 
				-
			
 
				-		if (shrinker->count_objects)
			
 
				-			max_pass = shrinker->count_objects(shrinker, shrinkctl);
			
 
				-		else
			
 
				-			max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
			
 
				-		if (max_pass == 0)
			
 
				-			continue;
			
 
				-
			
 
				-		/*
			
 
				-		 * copy the current shrinker scan count into a local variable
			
 
				-		 * and zero it so that other concurrent shrinker invocations
			
 
				-		 * don't also do this scanning work.
			
 
				-		 */
			
 
				-		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
			
 
				-
			
 
				-		total_scan = nr;
			
 
				-		delta = (4 * nr_pages_scanned) / shrinker->seeks;
			
 
				-		delta *= max_pass;
			
 
				-		do_div(delta, lru_pages + 1);
			
 
				-		total_scan += delta;
			
 
				-		if (total_scan < 0) {
			
 
				-			printk(KERN_ERR
			
 
				-			"shrink_slab: %pF negative objects to delete nr=%ld\n",
			
 
				-			       shrinker->shrink, total_scan);
			
 
				-			total_scan = max_pass;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * We need to avoid excessive windup on filesystem shrinkers
			
 
				-		 * due to large numbers of GFP_NOFS allocations causing the
			
 
				-		 * shrinkers to return -1 all the time. This results in a large
			
 
				-		 * nr being built up so when a shrink that can do some work
			
 
				-		 * comes along it empties the entire cache due to nr >>>
			
 
				-		 * max_pass.  This is bad for sustaining a working set in
			
 
				-		 * memory.
			
 
				-		 *
			
 
				-		 * Hence only allow the shrinker to scan the entire cache when
			
 
				-		 * a large delta change is calculated directly.
			
 
				-		 */
			
 
				-		if (delta < max_pass / 4)
			
 
				-			total_scan = min(total_scan, max_pass / 2);
			
 
				-
			
 
				-		/*
			
 
				-		 * Avoid risking looping forever due to too large nr value:
			
 
				-		 * never try to free more than twice the estimate number of
			
 
				-		 * freeable entries.
			
 
				-		 */
			
 
				-		if (total_scan > max_pass * 2)
			
 
				-			total_scan = max_pass * 2;
			
 
				-
			
 
				-		trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
			
 
				-					nr_pages_scanned, lru_pages,
			
 
				-					max_pass, delta, total_scan);
			
 
				-
			
 
				-		while (total_scan >= batch_size) {
			
 
				-
			
 
				-			if (shrinker->scan_objects) {
			
 
				-				unsigned long ret;
			
 
				-				shrinkctl->nr_to_scan = batch_size;
			
 
				-				ret = shrinker->scan_objects(shrinker, shrinkctl);
			
 
				+		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
			
 
				+			if (!node_online(shrinkctl->nid))
			
 
				+				continue;
			
 
				 
			
 
				-				if (ret == SHRINK_STOP)
			
 
				-					break;
			
 
				-				freed += ret;
			
 
				-			} else {
			
 
				-				int nr_before;
			
 
				-				long ret;
			
 
				-
			
 
				-				nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
			
 
				-				ret = do_shrinker_shrink(shrinker, shrinkctl,
			
 
				-								batch_size);
			
 
				-				if (ret == -1)
			
 
				-					break;
			
 
				-				if (ret < nr_before)
			
 
				-					freed += nr_before - ret;
			
 
				-			}
			
 
				+			if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
			
 
				+			    (shrinkctl->nid != 0))
			
 
				+				break;
			
 
				 
			
 
				-			count_vm_events(SLABS_SCANNED, batch_size);
			
 
				-			total_scan -= batch_size;
			
 
				+			freed += shrink_slab_node(shrinkctl, shrinker,
			
 
				+				 nr_pages_scanned, lru_pages);
			
 
				 
			
 
				-			cond_resched();
			
 
				 		}
			
 
				-
			
 
				-		/*
			
 
				-		 * move the unused scan count back into the shrinker in a
			
 
				-		 * manner that handles concurrent updates. If we exhausted the
			
 
				-		 * scan, there is no need to do an update.
			
 
				-		 */
			
 
				-		if (total_scan > 0)
			
 
				-			new_nr = atomic_long_add_return(total_scan,
			
 
				-					&shrinker->nr_in_batch);
			
 
				-		else
			
 
				-			new_nr = atomic_long_read(&shrinker->nr_in_batch);
			
 
				-
			
 
				-		trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
			
 
				 	}
			
 
				 	up_read(&shrinker_rwsem);
			
 
				 out: