16 жил өмнө · ccea34b5d0
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 
				 #include <linux/pfn.h>
			
 
				 #include <linux/rbtree.h>
			
 
				 #include <linux/slab.h>
			
 
				+#include <linux/spinlock.h>
			
 
				 #include <linux/vmalloc.h>
			
 
				 #include <linux/workqueue.h>
			
 
				 
			
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 
				 static int pcpu_reserved_chunk_limit;
			
 
				 
			
 
				 /*
			
 
				- * One mutex to rule them all.
			
 
				- *
			
 
				- * The following mutex is grabbed in the outermost public alloc/free
			
 
				- * interface functions and released only when the operation is
			
 
				- * complete.  As such, every function in this file other than the
			
 
				- * outermost functions are called under pcpu_mutex.
			
 
				- *
			
 
				- * It can easily be switched to use spinlock such that only the area
			
 
				- * allocation and page population commit are protected with it doing
			
 
				- * actual [de]allocation without holding any lock.  However, given
			
 
				- * what this allocator does, I think it's better to let them run
			
 
				- * sequentially.
			
 
				+ * Synchronization rules.
			
 
				+ *
			
 
				+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
			
 
				+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
			
 
				+ * The latter is a spinlock and protects the index data structures -
			
 
				+ * chunk slots, rbtree, chunks and area maps in chunks.
			
 
				+ *
			
 
				+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
			
 
				+ * pcpu_lock is grabbed and released as necessary.  All actual memory
			
 
				+ * allocations are done using GFP_KERNEL with pcpu_lock released.
			
 
				+ *
			
 
				+ * Free path accesses and alters only the index data structures, so it
			
 
				+ * can be safely called from atomic context.  When memory needs to be
			
 
				+ * returned to the system, free path schedules reclaim_work which
			
 
				+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
			
 
				+ * reclaimed, release both locks and frees the chunks.  Note that it's
			
 
				+ * necessary to grab both locks to remove a chunk from circulation as
			
 
				+ * allocation path might be referencing the chunk with only
			
 
				+ * pcpu_alloc_mutex locked.
			
 
				  */
			
 
				-static DEFINE_MUTEX(pcpu_mutex);
			
 
				+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
			
 
				+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
			
 
				 
			
 
				 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
			
 
				 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
			
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 
				  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
			
 
				  * memory is always zeroed.
			
 
				  *
			
 
				+ * CONTEXT:
			
 
				+ * Does GFP_KERNEL allocation.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * Pointer to the allocated area on success, NULL on failure.
			
 
				  */
			
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
 
				  * New slot according to the changed state is determined and @chunk is
			
 
				  * moved to the slot.  Note that the reserved chunk is never put on
			
 
				  * chunk slots.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_lock.
			
 
				  */
			
 
				 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
			
 
				 {
			
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
 
				  * searchs for the chunk with the highest start address which isn't
			
 
				  * beyond @addr.
			
 
				  *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_lock.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * The address of the found chunk.
			
 
				  */
			
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 
				  * @new: chunk to insert
			
 
				  *
			
 
				  * Insert @new into address rb tree.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_lock.
			
 
				  */
			
 
				 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
			
 
				 {
			
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 
				  * A single allocation can split an area into three areas, so this
			
 
				  * function makes sure that @chunk->map has at least two extra slots.
			
 
				  *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
			
 
				+ * if area map is extended.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * 0 if noop, 1 if successfully extended, -errno on failure.
			
 
				  */
			
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 
				 	if (chunk->map_alloc >= chunk->map_used + 2)
			
 
				 		return 0;
			
 
				 
			
 
				+	spin_unlock_irq(&pcpu_lock);
			
 
				+
			
 
				 	new_alloc = PCPU_DFL_MAP_ALLOC;
			
 
				 	while (new_alloc < chunk->map_used + 2)
			
 
				 		new_alloc *= 2;
			
 
				 
			
 
				 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
			
 
				-	if (!new)
			
 
				+	if (!new) {
			
 
				+		spin_lock_irq(&pcpu_lock);
			
 
				 		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Acquire pcpu_lock and switch to new area map.  Only free
			
 
				+	 * could have happened inbetween, so map_used couldn't have
			
 
				+	 * grown.
			
 
				+	 */
			
 
				+	spin_lock_irq(&pcpu_lock);
			
 
				+	BUG_ON(new_alloc < chunk->map_used + 2);
			
 
				 
			
 
				 	size = chunk->map_alloc * sizeof(chunk->map[0]);
			
 
				 	memcpy(new, chunk->map, size);
			
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 
				  * is inserted after the target block.
			
 
				  *
			
 
				  * @chunk->map must have enough free slots to accomodate the split.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_lock.
			
 
				  */
			
 
				 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
			
 
				 			     int head, int tail)
			
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 
				  *
			
 
				  * @chunk->map must have at least two free slots.
			
 
				  *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_lock.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * Allocated offset in @chunk on success, -1 if no matching area is
			
 
				  * found.
			
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 
				  * Free area starting from @freeme to @chunk.  Note that this function
			
 
				  * only modifies the allocation map.  It doesn't depopulate or unmap
			
 
				  * the area.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_lock.
			
 
				  */
			
 
				 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
			
 
				 {
			
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 
				  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
			
 
				  * from @chunk.  If @flush is true, vcache is flushed before unmapping
			
 
				  * and tlb after.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_alloc_mutex.
			
 
				  */
			
 
				 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
			
 
				 				  bool flush)
			
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 
				  *
			
 
				  * For each cpu, populate and map pages [@page_start,@page_end) into
			
 
				  * @chunk.  The area is cleared on return.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
			
 
				  */
			
 
				 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
			
 
				 {
			
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 
				  * @align: alignment of area (max PAGE_SIZE)
			
 
				  * @reserved: allocate from the reserved chunk if available
			
 
				  *
			
 
				- * Allocate percpu area of @size bytes aligned at @align.  Might
			
 
				- * sleep.  Might trigger writeouts.
			
 
				+ * Allocate percpu area of @size bytes aligned at @align.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * Does GFP_KERNEL allocation.
			
 
				  *
			
 
				  * RETURNS:
			
 
				  * Percpu pointer to the allocated area on success, NULL on failure.
			
 
				  */
			
 
				 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
			
 
				 {
			
 
				-	void *ptr = NULL;
			
 
				 	struct pcpu_chunk *chunk;
			
 
				 	int slot, off;
			
 
				 
			
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
 
				-	mutex_lock(&pcpu_mutex);
			
 
				+	mutex_lock(&pcpu_alloc_mutex);
			
 
				+	spin_lock_irq(&pcpu_lock);
			
 
				 
			
 
				 	/* serve reserved allocations from the reserved chunk if available */
			
 
				 	if (reserved && pcpu_reserved_chunk) {
			
 
				 		chunk = pcpu_reserved_chunk;
			
 
				 		if (size > chunk->contig_hint ||
			
 
				 		    pcpu_extend_area_map(chunk) < 0)
			
 
				-			goto out_unlock;
			
 
				+			goto fail_unlock;
			
 
				 		off = pcpu_alloc_area(chunk, size, align);
			
 
				 		if (off >= 0)
			
 
				 			goto area_found;
			
 
				-		goto out_unlock;
			
 
				+		goto fail_unlock;
			
 
				 	}
			
 
				 
			
 
				+restart:
			
 
				 	/* search through normal chunks */
			
 
				 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
			
 
				 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
			
 
				 			if (size > chunk->contig_hint)
			
 
				 				continue;
			
 
				-			if (pcpu_extend_area_map(chunk) < 0)
			
 
				-				goto out_unlock;
			
 
				+
			
 
				+			switch (pcpu_extend_area_map(chunk)) {
			
 
				+			case 0:
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				goto restart;	/* pcpu_lock dropped, restart */
			
 
				+			default:
			
 
				+				goto fail_unlock;
			
 
				+			}
			
 
				+
			
 
				 			off = pcpu_alloc_area(chunk, size, align);
			
 
				 			if (off >= 0)
			
 
				 				goto area_found;
			
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 
				 	}
			
 
				 
			
 
				 	/* hmmm... no space left, create a new chunk */
			
 
				+	spin_unlock_irq(&pcpu_lock);
			
 
				+
			
 
				 	chunk = alloc_pcpu_chunk();
			
 
				 	if (!chunk)
			
 
				-		goto out_unlock;
			
 
				+		goto fail_unlock_mutex;
			
 
				+
			
 
				+	spin_lock_irq(&pcpu_lock);
			
 
				 	pcpu_chunk_relocate(chunk, -1);
			
 
				 	pcpu_chunk_addr_insert(chunk);
			
 
				-
			
 
				-	off = pcpu_alloc_area(chunk, size, align);
			
 
				-	if (off < 0)
			
 
				-		goto out_unlock;
			
 
				+	goto restart;
			
 
				 
			
 
				 area_found:
			
 
				+	spin_unlock_irq(&pcpu_lock);
			
 
				+
			
 
				 	/* populate, map and clear the area */
			
 
				 	if (pcpu_populate_chunk(chunk, off, size)) {
			
 
				+		spin_lock_irq(&pcpu_lock);
			
 
				 		pcpu_free_area(chunk, off);
			
 
				-		goto out_unlock;
			
 
				+		goto fail_unlock;
			
 
				 	}
			
 
				 
			
 
				-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
			
 
				-out_unlock:
			
 
				-	mutex_unlock(&pcpu_mutex);
			
 
				-	return ptr;
			
 
				+	mutex_unlock(&pcpu_alloc_mutex);
			
 
				+
			
 
				+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
			
 
				+
			
 
				+fail_unlock:
			
 
				+	spin_unlock_irq(&pcpu_lock);
			
 
				+fail_unlock_mutex:
			
 
				+	mutex_unlock(&pcpu_alloc_mutex);
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -825,6 +897,9 @@ out_unlock:
 
				  * Allocate percpu area of @size bytes aligned at @align.  Might
			
 
				  * sleep.  Might trigger writeouts.
			
 
				  *
			
 
				+ * CONTEXT:
			
 
				+ * Does GFP_KERNEL allocation.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * Percpu pointer to the allocated area on success, NULL on failure.
			
 
				  */
			
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 
				  * percpu area if arch has set it up; otherwise, allocation is served
			
 
				  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
			
 
				  *
			
 
				+ * CONTEXT:
			
 
				+ * Does GFP_KERNEL allocation.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * Percpu pointer to the allocated area on success, NULL on failure.
			
 
				  */
			
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
 
				  * @work: unused
			
 
				  *
			
 
				  * Reclaim all fully free chunks except for the first one.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * workqueue context.
			
 
				  */
			
 
				 static void pcpu_reclaim(struct work_struct *work)
			
 
				 {
			
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
 
				 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
			
 
				 	struct pcpu_chunk *chunk, *next;
			
 
				 
			
 
				-	mutex_lock(&pcpu_mutex);
			
 
				+	mutex_lock(&pcpu_alloc_mutex);
			
 
				+	spin_lock_irq(&pcpu_lock);
			
 
				 
			
 
				 	list_for_each_entry_safe(chunk, next, head, list) {
			
 
				 		WARN_ON(chunk->immutable);
			
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
 
				 		list_move(&chunk->list, &todo);
			
 
				 	}
			
 
				 
			
 
				-	mutex_unlock(&pcpu_mutex);
			
 
				+	spin_unlock_irq(&pcpu_lock);
			
 
				+	mutex_unlock(&pcpu_alloc_mutex);
			
 
				 
			
 
				 	list_for_each_entry_safe(chunk, next, &todo, list) {
			
 
				 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
			
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
 
				  * free_percpu - free percpu area
			
 
				  * @ptr: pointer to area to free
			
 
				  *
			
 
				- * Free percpu area @ptr.  Might sleep.
			
 
				+ * Free percpu area @ptr.
			
 
				+ *
			
 
				+ * CONTEXT:
			
 
				+ * Can be called from atomic context.
			
 
				  */
			
 
				 void free_percpu(void *ptr)
			
 
				 {
			
 
				 	void *addr = __pcpu_ptr_to_addr(ptr);
			
 
				 	struct pcpu_chunk *chunk;
			
 
				+	unsigned long flags;
			
 
				 	int off;
			
 
				 
			
 
				 	if (!ptr)
			
 
				 		return;
			
 
				 
			
 
				-	mutex_lock(&pcpu_mutex);
			
 
				+	spin_lock_irqsave(&pcpu_lock, flags);
			
 
				 
			
 
				 	chunk = pcpu_chunk_addr_search(addr);
			
 
				 	off = addr - chunk->vm->addr;
			
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
 
				 			}
			
 
				 	}
			
 
				 
			
 
				-	mutex_unlock(&pcpu_mutex);
			
 
				+	spin_unlock_irqrestore(&pcpu_lock, flags);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(free_percpu);