|
@@ -62,6 +62,7 @@
|
|
|
#include <linux/pfn.h>
|
|
|
#include <linux/rbtree.h>
|
|
|
#include <linux/slab.h>
|
|
|
+#include <linux/spinlock.h>
|
|
|
#include <linux/vmalloc.h>
|
|
|
#include <linux/workqueue.h>
|
|
|
|
|
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
|
|
|
static int pcpu_reserved_chunk_limit;
|
|
|
|
|
|
/*
|
|
|
- * One mutex to rule them all.
|
|
|
- *
|
|
|
- * The following mutex is grabbed in the outermost public alloc/free
|
|
|
- * interface functions and released only when the operation is
|
|
|
- * complete. As such, every function in this file other than the
|
|
|
- * outermost functions are called under pcpu_mutex.
|
|
|
- *
|
|
|
- * It can easily be switched to use spinlock such that only the area
|
|
|
- * allocation and page population commit are protected with it doing
|
|
|
- * actual [de]allocation without holding any lock. However, given
|
|
|
- * what this allocator does, I think it's better to let them run
|
|
|
- * sequentially.
|
|
|
+ * Synchronization rules.
|
|
|
+ *
|
|
|
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
|
|
|
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
|
|
|
+ * The latter is a spinlock and protects the index data structures -
|
|
|
+ * chunk slots, rbtree, chunks and area maps in chunks.
|
|
|
+ *
|
|
|
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
|
|
|
+ * pcpu_lock is grabbed and released as necessary. All actual memory
|
|
|
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
|
|
|
+ *
|
|
|
+ * Free path accesses and alters only the index data structures, so it
|
|
|
+ * can be safely called from atomic context. When memory needs to be
|
|
|
+ * returned to the system, free path schedules reclaim_work which
|
|
|
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
|
|
|
+ * reclaimed, release both locks and frees the chunks. Note that it's
|
|
|
+ * necessary to grab both locks to remove a chunk from circulation as
|
|
|
+ * allocation path might be referencing the chunk with only
|
|
|
+ * pcpu_alloc_mutex locked.
|
|
|
*/
|
|
|
-static DEFINE_MUTEX(pcpu_mutex);
|
|
|
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
|
|
|
+static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
|
|
|
|
|
|
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
|
|
|
static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
|
|
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
|
|
|
* kzalloc() is used; otherwise, vmalloc() is used. The returned
|
|
|
* memory is always zeroed.
|
|
|
*
|
|
|
+ * CONTEXT:
|
|
|
+ * Does GFP_KERNEL allocation.
|
|
|
+ *
|
|
|
* RETURNS:
|
|
|
* Pointer to the allocated area on success, NULL on failure.
|
|
|
*/
|
|
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
|
|
|
* New slot according to the changed state is determined and @chunk is
|
|
|
* moved to the slot. Note that the reserved chunk is never put on
|
|
|
* chunk slots.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_lock.
|
|
|
*/
|
|
|
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
|
|
|
{
|
|
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
|
|
|
* searchs for the chunk with the highest start address which isn't
|
|
|
* beyond @addr.
|
|
|
*
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_lock.
|
|
|
+ *
|
|
|
* RETURNS:
|
|
|
* The address of the found chunk.
|
|
|
*/
|
|
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
|
|
|
* @new: chunk to insert
|
|
|
*
|
|
|
* Insert @new into address rb tree.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_lock.
|
|
|
*/
|
|
|
static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
|
|
|
{
|
|
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
|
|
|
* A single allocation can split an area into three areas, so this
|
|
|
* function makes sure that @chunk->map has at least two extra slots.
|
|
|
*
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
|
|
|
+ * if area map is extended.
|
|
|
+ *
|
|
|
* RETURNS:
|
|
|
* 0 if noop, 1 if successfully extended, -errno on failure.
|
|
|
*/
|
|
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
|
|
|
if (chunk->map_alloc >= chunk->map_used + 2)
|
|
|
return 0;
|
|
|
|
|
|
+ spin_unlock_irq(&pcpu_lock);
|
|
|
+
|
|
|
new_alloc = PCPU_DFL_MAP_ALLOC;
|
|
|
while (new_alloc < chunk->map_used + 2)
|
|
|
new_alloc *= 2;
|
|
|
|
|
|
new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
|
|
|
- if (!new)
|
|
|
+ if (!new) {
|
|
|
+ spin_lock_irq(&pcpu_lock);
|
|
|
return -ENOMEM;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Acquire pcpu_lock and switch to new area map. Only free
|
|
|
+ * could have happened inbetween, so map_used couldn't have
|
|
|
+ * grown.
|
|
|
+ */
|
|
|
+ spin_lock_irq(&pcpu_lock);
|
|
|
+ BUG_ON(new_alloc < chunk->map_used + 2);
|
|
|
|
|
|
size = chunk->map_alloc * sizeof(chunk->map[0]);
|
|
|
memcpy(new, chunk->map, size);
|
|
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
|
|
|
* is inserted after the target block.
|
|
|
*
|
|
|
* @chunk->map must have enough free slots to accomodate the split.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_lock.
|
|
|
*/
|
|
|
static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
|
|
|
int head, int tail)
|
|
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
|
|
|
*
|
|
|
* @chunk->map must have at least two free slots.
|
|
|
*
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_lock.
|
|
|
+ *
|
|
|
* RETURNS:
|
|
|
* Allocated offset in @chunk on success, -1 if no matching area is
|
|
|
* found.
|
|
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
|
|
|
* Free area starting from @freeme to @chunk. Note that this function
|
|
|
* only modifies the allocation map. It doesn't depopulate or unmap
|
|
|
* the area.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_lock.
|
|
|
*/
|
|
|
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
|
|
|
{
|
|
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
|
|
|
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
|
|
|
* from @chunk. If @flush is true, vcache is flushed before unmapping
|
|
|
* and tlb after.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_alloc_mutex.
|
|
|
*/
|
|
|
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
|
|
|
bool flush)
|
|
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
|
|
|
*
|
|
|
* For each cpu, populate and map pages [@page_start,@page_end) into
|
|
|
* @chunk. The area is cleared on return.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
|
|
|
*/
|
|
|
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
|
|
|
{
|
|
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
|
|
|
* @align: alignment of area (max PAGE_SIZE)
|
|
|
* @reserved: allocate from the reserved chunk if available
|
|
|
*
|
|
|
- * Allocate percpu area of @size bytes aligned at @align. Might
|
|
|
- * sleep. Might trigger writeouts.
|
|
|
+ * Allocate percpu area of @size bytes aligned at @align.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * Does GFP_KERNEL allocation.
|
|
|
*
|
|
|
* RETURNS:
|
|
|
* Percpu pointer to the allocated area on success, NULL on failure.
|
|
|
*/
|
|
|
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
|
|
|
{
|
|
|
- void *ptr = NULL;
|
|
|
struct pcpu_chunk *chunk;
|
|
|
int slot, off;
|
|
|
|
|
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
- mutex_lock(&pcpu_mutex);
|
|
|
+ mutex_lock(&pcpu_alloc_mutex);
|
|
|
+ spin_lock_irq(&pcpu_lock);
|
|
|
|
|
|
/* serve reserved allocations from the reserved chunk if available */
|
|
|
if (reserved && pcpu_reserved_chunk) {
|
|
|
chunk = pcpu_reserved_chunk;
|
|
|
if (size > chunk->contig_hint ||
|
|
|
pcpu_extend_area_map(chunk) < 0)
|
|
|
- goto out_unlock;
|
|
|
+ goto fail_unlock;
|
|
|
off = pcpu_alloc_area(chunk, size, align);
|
|
|
if (off >= 0)
|
|
|
goto area_found;
|
|
|
- goto out_unlock;
|
|
|
+ goto fail_unlock;
|
|
|
}
|
|
|
|
|
|
+restart:
|
|
|
/* search through normal chunks */
|
|
|
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
|
|
|
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
|
|
|
if (size > chunk->contig_hint)
|
|
|
continue;
|
|
|
- if (pcpu_extend_area_map(chunk) < 0)
|
|
|
- goto out_unlock;
|
|
|
+
|
|
|
+ switch (pcpu_extend_area_map(chunk)) {
|
|
|
+ case 0:
|
|
|
+ break;
|
|
|
+ case 1:
|
|
|
+ goto restart; /* pcpu_lock dropped, restart */
|
|
|
+ default:
|
|
|
+ goto fail_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
off = pcpu_alloc_area(chunk, size, align);
|
|
|
if (off >= 0)
|
|
|
goto area_found;
|
|
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
|
|
|
}
|
|
|
|
|
|
/* hmmm... no space left, create a new chunk */
|
|
|
+ spin_unlock_irq(&pcpu_lock);
|
|
|
+
|
|
|
chunk = alloc_pcpu_chunk();
|
|
|
if (!chunk)
|
|
|
- goto out_unlock;
|
|
|
+ goto fail_unlock_mutex;
|
|
|
+
|
|
|
+ spin_lock_irq(&pcpu_lock);
|
|
|
pcpu_chunk_relocate(chunk, -1);
|
|
|
pcpu_chunk_addr_insert(chunk);
|
|
|
-
|
|
|
- off = pcpu_alloc_area(chunk, size, align);
|
|
|
- if (off < 0)
|
|
|
- goto out_unlock;
|
|
|
+ goto restart;
|
|
|
|
|
|
area_found:
|
|
|
+ spin_unlock_irq(&pcpu_lock);
|
|
|
+
|
|
|
/* populate, map and clear the area */
|
|
|
if (pcpu_populate_chunk(chunk, off, size)) {
|
|
|
+ spin_lock_irq(&pcpu_lock);
|
|
|
pcpu_free_area(chunk, off);
|
|
|
- goto out_unlock;
|
|
|
+ goto fail_unlock;
|
|
|
}
|
|
|
|
|
|
- ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
|
|
|
-out_unlock:
|
|
|
- mutex_unlock(&pcpu_mutex);
|
|
|
- return ptr;
|
|
|
+ mutex_unlock(&pcpu_alloc_mutex);
|
|
|
+
|
|
|
+ return __addr_to_pcpu_ptr(chunk->vm->addr + off);
|
|
|
+
|
|
|
+fail_unlock:
|
|
|
+ spin_unlock_irq(&pcpu_lock);
|
|
|
+fail_unlock_mutex:
|
|
|
+ mutex_unlock(&pcpu_alloc_mutex);
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -825,6 +897,9 @@ out_unlock:
|
|
|
* Allocate percpu area of @size bytes aligned at @align. Might
|
|
|
* sleep. Might trigger writeouts.
|
|
|
*
|
|
|
+ * CONTEXT:
|
|
|
+ * Does GFP_KERNEL allocation.
|
|
|
+ *
|
|
|
* RETURNS:
|
|
|
* Percpu pointer to the allocated area on success, NULL on failure.
|
|
|
*/
|
|
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
|
|
|
* percpu area if arch has set it up; otherwise, allocation is served
|
|
|
* from the same dynamic area. Might sleep. Might trigger writeouts.
|
|
|
*
|
|
|
+ * CONTEXT:
|
|
|
+ * Does GFP_KERNEL allocation.
|
|
|
+ *
|
|
|
* RETURNS:
|
|
|
* Percpu pointer to the allocated area on success, NULL on failure.
|
|
|
*/
|
|
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
|
|
|
* @work: unused
|
|
|
*
|
|
|
* Reclaim all fully free chunks except for the first one.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * workqueue context.
|
|
|
*/
|
|
|
static void pcpu_reclaim(struct work_struct *work)
|
|
|
{
|
|
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
|
|
|
struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
|
|
|
struct pcpu_chunk *chunk, *next;
|
|
|
|
|
|
- mutex_lock(&pcpu_mutex);
|
|
|
+ mutex_lock(&pcpu_alloc_mutex);
|
|
|
+ spin_lock_irq(&pcpu_lock);
|
|
|
|
|
|
list_for_each_entry_safe(chunk, next, head, list) {
|
|
|
WARN_ON(chunk->immutable);
|
|
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
|
|
|
list_move(&chunk->list, &todo);
|
|
|
}
|
|
|
|
|
|
- mutex_unlock(&pcpu_mutex);
|
|
|
+ spin_unlock_irq(&pcpu_lock);
|
|
|
+ mutex_unlock(&pcpu_alloc_mutex);
|
|
|
|
|
|
list_for_each_entry_safe(chunk, next, &todo, list) {
|
|
|
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
|
|
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
|
|
|
* free_percpu - free percpu area
|
|
|
* @ptr: pointer to area to free
|
|
|
*
|
|
|
- * Free percpu area @ptr. Might sleep.
|
|
|
+ * Free percpu area @ptr.
|
|
|
+ *
|
|
|
+ * CONTEXT:
|
|
|
+ * Can be called from atomic context.
|
|
|
*/
|
|
|
void free_percpu(void *ptr)
|
|
|
{
|
|
|
void *addr = __pcpu_ptr_to_addr(ptr);
|
|
|
struct pcpu_chunk *chunk;
|
|
|
+ unsigned long flags;
|
|
|
int off;
|
|
|
|
|
|
if (!ptr)
|
|
|
return;
|
|
|
|
|
|
- mutex_lock(&pcpu_mutex);
|
|
|
+ spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
|
|
chunk = pcpu_chunk_addr_search(addr);
|
|
|
off = addr - chunk->vm->addr;
|
|
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- mutex_unlock(&pcpu_mutex);
|
|
|
+ spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(free_percpu);
|
|
|
|