percpu.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293
  1. /*
  2. * linux/mm/percpu.c - percpu memory allocator
  3. *
  4. * Copyright (C) 2009 SUSE Linux Products GmbH
  5. * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
  6. *
  7. * This file is released under the GPLv2.
  8. *
  9. * This is percpu allocator which can handle both static and dynamic
  10. * areas. Percpu areas are allocated in chunks in vmalloc area. Each
  11. * chunk is consisted of nr_cpu_ids units and the first chunk is used
  12. * for static percpu variables in the kernel image (special boot time
  13. * alloc/init handling necessary as these areas need to be brought up
  14. * before allocation services are running). Unit grows as necessary
  15. * and all units grow or shrink in unison. When a chunk is filled up,
  16. * another chunk is allocated. ie. in vmalloc area
  17. *
  18. * c0 c1 c2
  19. * ------------------- ------------------- ------------
  20. * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
  21. * ------------------- ...... ------------------- .... ------------
  22. *
  23. * Allocation is done in offset-size areas of single unit space. Ie,
  24. * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  25. * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
  26. * percpu base registers pcpu_unit_size apart.
  27. *
  28. * There are usually many small percpu allocations many of them as
  29. * small as 4 bytes. The allocator organizes chunks into lists
  30. * according to free size and tries to allocate from the fullest one.
  31. * Each chunk keeps the maximum contiguous area size hint which is
  32. * guaranteed to be eqaul to or larger than the maximum contiguous
  33. * area in the chunk. This helps the allocator not to iterate the
  34. * chunk maps unnecessarily.
  35. *
  36. * Allocation state in each chunk is kept using an array of integers
  37. * on chunk->map. A positive value in the map represents a free
  38. * region and negative allocated. Allocation inside a chunk is done
  39. * by scanning this map sequentially and serving the first matching
  40. * entry. This is mostly copied from the percpu_modalloc() allocator.
  41. * Chunks can be determined from the address using the index field
  42. * in the page struct. The index field contains a pointer to the chunk.
  43. *
  44. * To use this allocator, arch code should do the followings.
  45. *
  46. * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
  47. *
  48. * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  49. * regular address to percpu pointer and back if they need to be
  50. * different from the default
  51. *
  52. * - use pcpu_setup_first_chunk() during percpu area initialization to
  53. * setup the first chunk containing the kernel static percpu area
  54. */
  55. #include <linux/bitmap.h>
  56. #include <linux/bootmem.h>
  57. #include <linux/list.h>
  58. #include <linux/mm.h>
  59. #include <linux/module.h>
  60. #include <linux/mutex.h>
  61. #include <linux/percpu.h>
  62. #include <linux/pfn.h>
  63. #include <linux/slab.h>
  64. #include <linux/spinlock.h>
  65. #include <linux/vmalloc.h>
  66. #include <linux/workqueue.h>
  67. #include <asm/cacheflush.h>
  68. #include <asm/sections.h>
  69. #include <asm/tlbflush.h>
  70. #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
  71. #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
  72. /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  73. #ifndef __addr_to_pcpu_ptr
  74. #define __addr_to_pcpu_ptr(addr) \
  75. (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
  76. + (unsigned long)__per_cpu_start)
  77. #endif
  78. #ifndef __pcpu_ptr_to_addr
  79. #define __pcpu_ptr_to_addr(ptr) \
  80. (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
  81. - (unsigned long)__per_cpu_start)
  82. #endif
  83. struct pcpu_chunk {
  84. struct list_head list; /* linked to pcpu_slot lists */
  85. int free_size; /* free bytes in the chunk */
  86. int contig_hint; /* max contiguous size hint */
  87. struct vm_struct *vm; /* mapped vmalloc region */
  88. int map_used; /* # of map entries used */
  89. int map_alloc; /* # of map entries allocated */
  90. int *map; /* allocation map */
  91. bool immutable; /* no [de]population allowed */
  92. struct page **page; /* points to page array */
  93. struct page *page_ar[]; /* #cpus * UNIT_PAGES */
  94. };
  95. static int pcpu_unit_pages __read_mostly;
  96. static int pcpu_unit_size __read_mostly;
  97. static int pcpu_chunk_size __read_mostly;
  98. static int pcpu_nr_slots __read_mostly;
  99. static size_t pcpu_chunk_struct_size __read_mostly;
  100. /* the address of the first chunk which starts with the kernel static area */
  101. void *pcpu_base_addr __read_mostly;
  102. EXPORT_SYMBOL_GPL(pcpu_base_addr);
  103. /*
  104. * The first chunk which always exists. Note that unlike other
  105. * chunks, this one can be allocated and mapped in several different
  106. * ways and thus often doesn't live in the vmalloc area.
  107. */
  108. static struct pcpu_chunk *pcpu_first_chunk;
  109. /*
  110. * Optional reserved chunk. This chunk reserves part of the first
  111. * chunk and serves it for reserved allocations. The amount of
  112. * reserved offset is in pcpu_reserved_chunk_limit. When reserved
  113. * area doesn't exist, the following variables contain NULL and 0
  114. * respectively.
  115. */
  116. static struct pcpu_chunk *pcpu_reserved_chunk;
  117. static int pcpu_reserved_chunk_limit;
  118. /*
  119. * Synchronization rules.
  120. *
  121. * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
  122. * protects allocation/reclaim paths, chunks and chunk->page arrays.
  123. * The latter is a spinlock and protects the index data structures -
  124. * chunk slots, chunks and area maps in chunks.
  125. *
  126. * During allocation, pcpu_alloc_mutex is kept locked all the time and
  127. * pcpu_lock is grabbed and released as necessary. All actual memory
  128. * allocations are done using GFP_KERNEL with pcpu_lock released.
  129. *
  130. * Free path accesses and alters only the index data structures, so it
  131. * can be safely called from atomic context. When memory needs to be
  132. * returned to the system, free path schedules reclaim_work which
  133. * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
  134. * reclaimed, release both locks and frees the chunks. Note that it's
  135. * necessary to grab both locks to remove a chunk from circulation as
  136. * allocation path might be referencing the chunk with only
  137. * pcpu_alloc_mutex locked.
  138. */
  139. static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
  140. static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
  141. static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
  142. /* reclaim work to release fully free chunks, scheduled from free path */
  143. static void pcpu_reclaim(struct work_struct *work);
  144. static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
  145. static int __pcpu_size_to_slot(int size)
  146. {
  147. int highbit = fls(size); /* size is in bytes */
  148. return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
  149. }
  150. static int pcpu_size_to_slot(int size)
  151. {
  152. if (size == pcpu_unit_size)
  153. return pcpu_nr_slots - 1;
  154. return __pcpu_size_to_slot(size);
  155. }
  156. static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
  157. {
  158. if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
  159. return 0;
  160. return pcpu_size_to_slot(chunk->free_size);
  161. }
  162. static int pcpu_page_idx(unsigned int cpu, int page_idx)
  163. {
  164. return cpu * pcpu_unit_pages + page_idx;
  165. }
  166. static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
  167. unsigned int cpu, int page_idx)
  168. {
  169. return &chunk->page[pcpu_page_idx(cpu, page_idx)];
  170. }
  171. static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
  172. unsigned int cpu, int page_idx)
  173. {
  174. return (unsigned long)chunk->vm->addr +
  175. (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
  176. }
  177. static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
  178. int page_idx)
  179. {
  180. /*
  181. * Any possible cpu id can be used here, so there's no need to
  182. * worry about preemption or cpu hotplug.
  183. */
  184. return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
  185. page_idx) != NULL;
  186. }
  187. /* set the pointer to a chunk in a page struct */
  188. static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
  189. {
  190. page->index = (unsigned long)pcpu;
  191. }
  192. /* obtain pointer to a chunk from a page struct */
  193. static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
  194. {
  195. return (struct pcpu_chunk *)page->index;
  196. }
  197. /**
  198. * pcpu_mem_alloc - allocate memory
  199. * @size: bytes to allocate
  200. *
  201. * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
  202. * kzalloc() is used; otherwise, vmalloc() is used. The returned
  203. * memory is always zeroed.
  204. *
  205. * CONTEXT:
  206. * Does GFP_KERNEL allocation.
  207. *
  208. * RETURNS:
  209. * Pointer to the allocated area on success, NULL on failure.
  210. */
  211. static void *pcpu_mem_alloc(size_t size)
  212. {
  213. if (size <= PAGE_SIZE)
  214. return kzalloc(size, GFP_KERNEL);
  215. else {
  216. void *ptr = vmalloc(size);
  217. if (ptr)
  218. memset(ptr, 0, size);
  219. return ptr;
  220. }
  221. }
  222. /**
  223. * pcpu_mem_free - free memory
  224. * @ptr: memory to free
  225. * @size: size of the area
  226. *
  227. * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
  228. */
  229. static void pcpu_mem_free(void *ptr, size_t size)
  230. {
  231. if (size <= PAGE_SIZE)
  232. kfree(ptr);
  233. else
  234. vfree(ptr);
  235. }
  236. /**
  237. * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
  238. * @chunk: chunk of interest
  239. * @oslot: the previous slot it was on
  240. *
  241. * This function is called after an allocation or free changed @chunk.
  242. * New slot according to the changed state is determined and @chunk is
  243. * moved to the slot. Note that the reserved chunk is never put on
  244. * chunk slots.
  245. *
  246. * CONTEXT:
  247. * pcpu_lock.
  248. */
  249. static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  250. {
  251. int nslot = pcpu_chunk_slot(chunk);
  252. if (chunk != pcpu_reserved_chunk && oslot != nslot) {
  253. if (oslot < nslot)
  254. list_move(&chunk->list, &pcpu_slot[nslot]);
  255. else
  256. list_move_tail(&chunk->list, &pcpu_slot[nslot]);
  257. }
  258. }
  259. /**
  260. * pcpu_chunk_addr_search - determine chunk containing specified address
  261. * @addr: address for which the chunk needs to be determined.
  262. *
  263. * RETURNS:
  264. * The address of the found chunk.
  265. */
  266. static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  267. {
  268. void *first_start = pcpu_first_chunk->vm->addr;
  269. /* is it in the first chunk? */
  270. if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
  271. /* is it in the reserved area? */
  272. if (addr < first_start + pcpu_reserved_chunk_limit)
  273. return pcpu_reserved_chunk;
  274. return pcpu_first_chunk;
  275. }
  276. /*
  277. * The address is relative to unit0 which might be unused and
  278. * thus unmapped. Offset the address to the unit space of the
  279. * current processor before looking it up in the vmalloc
  280. * space. Note that any possible cpu id can be used here, so
  281. * there's no need to worry about preemption or cpu hotplug.
  282. */
  283. addr += raw_smp_processor_id() * pcpu_unit_size;
  284. return pcpu_get_page_chunk(vmalloc_to_page(addr));
  285. }
  286. /**
  287. * pcpu_extend_area_map - extend area map for allocation
  288. * @chunk: target chunk
  289. *
  290. * Extend area map of @chunk so that it can accomodate an allocation.
  291. * A single allocation can split an area into three areas, so this
  292. * function makes sure that @chunk->map has at least two extra slots.
  293. *
  294. * CONTEXT:
  295. * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
  296. * if area map is extended.
  297. *
  298. * RETURNS:
  299. * 0 if noop, 1 if successfully extended, -errno on failure.
  300. */
  301. static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
  302. {
  303. int new_alloc;
  304. int *new;
  305. size_t size;
  306. /* has enough? */
  307. if (chunk->map_alloc >= chunk->map_used + 2)
  308. return 0;
  309. spin_unlock_irq(&pcpu_lock);
  310. new_alloc = PCPU_DFL_MAP_ALLOC;
  311. while (new_alloc < chunk->map_used + 2)
  312. new_alloc *= 2;
  313. new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
  314. if (!new) {
  315. spin_lock_irq(&pcpu_lock);
  316. return -ENOMEM;
  317. }
  318. /*
  319. * Acquire pcpu_lock and switch to new area map. Only free
  320. * could have happened inbetween, so map_used couldn't have
  321. * grown.
  322. */
  323. spin_lock_irq(&pcpu_lock);
  324. BUG_ON(new_alloc < chunk->map_used + 2);
  325. size = chunk->map_alloc * sizeof(chunk->map[0]);
  326. memcpy(new, chunk->map, size);
  327. /*
  328. * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
  329. * one of the first chunks and still using static map.
  330. */
  331. if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
  332. pcpu_mem_free(chunk->map, size);
  333. chunk->map_alloc = new_alloc;
  334. chunk->map = new;
  335. return 0;
  336. }
  337. /**
  338. * pcpu_split_block - split a map block
  339. * @chunk: chunk of interest
  340. * @i: index of map block to split
  341. * @head: head size in bytes (can be 0)
  342. * @tail: tail size in bytes (can be 0)
  343. *
  344. * Split the @i'th map block into two or three blocks. If @head is
  345. * non-zero, @head bytes block is inserted before block @i moving it
  346. * to @i+1 and reducing its size by @head bytes.
  347. *
  348. * If @tail is non-zero, the target block, which can be @i or @i+1
  349. * depending on @head, is reduced by @tail bytes and @tail byte block
  350. * is inserted after the target block.
  351. *
  352. * @chunk->map must have enough free slots to accomodate the split.
  353. *
  354. * CONTEXT:
  355. * pcpu_lock.
  356. */
  357. static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
  358. int head, int tail)
  359. {
  360. int nr_extra = !!head + !!tail;
  361. BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
  362. /* insert new subblocks */
  363. memmove(&chunk->map[i + nr_extra], &chunk->map[i],
  364. sizeof(chunk->map[0]) * (chunk->map_used - i));
  365. chunk->map_used += nr_extra;
  366. if (head) {
  367. chunk->map[i + 1] = chunk->map[i] - head;
  368. chunk->map[i++] = head;
  369. }
  370. if (tail) {
  371. chunk->map[i++] -= tail;
  372. chunk->map[i] = tail;
  373. }
  374. }
  375. /**
  376. * pcpu_alloc_area - allocate area from a pcpu_chunk
  377. * @chunk: chunk of interest
  378. * @size: wanted size in bytes
  379. * @align: wanted align
  380. *
  381. * Try to allocate @size bytes area aligned at @align from @chunk.
  382. * Note that this function only allocates the offset. It doesn't
  383. * populate or map the area.
  384. *
  385. * @chunk->map must have at least two free slots.
  386. *
  387. * CONTEXT:
  388. * pcpu_lock.
  389. *
  390. * RETURNS:
  391. * Allocated offset in @chunk on success, -1 if no matching area is
  392. * found.
  393. */
  394. static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
  395. {
  396. int oslot = pcpu_chunk_slot(chunk);
  397. int max_contig = 0;
  398. int i, off;
  399. for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
  400. bool is_last = i + 1 == chunk->map_used;
  401. int head, tail;
  402. /* extra for alignment requirement */
  403. head = ALIGN(off, align) - off;
  404. BUG_ON(i == 0 && head != 0);
  405. if (chunk->map[i] < 0)
  406. continue;
  407. if (chunk->map[i] < head + size) {
  408. max_contig = max(chunk->map[i], max_contig);
  409. continue;
  410. }
  411. /*
  412. * If head is small or the previous block is free,
  413. * merge'em. Note that 'small' is defined as smaller
  414. * than sizeof(int), which is very small but isn't too
  415. * uncommon for percpu allocations.
  416. */
  417. if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
  418. if (chunk->map[i - 1] > 0)
  419. chunk->map[i - 1] += head;
  420. else {
  421. chunk->map[i - 1] -= head;
  422. chunk->free_size -= head;
  423. }
  424. chunk->map[i] -= head;
  425. off += head;
  426. head = 0;
  427. }
  428. /* if tail is small, just keep it around */
  429. tail = chunk->map[i] - head - size;
  430. if (tail < sizeof(int))
  431. tail = 0;
  432. /* split if warranted */
  433. if (head || tail) {
  434. pcpu_split_block(chunk, i, head, tail);
  435. if (head) {
  436. i++;
  437. off += head;
  438. max_contig = max(chunk->map[i - 1], max_contig);
  439. }
  440. if (tail)
  441. max_contig = max(chunk->map[i + 1], max_contig);
  442. }
  443. /* update hint and mark allocated */
  444. if (is_last)
  445. chunk->contig_hint = max_contig; /* fully scanned */
  446. else
  447. chunk->contig_hint = max(chunk->contig_hint,
  448. max_contig);
  449. chunk->free_size -= chunk->map[i];
  450. chunk->map[i] = -chunk->map[i];
  451. pcpu_chunk_relocate(chunk, oslot);
  452. return off;
  453. }
  454. chunk->contig_hint = max_contig; /* fully scanned */
  455. pcpu_chunk_relocate(chunk, oslot);
  456. /* tell the upper layer that this chunk has no matching area */
  457. return -1;
  458. }
  459. /**
  460. * pcpu_free_area - free area to a pcpu_chunk
  461. * @chunk: chunk of interest
  462. * @freeme: offset of area to free
  463. *
  464. * Free area starting from @freeme to @chunk. Note that this function
  465. * only modifies the allocation map. It doesn't depopulate or unmap
  466. * the area.
  467. *
  468. * CONTEXT:
  469. * pcpu_lock.
  470. */
  471. static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
  472. {
  473. int oslot = pcpu_chunk_slot(chunk);
  474. int i, off;
  475. for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
  476. if (off == freeme)
  477. break;
  478. BUG_ON(off != freeme);
  479. BUG_ON(chunk->map[i] > 0);
  480. chunk->map[i] = -chunk->map[i];
  481. chunk->free_size += chunk->map[i];
  482. /* merge with previous? */
  483. if (i > 0 && chunk->map[i - 1] >= 0) {
  484. chunk->map[i - 1] += chunk->map[i];
  485. chunk->map_used--;
  486. memmove(&chunk->map[i], &chunk->map[i + 1],
  487. (chunk->map_used - i) * sizeof(chunk->map[0]));
  488. i--;
  489. }
  490. /* merge with next? */
  491. if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
  492. chunk->map[i] += chunk->map[i + 1];
  493. chunk->map_used--;
  494. memmove(&chunk->map[i + 1], &chunk->map[i + 2],
  495. (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
  496. }
  497. chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
  498. pcpu_chunk_relocate(chunk, oslot);
  499. }
  500. /**
  501. * pcpu_unmap - unmap pages out of a pcpu_chunk
  502. * @chunk: chunk of interest
  503. * @page_start: page index of the first page to unmap
  504. * @page_end: page index of the last page to unmap + 1
  505. * @flush_tlb: whether to flush tlb or not
  506. *
  507. * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
  508. * If @flush is true, vcache is flushed before unmapping and tlb
  509. * after.
  510. */
  511. static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  512. bool flush_tlb)
  513. {
  514. unsigned int last = nr_cpu_ids - 1;
  515. unsigned int cpu;
  516. /* unmap must not be done on immutable chunk */
  517. WARN_ON(chunk->immutable);
  518. /*
  519. * Each flushing trial can be very expensive, issue flush on
  520. * the whole region at once rather than doing it for each cpu.
  521. * This could be an overkill but is more scalable.
  522. */
  523. flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
  524. pcpu_chunk_addr(chunk, last, page_end));
  525. for_each_possible_cpu(cpu)
  526. unmap_kernel_range_noflush(
  527. pcpu_chunk_addr(chunk, cpu, page_start),
  528. (page_end - page_start) << PAGE_SHIFT);
  529. /* ditto as flush_cache_vunmap() */
  530. if (flush_tlb)
  531. flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
  532. pcpu_chunk_addr(chunk, last, page_end));
  533. }
  534. /**
  535. * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
  536. * @chunk: chunk to depopulate
  537. * @off: offset to the area to depopulate
  538. * @size: size of the area to depopulate in bytes
  539. * @flush: whether to flush cache and tlb or not
  540. *
  541. * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  542. * from @chunk. If @flush is true, vcache is flushed before unmapping
  543. * and tlb after.
  544. *
  545. * CONTEXT:
  546. * pcpu_alloc_mutex.
  547. */
  548. static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
  549. bool flush)
  550. {
  551. int page_start = PFN_DOWN(off);
  552. int page_end = PFN_UP(off + size);
  553. int unmap_start = -1;
  554. int uninitialized_var(unmap_end);
  555. unsigned int cpu;
  556. int i;
  557. for (i = page_start; i < page_end; i++) {
  558. for_each_possible_cpu(cpu) {
  559. struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
  560. if (!*pagep)
  561. continue;
  562. __free_page(*pagep);
  563. /*
  564. * If it's partial depopulation, it might get
  565. * populated or depopulated again. Mark the
  566. * page gone.
  567. */
  568. *pagep = NULL;
  569. unmap_start = unmap_start < 0 ? i : unmap_start;
  570. unmap_end = i + 1;
  571. }
  572. }
  573. if (unmap_start >= 0)
  574. pcpu_unmap(chunk, unmap_start, unmap_end, flush);
  575. }
  576. /**
  577. * pcpu_map - map pages into a pcpu_chunk
  578. * @chunk: chunk of interest
  579. * @page_start: page index of the first page to map
  580. * @page_end: page index of the last page to map + 1
  581. *
  582. * For each cpu, map pages [@page_start,@page_end) into @chunk.
  583. * vcache is flushed afterwards.
  584. */
  585. static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  586. {
  587. unsigned int last = nr_cpu_ids - 1;
  588. unsigned int cpu;
  589. int err;
  590. /* map must not be done on immutable chunk */
  591. WARN_ON(chunk->immutable);
  592. for_each_possible_cpu(cpu) {
  593. err = map_kernel_range_noflush(
  594. pcpu_chunk_addr(chunk, cpu, page_start),
  595. (page_end - page_start) << PAGE_SHIFT,
  596. PAGE_KERNEL,
  597. pcpu_chunk_pagep(chunk, cpu, page_start));
  598. if (err < 0)
  599. return err;
  600. }
  601. /* flush at once, please read comments in pcpu_unmap() */
  602. flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
  603. pcpu_chunk_addr(chunk, last, page_end));
  604. return 0;
  605. }
  606. /**
  607. * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
  608. * @chunk: chunk of interest
  609. * @off: offset to the area to populate
  610. * @size: size of the area to populate in bytes
  611. *
  612. * For each cpu, populate and map pages [@page_start,@page_end) into
  613. * @chunk. The area is cleared on return.
  614. *
  615. * CONTEXT:
  616. * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  617. */
  618. static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
  619. {
  620. const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
  621. int page_start = PFN_DOWN(off);
  622. int page_end = PFN_UP(off + size);
  623. int map_start = -1;
  624. int uninitialized_var(map_end);
  625. unsigned int cpu;
  626. int i;
  627. for (i = page_start; i < page_end; i++) {
  628. if (pcpu_chunk_page_occupied(chunk, i)) {
  629. if (map_start >= 0) {
  630. if (pcpu_map(chunk, map_start, map_end))
  631. goto err;
  632. map_start = -1;
  633. }
  634. continue;
  635. }
  636. map_start = map_start < 0 ? i : map_start;
  637. map_end = i + 1;
  638. for_each_possible_cpu(cpu) {
  639. struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
  640. *pagep = alloc_pages_node(cpu_to_node(cpu),
  641. alloc_mask, 0);
  642. if (!*pagep)
  643. goto err;
  644. pcpu_set_page_chunk(*pagep, chunk);
  645. }
  646. }
  647. if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
  648. goto err;
  649. for_each_possible_cpu(cpu)
  650. memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
  651. size);
  652. return 0;
  653. err:
  654. /* likely under heavy memory pressure, give memory back */
  655. pcpu_depopulate_chunk(chunk, off, size, true);
  656. return -ENOMEM;
  657. }
  658. static void free_pcpu_chunk(struct pcpu_chunk *chunk)
  659. {
  660. if (!chunk)
  661. return;
  662. if (chunk->vm)
  663. free_vm_area(chunk->vm);
  664. pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
  665. kfree(chunk);
  666. }
  667. static struct pcpu_chunk *alloc_pcpu_chunk(void)
  668. {
  669. struct pcpu_chunk *chunk;
  670. chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
  671. if (!chunk)
  672. return NULL;
  673. chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
  674. chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
  675. chunk->map[chunk->map_used++] = pcpu_unit_size;
  676. chunk->page = chunk->page_ar;
  677. chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
  678. if (!chunk->vm) {
  679. free_pcpu_chunk(chunk);
  680. return NULL;
  681. }
  682. INIT_LIST_HEAD(&chunk->list);
  683. chunk->free_size = pcpu_unit_size;
  684. chunk->contig_hint = pcpu_unit_size;
  685. return chunk;
  686. }
  687. /**
  688. * pcpu_alloc - the percpu allocator
  689. * @size: size of area to allocate in bytes
  690. * @align: alignment of area (max PAGE_SIZE)
  691. * @reserved: allocate from the reserved chunk if available
  692. *
  693. * Allocate percpu area of @size bytes aligned at @align.
  694. *
  695. * CONTEXT:
  696. * Does GFP_KERNEL allocation.
  697. *
  698. * RETURNS:
  699. * Percpu pointer to the allocated area on success, NULL on failure.
  700. */
  701. static void *pcpu_alloc(size_t size, size_t align, bool reserved)
  702. {
  703. struct pcpu_chunk *chunk;
  704. int slot, off;
  705. if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
  706. WARN(true, "illegal size (%zu) or align (%zu) for "
  707. "percpu allocation\n", size, align);
  708. return NULL;
  709. }
  710. mutex_lock(&pcpu_alloc_mutex);
  711. spin_lock_irq(&pcpu_lock);
  712. /* serve reserved allocations from the reserved chunk if available */
  713. if (reserved && pcpu_reserved_chunk) {
  714. chunk = pcpu_reserved_chunk;
  715. if (size > chunk->contig_hint ||
  716. pcpu_extend_area_map(chunk) < 0)
  717. goto fail_unlock;
  718. off = pcpu_alloc_area(chunk, size, align);
  719. if (off >= 0)
  720. goto area_found;
  721. goto fail_unlock;
  722. }
  723. restart:
  724. /* search through normal chunks */
  725. for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
  726. list_for_each_entry(chunk, &pcpu_slot[slot], list) {
  727. if (size > chunk->contig_hint)
  728. continue;
  729. switch (pcpu_extend_area_map(chunk)) {
  730. case 0:
  731. break;
  732. case 1:
  733. goto restart; /* pcpu_lock dropped, restart */
  734. default:
  735. goto fail_unlock;
  736. }
  737. off = pcpu_alloc_area(chunk, size, align);
  738. if (off >= 0)
  739. goto area_found;
  740. }
  741. }
  742. /* hmmm... no space left, create a new chunk */
  743. spin_unlock_irq(&pcpu_lock);
  744. chunk = alloc_pcpu_chunk();
  745. if (!chunk)
  746. goto fail_unlock_mutex;
  747. spin_lock_irq(&pcpu_lock);
  748. pcpu_chunk_relocate(chunk, -1);
  749. goto restart;
  750. area_found:
  751. spin_unlock_irq(&pcpu_lock);
  752. /* populate, map and clear the area */
  753. if (pcpu_populate_chunk(chunk, off, size)) {
  754. spin_lock_irq(&pcpu_lock);
  755. pcpu_free_area(chunk, off);
  756. goto fail_unlock;
  757. }
  758. mutex_unlock(&pcpu_alloc_mutex);
  759. return __addr_to_pcpu_ptr(chunk->vm->addr + off);
  760. fail_unlock:
  761. spin_unlock_irq(&pcpu_lock);
  762. fail_unlock_mutex:
  763. mutex_unlock(&pcpu_alloc_mutex);
  764. return NULL;
  765. }
  766. /**
  767. * __alloc_percpu - allocate dynamic percpu area
  768. * @size: size of area to allocate in bytes
  769. * @align: alignment of area (max PAGE_SIZE)
  770. *
  771. * Allocate percpu area of @size bytes aligned at @align. Might
  772. * sleep. Might trigger writeouts.
  773. *
  774. * CONTEXT:
  775. * Does GFP_KERNEL allocation.
  776. *
  777. * RETURNS:
  778. * Percpu pointer to the allocated area on success, NULL on failure.
  779. */
  780. void *__alloc_percpu(size_t size, size_t align)
  781. {
  782. return pcpu_alloc(size, align, false);
  783. }
  784. EXPORT_SYMBOL_GPL(__alloc_percpu);
  785. /**
  786. * __alloc_reserved_percpu - allocate reserved percpu area
  787. * @size: size of area to allocate in bytes
  788. * @align: alignment of area (max PAGE_SIZE)
  789. *
  790. * Allocate percpu area of @size bytes aligned at @align from reserved
  791. * percpu area if arch has set it up; otherwise, allocation is served
  792. * from the same dynamic area. Might sleep. Might trigger writeouts.
  793. *
  794. * CONTEXT:
  795. * Does GFP_KERNEL allocation.
  796. *
  797. * RETURNS:
  798. * Percpu pointer to the allocated area on success, NULL on failure.
  799. */
  800. void *__alloc_reserved_percpu(size_t size, size_t align)
  801. {
  802. return pcpu_alloc(size, align, true);
  803. }
  804. /**
  805. * pcpu_reclaim - reclaim fully free chunks, workqueue function
  806. * @work: unused
  807. *
  808. * Reclaim all fully free chunks except for the first one.
  809. *
  810. * CONTEXT:
  811. * workqueue context.
  812. */
  813. static void pcpu_reclaim(struct work_struct *work)
  814. {
  815. LIST_HEAD(todo);
  816. struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
  817. struct pcpu_chunk *chunk, *next;
  818. mutex_lock(&pcpu_alloc_mutex);
  819. spin_lock_irq(&pcpu_lock);
  820. list_for_each_entry_safe(chunk, next, head, list) {
  821. WARN_ON(chunk->immutable);
  822. /* spare the first one */
  823. if (chunk == list_first_entry(head, struct pcpu_chunk, list))
  824. continue;
  825. list_move(&chunk->list, &todo);
  826. }
  827. spin_unlock_irq(&pcpu_lock);
  828. mutex_unlock(&pcpu_alloc_mutex);
  829. list_for_each_entry_safe(chunk, next, &todo, list) {
  830. pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
  831. free_pcpu_chunk(chunk);
  832. }
  833. }
  834. /**
  835. * free_percpu - free percpu area
  836. * @ptr: pointer to area to free
  837. *
  838. * Free percpu area @ptr.
  839. *
  840. * CONTEXT:
  841. * Can be called from atomic context.
  842. */
  843. void free_percpu(void *ptr)
  844. {
  845. void *addr = __pcpu_ptr_to_addr(ptr);
  846. struct pcpu_chunk *chunk;
  847. unsigned long flags;
  848. int off;
  849. if (!ptr)
  850. return;
  851. spin_lock_irqsave(&pcpu_lock, flags);
  852. chunk = pcpu_chunk_addr_search(addr);
  853. off = addr - chunk->vm->addr;
  854. pcpu_free_area(chunk, off);
  855. /* if there are more than one fully free chunks, wake up grim reaper */
  856. if (chunk->free_size == pcpu_unit_size) {
  857. struct pcpu_chunk *pos;
  858. list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
  859. if (pos != chunk) {
  860. schedule_work(&pcpu_reclaim_work);
  861. break;
  862. }
  863. }
  864. spin_unlock_irqrestore(&pcpu_lock, flags);
  865. }
  866. EXPORT_SYMBOL_GPL(free_percpu);
  867. /**
  868. * pcpu_setup_first_chunk - initialize the first percpu chunk
  869. * @get_page_fn: callback to fetch page pointer
  870. * @static_size: the size of static percpu area in bytes
  871. * @reserved_size: the size of reserved percpu area in bytes
  872. * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  873. * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  874. * @base_addr: mapped address, NULL for auto
  875. * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  876. *
  877. * Initialize the first percpu chunk which contains the kernel static
  878. * perpcu area. This function is to be called from arch percpu area
  879. * setup path. The first two parameters are mandatory. The rest are
  880. * optional.
  881. *
  882. * @get_page_fn() should return pointer to percpu page given cpu
  883. * number and page number. It should at least return enough pages to
  884. * cover the static area. The returned pages for static area should
  885. * have been initialized with valid data. If @unit_size is specified,
  886. * it can also return pages after the static area. NULL return
  887. * indicates end of pages for the cpu. Note that @get_page_fn() must
  888. * return the same number of pages for all cpus.
  889. *
  890. * @reserved_size, if non-zero, specifies the amount of bytes to
  891. * reserve after the static area in the first chunk. This reserves
  892. * the first chunk such that it's available only through reserved
  893. * percpu allocation. This is primarily used to serve module percpu
  894. * static areas on architectures where the addressing model has
  895. * limited offset range for symbol relocations to guarantee module
  896. * percpu symbols fall inside the relocatable range.
  897. *
  898. * @dyn_size, if non-negative, determines the number of bytes
  899. * available for dynamic allocation in the first chunk. Specifying
  900. * non-negative value makes percpu leave alone the area beyond
  901. * @static_size + @reserved_size + @dyn_size.
  902. *
  903. * @unit_size, if non-negative, specifies unit size and must be
  904. * aligned to PAGE_SIZE and equal to or larger than @static_size +
  905. * @reserved_size + if non-negative, @dyn_size.
  906. *
  907. * Non-null @base_addr means that the caller already allocated virtual
  908. * region for the first chunk and mapped it. percpu must not mess
  909. * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
  910. * @populate_pte_fn doesn't make any sense.
  911. *
  912. * @populate_pte_fn is used to populate the pagetable. NULL means the
  913. * caller already populated the pagetable.
  914. *
  915. * If the first chunk ends up with both reserved and dynamic areas, it
  916. * is served by two chunks - one to serve the core static and reserved
  917. * areas and the other for the dynamic area. They share the same vm
  918. * and page map but uses different area allocation map to stay away
  919. * from each other. The latter chunk is circulated in the chunk slots
  920. * and available for dynamic allocation like any other chunks.
  921. *
  922. * RETURNS:
  923. * The determined pcpu_unit_size which can be used to initialize
  924. * percpu access.
  925. */
  926. size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  927. size_t static_size, size_t reserved_size,
  928. ssize_t dyn_size, ssize_t unit_size,
  929. void *base_addr,
  930. pcpu_populate_pte_fn_t populate_pte_fn)
  931. {
  932. static struct vm_struct first_vm;
  933. static int smap[2], dmap[2];
  934. size_t size_sum = static_size + reserved_size +
  935. (dyn_size >= 0 ? dyn_size : 0);
  936. struct pcpu_chunk *schunk, *dchunk = NULL;
  937. unsigned int cpu;
  938. int nr_pages;
  939. int err, i;
  940. /* santiy checks */
  941. BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
  942. ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
  943. BUG_ON(!static_size);
  944. if (unit_size >= 0) {
  945. BUG_ON(unit_size < size_sum);
  946. BUG_ON(unit_size & ~PAGE_MASK);
  947. BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
  948. } else
  949. BUG_ON(base_addr);
  950. BUG_ON(base_addr && populate_pte_fn);
  951. if (unit_size >= 0)
  952. pcpu_unit_pages = unit_size >> PAGE_SHIFT;
  953. else
  954. pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
  955. PFN_UP(size_sum));
  956. pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
  957. pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
  958. pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
  959. + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
  960. if (dyn_size < 0)
  961. dyn_size = pcpu_unit_size - static_size - reserved_size;
  962. /*
  963. * Allocate chunk slots. The additional last slot is for
  964. * empty chunks.
  965. */
  966. pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
  967. pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
  968. for (i = 0; i < pcpu_nr_slots; i++)
  969. INIT_LIST_HEAD(&pcpu_slot[i]);
  970. /*
  971. * Initialize static chunk. If reserved_size is zero, the
  972. * static chunk covers static area + dynamic allocation area
  973. * in the first chunk. If reserved_size is not zero, it
  974. * covers static area + reserved area (mostly used for module
  975. * static percpu allocation).
  976. */
  977. schunk = alloc_bootmem(pcpu_chunk_struct_size);
  978. INIT_LIST_HEAD(&schunk->list);
  979. schunk->vm = &first_vm;
  980. schunk->map = smap;
  981. schunk->map_alloc = ARRAY_SIZE(smap);
  982. schunk->page = schunk->page_ar;
  983. if (reserved_size) {
  984. schunk->free_size = reserved_size;
  985. pcpu_reserved_chunk = schunk;
  986. pcpu_reserved_chunk_limit = static_size + reserved_size;
  987. } else {
  988. schunk->free_size = dyn_size;
  989. dyn_size = 0; /* dynamic area covered */
  990. }
  991. schunk->contig_hint = schunk->free_size;
  992. schunk->map[schunk->map_used++] = -static_size;
  993. if (schunk->free_size)
  994. schunk->map[schunk->map_used++] = schunk->free_size;
  995. /* init dynamic chunk if necessary */
  996. if (dyn_size) {
  997. dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
  998. INIT_LIST_HEAD(&dchunk->list);
  999. dchunk->vm = &first_vm;
  1000. dchunk->map = dmap;
  1001. dchunk->map_alloc = ARRAY_SIZE(dmap);
  1002. dchunk->page = schunk->page_ar; /* share page map with schunk */
  1003. dchunk->contig_hint = dchunk->free_size = dyn_size;
  1004. dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
  1005. dchunk->map[dchunk->map_used++] = dchunk->free_size;
  1006. }
  1007. /* allocate vm address */
  1008. first_vm.flags = VM_ALLOC;
  1009. first_vm.size = pcpu_chunk_size;
  1010. if (!base_addr)
  1011. vm_area_register_early(&first_vm, PAGE_SIZE);
  1012. else {
  1013. /*
  1014. * Pages already mapped. No need to remap into
  1015. * vmalloc area. In this case the first chunks can't
  1016. * be mapped or unmapped by percpu and are marked
  1017. * immutable.
  1018. */
  1019. first_vm.addr = base_addr;
  1020. schunk->immutable = true;
  1021. if (dchunk)
  1022. dchunk->immutable = true;
  1023. }
  1024. /* assign pages */
  1025. nr_pages = -1;
  1026. for_each_possible_cpu(cpu) {
  1027. for (i = 0; i < pcpu_unit_pages; i++) {
  1028. struct page *page = get_page_fn(cpu, i);
  1029. if (!page)
  1030. break;
  1031. *pcpu_chunk_pagep(schunk, cpu, i) = page;
  1032. }
  1033. BUG_ON(i < PFN_UP(static_size));
  1034. if (nr_pages < 0)
  1035. nr_pages = i;
  1036. else
  1037. BUG_ON(nr_pages != i);
  1038. }
  1039. /* map them */
  1040. if (populate_pte_fn) {
  1041. for_each_possible_cpu(cpu)
  1042. for (i = 0; i < nr_pages; i++)
  1043. populate_pte_fn(pcpu_chunk_addr(schunk,
  1044. cpu, i));
  1045. err = pcpu_map(schunk, 0, nr_pages);
  1046. if (err)
  1047. panic("failed to setup static percpu area, err=%d\n",
  1048. err);
  1049. }
  1050. /* link the first chunk in */
  1051. pcpu_first_chunk = dchunk ?: schunk;
  1052. pcpu_chunk_relocate(pcpu_first_chunk, -1);
  1053. /* we're done */
  1054. pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
  1055. return pcpu_unit_size;
  1056. }
  1057. /*
  1058. * Embedding first chunk setup helper.
  1059. */
  1060. static void *pcpue_ptr __initdata;
  1061. static size_t pcpue_size __initdata;
  1062. static size_t pcpue_unit_size __initdata;
  1063. static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  1064. {
  1065. size_t off = (size_t)pageno << PAGE_SHIFT;
  1066. if (off >= pcpue_size)
  1067. return NULL;
  1068. return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
  1069. }
  1070. /**
  1071. * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  1072. * @static_size: the size of static percpu area in bytes
  1073. * @reserved_size: the size of reserved percpu area in bytes
  1074. * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  1075. * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  1076. *
  1077. * This is a helper to ease setting up embedded first percpu chunk and
  1078. * can be called where pcpu_setup_first_chunk() is expected.
  1079. *
  1080. * If this function is used to setup the first chunk, it is allocated
  1081. * as a contiguous area using bootmem allocator and used as-is without
  1082. * being mapped into vmalloc area. This enables the first chunk to
  1083. * piggy back on the linear physical mapping which often uses larger
  1084. * page size.
  1085. *
  1086. * When @dyn_size is positive, dynamic area might be larger than
  1087. * specified to fill page alignment. Also, when @dyn_size is auto,
  1088. * @dyn_size does not fill the whole first chunk but only what's
  1089. * necessary for page alignment after static and reserved areas.
  1090. *
  1091. * If the needed size is smaller than the minimum or specified unit
  1092. * size, the leftover is returned to the bootmem allocator.
  1093. *
  1094. * RETURNS:
  1095. * The determined pcpu_unit_size which can be used to initialize
  1096. * percpu access on success, -errno on failure.
  1097. */
  1098. ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
  1099. ssize_t dyn_size, ssize_t unit_size)
  1100. {
  1101. size_t chunk_size;
  1102. unsigned int cpu;
  1103. /* determine parameters and allocate */
  1104. pcpue_size = PFN_ALIGN(static_size + reserved_size +
  1105. (dyn_size >= 0 ? dyn_size : 0));
  1106. if (dyn_size != 0)
  1107. dyn_size = pcpue_size - static_size - reserved_size;
  1108. if (unit_size >= 0) {
  1109. BUG_ON(unit_size < pcpue_size);
  1110. pcpue_unit_size = unit_size;
  1111. } else
  1112. pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
  1113. chunk_size = pcpue_unit_size * nr_cpu_ids;
  1114. pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
  1115. __pa(MAX_DMA_ADDRESS));
  1116. if (!pcpue_ptr) {
  1117. pr_warning("PERCPU: failed to allocate %zu bytes for "
  1118. "embedding\n", chunk_size);
  1119. return -ENOMEM;
  1120. }
  1121. /* return the leftover and copy */
  1122. for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
  1123. void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
  1124. if (cpu_possible(cpu)) {
  1125. free_bootmem(__pa(ptr + pcpue_size),
  1126. pcpue_unit_size - pcpue_size);
  1127. memcpy(ptr, __per_cpu_load, static_size);
  1128. } else
  1129. free_bootmem(__pa(ptr), pcpue_unit_size);
  1130. }
  1131. /* we're ready, commit */
  1132. pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
  1133. pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
  1134. return pcpu_setup_first_chunk(pcpue_get_page, static_size,
  1135. reserved_size, dyn_size,
  1136. pcpue_unit_size, pcpue_ptr, NULL);
  1137. }