pid.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. /*
  2. * Generic pidhash and scalable, time-bounded PID allocator
  3. *
  4. * (C) 2002-2003 William Irwin, IBM
  5. * (C) 2004 William Irwin, Oracle
  6. * (C) 2002-2004 Ingo Molnar, Red Hat
  7. *
  8. * pid-structures are backing objects for tasks sharing a given ID to chain
  9. * against. There is very little to them aside from hashing them and
  10. * parking tasks using given ID's on a list.
  11. *
  12. * The hash is always changed with the tasklist_lock write-acquired,
  13. * and the hash is only accessed with the tasklist_lock at least
  14. * read-acquired, so there's no additional SMP locking needed here.
  15. *
  16. * We have a list of bitmap pages, which bitmaps represent the PID space.
  17. * Allocating and freeing PIDs is completely lockless. The worst-case
  18. * allocation scenario when all but one out of 1 million PIDs possible are
  19. * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
  20. * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
  21. */
  22. #include <linux/mm.h>
  23. #include <linux/module.h>
  24. #include <linux/slab.h>
  25. #include <linux/init.h>
  26. #include <linux/bootmem.h>
  27. #include <linux/hash.h>
  28. #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
  29. static struct hlist_head *pid_hash;
  30. static int pidhash_shift;
  31. static kmem_cache_t *pid_cachep;
  32. int pid_max = PID_MAX_DEFAULT;
  33. int last_pid;
  34. #define RESERVED_PIDS 300
  35. int pid_max_min = RESERVED_PIDS + 1;
  36. int pid_max_max = PID_MAX_LIMIT;
  37. #define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
  38. #define BITS_PER_PAGE (PAGE_SIZE*8)
  39. #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
  40. #define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off))
  41. #define find_next_offset(map, off) \
  42. find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
  43. /*
  44. * PID-map pages start out as NULL, they get allocated upon
  45. * first use and are never deallocated. This way a low pid_max
  46. * value does not cause lots of bitmaps to be allocated, but
  47. * the scheme scales to up to 4 million PIDs, runtime.
  48. */
  49. typedef struct pidmap {
  50. atomic_t nr_free;
  51. void *page;
  52. } pidmap_t;
  53. static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
  54. { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
  55. /*
  56. * Note: disable interrupts while the pidmap_lock is held as an
  57. * interrupt might come in and do read_lock(&tasklist_lock).
  58. *
  59. * If we don't disable interrupts there is a nasty deadlock between
  60. * detach_pid()->free_pid() and another cpu that does
  61. * spin_lock(&pidmap_lock) followed by an interrupt routine that does
  62. * read_lock(&tasklist_lock);
  63. *
  64. * After we clean up the tasklist_lock and know there are no
  65. * irq handlers that take it we can leave the interrupts enabled.
  66. * For now it is easier to be safe than to prove it can't happen.
  67. */
  68. static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
  69. static fastcall void free_pidmap(int pid)
  70. {
  71. pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
  72. int offset = pid & BITS_PER_PAGE_MASK;
  73. clear_bit(offset, map->page);
  74. atomic_inc(&map->nr_free);
  75. }
  76. static int alloc_pidmap(void)
  77. {
  78. int i, offset, max_scan, pid, last = last_pid;
  79. pidmap_t *map;
  80. pid = last + 1;
  81. if (pid >= pid_max)
  82. pid = RESERVED_PIDS;
  83. offset = pid & BITS_PER_PAGE_MASK;
  84. map = &pidmap_array[pid/BITS_PER_PAGE];
  85. max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
  86. for (i = 0; i <= max_scan; ++i) {
  87. if (unlikely(!map->page)) {
  88. unsigned long page = get_zeroed_page(GFP_KERNEL);
  89. /*
  90. * Free the page if someone raced with us
  91. * installing it:
  92. */
  93. spin_lock_irq(&pidmap_lock);
  94. if (map->page)
  95. free_page(page);
  96. else
  97. map->page = (void *)page;
  98. spin_unlock_irq(&pidmap_lock);
  99. if (unlikely(!map->page))
  100. break;
  101. }
  102. if (likely(atomic_read(&map->nr_free))) {
  103. do {
  104. if (!test_and_set_bit(offset, map->page)) {
  105. atomic_dec(&map->nr_free);
  106. last_pid = pid;
  107. return pid;
  108. }
  109. offset = find_next_offset(map, offset);
  110. pid = mk_pid(map, offset);
  111. /*
  112. * find_next_offset() found a bit, the pid from it
  113. * is in-bounds, and if we fell back to the last
  114. * bitmap block and the final block was the same
  115. * as the starting point, pid is before last_pid.
  116. */
  117. } while (offset < BITS_PER_PAGE && pid < pid_max &&
  118. (i != max_scan || pid < last ||
  119. !((last+1) & BITS_PER_PAGE_MASK)));
  120. }
  121. if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
  122. ++map;
  123. offset = 0;
  124. } else {
  125. map = &pidmap_array[0];
  126. offset = RESERVED_PIDS;
  127. if (unlikely(last == offset))
  128. break;
  129. }
  130. pid = mk_pid(map, offset);
  131. }
  132. return -1;
  133. }
  134. fastcall void put_pid(struct pid *pid)
  135. {
  136. if (!pid)
  137. return;
  138. if ((atomic_read(&pid->count) == 1) ||
  139. atomic_dec_and_test(&pid->count))
  140. kmem_cache_free(pid_cachep, pid);
  141. }
  142. static void delayed_put_pid(struct rcu_head *rhp)
  143. {
  144. struct pid *pid = container_of(rhp, struct pid, rcu);
  145. put_pid(pid);
  146. }
  147. fastcall void free_pid(struct pid *pid)
  148. {
  149. /* We can be called with write_lock_irq(&tasklist_lock) held */
  150. unsigned long flags;
  151. spin_lock_irqsave(&pidmap_lock, flags);
  152. hlist_del_rcu(&pid->pid_chain);
  153. spin_unlock_irqrestore(&pidmap_lock, flags);
  154. free_pidmap(pid->nr);
  155. call_rcu(&pid->rcu, delayed_put_pid);
  156. }
  157. struct pid *alloc_pid(void)
  158. {
  159. struct pid *pid;
  160. enum pid_type type;
  161. int nr = -1;
  162. pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
  163. if (!pid)
  164. goto out;
  165. nr = alloc_pidmap();
  166. if (nr < 0)
  167. goto out_free;
  168. atomic_set(&pid->count, 1);
  169. pid->nr = nr;
  170. for (type = 0; type < PIDTYPE_MAX; ++type)
  171. INIT_HLIST_HEAD(&pid->tasks[type]);
  172. spin_lock_irq(&pidmap_lock);
  173. hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
  174. spin_unlock_irq(&pidmap_lock);
  175. out:
  176. return pid;
  177. out_free:
  178. kmem_cache_free(pid_cachep, pid);
  179. pid = NULL;
  180. goto out;
  181. }
  182. struct pid * fastcall find_pid(int nr)
  183. {
  184. struct hlist_node *elem;
  185. struct pid *pid;
  186. hlist_for_each_entry_rcu(pid, elem,
  187. &pid_hash[pid_hashfn(nr)], pid_chain) {
  188. if (pid->nr == nr)
  189. return pid;
  190. }
  191. return NULL;
  192. }
  193. int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
  194. {
  195. struct pid_link *link;
  196. struct pid *pid;
  197. link = &task->pids[type];
  198. link->pid = pid = find_pid(nr);
  199. hlist_add_head_rcu(&link->node, &pid->tasks[type]);
  200. return 0;
  201. }
  202. void fastcall detach_pid(struct task_struct *task, enum pid_type type)
  203. {
  204. struct pid_link *link;
  205. struct pid *pid;
  206. int tmp;
  207. link = &task->pids[type];
  208. pid = link->pid;
  209. hlist_del_rcu(&link->node);
  210. link->pid = NULL;
  211. for (tmp = PIDTYPE_MAX; --tmp >= 0; )
  212. if (!hlist_empty(&pid->tasks[tmp]))
  213. return;
  214. free_pid(pid);
  215. }
  216. /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
  217. void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
  218. enum pid_type type)
  219. {
  220. new->pids[type].pid = old->pids[type].pid;
  221. hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
  222. old->pids[type].pid = NULL;
  223. }
  224. struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
  225. {
  226. struct task_struct *result = NULL;
  227. if (pid) {
  228. struct hlist_node *first;
  229. first = rcu_dereference(pid->tasks[type].first);
  230. if (first)
  231. result = hlist_entry(first, struct task_struct, pids[(type)].node);
  232. }
  233. return result;
  234. }
  235. /*
  236. * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  237. */
  238. struct task_struct *find_task_by_pid_type(int type, int nr)
  239. {
  240. return pid_task(find_pid(nr), type);
  241. }
  242. EXPORT_SYMBOL(find_task_by_pid_type);
  243. struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
  244. {
  245. struct task_struct *result;
  246. rcu_read_lock();
  247. result = pid_task(pid, type);
  248. if (result)
  249. get_task_struct(result);
  250. rcu_read_unlock();
  251. return result;
  252. }
  253. struct pid *find_get_pid(pid_t nr)
  254. {
  255. struct pid *pid;
  256. rcu_read_lock();
  257. pid = get_pid(find_pid(nr));
  258. rcu_read_unlock();
  259. return pid;
  260. }
  261. /*
  262. * The pid hash table is scaled according to the amount of memory in the
  263. * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
  264. * more.
  265. */
  266. void __init pidhash_init(void)
  267. {
  268. int i, pidhash_size;
  269. unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
  270. pidhash_shift = max(4, fls(megabytes * 4));
  271. pidhash_shift = min(12, pidhash_shift);
  272. pidhash_size = 1 << pidhash_shift;
  273. printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
  274. pidhash_size, pidhash_shift,
  275. pidhash_size * sizeof(struct hlist_head));
  276. pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
  277. if (!pid_hash)
  278. panic("Could not alloc pidhash!\n");
  279. for (i = 0; i < pidhash_size; i++)
  280. INIT_HLIST_HEAD(&pid_hash[i]);
  281. }
  282. void __init pidmap_init(void)
  283. {
  284. pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
  285. /* Reserve PID 0. We never call free_pidmap(0) */
  286. set_bit(0, pidmap_array->page);
  287. atomic_dec(&pidmap_array->nr_free);
  288. pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
  289. __alignof__(struct pid),
  290. SLAB_PANIC, NULL, NULL);
  291. }