rcupreempt.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170
  1. /*
  2. * Read-Copy Update mechanism for mutual exclusion, realtime implementation
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. *
  18. * Copyright IBM Corporation, 2006
  19. *
  20. * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  21. * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
  22. * for pushing me away from locks and towards counters, and
  23. * to Suparna Bhattacharya for pushing me completely away
  24. * from atomic instructions on the read side.
  25. *
  26. * - Added handling of Dynamic Ticks
  27. * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
  28. * - Steven Rostedt <srostedt@redhat.com>
  29. *
  30. * Papers: http://www.rdrop.com/users/paulmck/RCU
  31. *
  32. * Design Document: http://lwn.net/Articles/253651/
  33. *
  34. * For detailed explanation of Read-Copy Update mechanism see -
  35. * Documentation/RCU/ *.txt
  36. *
  37. */
  38. #include <linux/types.h>
  39. #include <linux/kernel.h>
  40. #include <linux/init.h>
  41. #include <linux/spinlock.h>
  42. #include <linux/smp.h>
  43. #include <linux/rcupdate.h>
  44. #include <linux/interrupt.h>
  45. #include <linux/sched.h>
  46. #include <asm/atomic.h>
  47. #include <linux/bitops.h>
  48. #include <linux/module.h>
  49. #include <linux/completion.h>
  50. #include <linux/moduleparam.h>
  51. #include <linux/percpu.h>
  52. #include <linux/notifier.h>
  53. #include <linux/rcupdate.h>
  54. #include <linux/cpu.h>
  55. #include <linux/random.h>
  56. #include <linux/delay.h>
  57. #include <linux/byteorder/swabb.h>
  58. #include <linux/cpumask.h>
  59. #include <linux/rcupreempt_trace.h>
  60. /*
  61. * Macro that prevents the compiler from reordering accesses, but does
  62. * absolutely -nothing- to prevent CPUs from reordering. This is used
  63. * only to mediate communication between mainline code and hardware
  64. * interrupt and NMI handlers.
  65. */
  66. #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
  67. /*
  68. * PREEMPT_RCU data structures.
  69. */
  70. /*
  71. * GP_STAGES specifies the number of times the state machine has
  72. * to go through the all the rcu_try_flip_states (see below)
  73. * in a single Grace Period.
  74. *
  75. * GP in GP_STAGES stands for Grace Period ;)
  76. */
  77. #define GP_STAGES 2
  78. struct rcu_data {
  79. spinlock_t lock; /* Protect rcu_data fields. */
  80. long completed; /* Number of last completed batch. */
  81. int waitlistcount;
  82. struct tasklet_struct rcu_tasklet;
  83. struct rcu_head *nextlist;
  84. struct rcu_head **nexttail;
  85. struct rcu_head *waitlist[GP_STAGES];
  86. struct rcu_head **waittail[GP_STAGES];
  87. struct rcu_head *donelist;
  88. struct rcu_head **donetail;
  89. long rcu_flipctr[2];
  90. #ifdef CONFIG_RCU_TRACE
  91. struct rcupreempt_trace trace;
  92. #endif /* #ifdef CONFIG_RCU_TRACE */
  93. };
  94. /*
  95. * States for rcu_try_flip() and friends.
  96. */
  97. enum rcu_try_flip_states {
  98. /*
  99. * Stay here if nothing is happening. Flip the counter if somthing
  100. * starts happening. Denoted by "I"
  101. */
  102. rcu_try_flip_idle_state,
  103. /*
  104. * Wait here for all CPUs to notice that the counter has flipped. This
  105. * prevents the old set of counters from ever being incremented once
  106. * we leave this state, which in turn is necessary because we cannot
  107. * test any individual counter for zero -- we can only check the sum.
  108. * Denoted by "A".
  109. */
  110. rcu_try_flip_waitack_state,
  111. /*
  112. * Wait here for the sum of the old per-CPU counters to reach zero.
  113. * Denoted by "Z".
  114. */
  115. rcu_try_flip_waitzero_state,
  116. /*
  117. * Wait here for each of the other CPUs to execute a memory barrier.
  118. * This is necessary to ensure that these other CPUs really have
  119. * completed executing their RCU read-side critical sections, despite
  120. * their CPUs wildly reordering memory. Denoted by "M".
  121. */
  122. rcu_try_flip_waitmb_state,
  123. };
  124. struct rcu_ctrlblk {
  125. spinlock_t fliplock; /* Protect state-machine transitions. */
  126. long completed; /* Number of last completed batch. */
  127. enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
  128. the rcu state machine */
  129. };
  130. static DEFINE_PER_CPU(struct rcu_data, rcu_data);
  131. static struct rcu_ctrlblk rcu_ctrlblk = {
  132. .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
  133. .completed = 0,
  134. .rcu_try_flip_state = rcu_try_flip_idle_state,
  135. };
  136. #ifdef CONFIG_RCU_TRACE
  137. static char *rcu_try_flip_state_names[] =
  138. { "idle", "waitack", "waitzero", "waitmb" };
  139. #endif /* #ifdef CONFIG_RCU_TRACE */
  140. static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
  141. /*
  142. * Enum and per-CPU flag to determine when each CPU has seen
  143. * the most recent counter flip.
  144. */
  145. enum rcu_flip_flag_values {
  146. rcu_flip_seen, /* Steady/initial state, last flip seen. */
  147. /* Only GP detector can update. */
  148. rcu_flipped /* Flip just completed, need confirmation. */
  149. /* Only corresponding CPU can update. */
  150. };
  151. static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
  152. = rcu_flip_seen;
  153. /*
  154. * Enum and per-CPU flag to determine when each CPU has executed the
  155. * needed memory barrier to fence in memory references from its last RCU
  156. * read-side critical section in the just-completed grace period.
  157. */
  158. enum rcu_mb_flag_values {
  159. rcu_mb_done, /* Steady/initial state, no mb()s required. */
  160. /* Only GP detector can update. */
  161. rcu_mb_needed /* Flip just completed, need an mb(). */
  162. /* Only corresponding CPU can update. */
  163. };
  164. static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
  165. = rcu_mb_done;
  166. /*
  167. * RCU_DATA_ME: find the current CPU's rcu_data structure.
  168. * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
  169. */
  170. #define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
  171. #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
  172. /*
  173. * Helper macro for tracing when the appropriate rcu_data is not
  174. * cached in a local variable, but where the CPU number is so cached.
  175. */
  176. #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
  177. /*
  178. * Helper macro for tracing when the appropriate rcu_data is not
  179. * cached in a local variable.
  180. */
  181. #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
  182. /*
  183. * Helper macro for tracing when the appropriate rcu_data is pointed
  184. * to by a local variable.
  185. */
  186. #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
  187. /*
  188. * Return the number of RCU batches processed thus far. Useful
  189. * for debug and statistics.
  190. */
  191. long rcu_batches_completed(void)
  192. {
  193. return rcu_ctrlblk.completed;
  194. }
  195. EXPORT_SYMBOL_GPL(rcu_batches_completed);
  196. void __rcu_read_lock(void)
  197. {
  198. int idx;
  199. struct task_struct *t = current;
  200. int nesting;
  201. nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
  202. if (nesting != 0) {
  203. /* An earlier rcu_read_lock() covers us, just count it. */
  204. t->rcu_read_lock_nesting = nesting + 1;
  205. } else {
  206. unsigned long flags;
  207. /*
  208. * We disable interrupts for the following reasons:
  209. * - If we get scheduling clock interrupt here, and we
  210. * end up acking the counter flip, it's like a promise
  211. * that we will never increment the old counter again.
  212. * Thus we will break that promise if that
  213. * scheduling clock interrupt happens between the time
  214. * we pick the .completed field and the time that we
  215. * increment our counter.
  216. *
  217. * - We don't want to be preempted out here.
  218. *
  219. * NMIs can still occur, of course, and might themselves
  220. * contain rcu_read_lock().
  221. */
  222. local_irq_save(flags);
  223. /*
  224. * Outermost nesting of rcu_read_lock(), so increment
  225. * the current counter for the current CPU. Use volatile
  226. * casts to prevent the compiler from reordering.
  227. */
  228. idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
  229. ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
  230. /*
  231. * Now that the per-CPU counter has been incremented, we
  232. * are protected from races with rcu_read_lock() invoked
  233. * from NMI handlers on this CPU. We can therefore safely
  234. * increment the nesting counter, relieving further NMIs
  235. * of the need to increment the per-CPU counter.
  236. */
  237. ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
  238. /*
  239. * Now that we have preventing any NMIs from storing
  240. * to the ->rcu_flipctr_idx, we can safely use it to
  241. * remember which counter to decrement in the matching
  242. * rcu_read_unlock().
  243. */
  244. ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
  245. local_irq_restore(flags);
  246. }
  247. }
  248. EXPORT_SYMBOL_GPL(__rcu_read_lock);
  249. void __rcu_read_unlock(void)
  250. {
  251. int idx;
  252. struct task_struct *t = current;
  253. int nesting;
  254. nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
  255. if (nesting > 1) {
  256. /*
  257. * We are still protected by the enclosing rcu_read_lock(),
  258. * so simply decrement the counter.
  259. */
  260. t->rcu_read_lock_nesting = nesting - 1;
  261. } else {
  262. unsigned long flags;
  263. /*
  264. * Disable local interrupts to prevent the grace-period
  265. * detection state machine from seeing us half-done.
  266. * NMIs can still occur, of course, and might themselves
  267. * contain rcu_read_lock() and rcu_read_unlock().
  268. */
  269. local_irq_save(flags);
  270. /*
  271. * Outermost nesting of rcu_read_unlock(), so we must
  272. * decrement the current counter for the current CPU.
  273. * This must be done carefully, because NMIs can
  274. * occur at any point in this code, and any rcu_read_lock()
  275. * and rcu_read_unlock() pairs in the NMI handlers
  276. * must interact non-destructively with this code.
  277. * Lots of volatile casts, and -very- careful ordering.
  278. *
  279. * Changes to this code, including this one, must be
  280. * inspected, validated, and tested extremely carefully!!!
  281. */
  282. /*
  283. * First, pick up the index.
  284. */
  285. idx = ACCESS_ONCE(t->rcu_flipctr_idx);
  286. /*
  287. * Now that we have fetched the counter index, it is
  288. * safe to decrement the per-task RCU nesting counter.
  289. * After this, any interrupts or NMIs will increment and
  290. * decrement the per-CPU counters.
  291. */
  292. ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
  293. /*
  294. * It is now safe to decrement this task's nesting count.
  295. * NMIs that occur after this statement will route their
  296. * rcu_read_lock() calls through this "else" clause, and
  297. * will thus start incrementing the per-CPU counter on
  298. * their own. They will also clobber ->rcu_flipctr_idx,
  299. * but that is OK, since we have already fetched it.
  300. */
  301. ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
  302. local_irq_restore(flags);
  303. }
  304. }
  305. EXPORT_SYMBOL_GPL(__rcu_read_unlock);
  306. /*
  307. * If a global counter flip has occurred since the last time that we
  308. * advanced callbacks, advance them. Hardware interrupts must be
  309. * disabled when calling this function.
  310. */
  311. static void __rcu_advance_callbacks(struct rcu_data *rdp)
  312. {
  313. int cpu;
  314. int i;
  315. int wlc = 0;
  316. if (rdp->completed != rcu_ctrlblk.completed) {
  317. if (rdp->waitlist[GP_STAGES - 1] != NULL) {
  318. *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
  319. rdp->donetail = rdp->waittail[GP_STAGES - 1];
  320. RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
  321. }
  322. for (i = GP_STAGES - 2; i >= 0; i--) {
  323. if (rdp->waitlist[i] != NULL) {
  324. rdp->waitlist[i + 1] = rdp->waitlist[i];
  325. rdp->waittail[i + 1] = rdp->waittail[i];
  326. wlc++;
  327. } else {
  328. rdp->waitlist[i + 1] = NULL;
  329. rdp->waittail[i + 1] =
  330. &rdp->waitlist[i + 1];
  331. }
  332. }
  333. if (rdp->nextlist != NULL) {
  334. rdp->waitlist[0] = rdp->nextlist;
  335. rdp->waittail[0] = rdp->nexttail;
  336. wlc++;
  337. rdp->nextlist = NULL;
  338. rdp->nexttail = &rdp->nextlist;
  339. RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
  340. } else {
  341. rdp->waitlist[0] = NULL;
  342. rdp->waittail[0] = &rdp->waitlist[0];
  343. }
  344. rdp->waitlistcount = wlc;
  345. rdp->completed = rcu_ctrlblk.completed;
  346. }
  347. /*
  348. * Check to see if this CPU needs to report that it has seen
  349. * the most recent counter flip, thereby declaring that all
  350. * subsequent rcu_read_lock() invocations will respect this flip.
  351. */
  352. cpu = raw_smp_processor_id();
  353. if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
  354. smp_mb(); /* Subsequent counter accesses must see new value */
  355. per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
  356. smp_mb(); /* Subsequent RCU read-side critical sections */
  357. /* seen -after- acknowledgement. */
  358. }
  359. }
  360. #ifdef CONFIG_NO_HZ
  361. DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
  362. static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
  363. static DEFINE_PER_CPU(int, rcu_update_flag);
  364. /**
  365. * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  366. *
  367. * If the CPU was idle with dynamic ticks active, this updates the
  368. * dynticks_progress_counter to let the RCU handling know that the
  369. * CPU is active.
  370. */
  371. void rcu_irq_enter(void)
  372. {
  373. int cpu = smp_processor_id();
  374. if (per_cpu(rcu_update_flag, cpu))
  375. per_cpu(rcu_update_flag, cpu)++;
  376. /*
  377. * Only update if we are coming from a stopped ticks mode
  378. * (dynticks_progress_counter is even).
  379. */
  380. if (!in_interrupt() &&
  381. (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
  382. /*
  383. * The following might seem like we could have a race
  384. * with NMI/SMIs. But this really isn't a problem.
  385. * Here we do a read/modify/write, and the race happens
  386. * when an NMI/SMI comes in after the read and before
  387. * the write. But NMI/SMIs will increment this counter
  388. * twice before returning, so the zero bit will not
  389. * be corrupted by the NMI/SMI which is the most important
  390. * part.
  391. *
  392. * The only thing is that we would bring back the counter
  393. * to a postion that it was in during the NMI/SMI.
  394. * But the zero bit would be set, so the rest of the
  395. * counter would again be ignored.
  396. *
  397. * On return from the IRQ, the counter may have the zero
  398. * bit be 0 and the counter the same as the return from
  399. * the NMI/SMI. If the state machine was so unlucky to
  400. * see that, it still doesn't matter, since all
  401. * RCU read-side critical sections on this CPU would
  402. * have already completed.
  403. */
  404. per_cpu(dynticks_progress_counter, cpu)++;
  405. /*
  406. * The following memory barrier ensures that any
  407. * rcu_read_lock() primitives in the irq handler
  408. * are seen by other CPUs to follow the above
  409. * increment to dynticks_progress_counter. This is
  410. * required in order for other CPUs to correctly
  411. * determine when it is safe to advance the RCU
  412. * grace-period state machine.
  413. */
  414. smp_mb(); /* see above block comment. */
  415. /*
  416. * Since we can't determine the dynamic tick mode from
  417. * the dynticks_progress_counter after this routine,
  418. * we use a second flag to acknowledge that we came
  419. * from an idle state with ticks stopped.
  420. */
  421. per_cpu(rcu_update_flag, cpu)++;
  422. /*
  423. * If we take an NMI/SMI now, they will also increment
  424. * the rcu_update_flag, and will not update the
  425. * dynticks_progress_counter on exit. That is for
  426. * this IRQ to do.
  427. */
  428. }
  429. }
  430. /**
  431. * rcu_irq_exit - Called from exiting Hard irq context.
  432. *
  433. * If the CPU was idle with dynamic ticks active, update the
  434. * dynticks_progress_counter to put let the RCU handling be
  435. * aware that the CPU is going back to idle with no ticks.
  436. */
  437. void rcu_irq_exit(void)
  438. {
  439. int cpu = smp_processor_id();
  440. /*
  441. * rcu_update_flag is set if we interrupted the CPU
  442. * when it was idle with ticks stopped.
  443. * Once this occurs, we keep track of interrupt nesting
  444. * because a NMI/SMI could also come in, and we still
  445. * only want the IRQ that started the increment of the
  446. * dynticks_progress_counter to be the one that modifies
  447. * it on exit.
  448. */
  449. if (per_cpu(rcu_update_flag, cpu)) {
  450. if (--per_cpu(rcu_update_flag, cpu))
  451. return;
  452. /* This must match the interrupt nesting */
  453. WARN_ON(in_interrupt());
  454. /*
  455. * If an NMI/SMI happens now we are still
  456. * protected by the dynticks_progress_counter being odd.
  457. */
  458. /*
  459. * The following memory barrier ensures that any
  460. * rcu_read_unlock() primitives in the irq handler
  461. * are seen by other CPUs to preceed the following
  462. * increment to dynticks_progress_counter. This
  463. * is required in order for other CPUs to determine
  464. * when it is safe to advance the RCU grace-period
  465. * state machine.
  466. */
  467. smp_mb(); /* see above block comment. */
  468. per_cpu(dynticks_progress_counter, cpu)++;
  469. WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
  470. }
  471. }
  472. static void dyntick_save_progress_counter(int cpu)
  473. {
  474. per_cpu(rcu_dyntick_snapshot, cpu) =
  475. per_cpu(dynticks_progress_counter, cpu);
  476. }
  477. static inline int
  478. rcu_try_flip_waitack_needed(int cpu)
  479. {
  480. long curr;
  481. long snap;
  482. curr = per_cpu(dynticks_progress_counter, cpu);
  483. snap = per_cpu(rcu_dyntick_snapshot, cpu);
  484. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  485. /*
  486. * If the CPU remained in dynticks mode for the entire time
  487. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  488. * then it cannot be in the middle of an rcu_read_lock(), so
  489. * the next rcu_read_lock() it executes must use the new value
  490. * of the counter. So we can safely pretend that this CPU
  491. * already acknowledged the counter.
  492. */
  493. if ((curr == snap) && ((curr & 0x1) == 0))
  494. return 0;
  495. /*
  496. * If the CPU passed through or entered a dynticks idle phase with
  497. * no active irq handlers, then, as above, we can safely pretend
  498. * that this CPU already acknowledged the counter.
  499. */
  500. if ((curr - snap) > 2 || (snap & 0x1) == 0)
  501. return 0;
  502. /* We need this CPU to explicitly acknowledge the counter flip. */
  503. return 1;
  504. }
  505. static inline int
  506. rcu_try_flip_waitmb_needed(int cpu)
  507. {
  508. long curr;
  509. long snap;
  510. curr = per_cpu(dynticks_progress_counter, cpu);
  511. snap = per_cpu(rcu_dyntick_snapshot, cpu);
  512. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  513. /*
  514. * If the CPU remained in dynticks mode for the entire time
  515. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  516. * then it cannot have executed an RCU read-side critical section
  517. * during that time, so there is no need for it to execute a
  518. * memory barrier.
  519. */
  520. if ((curr == snap) && ((curr & 0x1) == 0))
  521. return 0;
  522. /*
  523. * If the CPU either entered or exited an outermost interrupt,
  524. * SMI, NMI, or whatever handler, then we know that it executed
  525. * a memory barrier when doing so. So we don't need another one.
  526. */
  527. if (curr != snap)
  528. return 0;
  529. /* We need the CPU to execute a memory barrier. */
  530. return 1;
  531. }
  532. #else /* !CONFIG_NO_HZ */
  533. # define dyntick_save_progress_counter(cpu) do { } while (0)
  534. # define rcu_try_flip_waitack_needed(cpu) (1)
  535. # define rcu_try_flip_waitmb_needed(cpu) (1)
  536. #endif /* CONFIG_NO_HZ */
  537. /*
  538. * Get here when RCU is idle. Decide whether we need to
  539. * move out of idle state, and return non-zero if so.
  540. * "Straightforward" approach for the moment, might later
  541. * use callback-list lengths, grace-period duration, or
  542. * some such to determine when to exit idle state.
  543. * Might also need a pre-idle test that does not acquire
  544. * the lock, but let's get the simple case working first...
  545. */
  546. static int
  547. rcu_try_flip_idle(void)
  548. {
  549. int cpu;
  550. RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
  551. if (!rcu_pending(smp_processor_id())) {
  552. RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
  553. return 0;
  554. }
  555. /*
  556. * Do the flip.
  557. */
  558. RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
  559. rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
  560. /*
  561. * Need a memory barrier so that other CPUs see the new
  562. * counter value before they see the subsequent change of all
  563. * the rcu_flip_flag instances to rcu_flipped.
  564. */
  565. smp_mb(); /* see above block comment. */
  566. /* Now ask each CPU for acknowledgement of the flip. */
  567. for_each_cpu_mask(cpu, rcu_cpu_online_map) {
  568. per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
  569. dyntick_save_progress_counter(cpu);
  570. }
  571. return 1;
  572. }
  573. /*
  574. * Wait for CPUs to acknowledge the flip.
  575. */
  576. static int
  577. rcu_try_flip_waitack(void)
  578. {
  579. int cpu;
  580. RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
  581. for_each_cpu_mask(cpu, rcu_cpu_online_map)
  582. if (rcu_try_flip_waitack_needed(cpu) &&
  583. per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
  584. RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
  585. return 0;
  586. }
  587. /*
  588. * Make sure our checks above don't bleed into subsequent
  589. * waiting for the sum of the counters to reach zero.
  590. */
  591. smp_mb(); /* see above block comment. */
  592. RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
  593. return 1;
  594. }
  595. /*
  596. * Wait for collective ``last'' counter to reach zero,
  597. * then tell all CPUs to do an end-of-grace-period memory barrier.
  598. */
  599. static int
  600. rcu_try_flip_waitzero(void)
  601. {
  602. int cpu;
  603. int lastidx = !(rcu_ctrlblk.completed & 0x1);
  604. int sum = 0;
  605. /* Check to see if the sum of the "last" counters is zero. */
  606. RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
  607. for_each_cpu_mask(cpu, rcu_cpu_online_map)
  608. sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
  609. if (sum != 0) {
  610. RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
  611. return 0;
  612. }
  613. /*
  614. * This ensures that the other CPUs see the call for
  615. * memory barriers -after- the sum to zero has been
  616. * detected here
  617. */
  618. smp_mb(); /* ^^^^^^^^^^^^ */
  619. /* Call for a memory barrier from each CPU. */
  620. for_each_cpu_mask(cpu, rcu_cpu_online_map) {
  621. per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
  622. dyntick_save_progress_counter(cpu);
  623. }
  624. RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
  625. return 1;
  626. }
  627. /*
  628. * Wait for all CPUs to do their end-of-grace-period memory barrier.
  629. * Return 0 once all CPUs have done so.
  630. */
  631. static int
  632. rcu_try_flip_waitmb(void)
  633. {
  634. int cpu;
  635. RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
  636. for_each_cpu_mask(cpu, rcu_cpu_online_map)
  637. if (rcu_try_flip_waitmb_needed(cpu) &&
  638. per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
  639. RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
  640. return 0;
  641. }
  642. smp_mb(); /* Ensure that the above checks precede any following flip. */
  643. RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
  644. return 1;
  645. }
  646. /*
  647. * Attempt a single flip of the counters. Remember, a single flip does
  648. * -not- constitute a grace period. Instead, the interval between
  649. * at least GP_STAGES consecutive flips is a grace period.
  650. *
  651. * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
  652. * on a large SMP, they might want to use a hierarchical organization of
  653. * the per-CPU-counter pairs.
  654. */
  655. static void rcu_try_flip(void)
  656. {
  657. unsigned long flags;
  658. RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
  659. if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
  660. RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
  661. return;
  662. }
  663. /*
  664. * Take the next transition(s) through the RCU grace-period
  665. * flip-counter state machine.
  666. */
  667. switch (rcu_ctrlblk.rcu_try_flip_state) {
  668. case rcu_try_flip_idle_state:
  669. if (rcu_try_flip_idle())
  670. rcu_ctrlblk.rcu_try_flip_state =
  671. rcu_try_flip_waitack_state;
  672. break;
  673. case rcu_try_flip_waitack_state:
  674. if (rcu_try_flip_waitack())
  675. rcu_ctrlblk.rcu_try_flip_state =
  676. rcu_try_flip_waitzero_state;
  677. break;
  678. case rcu_try_flip_waitzero_state:
  679. if (rcu_try_flip_waitzero())
  680. rcu_ctrlblk.rcu_try_flip_state =
  681. rcu_try_flip_waitmb_state;
  682. break;
  683. case rcu_try_flip_waitmb_state:
  684. if (rcu_try_flip_waitmb())
  685. rcu_ctrlblk.rcu_try_flip_state =
  686. rcu_try_flip_idle_state;
  687. }
  688. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  689. }
  690. /*
  691. * Check to see if this CPU needs to do a memory barrier in order to
  692. * ensure that any prior RCU read-side critical sections have committed
  693. * their counter manipulations and critical-section memory references
  694. * before declaring the grace period to be completed.
  695. */
  696. static void rcu_check_mb(int cpu)
  697. {
  698. if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
  699. smp_mb(); /* Ensure RCU read-side accesses are visible. */
  700. per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
  701. }
  702. }
  703. void rcu_check_callbacks(int cpu, int user)
  704. {
  705. unsigned long flags;
  706. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  707. rcu_check_mb(cpu);
  708. if (rcu_ctrlblk.completed == rdp->completed)
  709. rcu_try_flip();
  710. spin_lock_irqsave(&rdp->lock, flags);
  711. RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
  712. __rcu_advance_callbacks(rdp);
  713. if (rdp->donelist == NULL) {
  714. spin_unlock_irqrestore(&rdp->lock, flags);
  715. } else {
  716. spin_unlock_irqrestore(&rdp->lock, flags);
  717. raise_softirq(RCU_SOFTIRQ);
  718. }
  719. }
  720. /*
  721. * Needed by dynticks, to make sure all RCU processing has finished
  722. * when we go idle:
  723. */
  724. void rcu_advance_callbacks(int cpu, int user)
  725. {
  726. unsigned long flags;
  727. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  728. if (rcu_ctrlblk.completed == rdp->completed) {
  729. rcu_try_flip();
  730. if (rcu_ctrlblk.completed == rdp->completed)
  731. return;
  732. }
  733. spin_lock_irqsave(&rdp->lock, flags);
  734. RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
  735. __rcu_advance_callbacks(rdp);
  736. spin_unlock_irqrestore(&rdp->lock, flags);
  737. }
  738. #ifdef CONFIG_HOTPLUG_CPU
  739. #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
  740. *dsttail = srclist; \
  741. if (srclist != NULL) { \
  742. dsttail = srctail; \
  743. srclist = NULL; \
  744. srctail = &srclist;\
  745. } \
  746. } while (0)
  747. void rcu_offline_cpu(int cpu)
  748. {
  749. int i;
  750. struct rcu_head *list = NULL;
  751. unsigned long flags;
  752. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  753. struct rcu_head **tail = &list;
  754. /*
  755. * Remove all callbacks from the newly dead CPU, retaining order.
  756. * Otherwise rcu_barrier() will fail
  757. */
  758. spin_lock_irqsave(&rdp->lock, flags);
  759. rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
  760. for (i = GP_STAGES - 1; i >= 0; i--)
  761. rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
  762. list, tail);
  763. rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
  764. spin_unlock_irqrestore(&rdp->lock, flags);
  765. rdp->waitlistcount = 0;
  766. /* Disengage the newly dead CPU from the grace-period computation. */
  767. spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  768. rcu_check_mb(cpu);
  769. if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
  770. smp_mb(); /* Subsequent counter accesses must see new value */
  771. per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
  772. smp_mb(); /* Subsequent RCU read-side critical sections */
  773. /* seen -after- acknowledgement. */
  774. }
  775. RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
  776. RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
  777. RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
  778. RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
  779. cpu_clear(cpu, rcu_cpu_online_map);
  780. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  781. /*
  782. * Place the removed callbacks on the current CPU's queue.
  783. * Make them all start a new grace period: simple approach,
  784. * in theory could starve a given set of callbacks, but
  785. * you would need to be doing some serious CPU hotplugging
  786. * to make this happen. If this becomes a problem, adding
  787. * a synchronize_rcu() to the hotplug path would be a simple
  788. * fix.
  789. */
  790. local_irq_save(flags);
  791. rdp = RCU_DATA_ME();
  792. spin_lock(&rdp->lock);
  793. *rdp->nexttail = list;
  794. if (list)
  795. rdp->nexttail = tail;
  796. spin_unlock_irqrestore(&rdp->lock, flags);
  797. }
  798. void __devinit rcu_online_cpu(int cpu)
  799. {
  800. unsigned long flags;
  801. spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  802. cpu_set(cpu, rcu_cpu_online_map);
  803. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  804. }
  805. #else /* #ifdef CONFIG_HOTPLUG_CPU */
  806. void rcu_offline_cpu(int cpu)
  807. {
  808. }
  809. void __devinit rcu_online_cpu(int cpu)
  810. {
  811. }
  812. #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
  813. static void rcu_process_callbacks(struct softirq_action *unused)
  814. {
  815. unsigned long flags;
  816. struct rcu_head *next, *list;
  817. struct rcu_data *rdp;
  818. local_irq_save(flags);
  819. rdp = RCU_DATA_ME();
  820. spin_lock(&rdp->lock);
  821. list = rdp->donelist;
  822. if (list == NULL) {
  823. spin_unlock_irqrestore(&rdp->lock, flags);
  824. return;
  825. }
  826. rdp->donelist = NULL;
  827. rdp->donetail = &rdp->donelist;
  828. RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
  829. spin_unlock_irqrestore(&rdp->lock, flags);
  830. while (list) {
  831. next = list->next;
  832. list->func(list);
  833. list = next;
  834. RCU_TRACE_ME(rcupreempt_trace_invoke);
  835. }
  836. }
  837. void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  838. {
  839. unsigned long flags;
  840. struct rcu_data *rdp;
  841. head->func = func;
  842. head->next = NULL;
  843. local_irq_save(flags);
  844. rdp = RCU_DATA_ME();
  845. spin_lock(&rdp->lock);
  846. __rcu_advance_callbacks(rdp);
  847. *rdp->nexttail = head;
  848. rdp->nexttail = &head->next;
  849. RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
  850. spin_unlock(&rdp->lock);
  851. local_irq_restore(flags);
  852. }
  853. EXPORT_SYMBOL_GPL(call_rcu);
  854. /*
  855. * Wait until all currently running preempt_disable() code segments
  856. * (including hardware-irq-disable segments) complete. Note that
  857. * in -rt this does -not- necessarily result in all currently executing
  858. * interrupt -handlers- having completed.
  859. */
  860. void __synchronize_sched(void)
  861. {
  862. cpumask_t oldmask;
  863. int cpu;
  864. if (sched_getaffinity(0, &oldmask) < 0)
  865. oldmask = cpu_possible_map;
  866. for_each_online_cpu(cpu) {
  867. sched_setaffinity(0, &cpumask_of_cpu(cpu));
  868. schedule();
  869. }
  870. sched_setaffinity(0, &oldmask);
  871. }
  872. EXPORT_SYMBOL_GPL(__synchronize_sched);
  873. /*
  874. * Check to see if any future RCU-related work will need to be done
  875. * by the current CPU, even if none need be done immediately, returning
  876. * 1 if so. Assumes that notifiers would take care of handling any
  877. * outstanding requests from the RCU core.
  878. *
  879. * This function is part of the RCU implementation; it is -not-
  880. * an exported member of the RCU API.
  881. */
  882. int rcu_needs_cpu(int cpu)
  883. {
  884. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  885. return (rdp->donelist != NULL ||
  886. !!rdp->waitlistcount ||
  887. rdp->nextlist != NULL);
  888. }
  889. int rcu_pending(int cpu)
  890. {
  891. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  892. /* The CPU has at least one callback queued somewhere. */
  893. if (rdp->donelist != NULL ||
  894. !!rdp->waitlistcount ||
  895. rdp->nextlist != NULL)
  896. return 1;
  897. /* The RCU core needs an acknowledgement from this CPU. */
  898. if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
  899. (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
  900. return 1;
  901. /* This CPU has fallen behind the global grace-period number. */
  902. if (rdp->completed != rcu_ctrlblk.completed)
  903. return 1;
  904. /* Nothing needed from this CPU. */
  905. return 0;
  906. }
  907. static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
  908. unsigned long action, void *hcpu)
  909. {
  910. long cpu = (long)hcpu;
  911. switch (action) {
  912. case CPU_UP_PREPARE:
  913. case CPU_UP_PREPARE_FROZEN:
  914. rcu_online_cpu(cpu);
  915. break;
  916. case CPU_UP_CANCELED:
  917. case CPU_UP_CANCELED_FROZEN:
  918. case CPU_DEAD:
  919. case CPU_DEAD_FROZEN:
  920. rcu_offline_cpu(cpu);
  921. break;
  922. default:
  923. break;
  924. }
  925. return NOTIFY_OK;
  926. }
  927. static struct notifier_block __cpuinitdata rcu_nb = {
  928. .notifier_call = rcu_cpu_notify,
  929. };
  930. void __init __rcu_init(void)
  931. {
  932. int cpu;
  933. int i;
  934. struct rcu_data *rdp;
  935. printk(KERN_NOTICE "Preemptible RCU implementation.\n");
  936. for_each_possible_cpu(cpu) {
  937. rdp = RCU_DATA_CPU(cpu);
  938. spin_lock_init(&rdp->lock);
  939. rdp->completed = 0;
  940. rdp->waitlistcount = 0;
  941. rdp->nextlist = NULL;
  942. rdp->nexttail = &rdp->nextlist;
  943. for (i = 0; i < GP_STAGES; i++) {
  944. rdp->waitlist[i] = NULL;
  945. rdp->waittail[i] = &rdp->waitlist[i];
  946. }
  947. rdp->donelist = NULL;
  948. rdp->donetail = &rdp->donelist;
  949. rdp->rcu_flipctr[0] = 0;
  950. rdp->rcu_flipctr[1] = 0;
  951. }
  952. register_cpu_notifier(&rcu_nb);
  953. /*
  954. * We don't need protection against CPU-Hotplug here
  955. * since
  956. * a) If a CPU comes online while we are iterating over the
  957. * cpu_online_map below, we would only end up making a
  958. * duplicate call to rcu_online_cpu() which sets the corresponding
  959. * CPU's mask in the rcu_cpu_online_map.
  960. *
  961. * b) A CPU cannot go offline at this point in time since the user
  962. * does not have access to the sysfs interface, nor do we
  963. * suspend the system.
  964. */
  965. for_each_online_cpu(cpu)
  966. rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
  967. open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
  968. }
  969. /*
  970. * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
  971. */
  972. void synchronize_kernel(void)
  973. {
  974. synchronize_rcu();
  975. }
  976. #ifdef CONFIG_RCU_TRACE
  977. long *rcupreempt_flipctr(int cpu)
  978. {
  979. return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
  980. }
  981. EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
  982. int rcupreempt_flip_flag(int cpu)
  983. {
  984. return per_cpu(rcu_flip_flag, cpu);
  985. }
  986. EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
  987. int rcupreempt_mb_flag(int cpu)
  988. {
  989. return per_cpu(rcu_mb_flag, cpu);
  990. }
  991. EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
  992. char *rcupreempt_try_flip_state_name(void)
  993. {
  994. return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
  995. }
  996. EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
  997. struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
  998. {
  999. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  1000. return &rdp->trace;
  1001. }
  1002. EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
  1003. #endif /* #ifdef RCU_TRACE */