rcupreempt.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518
  1. /*
  2. * Read-Copy Update mechanism for mutual exclusion, realtime implementation
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. *
  18. * Copyright IBM Corporation, 2006
  19. *
  20. * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  21. * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
  22. * for pushing me away from locks and towards counters, and
  23. * to Suparna Bhattacharya for pushing me completely away
  24. * from atomic instructions on the read side.
  25. *
  26. * - Added handling of Dynamic Ticks
  27. * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
  28. * - Steven Rostedt <srostedt@redhat.com>
  29. *
  30. * Papers: http://www.rdrop.com/users/paulmck/RCU
  31. *
  32. * Design Document: http://lwn.net/Articles/253651/
  33. *
  34. * For detailed explanation of Read-Copy Update mechanism see -
  35. * Documentation/RCU/ *.txt
  36. *
  37. */
  38. #include <linux/types.h>
  39. #include <linux/kernel.h>
  40. #include <linux/init.h>
  41. #include <linux/spinlock.h>
  42. #include <linux/smp.h>
  43. #include <linux/rcupdate.h>
  44. #include <linux/interrupt.h>
  45. #include <linux/sched.h>
  46. #include <asm/atomic.h>
  47. #include <linux/bitops.h>
  48. #include <linux/module.h>
  49. #include <linux/kthread.h>
  50. #include <linux/completion.h>
  51. #include <linux/moduleparam.h>
  52. #include <linux/percpu.h>
  53. #include <linux/notifier.h>
  54. #include <linux/cpu.h>
  55. #include <linux/random.h>
  56. #include <linux/delay.h>
  57. #include <linux/cpumask.h>
  58. #include <linux/rcupreempt_trace.h>
  59. #include <asm/byteorder.h>
  60. /*
  61. * PREEMPT_RCU data structures.
  62. */
  63. /*
  64. * GP_STAGES specifies the number of times the state machine has
  65. * to go through the all the rcu_try_flip_states (see below)
  66. * in a single Grace Period.
  67. *
  68. * GP in GP_STAGES stands for Grace Period ;)
  69. */
  70. #define GP_STAGES 2
  71. struct rcu_data {
  72. spinlock_t lock; /* Protect rcu_data fields. */
  73. long completed; /* Number of last completed batch. */
  74. int waitlistcount;
  75. struct rcu_head *nextlist;
  76. struct rcu_head **nexttail;
  77. struct rcu_head *waitlist[GP_STAGES];
  78. struct rcu_head **waittail[GP_STAGES];
  79. struct rcu_head *donelist; /* from waitlist & waitschedlist */
  80. struct rcu_head **donetail;
  81. long rcu_flipctr[2];
  82. struct rcu_head *nextschedlist;
  83. struct rcu_head **nextschedtail;
  84. struct rcu_head *waitschedlist;
  85. struct rcu_head **waitschedtail;
  86. int rcu_sched_sleeping;
  87. #ifdef CONFIG_RCU_TRACE
  88. struct rcupreempt_trace trace;
  89. #endif /* #ifdef CONFIG_RCU_TRACE */
  90. };
  91. /*
  92. * States for rcu_try_flip() and friends.
  93. */
  94. enum rcu_try_flip_states {
  95. /*
  96. * Stay here if nothing is happening. Flip the counter if somthing
  97. * starts happening. Denoted by "I"
  98. */
  99. rcu_try_flip_idle_state,
  100. /*
  101. * Wait here for all CPUs to notice that the counter has flipped. This
  102. * prevents the old set of counters from ever being incremented once
  103. * we leave this state, which in turn is necessary because we cannot
  104. * test any individual counter for zero -- we can only check the sum.
  105. * Denoted by "A".
  106. */
  107. rcu_try_flip_waitack_state,
  108. /*
  109. * Wait here for the sum of the old per-CPU counters to reach zero.
  110. * Denoted by "Z".
  111. */
  112. rcu_try_flip_waitzero_state,
  113. /*
  114. * Wait here for each of the other CPUs to execute a memory barrier.
  115. * This is necessary to ensure that these other CPUs really have
  116. * completed executing their RCU read-side critical sections, despite
  117. * their CPUs wildly reordering memory. Denoted by "M".
  118. */
  119. rcu_try_flip_waitmb_state,
  120. };
  121. /*
  122. * States for rcu_ctrlblk.rcu_sched_sleep.
  123. */
  124. enum rcu_sched_sleep_states {
  125. rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
  126. rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
  127. rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
  128. };
  129. struct rcu_ctrlblk {
  130. spinlock_t fliplock; /* Protect state-machine transitions. */
  131. long completed; /* Number of last completed batch. */
  132. enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
  133. the rcu state machine */
  134. spinlock_t schedlock; /* Protect rcu_sched sleep state. */
  135. enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
  136. wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
  137. };
  138. struct rcu_dyntick_sched {
  139. int dynticks;
  140. int dynticks_snap;
  141. int sched_qs;
  142. int sched_qs_snap;
  143. int sched_dynticks_snap;
  144. };
  145. static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
  146. .dynticks = 1,
  147. };
  148. static int rcu_pending(int cpu);
  149. void rcu_sched_qs(int cpu)
  150. {
  151. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  152. rdssp->sched_qs++;
  153. }
  154. #ifdef CONFIG_NO_HZ
  155. void rcu_enter_nohz(void)
  156. {
  157. static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
  158. smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
  159. __get_cpu_var(rcu_dyntick_sched).dynticks++;
  160. WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
  161. }
  162. void rcu_exit_nohz(void)
  163. {
  164. static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
  165. __get_cpu_var(rcu_dyntick_sched).dynticks++;
  166. smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
  167. WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
  168. &rs);
  169. }
  170. #endif /* CONFIG_NO_HZ */
  171. static DEFINE_PER_CPU(struct rcu_data, rcu_data);
  172. static struct rcu_ctrlblk rcu_ctrlblk = {
  173. .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
  174. .completed = 0,
  175. .rcu_try_flip_state = rcu_try_flip_idle_state,
  176. .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
  177. .sched_sleep = rcu_sched_not_sleeping,
  178. .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
  179. };
  180. static struct task_struct *rcu_sched_grace_period_task;
  181. #ifdef CONFIG_RCU_TRACE
  182. static char *rcu_try_flip_state_names[] =
  183. { "idle", "waitack", "waitzero", "waitmb" };
  184. #endif /* #ifdef CONFIG_RCU_TRACE */
  185. static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
  186. = CPU_BITS_NONE;
  187. /*
  188. * Enum and per-CPU flag to determine when each CPU has seen
  189. * the most recent counter flip.
  190. */
  191. enum rcu_flip_flag_values {
  192. rcu_flip_seen, /* Steady/initial state, last flip seen. */
  193. /* Only GP detector can update. */
  194. rcu_flipped /* Flip just completed, need confirmation. */
  195. /* Only corresponding CPU can update. */
  196. };
  197. static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
  198. = rcu_flip_seen;
  199. /*
  200. * Enum and per-CPU flag to determine when each CPU has executed the
  201. * needed memory barrier to fence in memory references from its last RCU
  202. * read-side critical section in the just-completed grace period.
  203. */
  204. enum rcu_mb_flag_values {
  205. rcu_mb_done, /* Steady/initial state, no mb()s required. */
  206. /* Only GP detector can update. */
  207. rcu_mb_needed /* Flip just completed, need an mb(). */
  208. /* Only corresponding CPU can update. */
  209. };
  210. static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
  211. = rcu_mb_done;
  212. /*
  213. * RCU_DATA_ME: find the current CPU's rcu_data structure.
  214. * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
  215. */
  216. #define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
  217. #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
  218. /*
  219. * Helper macro for tracing when the appropriate rcu_data is not
  220. * cached in a local variable, but where the CPU number is so cached.
  221. */
  222. #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
  223. /*
  224. * Helper macro for tracing when the appropriate rcu_data is not
  225. * cached in a local variable.
  226. */
  227. #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
  228. /*
  229. * Helper macro for tracing when the appropriate rcu_data is pointed
  230. * to by a local variable.
  231. */
  232. #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
  233. #define RCU_SCHED_BATCH_TIME (HZ / 50)
  234. /*
  235. * Return the number of RCU batches processed thus far. Useful
  236. * for debug and statistics.
  237. */
  238. long rcu_batches_completed(void)
  239. {
  240. return rcu_ctrlblk.completed;
  241. }
  242. EXPORT_SYMBOL_GPL(rcu_batches_completed);
  243. void __rcu_read_lock(void)
  244. {
  245. int idx;
  246. struct task_struct *t = current;
  247. int nesting;
  248. nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
  249. if (nesting != 0) {
  250. /* An earlier rcu_read_lock() covers us, just count it. */
  251. t->rcu_read_lock_nesting = nesting + 1;
  252. } else {
  253. unsigned long flags;
  254. /*
  255. * We disable interrupts for the following reasons:
  256. * - If we get scheduling clock interrupt here, and we
  257. * end up acking the counter flip, it's like a promise
  258. * that we will never increment the old counter again.
  259. * Thus we will break that promise if that
  260. * scheduling clock interrupt happens between the time
  261. * we pick the .completed field and the time that we
  262. * increment our counter.
  263. *
  264. * - We don't want to be preempted out here.
  265. *
  266. * NMIs can still occur, of course, and might themselves
  267. * contain rcu_read_lock().
  268. */
  269. local_irq_save(flags);
  270. /*
  271. * Outermost nesting of rcu_read_lock(), so increment
  272. * the current counter for the current CPU. Use volatile
  273. * casts to prevent the compiler from reordering.
  274. */
  275. idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
  276. ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
  277. /*
  278. * Now that the per-CPU counter has been incremented, we
  279. * are protected from races with rcu_read_lock() invoked
  280. * from NMI handlers on this CPU. We can therefore safely
  281. * increment the nesting counter, relieving further NMIs
  282. * of the need to increment the per-CPU counter.
  283. */
  284. ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
  285. /*
  286. * Now that we have preventing any NMIs from storing
  287. * to the ->rcu_flipctr_idx, we can safely use it to
  288. * remember which counter to decrement in the matching
  289. * rcu_read_unlock().
  290. */
  291. ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
  292. local_irq_restore(flags);
  293. }
  294. }
  295. EXPORT_SYMBOL_GPL(__rcu_read_lock);
  296. void __rcu_read_unlock(void)
  297. {
  298. int idx;
  299. struct task_struct *t = current;
  300. int nesting;
  301. nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
  302. if (nesting > 1) {
  303. /*
  304. * We are still protected by the enclosing rcu_read_lock(),
  305. * so simply decrement the counter.
  306. */
  307. t->rcu_read_lock_nesting = nesting - 1;
  308. } else {
  309. unsigned long flags;
  310. /*
  311. * Disable local interrupts to prevent the grace-period
  312. * detection state machine from seeing us half-done.
  313. * NMIs can still occur, of course, and might themselves
  314. * contain rcu_read_lock() and rcu_read_unlock().
  315. */
  316. local_irq_save(flags);
  317. /*
  318. * Outermost nesting of rcu_read_unlock(), so we must
  319. * decrement the current counter for the current CPU.
  320. * This must be done carefully, because NMIs can
  321. * occur at any point in this code, and any rcu_read_lock()
  322. * and rcu_read_unlock() pairs in the NMI handlers
  323. * must interact non-destructively with this code.
  324. * Lots of volatile casts, and -very- careful ordering.
  325. *
  326. * Changes to this code, including this one, must be
  327. * inspected, validated, and tested extremely carefully!!!
  328. */
  329. /*
  330. * First, pick up the index.
  331. */
  332. idx = ACCESS_ONCE(t->rcu_flipctr_idx);
  333. /*
  334. * Now that we have fetched the counter index, it is
  335. * safe to decrement the per-task RCU nesting counter.
  336. * After this, any interrupts or NMIs will increment and
  337. * decrement the per-CPU counters.
  338. */
  339. ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
  340. /*
  341. * It is now safe to decrement this task's nesting count.
  342. * NMIs that occur after this statement will route their
  343. * rcu_read_lock() calls through this "else" clause, and
  344. * will thus start incrementing the per-CPU counter on
  345. * their own. They will also clobber ->rcu_flipctr_idx,
  346. * but that is OK, since we have already fetched it.
  347. */
  348. ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
  349. local_irq_restore(flags);
  350. }
  351. }
  352. EXPORT_SYMBOL_GPL(__rcu_read_unlock);
  353. /*
  354. * If a global counter flip has occurred since the last time that we
  355. * advanced callbacks, advance them. Hardware interrupts must be
  356. * disabled when calling this function.
  357. */
  358. static void __rcu_advance_callbacks(struct rcu_data *rdp)
  359. {
  360. int cpu;
  361. int i;
  362. int wlc = 0;
  363. if (rdp->completed != rcu_ctrlblk.completed) {
  364. if (rdp->waitlist[GP_STAGES - 1] != NULL) {
  365. *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
  366. rdp->donetail = rdp->waittail[GP_STAGES - 1];
  367. RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
  368. }
  369. for (i = GP_STAGES - 2; i >= 0; i--) {
  370. if (rdp->waitlist[i] != NULL) {
  371. rdp->waitlist[i + 1] = rdp->waitlist[i];
  372. rdp->waittail[i + 1] = rdp->waittail[i];
  373. wlc++;
  374. } else {
  375. rdp->waitlist[i + 1] = NULL;
  376. rdp->waittail[i + 1] =
  377. &rdp->waitlist[i + 1];
  378. }
  379. }
  380. if (rdp->nextlist != NULL) {
  381. rdp->waitlist[0] = rdp->nextlist;
  382. rdp->waittail[0] = rdp->nexttail;
  383. wlc++;
  384. rdp->nextlist = NULL;
  385. rdp->nexttail = &rdp->nextlist;
  386. RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
  387. } else {
  388. rdp->waitlist[0] = NULL;
  389. rdp->waittail[0] = &rdp->waitlist[0];
  390. }
  391. rdp->waitlistcount = wlc;
  392. rdp->completed = rcu_ctrlblk.completed;
  393. }
  394. /*
  395. * Check to see if this CPU needs to report that it has seen
  396. * the most recent counter flip, thereby declaring that all
  397. * subsequent rcu_read_lock() invocations will respect this flip.
  398. */
  399. cpu = raw_smp_processor_id();
  400. if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
  401. smp_mb(); /* Subsequent counter accesses must see new value */
  402. per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
  403. smp_mb(); /* Subsequent RCU read-side critical sections */
  404. /* seen -after- acknowledgement. */
  405. }
  406. }
  407. #ifdef CONFIG_NO_HZ
  408. static DEFINE_PER_CPU(int, rcu_update_flag);
  409. /**
  410. * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  411. *
  412. * If the CPU was idle with dynamic ticks active, this updates the
  413. * rcu_dyntick_sched.dynticks to let the RCU handling know that the
  414. * CPU is active.
  415. */
  416. void rcu_irq_enter(void)
  417. {
  418. int cpu = smp_processor_id();
  419. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  420. if (per_cpu(rcu_update_flag, cpu))
  421. per_cpu(rcu_update_flag, cpu)++;
  422. /*
  423. * Only update if we are coming from a stopped ticks mode
  424. * (rcu_dyntick_sched.dynticks is even).
  425. */
  426. if (!in_interrupt() &&
  427. (rdssp->dynticks & 0x1) == 0) {
  428. /*
  429. * The following might seem like we could have a race
  430. * with NMI/SMIs. But this really isn't a problem.
  431. * Here we do a read/modify/write, and the race happens
  432. * when an NMI/SMI comes in after the read and before
  433. * the write. But NMI/SMIs will increment this counter
  434. * twice before returning, so the zero bit will not
  435. * be corrupted by the NMI/SMI which is the most important
  436. * part.
  437. *
  438. * The only thing is that we would bring back the counter
  439. * to a postion that it was in during the NMI/SMI.
  440. * But the zero bit would be set, so the rest of the
  441. * counter would again be ignored.
  442. *
  443. * On return from the IRQ, the counter may have the zero
  444. * bit be 0 and the counter the same as the return from
  445. * the NMI/SMI. If the state machine was so unlucky to
  446. * see that, it still doesn't matter, since all
  447. * RCU read-side critical sections on this CPU would
  448. * have already completed.
  449. */
  450. rdssp->dynticks++;
  451. /*
  452. * The following memory barrier ensures that any
  453. * rcu_read_lock() primitives in the irq handler
  454. * are seen by other CPUs to follow the above
  455. * increment to rcu_dyntick_sched.dynticks. This is
  456. * required in order for other CPUs to correctly
  457. * determine when it is safe to advance the RCU
  458. * grace-period state machine.
  459. */
  460. smp_mb(); /* see above block comment. */
  461. /*
  462. * Since we can't determine the dynamic tick mode from
  463. * the rcu_dyntick_sched.dynticks after this routine,
  464. * we use a second flag to acknowledge that we came
  465. * from an idle state with ticks stopped.
  466. */
  467. per_cpu(rcu_update_flag, cpu)++;
  468. /*
  469. * If we take an NMI/SMI now, they will also increment
  470. * the rcu_update_flag, and will not update the
  471. * rcu_dyntick_sched.dynticks on exit. That is for
  472. * this IRQ to do.
  473. */
  474. }
  475. }
  476. /**
  477. * rcu_irq_exit - Called from exiting Hard irq context.
  478. *
  479. * If the CPU was idle with dynamic ticks active, update the
  480. * rcu_dyntick_sched.dynticks to let the RCU handling be
  481. * aware that the CPU is going back to idle with no ticks.
  482. */
  483. void rcu_irq_exit(void)
  484. {
  485. int cpu = smp_processor_id();
  486. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  487. /*
  488. * rcu_update_flag is set if we interrupted the CPU
  489. * when it was idle with ticks stopped.
  490. * Once this occurs, we keep track of interrupt nesting
  491. * because a NMI/SMI could also come in, and we still
  492. * only want the IRQ that started the increment of the
  493. * rcu_dyntick_sched.dynticks to be the one that modifies
  494. * it on exit.
  495. */
  496. if (per_cpu(rcu_update_flag, cpu)) {
  497. if (--per_cpu(rcu_update_flag, cpu))
  498. return;
  499. /* This must match the interrupt nesting */
  500. WARN_ON(in_interrupt());
  501. /*
  502. * If an NMI/SMI happens now we are still
  503. * protected by the rcu_dyntick_sched.dynticks being odd.
  504. */
  505. /*
  506. * The following memory barrier ensures that any
  507. * rcu_read_unlock() primitives in the irq handler
  508. * are seen by other CPUs to preceed the following
  509. * increment to rcu_dyntick_sched.dynticks. This
  510. * is required in order for other CPUs to determine
  511. * when it is safe to advance the RCU grace-period
  512. * state machine.
  513. */
  514. smp_mb(); /* see above block comment. */
  515. rdssp->dynticks++;
  516. WARN_ON(rdssp->dynticks & 0x1);
  517. }
  518. }
  519. void rcu_nmi_enter(void)
  520. {
  521. rcu_irq_enter();
  522. }
  523. void rcu_nmi_exit(void)
  524. {
  525. rcu_irq_exit();
  526. }
  527. static void dyntick_save_progress_counter(int cpu)
  528. {
  529. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  530. rdssp->dynticks_snap = rdssp->dynticks;
  531. }
  532. static inline int
  533. rcu_try_flip_waitack_needed(int cpu)
  534. {
  535. long curr;
  536. long snap;
  537. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  538. curr = rdssp->dynticks;
  539. snap = rdssp->dynticks_snap;
  540. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  541. /*
  542. * If the CPU remained in dynticks mode for the entire time
  543. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  544. * then it cannot be in the middle of an rcu_read_lock(), so
  545. * the next rcu_read_lock() it executes must use the new value
  546. * of the counter. So we can safely pretend that this CPU
  547. * already acknowledged the counter.
  548. */
  549. if ((curr == snap) && ((curr & 0x1) == 0))
  550. return 0;
  551. /*
  552. * If the CPU passed through or entered a dynticks idle phase with
  553. * no active irq handlers, then, as above, we can safely pretend
  554. * that this CPU already acknowledged the counter.
  555. */
  556. if ((curr - snap) > 2 || (curr & 0x1) == 0)
  557. return 0;
  558. /* We need this CPU to explicitly acknowledge the counter flip. */
  559. return 1;
  560. }
  561. static inline int
  562. rcu_try_flip_waitmb_needed(int cpu)
  563. {
  564. long curr;
  565. long snap;
  566. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  567. curr = rdssp->dynticks;
  568. snap = rdssp->dynticks_snap;
  569. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  570. /*
  571. * If the CPU remained in dynticks mode for the entire time
  572. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  573. * then it cannot have executed an RCU read-side critical section
  574. * during that time, so there is no need for it to execute a
  575. * memory barrier.
  576. */
  577. if ((curr == snap) && ((curr & 0x1) == 0))
  578. return 0;
  579. /*
  580. * If the CPU either entered or exited an outermost interrupt,
  581. * SMI, NMI, or whatever handler, then we know that it executed
  582. * a memory barrier when doing so. So we don't need another one.
  583. */
  584. if (curr != snap)
  585. return 0;
  586. /* We need the CPU to execute a memory barrier. */
  587. return 1;
  588. }
  589. static void dyntick_save_progress_counter_sched(int cpu)
  590. {
  591. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  592. rdssp->sched_dynticks_snap = rdssp->dynticks;
  593. }
  594. static int rcu_qsctr_inc_needed_dyntick(int cpu)
  595. {
  596. long curr;
  597. long snap;
  598. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  599. curr = rdssp->dynticks;
  600. snap = rdssp->sched_dynticks_snap;
  601. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  602. /*
  603. * If the CPU remained in dynticks mode for the entire time
  604. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  605. * then it cannot be in the middle of an rcu_read_lock(), so
  606. * the next rcu_read_lock() it executes must use the new value
  607. * of the counter. Therefore, this CPU has been in a quiescent
  608. * state the entire time, and we don't need to wait for it.
  609. */
  610. if ((curr == snap) && ((curr & 0x1) == 0))
  611. return 0;
  612. /*
  613. * If the CPU passed through or entered a dynticks idle phase with
  614. * no active irq handlers, then, as above, this CPU has already
  615. * passed through a quiescent state.
  616. */
  617. if ((curr - snap) > 2 || (snap & 0x1) == 0)
  618. return 0;
  619. /* We need this CPU to go through a quiescent state. */
  620. return 1;
  621. }
  622. #else /* !CONFIG_NO_HZ */
  623. # define dyntick_save_progress_counter(cpu) do { } while (0)
  624. # define rcu_try_flip_waitack_needed(cpu) (1)
  625. # define rcu_try_flip_waitmb_needed(cpu) (1)
  626. # define dyntick_save_progress_counter_sched(cpu) do { } while (0)
  627. # define rcu_qsctr_inc_needed_dyntick(cpu) (1)
  628. #endif /* CONFIG_NO_HZ */
  629. static void save_qsctr_sched(int cpu)
  630. {
  631. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  632. rdssp->sched_qs_snap = rdssp->sched_qs;
  633. }
  634. static inline int rcu_qsctr_inc_needed(int cpu)
  635. {
  636. struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  637. /*
  638. * If there has been a quiescent state, no more need to wait
  639. * on this CPU.
  640. */
  641. if (rdssp->sched_qs != rdssp->sched_qs_snap) {
  642. smp_mb(); /* force ordering with cpu entering schedule(). */
  643. return 0;
  644. }
  645. /* We need this CPU to go through a quiescent state. */
  646. return 1;
  647. }
  648. /*
  649. * Get here when RCU is idle. Decide whether we need to
  650. * move out of idle state, and return non-zero if so.
  651. * "Straightforward" approach for the moment, might later
  652. * use callback-list lengths, grace-period duration, or
  653. * some such to determine when to exit idle state.
  654. * Might also need a pre-idle test that does not acquire
  655. * the lock, but let's get the simple case working first...
  656. */
  657. static int
  658. rcu_try_flip_idle(void)
  659. {
  660. int cpu;
  661. RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
  662. if (!rcu_pending(smp_processor_id())) {
  663. RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
  664. return 0;
  665. }
  666. /*
  667. * Do the flip.
  668. */
  669. RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
  670. rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
  671. /*
  672. * Need a memory barrier so that other CPUs see the new
  673. * counter value before they see the subsequent change of all
  674. * the rcu_flip_flag instances to rcu_flipped.
  675. */
  676. smp_mb(); /* see above block comment. */
  677. /* Now ask each CPU for acknowledgement of the flip. */
  678. for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
  679. per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
  680. dyntick_save_progress_counter(cpu);
  681. }
  682. return 1;
  683. }
  684. /*
  685. * Wait for CPUs to acknowledge the flip.
  686. */
  687. static int
  688. rcu_try_flip_waitack(void)
  689. {
  690. int cpu;
  691. RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
  692. for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
  693. if (rcu_try_flip_waitack_needed(cpu) &&
  694. per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
  695. RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
  696. return 0;
  697. }
  698. /*
  699. * Make sure our checks above don't bleed into subsequent
  700. * waiting for the sum of the counters to reach zero.
  701. */
  702. smp_mb(); /* see above block comment. */
  703. RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
  704. return 1;
  705. }
  706. /*
  707. * Wait for collective ``last'' counter to reach zero,
  708. * then tell all CPUs to do an end-of-grace-period memory barrier.
  709. */
  710. static int
  711. rcu_try_flip_waitzero(void)
  712. {
  713. int cpu;
  714. int lastidx = !(rcu_ctrlblk.completed & 0x1);
  715. int sum = 0;
  716. /* Check to see if the sum of the "last" counters is zero. */
  717. RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
  718. for_each_possible_cpu(cpu)
  719. sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
  720. if (sum != 0) {
  721. RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
  722. return 0;
  723. }
  724. /*
  725. * This ensures that the other CPUs see the call for
  726. * memory barriers -after- the sum to zero has been
  727. * detected here
  728. */
  729. smp_mb(); /* ^^^^^^^^^^^^ */
  730. /* Call for a memory barrier from each CPU. */
  731. for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
  732. per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
  733. dyntick_save_progress_counter(cpu);
  734. }
  735. RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
  736. return 1;
  737. }
  738. /*
  739. * Wait for all CPUs to do their end-of-grace-period memory barrier.
  740. * Return 0 once all CPUs have done so.
  741. */
  742. static int
  743. rcu_try_flip_waitmb(void)
  744. {
  745. int cpu;
  746. RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
  747. for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
  748. if (rcu_try_flip_waitmb_needed(cpu) &&
  749. per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
  750. RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
  751. return 0;
  752. }
  753. smp_mb(); /* Ensure that the above checks precede any following flip. */
  754. RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
  755. return 1;
  756. }
  757. /*
  758. * Attempt a single flip of the counters. Remember, a single flip does
  759. * -not- constitute a grace period. Instead, the interval between
  760. * at least GP_STAGES consecutive flips is a grace period.
  761. *
  762. * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
  763. * on a large SMP, they might want to use a hierarchical organization of
  764. * the per-CPU-counter pairs.
  765. */
  766. static void rcu_try_flip(void)
  767. {
  768. unsigned long flags;
  769. RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
  770. if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
  771. RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
  772. return;
  773. }
  774. /*
  775. * Take the next transition(s) through the RCU grace-period
  776. * flip-counter state machine.
  777. */
  778. switch (rcu_ctrlblk.rcu_try_flip_state) {
  779. case rcu_try_flip_idle_state:
  780. if (rcu_try_flip_idle())
  781. rcu_ctrlblk.rcu_try_flip_state =
  782. rcu_try_flip_waitack_state;
  783. break;
  784. case rcu_try_flip_waitack_state:
  785. if (rcu_try_flip_waitack())
  786. rcu_ctrlblk.rcu_try_flip_state =
  787. rcu_try_flip_waitzero_state;
  788. break;
  789. case rcu_try_flip_waitzero_state:
  790. if (rcu_try_flip_waitzero())
  791. rcu_ctrlblk.rcu_try_flip_state =
  792. rcu_try_flip_waitmb_state;
  793. break;
  794. case rcu_try_flip_waitmb_state:
  795. if (rcu_try_flip_waitmb())
  796. rcu_ctrlblk.rcu_try_flip_state =
  797. rcu_try_flip_idle_state;
  798. }
  799. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  800. }
  801. /*
  802. * Check to see if this CPU needs to do a memory barrier in order to
  803. * ensure that any prior RCU read-side critical sections have committed
  804. * their counter manipulations and critical-section memory references
  805. * before declaring the grace period to be completed.
  806. */
  807. static void rcu_check_mb(int cpu)
  808. {
  809. if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
  810. smp_mb(); /* Ensure RCU read-side accesses are visible. */
  811. per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
  812. }
  813. }
  814. void rcu_check_callbacks(int cpu, int user)
  815. {
  816. unsigned long flags;
  817. struct rcu_data *rdp;
  818. if (!rcu_pending(cpu))
  819. return; /* if nothing for RCU to do. */
  820. /*
  821. * If this CPU took its interrupt from user mode or from the
  822. * idle loop, and this is not a nested interrupt, then
  823. * this CPU has to have exited all prior preept-disable
  824. * sections of code. So invoke rcu_sched_qs() to note this.
  825. *
  826. * The memory barrier is needed to handle the case where
  827. * writes from a preempt-disable section of code get reordered
  828. * into schedule() by this CPU's write buffer. So the memory
  829. * barrier makes sure that the rcu_sched_qs() is seen by other
  830. * CPUs to happen after any such write.
  831. */
  832. rdp = RCU_DATA_CPU(cpu);
  833. if (user ||
  834. (idle_cpu(cpu) && !in_softirq() &&
  835. hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
  836. smp_mb(); /* Guard against aggressive schedule(). */
  837. rcu_sched_qs(cpu);
  838. }
  839. rcu_check_mb(cpu);
  840. if (rcu_ctrlblk.completed == rdp->completed)
  841. rcu_try_flip();
  842. spin_lock_irqsave(&rdp->lock, flags);
  843. RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
  844. __rcu_advance_callbacks(rdp);
  845. if (rdp->donelist == NULL) {
  846. spin_unlock_irqrestore(&rdp->lock, flags);
  847. } else {
  848. spin_unlock_irqrestore(&rdp->lock, flags);
  849. raise_softirq(RCU_SOFTIRQ);
  850. }
  851. }
  852. /*
  853. * Needed by dynticks, to make sure all RCU processing has finished
  854. * when we go idle:
  855. */
  856. void rcu_advance_callbacks(int cpu, int user)
  857. {
  858. unsigned long flags;
  859. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  860. if (rcu_ctrlblk.completed == rdp->completed) {
  861. rcu_try_flip();
  862. if (rcu_ctrlblk.completed == rdp->completed)
  863. return;
  864. }
  865. spin_lock_irqsave(&rdp->lock, flags);
  866. RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
  867. __rcu_advance_callbacks(rdp);
  868. spin_unlock_irqrestore(&rdp->lock, flags);
  869. }
  870. #ifdef CONFIG_HOTPLUG_CPU
  871. #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
  872. *dsttail = srclist; \
  873. if (srclist != NULL) { \
  874. dsttail = srctail; \
  875. srclist = NULL; \
  876. srctail = &srclist;\
  877. } \
  878. } while (0)
  879. void rcu_offline_cpu(int cpu)
  880. {
  881. int i;
  882. struct rcu_head *list = NULL;
  883. unsigned long flags;
  884. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  885. struct rcu_head *schedlist = NULL;
  886. struct rcu_head **schedtail = &schedlist;
  887. struct rcu_head **tail = &list;
  888. /*
  889. * Remove all callbacks from the newly dead CPU, retaining order.
  890. * Otherwise rcu_barrier() will fail
  891. */
  892. spin_lock_irqsave(&rdp->lock, flags);
  893. rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
  894. for (i = GP_STAGES - 1; i >= 0; i--)
  895. rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
  896. list, tail);
  897. rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
  898. rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
  899. schedlist, schedtail);
  900. rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
  901. schedlist, schedtail);
  902. rdp->rcu_sched_sleeping = 0;
  903. spin_unlock_irqrestore(&rdp->lock, flags);
  904. rdp->waitlistcount = 0;
  905. /* Disengage the newly dead CPU from the grace-period computation. */
  906. spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  907. rcu_check_mb(cpu);
  908. if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
  909. smp_mb(); /* Subsequent counter accesses must see new value */
  910. per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
  911. smp_mb(); /* Subsequent RCU read-side critical sections */
  912. /* seen -after- acknowledgement. */
  913. }
  914. cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
  915. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  916. /*
  917. * Place the removed callbacks on the current CPU's queue.
  918. * Make them all start a new grace period: simple approach,
  919. * in theory could starve a given set of callbacks, but
  920. * you would need to be doing some serious CPU hotplugging
  921. * to make this happen. If this becomes a problem, adding
  922. * a synchronize_rcu() to the hotplug path would be a simple
  923. * fix.
  924. */
  925. local_irq_save(flags); /* disable preempt till we know what lock. */
  926. rdp = RCU_DATA_ME();
  927. spin_lock(&rdp->lock);
  928. *rdp->nexttail = list;
  929. if (list)
  930. rdp->nexttail = tail;
  931. *rdp->nextschedtail = schedlist;
  932. if (schedlist)
  933. rdp->nextschedtail = schedtail;
  934. spin_unlock_irqrestore(&rdp->lock, flags);
  935. }
  936. #else /* #ifdef CONFIG_HOTPLUG_CPU */
  937. void rcu_offline_cpu(int cpu)
  938. {
  939. }
  940. #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
  941. void __cpuinit rcu_online_cpu(int cpu)
  942. {
  943. unsigned long flags;
  944. struct rcu_data *rdp;
  945. spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  946. cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
  947. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  948. /*
  949. * The rcu_sched grace-period processing might have bypassed
  950. * this CPU, given that it was not in the rcu_cpu_online_map
  951. * when the grace-period scan started. This means that the
  952. * grace-period task might sleep. So make sure that if this
  953. * should happen, the first callback posted to this CPU will
  954. * wake up the grace-period task if need be.
  955. */
  956. rdp = RCU_DATA_CPU(cpu);
  957. spin_lock_irqsave(&rdp->lock, flags);
  958. rdp->rcu_sched_sleeping = 1;
  959. spin_unlock_irqrestore(&rdp->lock, flags);
  960. }
  961. static void rcu_process_callbacks(struct softirq_action *unused)
  962. {
  963. unsigned long flags;
  964. struct rcu_head *next, *list;
  965. struct rcu_data *rdp;
  966. local_irq_save(flags);
  967. rdp = RCU_DATA_ME();
  968. spin_lock(&rdp->lock);
  969. list = rdp->donelist;
  970. if (list == NULL) {
  971. spin_unlock_irqrestore(&rdp->lock, flags);
  972. return;
  973. }
  974. rdp->donelist = NULL;
  975. rdp->donetail = &rdp->donelist;
  976. RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
  977. spin_unlock_irqrestore(&rdp->lock, flags);
  978. while (list) {
  979. next = list->next;
  980. list->func(list);
  981. list = next;
  982. RCU_TRACE_ME(rcupreempt_trace_invoke);
  983. }
  984. }
  985. void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  986. {
  987. unsigned long flags;
  988. struct rcu_data *rdp;
  989. head->func = func;
  990. head->next = NULL;
  991. local_irq_save(flags);
  992. rdp = RCU_DATA_ME();
  993. spin_lock(&rdp->lock);
  994. __rcu_advance_callbacks(rdp);
  995. *rdp->nexttail = head;
  996. rdp->nexttail = &head->next;
  997. RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
  998. spin_unlock_irqrestore(&rdp->lock, flags);
  999. }
  1000. EXPORT_SYMBOL_GPL(call_rcu);
  1001. void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  1002. {
  1003. unsigned long flags;
  1004. struct rcu_data *rdp;
  1005. int wake_gp = 0;
  1006. head->func = func;
  1007. head->next = NULL;
  1008. local_irq_save(flags);
  1009. rdp = RCU_DATA_ME();
  1010. spin_lock(&rdp->lock);
  1011. *rdp->nextschedtail = head;
  1012. rdp->nextschedtail = &head->next;
  1013. if (rdp->rcu_sched_sleeping) {
  1014. /* Grace-period processing might be sleeping... */
  1015. rdp->rcu_sched_sleeping = 0;
  1016. wake_gp = 1;
  1017. }
  1018. spin_unlock_irqrestore(&rdp->lock, flags);
  1019. if (wake_gp) {
  1020. /* Wake up grace-period processing, unless someone beat us. */
  1021. spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
  1022. if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
  1023. wake_gp = 0;
  1024. rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
  1025. spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1026. if (wake_gp)
  1027. wake_up_interruptible(&rcu_ctrlblk.sched_wq);
  1028. }
  1029. }
  1030. EXPORT_SYMBOL_GPL(call_rcu_sched);
  1031. /*
  1032. * Wait until all currently running preempt_disable() code segments
  1033. * (including hardware-irq-disable segments) complete. Note that
  1034. * in -rt this does -not- necessarily result in all currently executing
  1035. * interrupt -handlers- having completed.
  1036. */
  1037. void __synchronize_sched(void)
  1038. {
  1039. struct rcu_synchronize rcu;
  1040. if (num_online_cpus() == 1)
  1041. return; /* blocking is gp if only one CPU! */
  1042. init_completion(&rcu.completion);
  1043. /* Will wake me after RCU finished. */
  1044. call_rcu_sched(&rcu.head, wakeme_after_rcu);
  1045. /* Wait for it. */
  1046. wait_for_completion(&rcu.completion);
  1047. }
  1048. EXPORT_SYMBOL_GPL(__synchronize_sched);
  1049. /*
  1050. * kthread function that manages call_rcu_sched grace periods.
  1051. */
  1052. static int rcu_sched_grace_period(void *arg)
  1053. {
  1054. int couldsleep; /* might sleep after current pass. */
  1055. int couldsleepnext = 0; /* might sleep after next pass. */
  1056. int cpu;
  1057. unsigned long flags;
  1058. struct rcu_data *rdp;
  1059. int ret;
  1060. /*
  1061. * Each pass through the following loop handles one
  1062. * rcu_sched grace period cycle.
  1063. */
  1064. do {
  1065. /* Save each CPU's current state. */
  1066. for_each_online_cpu(cpu) {
  1067. dyntick_save_progress_counter_sched(cpu);
  1068. save_qsctr_sched(cpu);
  1069. }
  1070. /*
  1071. * Sleep for about an RCU grace-period's worth to
  1072. * allow better batching and to consume less CPU.
  1073. */
  1074. schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
  1075. /*
  1076. * If there was nothing to do last time, prepare to
  1077. * sleep at the end of the current grace period cycle.
  1078. */
  1079. couldsleep = couldsleepnext;
  1080. couldsleepnext = 1;
  1081. if (couldsleep) {
  1082. spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
  1083. rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
  1084. spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1085. }
  1086. /*
  1087. * Wait on each CPU in turn to have either visited
  1088. * a quiescent state or been in dynticks-idle mode.
  1089. */
  1090. for_each_online_cpu(cpu) {
  1091. while (rcu_qsctr_inc_needed(cpu) &&
  1092. rcu_qsctr_inc_needed_dyntick(cpu)) {
  1093. /* resched_cpu(cpu); @@@ */
  1094. schedule_timeout_interruptible(1);
  1095. }
  1096. }
  1097. /* Advance callbacks for each CPU. */
  1098. for_each_online_cpu(cpu) {
  1099. rdp = RCU_DATA_CPU(cpu);
  1100. spin_lock_irqsave(&rdp->lock, flags);
  1101. /*
  1102. * We are running on this CPU irq-disabled, so no
  1103. * CPU can go offline until we re-enable irqs.
  1104. * The current CPU might have already gone
  1105. * offline (between the for_each_offline_cpu and
  1106. * the spin_lock_irqsave), but in that case all its
  1107. * callback lists will be empty, so no harm done.
  1108. *
  1109. * Advance the callbacks! We share normal RCU's
  1110. * donelist, since callbacks are invoked the
  1111. * same way in either case.
  1112. */
  1113. if (rdp->waitschedlist != NULL) {
  1114. *rdp->donetail = rdp->waitschedlist;
  1115. rdp->donetail = rdp->waitschedtail;
  1116. /*
  1117. * Next rcu_check_callbacks() will
  1118. * do the required raise_softirq().
  1119. */
  1120. }
  1121. if (rdp->nextschedlist != NULL) {
  1122. rdp->waitschedlist = rdp->nextschedlist;
  1123. rdp->waitschedtail = rdp->nextschedtail;
  1124. couldsleep = 0;
  1125. couldsleepnext = 0;
  1126. } else {
  1127. rdp->waitschedlist = NULL;
  1128. rdp->waitschedtail = &rdp->waitschedlist;
  1129. }
  1130. rdp->nextschedlist = NULL;
  1131. rdp->nextschedtail = &rdp->nextschedlist;
  1132. /* Mark sleep intention. */
  1133. rdp->rcu_sched_sleeping = couldsleep;
  1134. spin_unlock_irqrestore(&rdp->lock, flags);
  1135. }
  1136. /* If we saw callbacks on the last scan, go deal with them. */
  1137. if (!couldsleep)
  1138. continue;
  1139. /* Attempt to block... */
  1140. spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
  1141. if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
  1142. /*
  1143. * Someone posted a callback after we scanned.
  1144. * Go take care of it.
  1145. */
  1146. spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1147. couldsleepnext = 0;
  1148. continue;
  1149. }
  1150. /* Block until the next person posts a callback. */
  1151. rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
  1152. spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
  1153. ret = 0; /* unused */
  1154. __wait_event_interruptible(rcu_ctrlblk.sched_wq,
  1155. rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
  1156. ret);
  1157. couldsleepnext = 0;
  1158. } while (!kthread_should_stop());
  1159. return (0);
  1160. }
  1161. /*
  1162. * Check to see if any future RCU-related work will need to be done
  1163. * by the current CPU, even if none need be done immediately, returning
  1164. * 1 if so. Assumes that notifiers would take care of handling any
  1165. * outstanding requests from the RCU core.
  1166. *
  1167. * This function is part of the RCU implementation; it is -not-
  1168. * an exported member of the RCU API.
  1169. */
  1170. int rcu_needs_cpu(int cpu)
  1171. {
  1172. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  1173. return (rdp->donelist != NULL ||
  1174. !!rdp->waitlistcount ||
  1175. rdp->nextlist != NULL ||
  1176. rdp->nextschedlist != NULL ||
  1177. rdp->waitschedlist != NULL);
  1178. }
  1179. static int rcu_pending(int cpu)
  1180. {
  1181. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  1182. /* The CPU has at least one callback queued somewhere. */
  1183. if (rdp->donelist != NULL ||
  1184. !!rdp->waitlistcount ||
  1185. rdp->nextlist != NULL ||
  1186. rdp->nextschedlist != NULL ||
  1187. rdp->waitschedlist != NULL)
  1188. return 1;
  1189. /* The RCU core needs an acknowledgement from this CPU. */
  1190. if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
  1191. (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
  1192. return 1;
  1193. /* This CPU has fallen behind the global grace-period number. */
  1194. if (rdp->completed != rcu_ctrlblk.completed)
  1195. return 1;
  1196. /* Nothing needed from this CPU. */
  1197. return 0;
  1198. }
  1199. int __cpuinit rcu_cpu_notify(struct notifier_block *self,
  1200. unsigned long action, void *hcpu)
  1201. {
  1202. long cpu = (long)hcpu;
  1203. switch (action) {
  1204. case CPU_UP_PREPARE:
  1205. case CPU_UP_PREPARE_FROZEN:
  1206. rcu_online_cpu(cpu);
  1207. break;
  1208. case CPU_UP_CANCELED:
  1209. case CPU_UP_CANCELED_FROZEN:
  1210. case CPU_DEAD:
  1211. case CPU_DEAD_FROZEN:
  1212. rcu_offline_cpu(cpu);
  1213. break;
  1214. default:
  1215. break;
  1216. }
  1217. return NOTIFY_OK;
  1218. }
  1219. void __init __rcu_init(void)
  1220. {
  1221. int cpu;
  1222. int i;
  1223. struct rcu_data *rdp;
  1224. printk(KERN_NOTICE "Preemptible RCU implementation.\n");
  1225. for_each_possible_cpu(cpu) {
  1226. rdp = RCU_DATA_CPU(cpu);
  1227. spin_lock_init(&rdp->lock);
  1228. rdp->completed = 0;
  1229. rdp->waitlistcount = 0;
  1230. rdp->nextlist = NULL;
  1231. rdp->nexttail = &rdp->nextlist;
  1232. for (i = 0; i < GP_STAGES; i++) {
  1233. rdp->waitlist[i] = NULL;
  1234. rdp->waittail[i] = &rdp->waitlist[i];
  1235. }
  1236. rdp->donelist = NULL;
  1237. rdp->donetail = &rdp->donelist;
  1238. rdp->rcu_flipctr[0] = 0;
  1239. rdp->rcu_flipctr[1] = 0;
  1240. rdp->nextschedlist = NULL;
  1241. rdp->nextschedtail = &rdp->nextschedlist;
  1242. rdp->waitschedlist = NULL;
  1243. rdp->waitschedtail = &rdp->waitschedlist;
  1244. rdp->rcu_sched_sleeping = 0;
  1245. }
  1246. open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  1247. }
  1248. /*
  1249. * Late-boot-time RCU initialization that must wait until after scheduler
  1250. * has been initialized.
  1251. */
  1252. void __init rcu_init_sched(void)
  1253. {
  1254. rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
  1255. NULL,
  1256. "rcu_sched_grace_period");
  1257. WARN_ON(IS_ERR(rcu_sched_grace_period_task));
  1258. }
  1259. #ifdef CONFIG_RCU_TRACE
  1260. long *rcupreempt_flipctr(int cpu)
  1261. {
  1262. return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
  1263. }
  1264. EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
  1265. int rcupreempt_flip_flag(int cpu)
  1266. {
  1267. return per_cpu(rcu_flip_flag, cpu);
  1268. }
  1269. EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
  1270. int rcupreempt_mb_flag(int cpu)
  1271. {
  1272. return per_cpu(rcu_mb_flag, cpu);
  1273. }
  1274. EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
  1275. char *rcupreempt_try_flip_state_name(void)
  1276. {
  1277. return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
  1278. }
  1279. EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
  1280. struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
  1281. {
  1282. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  1283. return &rdp->trace;
  1284. }
  1285. EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
  1286. #endif /* #ifdef RCU_TRACE */