rcupreempt.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166
  1. /*
  2. * Read-Copy Update mechanism for mutual exclusion, realtime implementation
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. *
  18. * Copyright IBM Corporation, 2006
  19. *
  20. * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  21. * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
  22. * for pushing me away from locks and towards counters, and
  23. * to Suparna Bhattacharya for pushing me completely away
  24. * from atomic instructions on the read side.
  25. *
  26. * - Added handling of Dynamic Ticks
  27. * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
  28. * - Steven Rostedt <srostedt@redhat.com>
  29. *
  30. * Papers: http://www.rdrop.com/users/paulmck/RCU
  31. *
  32. * Design Document: http://lwn.net/Articles/253651/
  33. *
  34. * For detailed explanation of Read-Copy Update mechanism see -
  35. * Documentation/RCU/ *.txt
  36. *
  37. */
  38. #include <linux/types.h>
  39. #include <linux/kernel.h>
  40. #include <linux/init.h>
  41. #include <linux/spinlock.h>
  42. #include <linux/smp.h>
  43. #include <linux/rcupdate.h>
  44. #include <linux/interrupt.h>
  45. #include <linux/sched.h>
  46. #include <asm/atomic.h>
  47. #include <linux/bitops.h>
  48. #include <linux/module.h>
  49. #include <linux/completion.h>
  50. #include <linux/moduleparam.h>
  51. #include <linux/percpu.h>
  52. #include <linux/notifier.h>
  53. #include <linux/rcupdate.h>
  54. #include <linux/cpu.h>
  55. #include <linux/random.h>
  56. #include <linux/delay.h>
  57. #include <linux/byteorder/swabb.h>
  58. #include <linux/cpumask.h>
  59. #include <linux/rcupreempt_trace.h>
  60. /*
  61. * Macro that prevents the compiler from reordering accesses, but does
  62. * absolutely -nothing- to prevent CPUs from reordering. This is used
  63. * only to mediate communication between mainline code and hardware
  64. * interrupt and NMI handlers.
  65. */
  66. #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
  67. /*
  68. * PREEMPT_RCU data structures.
  69. */
  70. /*
  71. * GP_STAGES specifies the number of times the state machine has
  72. * to go through the all the rcu_try_flip_states (see below)
  73. * in a single Grace Period.
  74. *
  75. * GP in GP_STAGES stands for Grace Period ;)
  76. */
  77. #define GP_STAGES 2
  78. struct rcu_data {
  79. spinlock_t lock; /* Protect rcu_data fields. */
  80. long completed; /* Number of last completed batch. */
  81. int waitlistcount;
  82. struct tasklet_struct rcu_tasklet;
  83. struct rcu_head *nextlist;
  84. struct rcu_head **nexttail;
  85. struct rcu_head *waitlist[GP_STAGES];
  86. struct rcu_head **waittail[GP_STAGES];
  87. struct rcu_head *donelist;
  88. struct rcu_head **donetail;
  89. long rcu_flipctr[2];
  90. #ifdef CONFIG_RCU_TRACE
  91. struct rcupreempt_trace trace;
  92. #endif /* #ifdef CONFIG_RCU_TRACE */
  93. };
  94. /*
  95. * States for rcu_try_flip() and friends.
  96. */
  97. enum rcu_try_flip_states {
  98. /*
  99. * Stay here if nothing is happening. Flip the counter if somthing
  100. * starts happening. Denoted by "I"
  101. */
  102. rcu_try_flip_idle_state,
  103. /*
  104. * Wait here for all CPUs to notice that the counter has flipped. This
  105. * prevents the old set of counters from ever being incremented once
  106. * we leave this state, which in turn is necessary because we cannot
  107. * test any individual counter for zero -- we can only check the sum.
  108. * Denoted by "A".
  109. */
  110. rcu_try_flip_waitack_state,
  111. /*
  112. * Wait here for the sum of the old per-CPU counters to reach zero.
  113. * Denoted by "Z".
  114. */
  115. rcu_try_flip_waitzero_state,
  116. /*
  117. * Wait here for each of the other CPUs to execute a memory barrier.
  118. * This is necessary to ensure that these other CPUs really have
  119. * completed executing their RCU read-side critical sections, despite
  120. * their CPUs wildly reordering memory. Denoted by "M".
  121. */
  122. rcu_try_flip_waitmb_state,
  123. };
  124. struct rcu_ctrlblk {
  125. spinlock_t fliplock; /* Protect state-machine transitions. */
  126. long completed; /* Number of last completed batch. */
  127. enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
  128. the rcu state machine */
  129. };
  130. static DEFINE_PER_CPU(struct rcu_data, rcu_data);
  131. static struct rcu_ctrlblk rcu_ctrlblk = {
  132. .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
  133. .completed = 0,
  134. .rcu_try_flip_state = rcu_try_flip_idle_state,
  135. };
  136. #ifdef CONFIG_RCU_TRACE
  137. static char *rcu_try_flip_state_names[] =
  138. { "idle", "waitack", "waitzero", "waitmb" };
  139. #endif /* #ifdef CONFIG_RCU_TRACE */
  140. static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
  141. /*
  142. * Enum and per-CPU flag to determine when each CPU has seen
  143. * the most recent counter flip.
  144. */
  145. enum rcu_flip_flag_values {
  146. rcu_flip_seen, /* Steady/initial state, last flip seen. */
  147. /* Only GP detector can update. */
  148. rcu_flipped /* Flip just completed, need confirmation. */
  149. /* Only corresponding CPU can update. */
  150. };
  151. static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
  152. = rcu_flip_seen;
  153. /*
  154. * Enum and per-CPU flag to determine when each CPU has executed the
  155. * needed memory barrier to fence in memory references from its last RCU
  156. * read-side critical section in the just-completed grace period.
  157. */
  158. enum rcu_mb_flag_values {
  159. rcu_mb_done, /* Steady/initial state, no mb()s required. */
  160. /* Only GP detector can update. */
  161. rcu_mb_needed /* Flip just completed, need an mb(). */
  162. /* Only corresponding CPU can update. */
  163. };
  164. static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
  165. = rcu_mb_done;
  166. /*
  167. * RCU_DATA_ME: find the current CPU's rcu_data structure.
  168. * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
  169. */
  170. #define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
  171. #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
  172. /*
  173. * Helper macro for tracing when the appropriate rcu_data is not
  174. * cached in a local variable, but where the CPU number is so cached.
  175. */
  176. #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
  177. /*
  178. * Helper macro for tracing when the appropriate rcu_data is not
  179. * cached in a local variable.
  180. */
  181. #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
  182. /*
  183. * Helper macro for tracing when the appropriate rcu_data is pointed
  184. * to by a local variable.
  185. */
  186. #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
  187. /*
  188. * Return the number of RCU batches processed thus far. Useful
  189. * for debug and statistics.
  190. */
  191. long rcu_batches_completed(void)
  192. {
  193. return rcu_ctrlblk.completed;
  194. }
  195. EXPORT_SYMBOL_GPL(rcu_batches_completed);
  196. void __rcu_read_lock(void)
  197. {
  198. int idx;
  199. struct task_struct *t = current;
  200. int nesting;
  201. nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
  202. if (nesting != 0) {
  203. /* An earlier rcu_read_lock() covers us, just count it. */
  204. t->rcu_read_lock_nesting = nesting + 1;
  205. } else {
  206. unsigned long flags;
  207. /*
  208. * We disable interrupts for the following reasons:
  209. * - If we get scheduling clock interrupt here, and we
  210. * end up acking the counter flip, it's like a promise
  211. * that we will never increment the old counter again.
  212. * Thus we will break that promise if that
  213. * scheduling clock interrupt happens between the time
  214. * we pick the .completed field and the time that we
  215. * increment our counter.
  216. *
  217. * - We don't want to be preempted out here.
  218. *
  219. * NMIs can still occur, of course, and might themselves
  220. * contain rcu_read_lock().
  221. */
  222. local_irq_save(flags);
  223. /*
  224. * Outermost nesting of rcu_read_lock(), so increment
  225. * the current counter for the current CPU. Use volatile
  226. * casts to prevent the compiler from reordering.
  227. */
  228. idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
  229. ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
  230. /*
  231. * Now that the per-CPU counter has been incremented, we
  232. * are protected from races with rcu_read_lock() invoked
  233. * from NMI handlers on this CPU. We can therefore safely
  234. * increment the nesting counter, relieving further NMIs
  235. * of the need to increment the per-CPU counter.
  236. */
  237. ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
  238. /*
  239. * Now that we have preventing any NMIs from storing
  240. * to the ->rcu_flipctr_idx, we can safely use it to
  241. * remember which counter to decrement in the matching
  242. * rcu_read_unlock().
  243. */
  244. ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
  245. local_irq_restore(flags);
  246. }
  247. }
  248. EXPORT_SYMBOL_GPL(__rcu_read_lock);
  249. void __rcu_read_unlock(void)
  250. {
  251. int idx;
  252. struct task_struct *t = current;
  253. int nesting;
  254. nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
  255. if (nesting > 1) {
  256. /*
  257. * We are still protected by the enclosing rcu_read_lock(),
  258. * so simply decrement the counter.
  259. */
  260. t->rcu_read_lock_nesting = nesting - 1;
  261. } else {
  262. unsigned long flags;
  263. /*
  264. * Disable local interrupts to prevent the grace-period
  265. * detection state machine from seeing us half-done.
  266. * NMIs can still occur, of course, and might themselves
  267. * contain rcu_read_lock() and rcu_read_unlock().
  268. */
  269. local_irq_save(flags);
  270. /*
  271. * Outermost nesting of rcu_read_unlock(), so we must
  272. * decrement the current counter for the current CPU.
  273. * This must be done carefully, because NMIs can
  274. * occur at any point in this code, and any rcu_read_lock()
  275. * and rcu_read_unlock() pairs in the NMI handlers
  276. * must interact non-destructively with this code.
  277. * Lots of volatile casts, and -very- careful ordering.
  278. *
  279. * Changes to this code, including this one, must be
  280. * inspected, validated, and tested extremely carefully!!!
  281. */
  282. /*
  283. * First, pick up the index.
  284. */
  285. idx = ACCESS_ONCE(t->rcu_flipctr_idx);
  286. /*
  287. * Now that we have fetched the counter index, it is
  288. * safe to decrement the per-task RCU nesting counter.
  289. * After this, any interrupts or NMIs will increment and
  290. * decrement the per-CPU counters.
  291. */
  292. ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
  293. /*
  294. * It is now safe to decrement this task's nesting count.
  295. * NMIs that occur after this statement will route their
  296. * rcu_read_lock() calls through this "else" clause, and
  297. * will thus start incrementing the per-CPU counter on
  298. * their own. They will also clobber ->rcu_flipctr_idx,
  299. * but that is OK, since we have already fetched it.
  300. */
  301. ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
  302. local_irq_restore(flags);
  303. }
  304. }
  305. EXPORT_SYMBOL_GPL(__rcu_read_unlock);
  306. /*
  307. * If a global counter flip has occurred since the last time that we
  308. * advanced callbacks, advance them. Hardware interrupts must be
  309. * disabled when calling this function.
  310. */
  311. static void __rcu_advance_callbacks(struct rcu_data *rdp)
  312. {
  313. int cpu;
  314. int i;
  315. int wlc = 0;
  316. if (rdp->completed != rcu_ctrlblk.completed) {
  317. if (rdp->waitlist[GP_STAGES - 1] != NULL) {
  318. *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
  319. rdp->donetail = rdp->waittail[GP_STAGES - 1];
  320. RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
  321. }
  322. for (i = GP_STAGES - 2; i >= 0; i--) {
  323. if (rdp->waitlist[i] != NULL) {
  324. rdp->waitlist[i + 1] = rdp->waitlist[i];
  325. rdp->waittail[i + 1] = rdp->waittail[i];
  326. wlc++;
  327. } else {
  328. rdp->waitlist[i + 1] = NULL;
  329. rdp->waittail[i + 1] =
  330. &rdp->waitlist[i + 1];
  331. }
  332. }
  333. if (rdp->nextlist != NULL) {
  334. rdp->waitlist[0] = rdp->nextlist;
  335. rdp->waittail[0] = rdp->nexttail;
  336. wlc++;
  337. rdp->nextlist = NULL;
  338. rdp->nexttail = &rdp->nextlist;
  339. RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
  340. } else {
  341. rdp->waitlist[0] = NULL;
  342. rdp->waittail[0] = &rdp->waitlist[0];
  343. }
  344. rdp->waitlistcount = wlc;
  345. rdp->completed = rcu_ctrlblk.completed;
  346. }
  347. /*
  348. * Check to see if this CPU needs to report that it has seen
  349. * the most recent counter flip, thereby declaring that all
  350. * subsequent rcu_read_lock() invocations will respect this flip.
  351. */
  352. cpu = raw_smp_processor_id();
  353. if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
  354. smp_mb(); /* Subsequent counter accesses must see new value */
  355. per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
  356. smp_mb(); /* Subsequent RCU read-side critical sections */
  357. /* seen -after- acknowledgement. */
  358. }
  359. }
  360. #ifdef CONFIG_NO_HZ
  361. DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
  362. static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
  363. static DEFINE_PER_CPU(int, rcu_update_flag);
  364. /**
  365. * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  366. *
  367. * If the CPU was idle with dynamic ticks active, this updates the
  368. * dynticks_progress_counter to let the RCU handling know that the
  369. * CPU is active.
  370. */
  371. void rcu_irq_enter(void)
  372. {
  373. int cpu = smp_processor_id();
  374. if (per_cpu(rcu_update_flag, cpu))
  375. per_cpu(rcu_update_flag, cpu)++;
  376. /*
  377. * Only update if we are coming from a stopped ticks mode
  378. * (dynticks_progress_counter is even).
  379. */
  380. if (!in_interrupt() &&
  381. (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
  382. /*
  383. * The following might seem like we could have a race
  384. * with NMI/SMIs. But this really isn't a problem.
  385. * Here we do a read/modify/write, and the race happens
  386. * when an NMI/SMI comes in after the read and before
  387. * the write. But NMI/SMIs will increment this counter
  388. * twice before returning, so the zero bit will not
  389. * be corrupted by the NMI/SMI which is the most important
  390. * part.
  391. *
  392. * The only thing is that we would bring back the counter
  393. * to a postion that it was in during the NMI/SMI.
  394. * But the zero bit would be set, so the rest of the
  395. * counter would again be ignored.
  396. *
  397. * On return from the IRQ, the counter may have the zero
  398. * bit be 0 and the counter the same as the return from
  399. * the NMI/SMI. If the state machine was so unlucky to
  400. * see that, it still doesn't matter, since all
  401. * RCU read-side critical sections on this CPU would
  402. * have already completed.
  403. */
  404. per_cpu(dynticks_progress_counter, cpu)++;
  405. /*
  406. * The following memory barrier ensures that any
  407. * rcu_read_lock() primitives in the irq handler
  408. * are seen by other CPUs to follow the above
  409. * increment to dynticks_progress_counter. This is
  410. * required in order for other CPUs to correctly
  411. * determine when it is safe to advance the RCU
  412. * grace-period state machine.
  413. */
  414. smp_mb(); /* see above block comment. */
  415. /*
  416. * Since we can't determine the dynamic tick mode from
  417. * the dynticks_progress_counter after this routine,
  418. * we use a second flag to acknowledge that we came
  419. * from an idle state with ticks stopped.
  420. */
  421. per_cpu(rcu_update_flag, cpu)++;
  422. /*
  423. * If we take an NMI/SMI now, they will also increment
  424. * the rcu_update_flag, and will not update the
  425. * dynticks_progress_counter on exit. That is for
  426. * this IRQ to do.
  427. */
  428. }
  429. }
  430. /**
  431. * rcu_irq_exit - Called from exiting Hard irq context.
  432. *
  433. * If the CPU was idle with dynamic ticks active, update the
  434. * dynticks_progress_counter to put let the RCU handling be
  435. * aware that the CPU is going back to idle with no ticks.
  436. */
  437. void rcu_irq_exit(void)
  438. {
  439. int cpu = smp_processor_id();
  440. /*
  441. * rcu_update_flag is set if we interrupted the CPU
  442. * when it was idle with ticks stopped.
  443. * Once this occurs, we keep track of interrupt nesting
  444. * because a NMI/SMI could also come in, and we still
  445. * only want the IRQ that started the increment of the
  446. * dynticks_progress_counter to be the one that modifies
  447. * it on exit.
  448. */
  449. if (per_cpu(rcu_update_flag, cpu)) {
  450. if (--per_cpu(rcu_update_flag, cpu))
  451. return;
  452. /* This must match the interrupt nesting */
  453. WARN_ON(in_interrupt());
  454. /*
  455. * If an NMI/SMI happens now we are still
  456. * protected by the dynticks_progress_counter being odd.
  457. */
  458. /*
  459. * The following memory barrier ensures that any
  460. * rcu_read_unlock() primitives in the irq handler
  461. * are seen by other CPUs to preceed the following
  462. * increment to dynticks_progress_counter. This
  463. * is required in order for other CPUs to determine
  464. * when it is safe to advance the RCU grace-period
  465. * state machine.
  466. */
  467. smp_mb(); /* see above block comment. */
  468. per_cpu(dynticks_progress_counter, cpu)++;
  469. WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
  470. }
  471. }
  472. static void dyntick_save_progress_counter(int cpu)
  473. {
  474. per_cpu(rcu_dyntick_snapshot, cpu) =
  475. per_cpu(dynticks_progress_counter, cpu);
  476. }
  477. static inline int
  478. rcu_try_flip_waitack_needed(int cpu)
  479. {
  480. long curr;
  481. long snap;
  482. curr = per_cpu(dynticks_progress_counter, cpu);
  483. snap = per_cpu(rcu_dyntick_snapshot, cpu);
  484. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  485. /*
  486. * If the CPU remained in dynticks mode for the entire time
  487. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  488. * then it cannot be in the middle of an rcu_read_lock(), so
  489. * the next rcu_read_lock() it executes must use the new value
  490. * of the counter. So we can safely pretend that this CPU
  491. * already acknowledged the counter.
  492. */
  493. if ((curr == snap) && ((curr & 0x1) == 0))
  494. return 0;
  495. /*
  496. * If the CPU passed through or entered a dynticks idle phase with
  497. * no active irq handlers, then, as above, we can safely pretend
  498. * that this CPU already acknowledged the counter.
  499. */
  500. if ((curr - snap) > 2 || (snap & 0x1) == 0)
  501. return 0;
  502. /* We need this CPU to explicitly acknowledge the counter flip. */
  503. return 1;
  504. }
  505. static inline int
  506. rcu_try_flip_waitmb_needed(int cpu)
  507. {
  508. long curr;
  509. long snap;
  510. curr = per_cpu(dynticks_progress_counter, cpu);
  511. snap = per_cpu(rcu_dyntick_snapshot, cpu);
  512. smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  513. /*
  514. * If the CPU remained in dynticks mode for the entire time
  515. * and didn't take any interrupts, NMIs, SMIs, or whatever,
  516. * then it cannot have executed an RCU read-side critical section
  517. * during that time, so there is no need for it to execute a
  518. * memory barrier.
  519. */
  520. if ((curr == snap) && ((curr & 0x1) == 0))
  521. return 0;
  522. /*
  523. * If the CPU either entered or exited an outermost interrupt,
  524. * SMI, NMI, or whatever handler, then we know that it executed
  525. * a memory barrier when doing so. So we don't need another one.
  526. */
  527. if (curr != snap)
  528. return 0;
  529. /* We need the CPU to execute a memory barrier. */
  530. return 1;
  531. }
  532. #else /* !CONFIG_NO_HZ */
  533. # define dyntick_save_progress_counter(cpu) do { } while (0)
  534. # define rcu_try_flip_waitack_needed(cpu) (1)
  535. # define rcu_try_flip_waitmb_needed(cpu) (1)
  536. #endif /* CONFIG_NO_HZ */
  537. /*
  538. * Get here when RCU is idle. Decide whether we need to
  539. * move out of idle state, and return non-zero if so.
  540. * "Straightforward" approach for the moment, might later
  541. * use callback-list lengths, grace-period duration, or
  542. * some such to determine when to exit idle state.
  543. * Might also need a pre-idle test that does not acquire
  544. * the lock, but let's get the simple case working first...
  545. */
  546. static int
  547. rcu_try_flip_idle(void)
  548. {
  549. int cpu;
  550. RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
  551. if (!rcu_pending(smp_processor_id())) {
  552. RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
  553. return 0;
  554. }
  555. /*
  556. * Do the flip.
  557. */
  558. RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
  559. rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
  560. /*
  561. * Need a memory barrier so that other CPUs see the new
  562. * counter value before they see the subsequent change of all
  563. * the rcu_flip_flag instances to rcu_flipped.
  564. */
  565. smp_mb(); /* see above block comment. */
  566. /* Now ask each CPU for acknowledgement of the flip. */
  567. for_each_cpu_mask(cpu, rcu_cpu_online_map) {
  568. per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
  569. dyntick_save_progress_counter(cpu);
  570. }
  571. return 1;
  572. }
  573. /*
  574. * Wait for CPUs to acknowledge the flip.
  575. */
  576. static int
  577. rcu_try_flip_waitack(void)
  578. {
  579. int cpu;
  580. RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
  581. for_each_cpu_mask(cpu, rcu_cpu_online_map)
  582. if (rcu_try_flip_waitack_needed(cpu) &&
  583. per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
  584. RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
  585. return 0;
  586. }
  587. /*
  588. * Make sure our checks above don't bleed into subsequent
  589. * waiting for the sum of the counters to reach zero.
  590. */
  591. smp_mb(); /* see above block comment. */
  592. RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
  593. return 1;
  594. }
  595. /*
  596. * Wait for collective ``last'' counter to reach zero,
  597. * then tell all CPUs to do an end-of-grace-period memory barrier.
  598. */
  599. static int
  600. rcu_try_flip_waitzero(void)
  601. {
  602. int cpu;
  603. int lastidx = !(rcu_ctrlblk.completed & 0x1);
  604. int sum = 0;
  605. /* Check to see if the sum of the "last" counters is zero. */
  606. RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
  607. for_each_cpu_mask(cpu, rcu_cpu_online_map)
  608. sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
  609. if (sum != 0) {
  610. RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
  611. return 0;
  612. }
  613. /*
  614. * This ensures that the other CPUs see the call for
  615. * memory barriers -after- the sum to zero has been
  616. * detected here
  617. */
  618. smp_mb(); /* ^^^^^^^^^^^^ */
  619. /* Call for a memory barrier from each CPU. */
  620. for_each_cpu_mask(cpu, rcu_cpu_online_map) {
  621. per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
  622. dyntick_save_progress_counter(cpu);
  623. }
  624. RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
  625. return 1;
  626. }
  627. /*
  628. * Wait for all CPUs to do their end-of-grace-period memory barrier.
  629. * Return 0 once all CPUs have done so.
  630. */
  631. static int
  632. rcu_try_flip_waitmb(void)
  633. {
  634. int cpu;
  635. RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
  636. for_each_cpu_mask(cpu, rcu_cpu_online_map)
  637. if (rcu_try_flip_waitmb_needed(cpu) &&
  638. per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
  639. RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
  640. return 0;
  641. }
  642. smp_mb(); /* Ensure that the above checks precede any following flip. */
  643. RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
  644. return 1;
  645. }
  646. /*
  647. * Attempt a single flip of the counters. Remember, a single flip does
  648. * -not- constitute a grace period. Instead, the interval between
  649. * at least GP_STAGES consecutive flips is a grace period.
  650. *
  651. * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
  652. * on a large SMP, they might want to use a hierarchical organization of
  653. * the per-CPU-counter pairs.
  654. */
  655. static void rcu_try_flip(void)
  656. {
  657. unsigned long flags;
  658. RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
  659. if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
  660. RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
  661. return;
  662. }
  663. /*
  664. * Take the next transition(s) through the RCU grace-period
  665. * flip-counter state machine.
  666. */
  667. switch (rcu_ctrlblk.rcu_try_flip_state) {
  668. case rcu_try_flip_idle_state:
  669. if (rcu_try_flip_idle())
  670. rcu_ctrlblk.rcu_try_flip_state =
  671. rcu_try_flip_waitack_state;
  672. break;
  673. case rcu_try_flip_waitack_state:
  674. if (rcu_try_flip_waitack())
  675. rcu_ctrlblk.rcu_try_flip_state =
  676. rcu_try_flip_waitzero_state;
  677. break;
  678. case rcu_try_flip_waitzero_state:
  679. if (rcu_try_flip_waitzero())
  680. rcu_ctrlblk.rcu_try_flip_state =
  681. rcu_try_flip_waitmb_state;
  682. break;
  683. case rcu_try_flip_waitmb_state:
  684. if (rcu_try_flip_waitmb())
  685. rcu_ctrlblk.rcu_try_flip_state =
  686. rcu_try_flip_idle_state;
  687. }
  688. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  689. }
  690. /*
  691. * Check to see if this CPU needs to do a memory barrier in order to
  692. * ensure that any prior RCU read-side critical sections have committed
  693. * their counter manipulations and critical-section memory references
  694. * before declaring the grace period to be completed.
  695. */
  696. static void rcu_check_mb(int cpu)
  697. {
  698. if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
  699. smp_mb(); /* Ensure RCU read-side accesses are visible. */
  700. per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
  701. }
  702. }
  703. void rcu_check_callbacks(int cpu, int user)
  704. {
  705. unsigned long flags;
  706. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  707. rcu_check_mb(cpu);
  708. if (rcu_ctrlblk.completed == rdp->completed)
  709. rcu_try_flip();
  710. spin_lock_irqsave(&rdp->lock, flags);
  711. RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
  712. __rcu_advance_callbacks(rdp);
  713. if (rdp->donelist == NULL) {
  714. spin_unlock_irqrestore(&rdp->lock, flags);
  715. } else {
  716. spin_unlock_irqrestore(&rdp->lock, flags);
  717. raise_softirq(RCU_SOFTIRQ);
  718. }
  719. }
  720. /*
  721. * Needed by dynticks, to make sure all RCU processing has finished
  722. * when we go idle:
  723. */
  724. void rcu_advance_callbacks(int cpu, int user)
  725. {
  726. unsigned long flags;
  727. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  728. if (rcu_ctrlblk.completed == rdp->completed) {
  729. rcu_try_flip();
  730. if (rcu_ctrlblk.completed == rdp->completed)
  731. return;
  732. }
  733. spin_lock_irqsave(&rdp->lock, flags);
  734. RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
  735. __rcu_advance_callbacks(rdp);
  736. spin_unlock_irqrestore(&rdp->lock, flags);
  737. }
  738. #ifdef CONFIG_HOTPLUG_CPU
  739. #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
  740. *dsttail = srclist; \
  741. if (srclist != NULL) { \
  742. dsttail = srctail; \
  743. srclist = NULL; \
  744. srctail = &srclist;\
  745. } \
  746. } while (0)
  747. void rcu_offline_cpu(int cpu)
  748. {
  749. int i;
  750. struct rcu_head *list = NULL;
  751. unsigned long flags;
  752. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  753. struct rcu_head **tail = &list;
  754. /*
  755. * Remove all callbacks from the newly dead CPU, retaining order.
  756. * Otherwise rcu_barrier() will fail
  757. */
  758. spin_lock_irqsave(&rdp->lock, flags);
  759. rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
  760. for (i = GP_STAGES - 1; i >= 0; i--)
  761. rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
  762. list, tail);
  763. rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
  764. spin_unlock_irqrestore(&rdp->lock, flags);
  765. rdp->waitlistcount = 0;
  766. /* Disengage the newly dead CPU from the grace-period computation. */
  767. spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  768. rcu_check_mb(cpu);
  769. if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
  770. smp_mb(); /* Subsequent counter accesses must see new value */
  771. per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
  772. smp_mb(); /* Subsequent RCU read-side critical sections */
  773. /* seen -after- acknowledgement. */
  774. }
  775. RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
  776. RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
  777. RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
  778. RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
  779. cpu_clear(cpu, rcu_cpu_online_map);
  780. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  781. /*
  782. * Place the removed callbacks on the current CPU's queue.
  783. * Make them all start a new grace period: simple approach,
  784. * in theory could starve a given set of callbacks, but
  785. * you would need to be doing some serious CPU hotplugging
  786. * to make this happen. If this becomes a problem, adding
  787. * a synchronize_rcu() to the hotplug path would be a simple
  788. * fix.
  789. */
  790. local_irq_save(flags);
  791. rdp = RCU_DATA_ME();
  792. spin_lock(&rdp->lock);
  793. *rdp->nexttail = list;
  794. if (list)
  795. rdp->nexttail = tail;
  796. spin_unlock_irqrestore(&rdp->lock, flags);
  797. }
  798. #else /* #ifdef CONFIG_HOTPLUG_CPU */
  799. void rcu_offline_cpu(int cpu)
  800. {
  801. }
  802. #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
  803. void __cpuinit rcu_online_cpu(int cpu)
  804. {
  805. unsigned long flags;
  806. spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
  807. cpu_set(cpu, rcu_cpu_online_map);
  808. spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
  809. }
  810. static void rcu_process_callbacks(struct softirq_action *unused)
  811. {
  812. unsigned long flags;
  813. struct rcu_head *next, *list;
  814. struct rcu_data *rdp;
  815. local_irq_save(flags);
  816. rdp = RCU_DATA_ME();
  817. spin_lock(&rdp->lock);
  818. list = rdp->donelist;
  819. if (list == NULL) {
  820. spin_unlock_irqrestore(&rdp->lock, flags);
  821. return;
  822. }
  823. rdp->donelist = NULL;
  824. rdp->donetail = &rdp->donelist;
  825. RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
  826. spin_unlock_irqrestore(&rdp->lock, flags);
  827. while (list) {
  828. next = list->next;
  829. list->func(list);
  830. list = next;
  831. RCU_TRACE_ME(rcupreempt_trace_invoke);
  832. }
  833. }
  834. void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  835. {
  836. unsigned long flags;
  837. struct rcu_data *rdp;
  838. head->func = func;
  839. head->next = NULL;
  840. local_irq_save(flags);
  841. rdp = RCU_DATA_ME();
  842. spin_lock(&rdp->lock);
  843. __rcu_advance_callbacks(rdp);
  844. *rdp->nexttail = head;
  845. rdp->nexttail = &head->next;
  846. RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
  847. spin_unlock(&rdp->lock);
  848. local_irq_restore(flags);
  849. }
  850. EXPORT_SYMBOL_GPL(call_rcu);
  851. /*
  852. * Wait until all currently running preempt_disable() code segments
  853. * (including hardware-irq-disable segments) complete. Note that
  854. * in -rt this does -not- necessarily result in all currently executing
  855. * interrupt -handlers- having completed.
  856. */
  857. void __synchronize_sched(void)
  858. {
  859. cpumask_t oldmask;
  860. int cpu;
  861. if (sched_getaffinity(0, &oldmask) < 0)
  862. oldmask = cpu_possible_map;
  863. for_each_online_cpu(cpu) {
  864. sched_setaffinity(0, &cpumask_of_cpu(cpu));
  865. schedule();
  866. }
  867. sched_setaffinity(0, &oldmask);
  868. }
  869. EXPORT_SYMBOL_GPL(__synchronize_sched);
  870. /*
  871. * Check to see if any future RCU-related work will need to be done
  872. * by the current CPU, even if none need be done immediately, returning
  873. * 1 if so. Assumes that notifiers would take care of handling any
  874. * outstanding requests from the RCU core.
  875. *
  876. * This function is part of the RCU implementation; it is -not-
  877. * an exported member of the RCU API.
  878. */
  879. int rcu_needs_cpu(int cpu)
  880. {
  881. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  882. return (rdp->donelist != NULL ||
  883. !!rdp->waitlistcount ||
  884. rdp->nextlist != NULL);
  885. }
  886. int rcu_pending(int cpu)
  887. {
  888. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  889. /* The CPU has at least one callback queued somewhere. */
  890. if (rdp->donelist != NULL ||
  891. !!rdp->waitlistcount ||
  892. rdp->nextlist != NULL)
  893. return 1;
  894. /* The RCU core needs an acknowledgement from this CPU. */
  895. if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
  896. (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
  897. return 1;
  898. /* This CPU has fallen behind the global grace-period number. */
  899. if (rdp->completed != rcu_ctrlblk.completed)
  900. return 1;
  901. /* Nothing needed from this CPU. */
  902. return 0;
  903. }
  904. static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
  905. unsigned long action, void *hcpu)
  906. {
  907. long cpu = (long)hcpu;
  908. switch (action) {
  909. case CPU_UP_PREPARE:
  910. case CPU_UP_PREPARE_FROZEN:
  911. rcu_online_cpu(cpu);
  912. break;
  913. case CPU_UP_CANCELED:
  914. case CPU_UP_CANCELED_FROZEN:
  915. case CPU_DEAD:
  916. case CPU_DEAD_FROZEN:
  917. rcu_offline_cpu(cpu);
  918. break;
  919. default:
  920. break;
  921. }
  922. return NOTIFY_OK;
  923. }
  924. static struct notifier_block __cpuinitdata rcu_nb = {
  925. .notifier_call = rcu_cpu_notify,
  926. };
  927. void __init __rcu_init(void)
  928. {
  929. int cpu;
  930. int i;
  931. struct rcu_data *rdp;
  932. printk(KERN_NOTICE "Preemptible RCU implementation.\n");
  933. for_each_possible_cpu(cpu) {
  934. rdp = RCU_DATA_CPU(cpu);
  935. spin_lock_init(&rdp->lock);
  936. rdp->completed = 0;
  937. rdp->waitlistcount = 0;
  938. rdp->nextlist = NULL;
  939. rdp->nexttail = &rdp->nextlist;
  940. for (i = 0; i < GP_STAGES; i++) {
  941. rdp->waitlist[i] = NULL;
  942. rdp->waittail[i] = &rdp->waitlist[i];
  943. }
  944. rdp->donelist = NULL;
  945. rdp->donetail = &rdp->donelist;
  946. rdp->rcu_flipctr[0] = 0;
  947. rdp->rcu_flipctr[1] = 0;
  948. }
  949. register_cpu_notifier(&rcu_nb);
  950. /*
  951. * We don't need protection against CPU-Hotplug here
  952. * since
  953. * a) If a CPU comes online while we are iterating over the
  954. * cpu_online_map below, we would only end up making a
  955. * duplicate call to rcu_online_cpu() which sets the corresponding
  956. * CPU's mask in the rcu_cpu_online_map.
  957. *
  958. * b) A CPU cannot go offline at this point in time since the user
  959. * does not have access to the sysfs interface, nor do we
  960. * suspend the system.
  961. */
  962. for_each_online_cpu(cpu)
  963. rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
  964. open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  965. }
  966. /*
  967. * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
  968. */
  969. void synchronize_kernel(void)
  970. {
  971. synchronize_rcu();
  972. }
  973. #ifdef CONFIG_RCU_TRACE
  974. long *rcupreempt_flipctr(int cpu)
  975. {
  976. return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
  977. }
  978. EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
  979. int rcupreempt_flip_flag(int cpu)
  980. {
  981. return per_cpu(rcu_flip_flag, cpu);
  982. }
  983. EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
  984. int rcupreempt_mb_flag(int cpu)
  985. {
  986. return per_cpu(rcu_mb_flag, cpu);
  987. }
  988. EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
  989. char *rcupreempt_try_flip_state_name(void)
  990. {
  991. return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
  992. }
  993. EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
  994. struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
  995. {
  996. struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  997. return &rdp->trace;
  998. }
  999. EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
  1000. #endif /* #ifdef RCU_TRACE */