tlb.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. #include <linux/init.h>
  2. #include <linux/mm.h>
  3. #include <linux/spinlock.h>
  4. #include <linux/smp.h>
  5. #include <linux/interrupt.h>
  6. #include <linux/module.h>
  7. #include <linux/cpu.h>
  8. #include <asm/tlbflush.h>
  9. #include <asm/mmu_context.h>
  10. #include <asm/cache.h>
  11. #include <asm/apic.h>
  12. #include <asm/uv/uv.h>
  13. DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
  14. = { &init_mm, 0, };
  15. /*
  16. * Smarter SMP flushing macros.
  17. * c/o Linus Torvalds.
  18. *
  19. * These mean you can really definitely utterly forget about
  20. * writing to user space from interrupts. (Its not allowed anyway).
  21. *
  22. * Optimizations Manfred Spraul <manfred@colorfullife.com>
  23. *
  24. * More scalable flush, from Andi Kleen
  25. *
  26. * To avoid global state use 8 different call vectors.
  27. * Each CPU uses a specific vector to trigger flushes on other
  28. * CPUs. Depending on the received vector the target CPUs look into
  29. * the right array slot for the flush data.
  30. *
  31. * With more than 8 CPUs they are hashed to the 8 available
  32. * vectors. The limited global vector space forces us to this right now.
  33. * In future when interrupts are split into per CPU domains this could be
  34. * fixed, at the cost of triggering multiple IPIs in some cases.
  35. */
  36. union smp_flush_state {
  37. struct {
  38. struct mm_struct *flush_mm;
  39. unsigned long flush_start;
  40. unsigned long flush_end;
  41. raw_spinlock_t tlbstate_lock;
  42. DECLARE_BITMAP(flush_cpumask, NR_CPUS);
  43. };
  44. char pad[INTERNODE_CACHE_BYTES];
  45. } ____cacheline_internodealigned_in_smp;
  46. /* State is put into the per CPU data section, but padded
  47. to a full cache line because other CPUs can access it and we don't
  48. want false sharing in the per cpu data segment. */
  49. static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
  50. static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
  51. /*
  52. * We cannot call mmdrop() because we are in interrupt context,
  53. * instead update mm->cpu_vm_mask.
  54. */
  55. void leave_mm(int cpu)
  56. {
  57. struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
  58. if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
  59. BUG();
  60. if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
  61. cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
  62. load_cr3(swapper_pg_dir);
  63. }
  64. }
  65. EXPORT_SYMBOL_GPL(leave_mm);
  66. /*
  67. *
  68. * The flush IPI assumes that a thread switch happens in this order:
  69. * [cpu0: the cpu that switches]
  70. * 1) switch_mm() either 1a) or 1b)
  71. * 1a) thread switch to a different mm
  72. * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
  73. * Stop ipi delivery for the old mm. This is not synchronized with
  74. * the other cpus, but smp_invalidate_interrupt ignore flush ipis
  75. * for the wrong mm, and in the worst case we perform a superfluous
  76. * tlb flush.
  77. * 1a2) set cpu mmu_state to TLBSTATE_OK
  78. * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
  79. * was in lazy tlb mode.
  80. * 1a3) update cpu active_mm
  81. * Now cpu0 accepts tlb flushes for the new mm.
  82. * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
  83. * Now the other cpus will send tlb flush ipis.
  84. * 1a4) change cr3.
  85. * 1b) thread switch without mm change
  86. * cpu active_mm is correct, cpu0 already handles
  87. * flush ipis.
  88. * 1b1) set cpu mmu_state to TLBSTATE_OK
  89. * 1b2) test_and_set the cpu bit in cpu_vm_mask.
  90. * Atomically set the bit [other cpus will start sending flush ipis],
  91. * and test the bit.
  92. * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
  93. * 2) switch %%esp, ie current
  94. *
  95. * The interrupt must handle 2 special cases:
  96. * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
  97. * - the cpu performs speculative tlb reads, i.e. even if the cpu only
  98. * runs in kernel space, the cpu could load tlb entries for user space
  99. * pages.
  100. *
  101. * The good news is that cpu mmu_state is local to each cpu, no
  102. * write/read ordering problems.
  103. */
  104. /*
  105. * TLB flush IPI:
  106. *
  107. * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
  108. * 2) Leave the mm if we are in the lazy tlb mode.
  109. *
  110. * Interrupts are disabled.
  111. */
  112. /*
  113. * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
  114. * but still used for documentation purpose but the usage is slightly
  115. * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
  116. * entry calls in with the first parameter in %eax. Maybe define
  117. * intrlinkage?
  118. */
  119. #ifdef CONFIG_X86_64
  120. asmlinkage
  121. #endif
  122. void smp_invalidate_interrupt(struct pt_regs *regs)
  123. {
  124. unsigned int cpu;
  125. unsigned int sender;
  126. union smp_flush_state *f;
  127. cpu = smp_processor_id();
  128. /*
  129. * orig_rax contains the negated interrupt vector.
  130. * Use that to determine where the sender put the data.
  131. */
  132. sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
  133. f = &flush_state[sender];
  134. if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
  135. goto out;
  136. /*
  137. * This was a BUG() but until someone can quote me the
  138. * line from the intel manual that guarantees an IPI to
  139. * multiple CPUs is retried _only_ on the erroring CPUs
  140. * its staying as a return
  141. *
  142. * BUG();
  143. */
  144. if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
  145. if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
  146. if (f->flush_end == TLB_FLUSH_ALL
  147. || !cpu_has_invlpg)
  148. local_flush_tlb();
  149. else if (!f->flush_end)
  150. __flush_tlb_single(f->flush_start);
  151. else {
  152. unsigned long addr;
  153. addr = f->flush_start;
  154. while (addr < f->flush_end) {
  155. __flush_tlb_single(addr);
  156. addr += PAGE_SIZE;
  157. }
  158. }
  159. } else
  160. leave_mm(cpu);
  161. }
  162. out:
  163. ack_APIC_irq();
  164. smp_mb__before_clear_bit();
  165. cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
  166. smp_mb__after_clear_bit();
  167. inc_irq_stat(irq_tlb_count);
  168. }
  169. static void flush_tlb_others_ipi(const struct cpumask *cpumask,
  170. struct mm_struct *mm, unsigned long start,
  171. unsigned long end)
  172. {
  173. unsigned int sender;
  174. union smp_flush_state *f;
  175. /* Caller has disabled preemption */
  176. sender = this_cpu_read(tlb_vector_offset);
  177. f = &flush_state[sender];
  178. if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
  179. raw_spin_lock(&f->tlbstate_lock);
  180. f->flush_mm = mm;
  181. f->flush_start = start;
  182. f->flush_end = end;
  183. if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
  184. /*
  185. * We have to send the IPI only to
  186. * CPUs affected.
  187. */
  188. apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
  189. INVALIDATE_TLB_VECTOR_START + sender);
  190. while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
  191. cpu_relax();
  192. }
  193. f->flush_mm = NULL;
  194. f->flush_start = 0;
  195. f->flush_end = 0;
  196. if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
  197. raw_spin_unlock(&f->tlbstate_lock);
  198. }
  199. void native_flush_tlb_others(const struct cpumask *cpumask,
  200. struct mm_struct *mm, unsigned long start,
  201. unsigned long end)
  202. {
  203. if (is_uv_system()) {
  204. unsigned int cpu;
  205. cpu = smp_processor_id();
  206. cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
  207. if (cpumask)
  208. flush_tlb_others_ipi(cpumask, mm, start, end);
  209. return;
  210. }
  211. flush_tlb_others_ipi(cpumask, mm, start, end);
  212. }
  213. static void __cpuinit calculate_tlb_offset(void)
  214. {
  215. int cpu, node, nr_node_vecs, idx = 0;
  216. /*
  217. * we are changing tlb_vector_offset for each CPU in runtime, but this
  218. * will not cause inconsistency, as the write is atomic under X86. we
  219. * might see more lock contentions in a short time, but after all CPU's
  220. * tlb_vector_offset are changed, everything should go normal
  221. *
  222. * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
  223. * waste some vectors.
  224. **/
  225. if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
  226. nr_node_vecs = 1;
  227. else
  228. nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
  229. for_each_online_node(node) {
  230. int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
  231. nr_node_vecs;
  232. int cpu_offset = 0;
  233. for_each_cpu(cpu, cpumask_of_node(node)) {
  234. per_cpu(tlb_vector_offset, cpu) = node_offset +
  235. cpu_offset;
  236. cpu_offset++;
  237. cpu_offset = cpu_offset % nr_node_vecs;
  238. }
  239. idx++;
  240. }
  241. }
  242. static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
  243. unsigned long action, void *hcpu)
  244. {
  245. switch (action & 0xf) {
  246. case CPU_ONLINE:
  247. case CPU_DEAD:
  248. calculate_tlb_offset();
  249. }
  250. return NOTIFY_OK;
  251. }
  252. static int __cpuinit init_smp_flush(void)
  253. {
  254. int i;
  255. for (i = 0; i < ARRAY_SIZE(flush_state); i++)
  256. raw_spin_lock_init(&flush_state[i].tlbstate_lock);
  257. calculate_tlb_offset();
  258. hotcpu_notifier(tlb_cpuhp_notify, 0);
  259. return 0;
  260. }
  261. core_initcall(init_smp_flush);
  262. void flush_tlb_current_task(void)
  263. {
  264. struct mm_struct *mm = current->mm;
  265. preempt_disable();
  266. local_flush_tlb();
  267. if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
  268. flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
  269. preempt_enable();
  270. }
  271. void flush_tlb_mm(struct mm_struct *mm)
  272. {
  273. preempt_disable();
  274. if (current->active_mm == mm) {
  275. if (current->mm)
  276. local_flush_tlb();
  277. else
  278. leave_mm(smp_processor_id());
  279. }
  280. if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
  281. flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
  282. preempt_enable();
  283. }
  284. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  285. static inline unsigned long has_large_page(struct mm_struct *mm,
  286. unsigned long start, unsigned long end)
  287. {
  288. pgd_t *pgd;
  289. pud_t *pud;
  290. pmd_t *pmd;
  291. unsigned long addr = ALIGN(start, HPAGE_SIZE);
  292. for (; addr < end; addr += HPAGE_SIZE) {
  293. pgd = pgd_offset(mm, addr);
  294. if (likely(!pgd_none(*pgd))) {
  295. pud = pud_offset(pgd, addr);
  296. if (likely(!pud_none(*pud))) {
  297. pmd = pmd_offset(pud, addr);
  298. if (likely(!pmd_none(*pmd)))
  299. if (pmd_large(*pmd))
  300. return addr;
  301. }
  302. }
  303. }
  304. return 0;
  305. }
  306. #else
  307. static inline unsigned long has_large_page(struct mm_struct *mm,
  308. unsigned long start, unsigned long end)
  309. {
  310. return 0;
  311. }
  312. #endif
  313. void flush_tlb_range(struct vm_area_struct *vma,
  314. unsigned long start, unsigned long end)
  315. {
  316. struct mm_struct *mm;
  317. if (vma->vm_flags & VM_HUGETLB || tlb_flushall_shift == -1) {
  318. flush_all:
  319. flush_tlb_mm(vma->vm_mm);
  320. return;
  321. }
  322. preempt_disable();
  323. mm = vma->vm_mm;
  324. if (current->active_mm == mm) {
  325. if (current->mm) {
  326. unsigned long addr, vmflag = vma->vm_flags;
  327. unsigned act_entries, tlb_entries = 0;
  328. if (vmflag & VM_EXEC)
  329. tlb_entries = tlb_lli_4k[ENTRIES];
  330. else
  331. tlb_entries = tlb_lld_4k[ENTRIES];
  332. act_entries = tlb_entries > mm->total_vm ?
  333. mm->total_vm : tlb_entries;
  334. if ((end - start) >> PAGE_SHIFT >
  335. act_entries >> tlb_flushall_shift)
  336. local_flush_tlb();
  337. else {
  338. if (has_large_page(mm, start, end)) {
  339. preempt_enable();
  340. goto flush_all;
  341. }
  342. for (addr = start; addr < end;
  343. addr += PAGE_SIZE)
  344. __flush_tlb_single(addr);
  345. if (cpumask_any_but(mm_cpumask(mm),
  346. smp_processor_id()) < nr_cpu_ids)
  347. flush_tlb_others(mm_cpumask(mm), mm,
  348. start, end);
  349. preempt_enable();
  350. return;
  351. }
  352. } else {
  353. leave_mm(smp_processor_id());
  354. }
  355. }
  356. if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
  357. flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
  358. preempt_enable();
  359. }
  360. void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
  361. {
  362. struct mm_struct *mm = vma->vm_mm;
  363. preempt_disable();
  364. if (current->active_mm == mm) {
  365. if (current->mm)
  366. __flush_tlb_one(start);
  367. else
  368. leave_mm(smp_processor_id());
  369. }
  370. if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
  371. flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
  372. preempt_enable();
  373. }
  374. static void do_flush_tlb_all(void *info)
  375. {
  376. __flush_tlb_all();
  377. if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
  378. leave_mm(smp_processor_id());
  379. }
  380. void flush_tlb_all(void)
  381. {
  382. on_each_cpu(do_flush_tlb_all, NULL, 1);
  383. }