浏览代码

[PATCH] cpu state clean after hot remove

Clean CPU states in order to reuse smp boot code for CPU hotplug.

Signed-off-by: Li Shaohua<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Li Shaohua 20 年之前
父节点
当前提交
e1367daf3e
共有 7 个文件被更改,包括 187 次插入43 次删除
  1. 12 0
      arch/i386/kernel/cpu/common.c
  2. 5 0
      arch/i386/kernel/irq.c
  3. 9 11
      arch/i386/kernel/process.c
  4. 144 31
      arch/i386/kernel/smpboot.c
  5. 7 1
      drivers/base/cpu.c
  6. 2 0
      include/asm-i386/irq.h
  7. 8 0
      include/asm-i386/smp.h

+ 12 - 0
arch/i386/kernel/cpu/common.c

@@ -651,3 +651,15 @@ void __devinit cpu_init(void)
 	clear_used_math();
 	clear_used_math();
 	mxcsr_feature_mask_init();
 	mxcsr_feature_mask_init();
 }
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+void __devinit cpu_uninit(void)
+{
+	int cpu = raw_smp_processor_id();
+	cpu_clear(cpu, cpu_initialized);
+
+	/* lazy TLB state */
+	per_cpu(cpu_tlbstate, cpu).state = 0;
+	per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+#endif

+ 5 - 0
arch/i386/kernel/irq.c

@@ -156,6 +156,11 @@ void irq_ctx_init(int cpu)
 		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
 		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
 }
 }
 
 
+void irq_ctx_exit(int cpu)
+{
+	hardirq_ctx[cpu] = NULL;
+}
+
 extern asmlinkage void __do_softirq(void);
 extern asmlinkage void __do_softirq(void);
 
 
 asmlinkage void do_softirq(void)
 asmlinkage void do_softirq(void)

+ 9 - 11
arch/i386/kernel/process.c

@@ -152,21 +152,19 @@ static void poll_idle (void)
 /* We don't actually take CPU down, just spin without interrupts. */
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 static inline void play_dead(void)
 {
 {
+	/* This must be done before dead CPU ack */
+	cpu_exit_clear();
+	wbinvd();
+	mb();
 	/* Ack it */
 	/* Ack it */
 	__get_cpu_var(cpu_state) = CPU_DEAD;
 	__get_cpu_var(cpu_state) = CPU_DEAD;
 
 
-	/* We shouldn't have to disable interrupts while dead, but
-	 * some interrupts just don't seem to go away, and this makes
-	 * it "work" for testing purposes. */
-	/* Death loop */
-	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
-		cpu_relax();
-
+	/*
+	 * With physical CPU hotplug, we should halt the cpu
+	 */
 	local_irq_disable();
 	local_irq_disable();
-	__flush_tlb_all();
-	cpu_set(smp_processor_id(), cpu_online_map);
-	enable_APIC_timer();
-	local_irq_enable();
+	while (1)
+		__asm__ __volatile__("hlt":::"memory");
 }
 }
 #else
 #else
 static inline void play_dead(void)
 static inline void play_dead(void)

+ 144 - 31
arch/i386/kernel/smpboot.c

@@ -90,6 +90,12 @@ cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
 EXPORT_SYMBOL(cpu_callout_map);
 static cpumask_t smp_commenced_mask;
 static cpumask_t smp_commenced_mask;
 
 
+/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
+ * is no way to resync one AP against BP. TBD: for prescott and above, we
+ * should use IA64's algorithm
+ */
+static int __devinitdata tsc_sync_disabled;
+
 /* Per CPU bogomips and other parameters */
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
 EXPORT_SYMBOL(cpu_data);
@@ -427,7 +433,7 @@ static void __devinit smp_callin(void)
 	/*
 	/*
 	 *      Synchronize the TSC with the BP
 	 *      Synchronize the TSC with the BP
 	 */
 	 */
-	if (cpu_has_tsc && cpu_khz)
+	if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
 		synchronize_tsc_ap();
 		synchronize_tsc_ap();
 }
 }
 
 
@@ -507,6 +513,7 @@ static void __devinit start_secondary(void *unused)
 	lock_ipi_call_lock();
 	lock_ipi_call_lock();
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_online_map);
 	unlock_ipi_call_lock();
 	unlock_ipi_call_lock();
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 
 
 	/* We can take interrupts now: we're officially "up". */
 	/* We can take interrupts now: we're officially "up". */
 	local_irq_enable();
 	local_irq_enable();
@@ -816,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 #endif	/* WAKE_SECONDARY_VIA_INIT */
 #endif	/* WAKE_SECONDARY_VIA_INIT */
 
 
 extern cpumask_t cpu_initialized;
 extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+	cpumask_t	tmp_map;
+	int cpu;
+	cpus_complement(tmp_map, cpu_present_map);
+	cpu = first_cpu(tmp_map);
+	if (cpu >= NR_CPUS)
+		return -ENODEV;
+	return cpu;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+	struct task_struct *idle;
+
+	if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+		/* initialize thread_struct.  we really want to avoid destroy
+		 * idle tread
+		 */
+		idle->thread.esp = (unsigned long)(((struct pt_regs *)
+			(THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
+		init_idle(idle, cpu);
+		return idle;
+	}
+	idle = fork_idle(cpu);
+
+	if (!IS_ERR(idle))
+		cpu_idle_tasks[cpu] = idle;
+	return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
 
 
-static int __devinit do_boot_cpu(int apicid)
+static int __devinit do_boot_cpu(int apicid, int cpu)
 /*
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -826,16 +868,17 @@ static int __devinit do_boot_cpu(int apicid)
 {
 {
 	struct task_struct *idle;
 	struct task_struct *idle;
 	unsigned long boot_error;
 	unsigned long boot_error;
-	int timeout, cpu;
+	int timeout;
 	unsigned long start_eip;
 	unsigned long start_eip;
 	unsigned short nmi_high = 0, nmi_low = 0;
 	unsigned short nmi_high = 0, nmi_low = 0;
 
 
-	cpu = ++cpucount;
+	++cpucount;
+
 	/*
 	/*
 	 * We can't use kernel_thread since we must avoid to
 	 * We can't use kernel_thread since we must avoid to
 	 * reschedule the child.
 	 * reschedule the child.
 	 */
 	 */
-	idle = fork_idle(cpu);
+	idle = alloc_idle_task(cpu);
 	if (IS_ERR(idle))
 	if (IS_ERR(idle))
 		panic("failed fork for CPU %d", cpu);
 		panic("failed fork for CPU %d", cpu);
 	idle->thread.eip = (unsigned long) start_secondary;
 	idle->thread.eip = (unsigned long) start_secondary;
@@ -902,13 +945,16 @@ static int __devinit do_boot_cpu(int apicid)
 			inquire_remote_apic(apicid);
 			inquire_remote_apic(apicid);
 		}
 		}
 	}
 	}
-	x86_cpu_to_apicid[cpu] = apicid;
+
 	if (boot_error) {
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		/* Try to put things back the way they were before ... */
 		unmap_cpu_to_logical_apicid(cpu);
 		unmap_cpu_to_logical_apicid(cpu);
 		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
 		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
 		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
 		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
 		cpucount--;
 		cpucount--;
+	} else {
+		x86_cpu_to_apicid[cpu] = apicid;
+		cpu_set(cpu, cpu_present_map);
 	}
 	}
 
 
 	/* mark "stuck" area as not stuck */
 	/* mark "stuck" area as not stuck */
@@ -917,6 +963,75 @@ static int __devinit do_boot_cpu(int apicid)
 	return boot_error;
 	return boot_error;
 }
 }
 
 
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	idle_task_exit();
+
+	cpucount --;
+	cpu_uninit();
+	irq_ctx_exit(cpu);
+
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);
+	cpu_clear(cpu, cpu_present_map);
+
+	cpu_clear(cpu, smp_commenced_mask);
+	unmap_cpu_to_logical_apicid(cpu);
+}
+
+struct warm_boot_cpu_info {
+	struct completion *complete;
+	int apicid;
+	int cpu;
+};
+
+static void __devinit do_warm_boot_cpu(void *p)
+{
+	struct warm_boot_cpu_info *info = p;
+	do_boot_cpu(info->apicid, info->cpu);
+	complete(info->complete);
+}
+
+int __devinit smp_prepare_cpu(int cpu)
+{
+	DECLARE_COMPLETION(done);
+	struct warm_boot_cpu_info info;
+	struct work_struct task;
+	int	apicid, ret;
+
+	lock_cpu_hotplug();
+	apicid = x86_cpu_to_apicid[cpu];
+	if (apicid == BAD_APICID) {
+		ret = -ENODEV;
+		goto exit;
+	}
+
+	info.complete = &done;
+	info.apicid = apicid;
+	info.cpu = cpu;
+	INIT_WORK(&task, do_warm_boot_cpu, &info);
+
+	tsc_sync_disabled = 1;
+
+	/* init low mem mapping */
+	memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+			sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS);
+	flush_tlb_all();
+	schedule_work(&task);
+	wait_for_completion(&done);
+
+	tsc_sync_disabled = 0;
+	zap_low_mappings();
+	ret = 0;
+exit:
+	unlock_cpu_hotplug();
+	return ret;
+}
+#endif
+
 static void smp_tune_scheduling (void)
 static void smp_tune_scheduling (void)
 {
 {
 	unsigned long cachesize;       /* kB   */
 	unsigned long cachesize;       /* kB   */
@@ -1069,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 		if (max_cpus <= cpucount+1)
 		if (max_cpus <= cpucount+1)
 			continue;
 			continue;
 
 
-		if (do_boot_cpu(apicid))
+		if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
 			printk("CPU #%d not responding - cannot use it.\n",
 			printk("CPU #%d not responding - cannot use it.\n",
 								apicid);
 								apicid);
 		else
 		else
@@ -1149,25 +1264,24 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 {
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callout_map);
 	cpu_set(smp_processor_id(), cpu_callout_map);
+	cpu_set(smp_processor_id(), cpu_present_map);
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 }
 
 
 #ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_HOTPLUG_CPU
-
-/* must be called with the cpucontrol mutex held */
-static int __devinit cpu_enable(unsigned int cpu)
+static void
+remove_siblinginfo(int cpu)
 {
 {
-	/* get the target out of its holding state */
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-	wmb();
-
-	/* wait for the processor to ack it. timeout? */
-	while (!cpu_online(cpu))
-		cpu_relax();
-
-	fixup_irqs(cpu_online_map);
-	/* counter the disable in fixup_irqs() */
-	local_irq_enable();
-	return 0;
+	int sibling;
+
+	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+		cpu_clear(cpu, cpu_sibling_map[sibling]);
+	for_each_cpu_mask(sibling, cpu_core_map[cpu])
+		cpu_clear(cpu, cpu_core_map[sibling]);
+	cpus_clear(cpu_sibling_map[cpu]);
+	cpus_clear(cpu_core_map[cpu]);
+	phys_proc_id[cpu] = BAD_APICID;
+	cpu_core_id[cpu] = BAD_APICID;
 }
 }
 
 
 int __cpu_disable(void)
 int __cpu_disable(void)
@@ -1193,6 +1307,8 @@ int __cpu_disable(void)
 	mdelay(1);
 	mdelay(1);
 	local_irq_disable();
 	local_irq_disable();
 
 
+	remove_siblinginfo(cpu);
+
 	cpu_clear(cpu, map);
 	cpu_clear(cpu, map);
 	fixup_irqs(map);
 	fixup_irqs(map);
 	/* It's now safe to remove this processor from the online map */
 	/* It's now safe to remove this processor from the online map */
@@ -1207,8 +1323,10 @@ void __cpu_die(unsigned int cpu)
 
 
 	for (i = 0; i < 10; i++) {
 	for (i = 0; i < 10; i++) {
 		/* They ack this in play_dead by setting CPU_DEAD */
 		/* They ack this in play_dead by setting CPU_DEAD */
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+			printk ("CPU %d is now offline\n", cpu);
 			return;
 			return;
+		}
 		current->state = TASK_UNINTERRUPTIBLE;
 		current->state = TASK_UNINTERRUPTIBLE;
 		schedule_timeout(HZ/10);
 		schedule_timeout(HZ/10);
 	}
 	}
@@ -1236,15 +1354,8 @@ int __devinit __cpu_up(unsigned int cpu)
 		return -EIO;
 		return -EIO;
 	}
 	}
 
 
-#ifdef CONFIG_HOTPLUG_CPU
-	/* Already up, and in cpu_quiescent now? */
-	if (cpu_isset(cpu, smp_commenced_mask)) {
-		cpu_enable(cpu);
-		return 0;
-	}
-#endif
-
 	local_irq_enable();
 	local_irq_enable();
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Unleash the CPU! */
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
 	cpu_set(cpu, smp_commenced_mask);
 	while (!cpu_isset(cpu, cpu_online_map))
 	while (!cpu_isset(cpu, cpu_online_map))
@@ -1258,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	setup_ioapic_dest();
 	setup_ioapic_dest();
 #endif
 #endif
 	zap_low_mappings();
 	zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
 	/*
 	/*
 	 * Disable executability of the SMP trampoline:
 	 * Disable executability of the SMP trampoline:
 	 */
 	 */
 	set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
 	set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
 }
 }
 
 
 void __init smp_intr_init(void)
 void __init smp_intr_init(void)

+ 7 - 1
drivers/base/cpu.c

@@ -16,6 +16,10 @@ struct sysdev_class cpu_sysdev_class = {
 EXPORT_SYMBOL(cpu_sysdev_class);
 EXPORT_SYMBOL(cpu_sysdev_class);
 
 
 #ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_HOTPLUG_CPU
+#ifndef __HAVE_ARCH_SMP_PREPARE_CPU
+#define smp_prepare_cpu(cpu) (0)
+#endif
+
 static ssize_t show_online(struct sys_device *dev, char *buf)
 static ssize_t show_online(struct sys_device *dev, char *buf)
 {
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -36,7 +40,9 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 			kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
 			kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
 		break;
 		break;
 	case '1':
 	case '1':
-		ret = cpu_up(cpu->sysdev.id);
+		ret = smp_prepare_cpu(cpu->sysdev.id);
+		if (ret == 0)
+			ret = cpu_up(cpu->sysdev.id);
 		break;
 		break;
 	default:
 	default:
 		ret = -EINVAL;
 		ret = -EINVAL;

+ 2 - 0
include/asm-i386/irq.h

@@ -29,9 +29,11 @@ extern void release_vm86_irqs(struct task_struct *);
 
 
 #ifdef CONFIG_4KSTACKS
 #ifdef CONFIG_4KSTACKS
   extern void irq_ctx_init(int cpu);
   extern void irq_ctx_init(int cpu);
+  extern void irq_ctx_exit(int cpu);
 # define __ARCH_HAS_DO_SOFTIRQ
 # define __ARCH_HAS_DO_SOFTIRQ
 #else
 #else
 # define irq_ctx_init(cpu) do { } while (0)
 # define irq_ctx_init(cpu) do { } while (0)
+# define irq_ctx_exit(cpu) do { } while (0)
 #endif
 #endif
 
 
 #ifdef CONFIG_IRQBALANCE
 #ifdef CONFIG_IRQBALANCE

+ 8 - 0
include/asm-i386/smp.h

@@ -48,6 +48,14 @@ extern void unlock_ipi_call_lock(void);
 #define MAX_APICID 256
 #define MAX_APICID 256
 extern u8 x86_cpu_to_apicid[];
 extern u8 x86_cpu_to_apicid[];
 
 
+#ifdef CONFIG_HOTPLUG_CPU
+extern void cpu_exit_clear(void);
+extern void cpu_uninit(void);
+
+#define __HAVE_ARCH_SMP_PREPARE_CPU
+extern int smp_prepare_cpu(int cpu);
+#endif
+
 /*
 /*
  * This function is needed by all SMP systems. It must _always_ be valid
  * This function is needed by all SMP systems. It must _always_ be valid
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * from the initial startup. We map APIC_BASE very early in page_setup(),