Merge branch 'drm-patches' of master.kernel.org:/pub/scm/linux/kernel/git/airlied...
[linux-drm-fsl-dcu.git] / arch / i386 / kernel / smpboot.c
index 8c6c8c52b95c0b574b837c279f84647d3b4ba867..a4b7ad283f49ace7e351fbb45f4917a75157af7f 100644 (file)
  *             Dave Jones      :       Report invalid combinations of Athlon CPUs.
 *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process. */
 
-
-/* SMP boot always wants to use real time delay to allow sufficient time for
- * the APs to come online */
-#define USE_REAL_TIME_DELAY
-
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -50,6 +45,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/percpu.h>
+#include <linux/nmi.h>
 
 #include <linux/delay.h>
 #include <linux/mc146818rtc.h>
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
-#include <asm/pda.h>
-#include <asm/genapic.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
+#include <asm/vmi.h>
+#include <asm/mtrr.h>
 
 /* Set if we find a B stepping CPU */
 static int __devinitdata smp_b_stepping;
@@ -93,12 +89,6 @@ cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
-/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
- * is no way to resync one AP against BP. TBD: for prescott and above, we
- * should use IA64's algorithm
- */
-static int __devinitdata tsc_sync_disabled;
-
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
@@ -109,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
 
 u8 apicid_2_node[MAX_APICID];
 
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -165,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id)
 
        *c = boot_cpu_data;
        if (id!=0)
-               identify_cpu(c);
+               identify_secondary_cpu(c);
        /*
         * Mask B, Pentium, but not Pentium MMX
         */
@@ -215,151 +208,6 @@ valid_k7:
        ;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
-
-static struct {
-       atomic_t start_flag;
-       atomic_t count_start;
-       atomic_t count_stop;
-       unsigned long long values[NR_CPUS];
-} tsc __cpuinitdata = {
-       .start_flag = ATOMIC_INIT(0),
-       .count_start = ATOMIC_INIT(0),
-       .count_stop = ATOMIC_INIT(0),
-};
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp(void)
-{
-       int i;
-       unsigned long long t0;
-       unsigned long long sum, avg;
-       long long delta;
-       unsigned int one_usec;
-       int buggy = 0;
-
-       printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-       /* convert from kcyc/sec to cyc/usec */
-       one_usec = cpu_khz / 1000;
-
-       atomic_set(&tsc.start_flag, 1);
-       wmb();
-
-       /*
-        * We loop a few times to get a primed instruction cache,
-        * then the last pass is more or less synchronized and
-        * the BP and APs set their cycle counters to zero all at
-        * once. This reduces the chance of having random offsets
-        * between the processors, and guarantees that the maximum
-        * delay between the cycle counters is never bigger than
-        * the latency of information-passing (cachelines) between
-        * two CPUs.
-        */
-       for (i = 0; i < NR_LOOPS; i++) {
-               /*
-                * all APs synchronize but they loop on '== num_cpus'
-                */
-               while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
-                       cpu_relax();
-               atomic_set(&tsc.count_stop, 0);
-               wmb();
-               /*
-                * this lets the APs save their current TSC:
-                */
-               atomic_inc(&tsc.count_start);
-
-               rdtscll(tsc.values[smp_processor_id()]);
-               /*
-                * We clear the TSC in the last loop:
-                */
-               if (i == NR_LOOPS-1)
-                       write_tsc(0, 0);
-
-               /*
-                * Wait for all APs to leave the synchronization point:
-                */
-               while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
-                       cpu_relax();
-               atomic_set(&tsc.count_start, 0);
-               wmb();
-               atomic_inc(&tsc.count_stop);
-       }
-
-       sum = 0;
-       for (i = 0; i < NR_CPUS; i++) {
-               if (cpu_isset(i, cpu_callout_map)) {
-                       t0 = tsc.values[i];
-                       sum += t0;
-               }
-       }
-       avg = sum;
-       do_div(avg, num_booting_cpus());
-
-       for (i = 0; i < NR_CPUS; i++) {
-               if (!cpu_isset(i, cpu_callout_map))
-                       continue;
-               delta = tsc.values[i] - avg;
-               if (delta < 0)
-                       delta = -delta;
-               /*
-                * We report bigger than 2 microseconds clock differences.
-                */
-               if (delta > 2*one_usec) {
-                       long long realdelta;
-
-                       if (!buggy) {
-                               buggy = 1;
-                               printk("\n");
-                       }
-                       realdelta = delta;
-                       do_div(realdelta, one_usec);
-                       if (tsc.values[i] < avg)
-                               realdelta = -realdelta;
-
-                       if (realdelta)
-                               printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
-                                       "skew, fixed it up.\n", i, realdelta);
-               }
-       }
-       if (!buggy)
-               printk("passed.\n");
-}
-
-static void __cpuinit synchronize_tsc_ap(void)
-{
-       int i;
-
-       /*
-        * Not every cpu is online at the time
-        * this gets called, so we first wait for the BP to
-        * finish SMP initialization:
-        */
-       while (!atomic_read(&tsc.start_flag))
-               cpu_relax();
-
-       for (i = 0; i < NR_LOOPS; i++) {
-               atomic_inc(&tsc.count_start);
-               while (atomic_read(&tsc.count_start) != num_booting_cpus())
-                       cpu_relax();
-
-               rdtscll(tsc.values[smp_processor_id()]);
-               if (i == NR_LOOPS-1)
-                       write_tsc(0, 0);
-
-               atomic_inc(&tsc.count_stop);
-               while (atomic_read(&tsc.count_stop) != num_booting_cpus())
-                       cpu_relax();
-       }
-}
-#undef NR_LOOPS
-
 extern void calibrate_delay(void);
 
 static atomic_t init_deasserted;
@@ -437,20 +285,12 @@ static void __cpuinit smp_callin(void)
        /*
         * Save our processor parameters
         */
-       smp_store_cpu_info(cpuid);
-
-       disable_APIC_timer();
+       smp_store_cpu_info(cpuid);
 
        /*
         * Allow the master to continue.
         */
        cpu_set(cpuid, cpu_callin_map);
-
-       /*
-        *      Synchronize the TSC with the BP
-        */
-       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
-               synchronize_tsc_ap();
 }
 
 static int cpucount;
@@ -541,22 +381,29 @@ set_cpu_sibling_map(int cpu)
 static void __cpuinit start_secondary(void *unused)
 {
        /*
-        * Don't put *anything* before secondary_cpu_init(), SMP
-        * booting is too fragile that we want to limit the
-        * things done here to the most necessary things.
+        * Don't put *anything* before cpu_init(), SMP booting is too
+        * fragile that we want to limit the things done here to the
+        * most necessary things.
         */
-       secondary_cpu_init();
+#ifdef CONFIG_VMI
+       vmi_bringup();
+#endif
+       cpu_init();
        preempt_disable();
        smp_callin();
        while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
                rep_nop();
-       setup_secondary_APIC_clock();
+       /*
+        * Check TSC synchronization with the BP:
+        */
+       check_tsc_sync_target();
+
+       setup_secondary_clock();
        if (nmi_watchdog == NMI_IO_APIC) {
                disable_8259A_irq(0);
                enable_NMI_through_LVT0(NULL);
                enable_8259A_irq(0);
        }
-       enable_APIC_timer();
        /*
         * low-memory mappings have been cleared, flush them from
         * the local TLBs too.
@@ -595,12 +442,6 @@ static void __cpuinit start_secondary(void *unused)
  */
 void __devinit initialize_secondary(void)
 {
-       /*
-        * switch to the per CPU GDT we already set up
-        * in do_boot_cpu()
-        */
-       cpu_set_gdt(current_thread_info()->cpu);
-
        /*
         * We don't actually need to load the full TSS,
         * basically just the stack pointer and the eip.
@@ -618,8 +459,6 @@ extern struct {
        void * esp;
        unsigned short ss;
 } stack_start;
-extern struct i386_pda *start_pda;
-extern struct Xgt_desc_struct cpu_gdt_descr;
 
 #ifdef CONFIG_NUMA
 
@@ -677,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu)
        unmap_cpu_to_node(cpu);
 }
 
-#if APIC_DEBUG
 static inline void __inquire_remote_apic(int apicid)
 {
        int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
        char *names[] = { "ID", "VERSION", "SPIV" };
-       int timeout, status;
+       int timeout;
+       unsigned long status;
 
        printk("Inquiring remote APIC #%d...\n", apicid);
 
@@ -692,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid)
                /*
                 * Wait for idle.
                 */
-               apic_wait_icr_idle();
+               status = safe_apic_wait_icr_idle();
+               if (status)
+                       printk("a previous APIC delivery may have failed\n");
 
                apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
                apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -706,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid)
                switch (status) {
                case APIC_ICR_RR_VALID:
                        status = apic_read(APIC_RRR);
-                       printk("%08x\n", status);
+                       printk("%lx\n", status);
                        break;
                default:
                        printk("failed\n");
                }
        }
 }
-#endif
 
 #ifdef WAKE_SECONDARY_VIA_NMI
 /* 
@@ -724,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid)
 static int __devinit
 wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 {
-       unsigned long send_status = 0, accept_status = 0;
-       int timeout, maxlvt;
+       unsigned long send_status, accept_status = 0;
+       int maxlvt;
 
        /* Target chip */
        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
@@ -735,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
        apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
 
        Dprintk("Waiting for send to finish...\n");
-       timeout = 0;
-       do {
-               Dprintk("+");
-               udelay(100);
-               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-       } while (send_status && (timeout++ < 1000));
+       send_status = safe_apic_wait_icr_idle();
 
        /*
         * Give the other CPU some time to accept the IPI.
@@ -749,7 +584,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
        /*
         * Due to the Pentium erratum 3AP.
         */
-       maxlvt = get_maxlvt();
+       maxlvt = lapic_get_maxlvt();
        if (maxlvt > 3) {
                apic_read_around(APIC_SPIV);
                apic_write(APIC_ESR, 0);
@@ -770,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 static int __devinit
 wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 {
-       unsigned long send_status = 0, accept_status = 0;
-       int maxlvt, timeout, num_starts, j;
+       unsigned long send_status, accept_status = 0;
+       int maxlvt, num_starts, j;
 
        /*
         * Be paranoid about clearing APIC errors.
@@ -796,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                                | APIC_DM_INIT);
 
        Dprintk("Waiting for send to finish...\n");
-       timeout = 0;
-       do {
-               Dprintk("+");
-               udelay(100);
-               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-       } while (send_status && (timeout++ < 1000));
+       send_status = safe_apic_wait_icr_idle();
 
        mdelay(10);
 
@@ -814,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
        apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
 
        Dprintk("Waiting for send to finish...\n");
-       timeout = 0;
-       do {
-               Dprintk("+");
-               udelay(100);
-               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-       } while (send_status && (timeout++ < 1000));
+       send_status = safe_apic_wait_icr_idle();
 
        atomic_set(&init_deasserted, 1);
 
@@ -834,12 +659,19 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
        else
                num_starts = 0;
 
+       /*
+        * Paravirt / VMI wants a startup IPI hook here to set up the
+        * target processor state.
+        */
+       startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+                        (unsigned long) stack_start.esp);
+
        /*
         * Run STARTUP IPI loop.
         */
        Dprintk("#startup loops: %d.\n", num_starts);
 
-       maxlvt = get_maxlvt();
+       maxlvt = lapic_get_maxlvt();
 
        for (j = 1; j <= num_starts; j++) {
                Dprintk("Sending STARTUP #%d.\n",j);
@@ -868,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                Dprintk("Startup point 1.\n");
 
                Dprintk("Waiting for send to finish...\n");
-               timeout = 0;
-               do {
-                       Dprintk("+");
-                       udelay(100);
-                       send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-               } while (send_status && (timeout++ < 1000));
+               send_status = safe_apic_wait_icr_idle();
 
                /*
                 * Give the other CPU some time to accept the IPI.
@@ -937,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu)
 #define alloc_idle_task(cpu) fork_idle(cpu)
 #endif
 
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+static __cpuinit void init_gdt(int cpu)
+{
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+       pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+                       (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+                       __per_cpu_offset[cpu], 0xFFFFF,
+                       0x80 | DESCTYPE_S | 0x2, 0x8);
+
+       per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+       per_cpu(cpu_number, cpu) = cpu;
+}
+
+/* Defined in head.S */
+extern struct Xgt_desc_struct early_gdt_descr;
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -950,6 +796,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
 
+       /*
+        * Save current MTRR state in case it was changed since early boot
+        * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+        */
+       mtrr_save_state();
+
        /*
         * We can't use kernel_thread since we must avoid to
         * reschedule the child.
@@ -958,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
        if (IS_ERR(idle))
                panic("failed fork for CPU %d", cpu);
 
-       /* Pre-allocate and initialize the CPU's GDT and PDA so it
-          doesn't have to do any memory allocation during the
-          delicate CPU-bringup phase. */
-       if (!init_gdt(cpu, idle)) {
-               printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
-               return -1;      /* ? */
-       }
+       init_gdt(cpu);
+       per_cpu(current_task, cpu) = idle;
+       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 
        idle->thread.eip = (unsigned long) start_secondary;
        /* start_eip had better be page-aligned! */
@@ -1090,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
        DECLARE_COMPLETION_ONSTACK(done);
        struct warm_boot_cpu_info info;
        int     apicid, ret;
-       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
        apicid = x86_cpu_to_apicid[cpu];
        if (apicid == BAD_APICID) {
@@ -1098,25 +945,11 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
                goto exit;
        }
 
-       /*
-        * the CPU isn't initialized at boot time, allocate gdt table here.
-        * cpu_init will initialize it
-        */
-       if (!cpu_gdt_descr->address) {
-               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-               if (!cpu_gdt_descr->address)
-                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
-                       ret = -ENOMEM;
-                       goto exit;
-       }
-
        info.complete = &done;
        info.apicid = apicid;
        info.cpu = cpu;
        INIT_WORK(&info.task, do_warm_boot_cpu);
 
-       tsc_sync_disabled = 1;
-
        /* init low mem mapping */
        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
                        min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
@@ -1124,7 +957,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
        schedule_work(&info.task);
        wait_for_completion(&done);
 
-       tsc_sync_disabled = 0;
        zap_low_mappings();
        ret = 0;
 exit:
@@ -1320,18 +1152,12 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 
        smpboot_setup_io_apic();
 
-       setup_boot_APIC_clock();
-
-       /*
-        * Synchronize the TSC with the AP
-        */
-       if (cpu_has_tsc && cpucount && cpu_khz)
-               synchronize_tsc_bp();
+       setup_boot_clock();
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init smp_prepare_cpus(unsigned int max_cpus)
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
 {
        smp_commenced_mask = cpumask_of_cpu(0);
        cpu_callin_map = cpumask_of_cpu(0);
@@ -1339,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
        smp_boot_cpus(max_cpus);
 }
 
-void __devinit smp_prepare_boot_cpu(void)
+void __init native_smp_prepare_boot_cpu(void)
 {
-       cpu_set(smp_processor_id(), cpu_online_map);
-       cpu_set(smp_processor_id(), cpu_callout_map);
-       cpu_set(smp_processor_id(), cpu_present_map);
-       cpu_set(smp_processor_id(), cpu_possible_map);
-       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+       unsigned int cpu = smp_processor_id();
+
+       init_gdt(cpu);
+       switch_to_new_gdt();
+
+       cpu_set(cpu, cpu_online_map);
+       cpu_set(cpu, cpu_callout_map);
+       cpu_set(cpu, cpu_present_map);
+       cpu_set(cpu, cpu_possible_map);
+       __get_cpu_var(cpu_state) = CPU_ONLINE;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1435,10 +1266,11 @@ void __cpu_die(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu)
 {
+       unsigned long flags;
 #ifdef CONFIG_HOTPLUG_CPU
-       int ret=0;
+       int ret = 0;
 
        /*
         * We do warm boot only on cpus that had booted earlier
@@ -1456,26 +1288,30 @@ int __cpuinit __cpu_up(unsigned int cpu)
        /* In case one didn't come up */
        if (!cpu_isset(cpu, cpu_callin_map)) {
                printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
-               local_irq_enable();
                return -EIO;
        }
 
-       local_irq_enable();
        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
        /* Unleash the CPU! */
        cpu_set(cpu, smp_commenced_mask);
-       while (!cpu_isset(cpu, cpu_online_map))
-               cpu_relax();
 
-#ifdef CONFIG_X86_GENERICARCH
-       if (num_online_cpus() > 8 && genapic == &apic_default)
-               panic("Default flat APIC routing can't be used with > 8 cpus\n");
-#endif
+       /*
+        * Check TSC synchronization with the AP (keep irqs disabled
+        * while doing so):
+        */
+       local_irq_save(flags);
+       check_tsc_sync_source(cpu);
+       local_irq_restore(flags);
+
+       while (!cpu_isset(cpu, cpu_online_map)) {
+               cpu_relax();
+               touch_nmi_watchdog();
+       }
 
        return 0;
 }
 
-void __init smp_cpus_done(unsigned int max_cpus)
+void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 #ifdef CONFIG_X86_IO_APIC
        setup_ioapic_dest();