Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core...

author Ingo Molnar <mingo@elte.hu>

Mon, 21 Jul 2008 14:37:17 +0000 (16:37 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 21 Jul 2008 14:37:17 +0000 (16:37 +0200)
author Ingo Molnar <mingo@elte.hu>
Mon, 21 Jul 2008 14:37:17 +0000 (16:37 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 21 Jul 2008 14:37:17 +0000 (16:37 +0200)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 09ad7450647bc81dff32a3eaf7ea3c0858f4a896..25e88cf5d84ececd222ed309a95115fc2c6d3c16 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1206,7 +1206,7 @@ and is between 256 and 4096 characters. It is defined in the file
                                  or
                                  memmap=0x10000$0x18690000
  
-       memtest=        [KNL,X86_64] Enable memtest
+       memtest=        [KNL,X86] Enable memtest
                         Format: <integer>
                         range: 0,4 : pattern number
                         default : 0 <disable>
@@ -2158,6 +2158,10 @@ and is between 256 and 4096 characters. It is defined in the file
                         Note that genuine overcurrent events won't be
                         reported either.
  
+       unknown_nmi_panic
+                       [X86-32,X86-64]
+                       Set unknown_nmi_panic=1 early on boot.
+
         usbcore.autosuspend=
                         [USB] The autosuspend time delay (in seconds) used
                         for newly-detected USB devices (default 2).  This
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 96e0c2ebc3885713a5d6290f5e8eb959d0d0d36e..03980cb042916c6f37bd131c1d43553f51c8ade6 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
  
  config MEMTEST
         bool "Memtest"
-       depends on X86_64
         help
           This option adds a kernel parameter 'memtest', which allows memtest
           to be set.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu

index abff1b84ed5bf2e009908110a9de93cd783ca745..54b8c02c71e6bd5d7ca1f1d7fd2e8143f399399e 100644 (file)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
         def_bool y
         depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
  
-config X86_GOOD_APIC
-       def_bool y
-       depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
-
  config X86_INTEL_USERCOPY
         def_bool y
         depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index 51c8214779513e3c1e1df6b56cda8907d7a9a636..85a87d2ac0c09940c9d0d6654345375417e3d378 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -289,7 +289,6 @@ config CPA_DEBUG
  
  config OPTIMIZE_INLINING
         bool "Allow gcc to uninline functions marked 'inline'"
-       depends on BROKEN
         help
           This option determines if the kernel forces gcc to inline the functions
           developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -300,5 +299,7 @@ config OPTIMIZE_INLINING
           become the default in the future, until then this option is there to
           test gcc for this.
  
+         If unsure, say N.
+
  endmenu
  
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c

index 03399d64013b21459f63942c06bc6c33b803f425..d93cbc6464d0f8aa8aed3479b1b88a66993afa67 100644 (file)
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -167,9 +167,8 @@ void query_edd(void)
                  * Scan the BIOS-supported hard disks and query EDD
                  * information...
                  */
-               get_edd_info(devno, &ei);
-
-               if (boot_params.eddbuf_entries < EDDMAXNR) {
+               if (!get_edd_info(devno, &ei)
+                   && boot_params.eddbuf_entries < EDDMAXNR) {
                         memcpy(edp, &ei, sizeof ei);
                         edp++;
                         boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c

index 328956fdb59e79dc354e96f47e98c5cb5dff4a6b..85a1cd8a8ff8a4daee99ee11d86176bd7a97c166 100644 (file)
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
  /*
   * Set up the GDT
   */
-#define GDT_ENTRY(flags, base, limit)          \
-       (((u64)(base & 0xff000000) << 32) |     \
-        ((u64)flags << 40) |                   \
-        ((u64)(limit & 0x00ff0000) << 32) |    \
-        ((u64)(base & 0x00ffffff) << 16) |     \
-        ((u64)(limit & 0x0000ffff)))
  
  struct gdt_ptr {
         u16 len;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c

index cb3856a18c8544e1ffe722d33a7713439043829d..20af4c79579a88ce1da6860e774c7440bc793b56 100644 (file)
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
  
  #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
  
+#define FIX_EFLAGS     (X86_EFLAGS_AC | X86_EFLAGS_OF | \
+                        X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+                        X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+                        X86_EFLAGS_CF)
+
  asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
  void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
  
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
         regs->ss |= 3;
  
         err |= __get_user(tmpflags, &sc->flags);
-       regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+       regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
         /* disable syscall checks */
         regs->orig_ax = -1;
  
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                         compat_sigset_t *set, struct pt_regs *regs)
  {
         struct rt_sigframe __user *frame;
-       struct exec_domain *ed = current_thread_info()->exec_domain;
         void __user *restorer;
         int err = 0;
  
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
         if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
                 goto give_sigsegv;
  
-       err |= __put_user((ed && ed->signal_invmap && sig < 32
-                          ? ed->signal_invmap[sig] : sig), &frame->sig);
+       err |= __put_user(sig, &frame->sig);
         err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
         err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
         err |= copy_siginfo_to_user32(&frame->info, info);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S

index 20371d0635e44975850ea37b5a8a03a2f52f0d58..23d146ce676bc0e1b8c6b65ead1e2971479e1708 100644 (file)
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -37,6 +37,11 @@
         movq    %rax,R8(%rsp)
         .endm
  
+       /*
+        * Reload arg registers from stack in case ptrace changed them.
+        * We don't reload %eax because syscall_trace_enter() returned
+        * the value it wants us to use in the table lookup.
+        */
         .macro LOAD_ARGS32 offset
         movl \offset(%rsp),%r11d
         movl \offset+8(%rsp),%r10d
@@ -46,7 +51,6 @@
         movl \offset+48(%rsp),%edx
         movl \offset+56(%rsp),%esi
         movl \offset+64(%rsp),%edi
-       movl \offset+72(%rsp),%eax
         .endm
         
         .macro CFI_STARTPROC32 simple
@@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target)
         .previous       
         GET_THREAD_INFO(%r10)
         orl    $TS_COMPAT,TI_status(%r10)
-       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-                TI_flags(%r10)
+       testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
         CFI_REMEMBER_STATE
         jnz  sysenter_tracesys
-sysenter_do_call:      
         cmpl    $(IA32_NR_syscalls-1),%eax
         ja      ia32_badsys
+sysenter_do_call:
         IA32_ARG_FIXUP 1
         call    *ia32_sys_call_table(,%rax,8)
         movq    %rax,RAX-ARGOFFSET(%rsp)
@@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target)
         .previous       
         GET_THREAD_INFO(%r10)
         orl   $TS_COMPAT,TI_status(%r10)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-               TI_flags(%r10)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
         CFI_REMEMBER_STATE
         jnz   cstar_tracesys
  cstar_do_call: 
@@ -321,6 +323,7 @@ ENTRY(ia32_syscall)
         /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
         /*CFI_REL_OFFSET        cs,CS-RIP*/
         CFI_REL_OFFSET  rip,RIP-RIP
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         SWAPGS
         /*
          * No need to follow this irqs on/off section: the syscall
@@ -336,8 +339,7 @@ ENTRY(ia32_syscall)
         SAVE_ARGS 0,0,1
         GET_THREAD_INFO(%r10)
         orl   $TS_COMPAT,TI_status(%r10)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-               TI_flags(%r10)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
         jnz ia32_tracesys
  ia32_do_syscall:       
         cmpl $(IA32_NR_syscalls-1),%eax
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index da140611bb57593ed401a5de6ee63a84cc408349..b78a17b128101c68e97cb345a97403a91c7ac185 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_OLPC)          += olpc.o
  # 64 bit specific files
  ifeq ($(CONFIG_X86_64),y)
          obj-y                          += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
+       obj-y                           += bios_uv.o
          obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
          obj-$(CONFIG_AUDIT)            += audit_64.o
  
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c

index 868de3d5c39de9144bfc42574bcb8dd0d24cca3e..a3ddad18aaa35b37c5344c4080b59d3a46c99021 100644 (file)
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
  #include <linux/bootmem.h>
  #include <linux/dmi.h>
  #include <linux/cpumask.h>
+#include <asm/segment.h>
  
  #include "realmode/wakeup.h"
  #include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
  static char temp_stack[10240];
  #endif
  
-/* XXX: this macro should move to asm-x86/segment.h and be shared with the
-   boot code... */
-#define GDT_ENTRY(flags, base, limit)          \
-       (((u64)(base & 0xff000000) << 32) |     \
-        ((u64)flags << 40) |                   \
-        ((u64)(limit & 0x00ff0000) << 32) |    \
-        ((u64)(base & 0x00ffffff) << 16) |     \
-        ((u64)(limit & 0x0000ffff)))
-
  /**
   * acpi_save_state_mem - save kernel state
   *
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c

index f2766d84c7a00c78c4f24951a7f2912f550ef475..c25210e6ac888e94224b460e6eb82f3c556d7616 100644 (file)
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,7 +23,7 @@
  #include <linux/scatterlist.h>
  #include <linux/iommu-helper.h>
  #include <asm/proto.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include <asm/amd_iommu_types.h>
  #include <asm/amd_iommu.h>
  
@@ -32,21 +32,37 @@
  #define to_pages(addr, size) \
          (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
  
+#define EXIT_LOOP_COUNT 10000000
+
  static DEFINE_RWLOCK(amd_iommu_devtable_lock);
  
-struct command {
+/*
+ * general struct to manage commands send to an IOMMU
+ */
+struct iommu_cmd {
         u32 data[4];
  };
  
  static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                              struct unity_map_entry *e);
  
+/* returns !0 if the IOMMU is caching non-present entries in its TLB */
  static int iommu_has_npcache(struct amd_iommu *iommu)
  {
         return iommu->cap & IOMMU_CAP_NPCACHE;
  }
  
-static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/****************************************************************************
+ *
+ * IOMMU command queuing functions
+ *
+ ****************************************************************************/
+
+/*
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command. Must be called with iommu->lock held.
+ */
+static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
  {
         u32 tail, head;
         u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
         return 0;
  }
  
-static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/*
+ * General queuing function for commands. Takes iommu->lock and calls
+ * __iommu_queue_command().
+ */
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
  {
         unsigned long flags;
         int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
         return ret;
  }
  
+/*
+ * This function is called whenever we need to ensure that the IOMMU has
+ * completed execution of all commands we sent. It sends a
+ * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
+ * us about that by writing a value to a physical address we pass with
+ * the command.
+ */
  static int iommu_completion_wait(struct amd_iommu *iommu)
  {
         int ret;
-       struct command cmd;
+       struct iommu_cmd cmd;
         volatile u64 ready = 0;
         unsigned long ready_phys = virt_to_phys(&ready);
+       unsigned long i = 0;
  
         memset(&cmd, 0, sizeof(cmd));
         cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
-       cmd.data[1] = HIGH_U32(ready_phys);
+       cmd.data[1] = upper_32_bits(ready_phys);
         cmd.data[2] = 1; /* value written to 'ready' */
         CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
  
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
         if (ret)
                 return ret;
  
-       while (!ready)
+       while (!ready && (i < EXIT_LOOP_COUNT)) {
+               ++i;
                 cpu_relax();
+       }
+
+       if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
+               printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
  
         return 0;
  }
  
+/*
+ * Command send function for invalidating a device table entry
+ */
  static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
  {
-       struct command cmd;
+       struct iommu_cmd cmd;
  
         BUG_ON(iommu == NULL);
  
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
         return iommu_queue_command(iommu, &cmd);
  }
  
+/*
+ * Generic command send function for invalidaing TLB entries
+ */
  static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
                 u64 address, u16 domid, int pde, int s)
  {
-       struct command cmd;
+       struct iommu_cmd cmd;
  
         memset(&cmd, 0, sizeof(cmd));
         address &= PAGE_MASK;
         CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
         cmd.data[1] |= domid;
         cmd.data[2] = LOW_U32(address);
-       cmd.data[3] = HIGH_U32(address);
-       if (s)
+       cmd.data[3] = upper_32_bits(address);
+       if (s) /* size bit - we flush more than one 4kb page */
                 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-       if (pde)
+       if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
                 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
  
         iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
         return iommu_queue_command(iommu, &cmd);
  }
  
+/*
+ * TLB invalidation function which is called from the mapping functions.
+ * It invalidates a single PTE if the range to flush is within a single
+ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ */
  static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
                 u64 address, size_t size)
  {
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
         return 0;
  }
  
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * Generic mapping functions. It maps a physical address into a DMA
+ * address space. It allocates the page table pages if necessary.
+ * In the future it can be extended to a generic mapping function
+ * supporting all features of AMD IOMMU page tables like level skipping
+ * and full 64 bit address spaces.
+ */
  static int iommu_map(struct protection_domain *dom,
                      unsigned long bus_addr,
                      unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
         return 0;
  }
  
+/*
+ * This function checks if a specific unity mapping entry is needed for
+ * this specific IOMMU.
+ */
  static int iommu_for_unity_map(struct amd_iommu *iommu,
                                struct unity_map_entry *entry)
  {
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
         return 0;
  }
  
+/*
+ * Init the unity mappings for a specific IOMMU in the system
+ *
+ * Basically iterates over all unity mapping entries and applies them to
+ * the default domain DMA of that IOMMU if necessary.
+ */
  static int iommu_init_unity_mappings(struct amd_iommu *iommu)
  {
         struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
         return 0;
  }
  
+/*
+ * This function actually applies the mapping to the page table of the
+ * dma_ops domain.
+ */
  static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                              struct unity_map_entry *e)
  {
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
         return 0;
  }
  
+/*
+ * Inits the unity mappings required for a specific device
+ */
  static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
                                           u16 devid)
  {
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
         return 0;
  }
  
+/****************************************************************************
+ *
+ * The next functions belong to the address allocator for the dma_ops
+ * interface functions. They work like the allocators in the other IOMMU
+ * drivers. Its basically a bitmap which marks the allocated pages in
+ * the aperture. Maybe it could be enhanced in the future to a more
+ * efficient allocator.
+ *
+ ****************************************************************************/
  static unsigned long dma_mask_to_pages(unsigned long mask)
  {
         return (mask >> PAGE_SHIFT) +
                 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
  }
  
+/*
+ * The address allocator core function.
+ *
+ * called with domain->lock held
+ */
  static unsigned long dma_ops_alloc_addresses(struct device *dev,
                                              struct dma_ops_domain *dom,
                                              unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
         return address;
  }
  
+/*
+ * The address free function.
+ *
+ * called with domain->lock held
+ */
  static void dma_ops_free_addresses(struct dma_ops_domain *dom,
                                    unsigned long address,
                                    unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
         iommu_area_free(dom->bitmap, address, pages);
  }
  
+/****************************************************************************
+ *
+ * The next functions belong to the domain allocation. A domain is
+ * allocated for every IOMMU as the default domain. If device isolation
+ * is enabled, every device get its own domain. The most important thing
+ * about domains is the page table mapping the DMA address space they
+ * contain.
+ *
+ ****************************************************************************/
+
  static u16 domain_id_alloc(void)
  {
         unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
         return id;
  }
  
+/*
+ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ * ranges.
+ */
  static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
                                       unsigned long start_page,
                                       unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
         free_page((unsigned long)p1);
  }
  
+/*
+ * Free a domain, only used if something went wrong in the
+ * allocation path and we need to free an already allocated page table
+ */
  static void dma_ops_domain_free(struct dma_ops_domain *dom)
  {
         if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
         kfree(dom);
  }
  
+/*
+ * Allocates a new protection domain usable for the dma_ops functions.
+ * It also intializes the page table and the address allocator data
+ * structures required for the dma_ops interface
+ */
  static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
                                                    unsigned order)
  {
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
         dma_dom->bitmap[0] = 1;
         dma_dom->next_bit = 0;
  
+       /* Intialize the exclusion range if necessary */
         if (iommu->exclusion_start &&
             iommu->exclusion_start < dma_dom->aperture_size) {
                 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
                 dma_ops_reserve_addresses(dma_dom, startpage, pages);
         }
  
+       /*
+        * At the last step, build the page tables so we don't need to
+        * allocate page table pages in the dma_ops mapping/unmapping
+        * path.
+        */
         num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
         dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
                         GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
         return NULL;
  }
  
+/*
+ * Find out the protection domain structure for a given PCI device. This
+ * will give us the pointer to the page table root for example.
+ */
  static struct protection_domain *domain_for_device(u16 devid)
  {
         struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
         return dom;
  }
  
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
  static void set_device_domain(struct amd_iommu *iommu,
                               struct protection_domain *domain,
                               u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
         iommu->need_sync = 1;
  }
  
+/*****************************************************************************
+ *
+ * The next functions belong to the dma_ops mapping/unmapping code.
+ *
+ *****************************************************************************/
+
+/*
+ * In the dma_ops path we only have the struct device. This function
+ * finds the corresponding IOMMU, the protection domain and the
+ * requestor id for a given device.
+ * If the device is not yet associated with a domain this is also done
+ * in this function.
+ */
  static int get_device_resources(struct device *dev,
                                 struct amd_iommu **iommu,
                                 struct protection_domain **domain,
@@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev,
         BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
  
         pcidev = to_pci_dev(dev);
-       _bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+       _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
  
+       /* device not translated by any IOMMU in the system? */
         if (_bdf >= amd_iommu_last_bdf) {
                 *iommu = NULL;
                 *domain = NULL;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
         return 1;
  }
  
+/*
+ * This is the generic map function. It maps one 4kb page at paddr to
+ * the given address in the DMA address space for the domain.
+ */
  static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
                                      struct dma_ops_domain *dom,
                                      unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
         return (dma_addr_t)address;
  }
  
+/*
+ * The generic unmapping function for on page in the DMA address space.
+ */
  static void dma_ops_domain_unmap(struct amd_iommu *iommu,
                                  struct dma_ops_domain *dom,
                                  unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
         *pte = 0ULL;
  }
  
+/*
+ * This function contains common code for mapping of a physically
+ * contiguous memory region into DMA address space. It is uses by all
+ * mapping functions provided by this IOMMU driver.
+ * Must be called with the domain lock held.
+ */
  static dma_addr_t __map_single(struct device *dev,
                                struct amd_iommu *iommu,
                                struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
         return address;
  }
  
+/*
+ * Does the reverse of the __map_single function. Must be called with
+ * the domain lock held too
+ */
  static void __unmap_single(struct amd_iommu *iommu,
                            struct dma_ops_domain *dma_dom,
                            dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
         dma_ops_free_addresses(dma_dom, dma_addr, pages);
  }
  
+/*
+ * The exported map_single function for dma_ops.
+ */
  static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
                              size_t size, int dir)
  {
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
         get_device_resources(dev, &iommu, &domain, &devid);
  
         if (iommu == NULL || domain == NULL)
+               /* device not handled by any AMD IOMMU */
                 return (dma_addr_t)paddr;
  
         spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
         return addr;
  }
  
+/*
+ * The exported unmap_single function for dma_ops.
+ */
  static void unmap_single(struct device *dev, dma_addr_t dma_addr,
                          size_t size, int dir)
  {
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
         u16 devid;
  
         if (!get_device_resources(dev, &iommu, &domain, &devid))
+               /* device not handled by any AMD IOMMU */
                 return;
  
         spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
         spin_unlock_irqrestore(&domain->lock, flags);
  }
  
+/*
+ * This is a special map_sg function which is used if we should map a
+ * device which is not handled by an AMD IOMMU in the system.
+ */
  static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
                            int nelems, int dir)
  {
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
         return nelems;
  }
  
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
  static int map_sg(struct device *dev, struct scatterlist *sglist,
                   int nelems, int dir)
  {
@@ -775,6 +953,10 @@ unmap:
         goto out;
  }
  
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
  static void unmap_sg(struct device *dev, struct scatterlist *sglist,
                      int nelems, int dir)
  {
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
         spin_unlock_irqrestore(&domain->lock, flags);
  }
  
+/*
+ * The exported alloc_coherent function for dma_ops.
+ */
  static void *alloc_coherent(struct device *dev, size_t size,
                             dma_addr_t *dma_addr, gfp_t flag)
  {
@@ -851,6 +1036,11 @@ out:
         return virt_addr;
  }
  
+/*
+ * The exported free_coherent function for dma_ops.
+ * FIXME: fix the generic x86 DMA layer so that it actually calls that
+ *        function.
+ */
  static void free_coherent(struct device *dev, size_t size,
                           void *virt_addr, dma_addr_t dma_addr)
  {
@@ -879,6 +1069,8 @@ free_mem:
  }
  
  /*
+ * The function for pre-allocating protection domains.
+ *
   * If the driver core informs the DMA layer if a driver grabs a device
   * we don't need to preallocate the protection domains anymore.
   * For now we have to.
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
         .unmap_sg = unmap_sg,
  };
  
+/*
+ * The function which clues the AMD IOMMU driver into dma_ops.
+ */
  int __init amd_iommu_init_dma_ops(void)
  {
         struct amd_iommu *iommu;
         int order = amd_iommu_aperture_order;
         int ret;
  
+       /*
+        * first allocate a default protection domain for every IOMMU we
+        * found in the system. Devices not assigned to any other
+        * protection domain will be assigned to the default one.
+        */
         list_for_each_entry(iommu, &amd_iommu_list, list) {
                 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
                 if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
                         goto free_domains;
         }
  
+       /*
+        * If device isolation is enabled, pre-allocate the protection
+        * domains for each device.
+        */
         if (amd_iommu_isolate)
                 prealloc_protection_domains();
  
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
         gart_iommu_aperture = 0;
  #endif
  
+       /* Make the driver finally visible to the drivers */
         dma_ops = &amd_iommu_dma_ops;
  
         return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c

index 2a13e430437dc5f1e05793aa995df4befcbc8938..c9d8ff2eb130b3ed384bc525ef13a5c4784161d7 100644 (file)
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -25,20 +25,13 @@
  #include <asm/pci-direct.h>
  #include <asm/amd_iommu_types.h>
  #include <asm/amd_iommu.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
  
  /*
   * definitions for the ACPI scanning code
   */
-#define UPDATE_LAST_BDF(x) do {\
-       if ((x) > amd_iommu_last_bdf) \
-               amd_iommu_last_bdf = (x); \
-       } while (0);
-
-#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
  #define PCI_BUS(x) (((x) >> 8) & 0xff)
  #define IVRS_HEADER_LENGTH 48
-#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
  
  #define ACPI_IVHD_TYPE                  0x10
  #define ACPI_IVMD_TYPE_ALL              0x20
@@ -71,6 +64,17 @@
  #define ACPI_DEVFLAG_LINT1              0x80
  #define ACPI_DEVFLAG_ATSDIS             0x10000000
  
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
  struct ivhd_header {
         u8 type;
         u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
         u32 reserved;
  } __attribute__((packed));
  
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
  struct ivhd_entry {
         u8 type;
         u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
         u32 ext;
  } __attribute__((packed));
  
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
  struct ivmd_header {
         u8 type;
         u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
  
  static int __initdata amd_iommu_detected;
  
-u16 amd_iommu_last_bdf;
-struct list_head amd_iommu_unity_map;
-unsigned amd_iommu_aperture_order = 26;
-int amd_iommu_isolate;
+u16 amd_iommu_last_bdf;                        /* largest PCI device id we have
+                                          to handle */
+LIST_HEAD(amd_iommu_unity_map);                /* a list of required unity mappings
+                                          we find in ACPI */
+unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+int amd_iommu_isolate;                 /* if 1, device isolation is enabled */
+
+LIST_HEAD(amd_iommu_list);             /* list of all AMD IOMMUs in the
+                                          system */
  
-struct list_head amd_iommu_list;
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
  struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
  u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
  struct amd_iommu **amd_iommu_rlookup_table;
+
+/*
+ * The pd table (protection domain table) is used to find the protection domain
+ * data structure a device belongs to. Indexed with the PCI device id too.
+ */
  struct protection_domain **amd_iommu_pd_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
  unsigned long *amd_iommu_pd_alloc_bitmap;
  
-static u32 dev_table_size;
-static u32 alias_table_size;
-static u32 rlookup_table_size;
+static u32 dev_table_size;     /* size of the device table */
+static u32 alias_table_size;   /* size of the alias table */
+static u32 rlookup_table_size; /* size if the rlookup table */
  
+static inline void update_last_devid(u16 devid)
+{
+       if (devid > amd_iommu_last_bdf)
+               amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+       unsigned shift = PAGE_SHIFT +
+                        get_order(amd_iommu_last_bdf * entry_size);
+
+       return 1UL << shift;
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
  static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
  {
         u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
                         &entry, sizeof(entry));
  }
  
+/* Programs the physical address of the device table into the IOMMU hardware */
  static void __init iommu_set_device_table(struct amd_iommu *iommu)
  {
         u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
                         &entry, sizeof(entry));
  }
  
+/* Generic functions to enable/disable certain features of the IOMMU. */
  static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
  {
         u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
         writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
  }
  
+/* Function to enable the hardware */
  void __init iommu_enable(struct amd_iommu *iommu)
  {
         printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
         iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
  }
  
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
  static u8 * __init iommu_map_mmio_space(u64 address)
  {
         u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
         release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
  }
  
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function reads the last device id the IOMMU has to handle from the PCI
+ * capability header for this IOMMU
+ */
  static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
  {
         u32 cap;
  
         cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-       UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+       update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
  
         return 0;
  }
  
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
  static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
  {
         u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
                 case IVHD_DEV_RANGE_END:
                 case IVHD_DEV_ALIAS:
                 case IVHD_DEV_EXT_SELECT:
-                       UPDATE_LAST_BDF(dev->devid);
+                       /* all the above subfield types refer to device ids */
+                       update_last_devid(dev->devid);
                         break;
                 default:
                         break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
         return 0;
  }
  
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
  static int __init find_last_devid_acpi(struct acpi_table_header *table)
  {
         int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
         return 0;
  }
  
+/****************************************************************************
+ *
+ * The following functions belong the the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
  static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
  {
-       u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+       u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                         get_order(CMD_BUFFER_SIZE));
-       u64 entry = 0;
+       u64 entry;
  
         if (cmd_buf == NULL)
                 return NULL;
  
         iommu->cmd_buf_size = CMD_BUFFER_SIZE;
  
-       memset(cmd_buf, 0, CMD_BUFFER_SIZE);
-
         entry = (u64)virt_to_phys(cmd_buf);
         entry |= MMIO_CMD_SIZE_512;
         memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
  
  static void __init free_command_buffer(struct amd_iommu *iommu)
  {
-       if (iommu->cmd_buf)
-               free_pages((unsigned long)iommu->cmd_buf,
-                               get_order(CMD_BUFFER_SIZE));
+       free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
  }
  
+/* sets a specific bit in the device table entry. */
  static void set_dev_entry_bit(u16 devid, u8 bit)
  {
         int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
         amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
  }
  
-static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+       amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+                                          u16 devid, u32 flags, u32 ext_flags)
  {
         if (flags & ACPI_DEVFLAG_INITPASS)
                 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
                 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
         if (flags & ACPI_DEVFLAG_LINT1)
                 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-}
  
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-       amd_iommu_rlookup_table[devid] = iommu;
+       set_iommu_for_device(iommu, devid);
  }
  
+/*
+ * Reads the device exclusion range from ACPI and initialize IOMMU with
+ * it
+ */
  static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
  {
         struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
                 return;
  
         if (iommu) {
+               /*
+                * We only can configure exclusion ranges per IOMMU, not
+                * per device. But we can enable the exclusion range per
+                * device. This is done here
+                */
                 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
                 iommu->exclusion_start = m->range_start;
                 iommu->exclusion_length = m->range_length;
         }
  }
  
+/*
+ * This function reads some important data from the IOMMU PCI space and
+ * initializes the driver data structure with it. It reads the hardware
+ * capabilities and the first/last device entries
+ */
  static void __init init_iommu_from_pci(struct amd_iommu *iommu)
  {
         int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
         iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
  
         range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-       iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
-       iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+       iommu->first_device = calc_devid(MMIO_GET_BUS(range),
+                                        MMIO_GET_FD(range));
+       iommu->last_device = calc_devid(MMIO_GET_BUS(range),
+                                       MMIO_GET_LD(range));
  }
  
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
  static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                                         struct ivhd_header *h)
  {
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
         u8 *end = p, flags = 0;
         u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
         u32 ext_flags = 0;
-       bool alias = 0;
+       bool alias = false;
         struct ivhd_entry *e;
  
         /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                 case IVHD_DEV_ALL:
                         for (dev_i = iommu->first_device;
                                         dev_i <= iommu->last_device; ++dev_i)
-                               set_dev_entry_from_acpi(dev_i, e->flags, 0);
+                               set_dev_entry_from_acpi(iommu, dev_i,
+                                                       e->flags, 0);
                         break;
                 case IVHD_DEV_SELECT:
                         devid = e->devid;
-                       set_dev_entry_from_acpi(devid, e->flags, 0);
+                       set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
                         break;
                 case IVHD_DEV_SELECT_RANGE_START:
                         devid_start = e->devid;
                         flags = e->flags;
                         ext_flags = 0;
-                       alias = 0;
+                       alias = false;
                         break;
                 case IVHD_DEV_ALIAS:
                         devid = e->devid;
                         devid_to = e->ext >> 8;
-                       set_dev_entry_from_acpi(devid, e->flags, 0);
+                       set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
                         amd_iommu_alias_table[devid] = devid_to;
                         break;
                 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                         flags = e->flags;
                         devid_to = e->ext >> 8;
                         ext_flags = 0;
-                       alias = 1;
+                       alias = true;
                         break;
                 case IVHD_DEV_EXT_SELECT:
                         devid = e->devid;
-                       set_dev_entry_from_acpi(devid, e->flags, e->ext);
+                       set_dev_entry_from_acpi(iommu, devid, e->flags,
+                                               e->ext);
                         break;
                 case IVHD_DEV_EXT_SELECT_RANGE:
                         devid_start = e->devid;
                         flags = e->flags;
                         ext_flags = e->ext;
-                       alias = 0;
+                       alias = false;
                         break;
                 case IVHD_DEV_RANGE_END:
                         devid = e->devid;
                         for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
                                 if (alias)
                                         amd_iommu_alias_table[dev_i] = devid_to;
-                               set_dev_entry_from_acpi(
+                               set_dev_entry_from_acpi(iommu,
                                                 amd_iommu_alias_table[dev_i],
                                                 flags, ext_flags);
                         }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
         }
  }
  
+/* Initializes the device->iommu mapping for the driver */
  static int __init init_iommu_devices(struct amd_iommu *iommu)
  {
         u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
         }
  }
  
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
  static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
  {
         spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
         return 0;
  }
  
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
  static int __init init_iommu_all(struct acpi_table_header *table)
  {
         u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
         struct amd_iommu *iommu;
         int ret;
  
-       INIT_LIST_HEAD(&amd_iommu_list);
-
         end += table->length;
         p += IVRS_HEADER_LENGTH;
  
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
         return 0;
  }
  
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping reanges).
+ *
+ ****************************************************************************/
+
  static void __init free_unity_maps(void)
  {
         struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
         }
  }
  
+/* called when we find an exclusion range definition in ACPI */
  static int __init init_exclusion_range(struct ivmd_header *m)
  {
         int i;
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
         return 0;
  }
  
+/* called for unity map ACPI definition */
  static int __init init_unity_map_range(struct ivmd_header *m)
  {
         struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
         return 0;
  }
  
+/* iterates over all memory definitions we find in the ACPI table */
  static int __init init_memory_definitions(struct acpi_table_header *table)
  {
         u8 *p = (u8 *)table, *end = (u8 *)table;
         struct ivmd_header *m;
  
-       INIT_LIST_HEAD(&amd_iommu_unity_map);
-
         end += table->length;
         p += IVRS_HEADER_LENGTH;
  
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
         return 0;
  }
  
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized
+ */
  static void __init enable_iommus(void)
  {
         struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
         .cls = &amd_iommu_sysdev_class,
  };
  
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * three times:
+ *
+ *     1 pass) Find the highest PCI device id the driver has to handle.
+ *             Upon this information the size of the data structures is
+ *             determined that needs to be allocated.
+ *
+ *     2 pass) Initialize the data structures just allocated with the
+ *             information in the ACPI table about available AMD IOMMUs
+ *             in the system. It also maps the PCI devices in the
+ *             system to specific IOMMUs
+ *
+ *     3 pass) After the basic data structures are allocated and
+ *             initialized we update them with information about memory
+ *             remapping requirements parsed out of the ACPI table in
+ *             this last pass.
+ *
+ * After that the hardware is initialized and ready to go. In the last
+ * step we do some Linux specific things like registering the driver in
+ * the dma_ops interface and initializing the suspend/resume support
+ * functions. Finally it prints some information about AMD IOMMUs and
+ * the driver state and enables the hardware.
+ */
  int __init amd_iommu_init(void)
  {
         int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
         if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
                 return -ENODEV;
  
-       dev_table_size     = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
-       alias_table_size   = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
-       rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+       dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
+       alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+       rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
  
         ret = -ENOMEM;
  
         /* Device table - directly used by all IOMMUs */
-       amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+       amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                                       get_order(dev_table_size));
         if (amd_iommu_dev_table == NULL)
                 goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
          * Protection Domain table - maps devices to protection domains
          * This table has the same size as the rlookup_table
          */
-       amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+       amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                                      get_order(rlookup_table_size));
         if (amd_iommu_pd_table == NULL)
                 goto free;
  
-       amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+       amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+                                           GFP_KERNEL | __GFP_ZERO,
                                             get_order(MAX_DOMAIN_ID/8));
         if (amd_iommu_pd_alloc_bitmap == NULL)
                 goto free;
  
         /*
-        * memory is allocated now; initialize the device table with all zeroes
-        * and let all alias entries point to itself
+        * let all alias entries point to itself
          */
-       memset(amd_iommu_dev_table, 0, dev_table_size);
         for (i = 0; i < amd_iommu_last_bdf; ++i)
                 amd_iommu_alias_table[i] = i;
  
-       memset(amd_iommu_pd_table, 0, rlookup_table_size);
-       memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
-
         /*
          * never allocate domain 0 because its used as the non-allocated and
          * error value placeholder
@@ -795,24 +981,19 @@ out:
         return ret;
  
  free:
-       if (amd_iommu_pd_alloc_bitmap)
-               free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+       free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
  
-       if (amd_iommu_pd_table)
-               free_pages((unsigned long)amd_iommu_pd_table,
-                               get_order(rlookup_table_size));
+       free_pages((unsigned long)amd_iommu_pd_table,
+                  get_order(rlookup_table_size));
  
-       if (amd_iommu_rlookup_table)
-               free_pages((unsigned long)amd_iommu_rlookup_table,
-                               get_order(rlookup_table_size));
+       free_pages((unsigned long)amd_iommu_rlookup_table,
+                  get_order(rlookup_table_size));
  
-       if (amd_iommu_alias_table)
-               free_pages((unsigned long)amd_iommu_alias_table,
-                               get_order(alias_table_size));
+       free_pages((unsigned long)amd_iommu_alias_table,
+                  get_order(alias_table_size));
  
-       if (amd_iommu_dev_table)
-               free_pages((unsigned long)amd_iommu_dev_table,
-                               get_order(dev_table_size));
+       free_pages((unsigned long)amd_iommu_dev_table,
+                  get_order(dev_table_size));
  
         free_iommu_all();
  
@@ -821,6 +1002,13 @@ free:
         goto out;
  }
  
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
  static int __init early_amd_iommu_detect(struct acpi_table_header *table)
  {
         return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
  
  void __init amd_iommu_detect(void)
  {
-       if (swiotlb || no_iommu || iommu_detected)
+       if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
                 return;
  
         if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
         }
  }
  
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
  static int __init parse_amd_iommu_options(char *str)
  {
         for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
  
  static int __init parse_amd_iommu_size_options(char *str)
  {
-       for (; *str; ++str) {
-               if (strcmp(str, "32M") == 0)
-                       amd_iommu_aperture_order = 25;
-               if (strcmp(str, "64M") == 0)
-                       amd_iommu_aperture_order = 26;
-               if (strcmp(str, "128M") == 0)
-                       amd_iommu_aperture_order = 27;
-               if (strcmp(str, "256M") == 0)
-                       amd_iommu_aperture_order = 28;
-               if (strcmp(str, "512M") == 0)
-                       amd_iommu_aperture_order = 29;
-               if (strcmp(str, "1G") == 0)
-                       amd_iommu_aperture_order = 30;
-       }
+       unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
+
+       if ((order > 24) && (order < 31))
+               amd_iommu_aperture_order = order;
  
         return 1;
  }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c

index 9f907806c1a53d58f9be603bb18bdcb60f14c6c2..44e21826db1145a00659c89c5c45232ff3e70d2a 100644 (file)
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
  #include <linux/suspend.h>
  #include <asm/e820.h>
  #include <asm/io.h>
+#include <asm/iommu.h>
  #include <asm/gart.h>
  #include <asm/pci-direct.h>
  #include <asm/dma.h>
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c

index a437d027f20b6d8d7ba3dc88400220e796afe41e..d6c8983583713d747790587861318a5fb58eb342 100644 (file)
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
  /*
   * Debug level, exported for io_apic.c
   */
-int apic_verbosity;
+unsigned int apic_verbosity;
  
  int pic_mode;
  
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
         /* Level triggered for 82489DX */
         if (!lapic_is_integrated())
                 v |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT0, v);
+       apic_write(APIC_LVT0, v);
  }
  
  /**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
   * this function twice on the boot CPU, once with a bogus timeout
   * value, second time for real. The other (noncalibrating) CPUs
   * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
   */
  static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
  {
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
         if (!irqen)
                 lvtt_value |= APIC_LVT_MASKED;
  
-       apic_write_around(APIC_LVTT, lvtt_value);
+       apic_write(APIC_LVTT, lvtt_value);
  
         /*
          * Divide PICLK by 16
          */
         tmp_value = apic_read(APIC_TDCR);
-       apic_write_around(APIC_TDCR, (tmp_value
-                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-                               | APIC_TDR_DIV_16);
+       apic_write(APIC_TDCR,
+                  (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+                  APIC_TDR_DIV_16);
  
         if (!oneshot)
-               apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+               apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
  }
  
  /*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
  static int lapic_next_event(unsigned long delta,
                             struct clock_event_device *evt)
  {
-       apic_write_around(APIC_TMICT, delta);
+       apic_write(APIC_TMICT, delta);
         return 0;
  }
  
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
         case CLOCK_EVT_MODE_SHUTDOWN:
                 v = apic_read(APIC_LVTT);
                 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-               apic_write_around(APIC_LVTT, v);
+               apic_write(APIC_LVTT, v);
                 break;
         case CLOCK_EVT_MODE_RESUME:
                 /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
         }
  }
  
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
  {
         struct clock_event_device *levt = &__get_cpu_var(lapic_events);
         const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
         long delta, deltapm;
         int pm_referenced = 0;
  
-       /*
-        * The local apic timer can be disabled via the kernel
-        * commandline or from the CPU detection code. Register the lapic
-        * timer as a dummy clock event source on SMP systems, so the
-        * broadcast mechanism is used. On UP systems simply ignore it.
-        */
-       if (local_apic_timer_disabled) {
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1) {
-                       lapic_clockevent.mult = 1;
-                       setup_APIC_timer();
-               }
-               return;
-       }
-
-       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
-                   "calibrating APIC timer ...\n");
-
         local_irq_disable();
  
         /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
                     calibration_result / (1000000 / HZ),
                     calibration_result % (1000000 / HZ));
  
-       local_apic_timer_verify_ok = 1;
-
         /*
          * Do a sanity check on the APIC calibration result
          */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
                 local_irq_enable();
                 printk(KERN_WARNING
                        "APIC frequency too slow, disabling apic timer\n");
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1)
-                       setup_APIC_timer();
-               return;
+               return -1;
         }
  
+       local_apic_timer_verify_ok = 1;
+
         /* We trust the pm timer based calibration */
         if (!pm_referenced) {
                 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
         if (!local_apic_timer_verify_ok) {
                 printk(KERN_WARNING
                        "APIC timer disabled due to verification failure.\n");
+                       return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+       /*
+        * The local apic timer can be disabled via the kernel
+        * commandline or from the CPU detection code. Register the lapic
+        * timer as a dummy clock event source on SMP systems, so the
+        * broadcast mechanism is used. On UP systems simply ignore it.
+        */
+       if (local_apic_timer_disabled) {
                 /* No broadcast on UP ! */
-               if (num_possible_cpus() == 1)
-                       return;
-       } else {
-               /*
-                * If nmi_watchdog is set to IO_APIC, we need the
-                * PIT/HPET going.  Otherwise register lapic as a dummy
-                * device.
-                */
-               if (nmi_watchdog != NMI_IO_APIC)
-                       lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-               else
-                       printk(KERN_WARNING "APIC timer registered as dummy,"
-                               " due to nmi_watchdog=%d!\n", nmi_watchdog);
+               if (num_possible_cpus() > 1) {
+                       lapic_clockevent.mult = 1;
+                       setup_APIC_timer();
+               }
+               return;
         }
  
+       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+                   "calibrating APIC timer ...\n");
+
+       if (calibrate_APIC_clock()) {
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() > 1)
+                       setup_APIC_timer();
+               return;
+       }
+
+       /*
+        * If nmi_watchdog is set to IO_APIC, we need the
+        * PIT/HPET going.  Otherwise register lapic as a dummy
+        * device.
+        */
+       if (nmi_watchdog != NMI_IO_APIC)
+               lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+       else
+               printk(KERN_WARNING "APIC timer registered as dummy,"
+                       " due to nmi_watchdog=%d!\n", nmi_watchdog);
+
         /* Setup the lapic or request the broadcast */
         setup_APIC_timer();
  }
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
          */
         if (maxlvt >= 3) {
                 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
         }
         /*
          * Careful: we have to set masks only first to deassert
          * any level-triggered sources.
          */
         v = apic_read(APIC_LVTT);
-       apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
         v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
         v = apic_read(APIC_LVT1);
-       apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
         if (maxlvt >= 4) {
                 v = apic_read(APIC_LVTPC);
-               apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
         }
  
         /* lets not touch this if we didn't frob it */
  #ifdef CONFIG_X86_MCE_P4THERMAL
         if (maxlvt >= 5) {
                 v = apic_read(APIC_LVTTHMR);
-               apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
         }
  #endif
         /*
          * Clean APIC state for other OSs:
          */
-       apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+       apic_write(APIC_LVTT, APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, APIC_LVT_MASKED);
         if (maxlvt >= 3)
-               apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
         if (maxlvt >= 4)
-               apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
  
  #ifdef CONFIG_X86_MCE_P4THERMAL
         if (maxlvt >= 5)
-               apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
+               apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
  #endif
         /* Integrated APIC (!82489DX) ? */
         if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
          */
         value = apic_read(APIC_SPIV);
         value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
  
         /*
          * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
         apic_wait_icr_idle();
  
         apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-       apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-                               | APIC_DM_INIT);
+       apic_write(APIC_ICR,
+                  APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
  }
  
  /*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
         else
                 value |= APIC_SPIV_FOCUS_DISABLED;
         value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
  
         /*
          * Set up the virtual wire mode.
          */
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
         value = APIC_DM_NMI;
         if (!lapic_is_integrated())             /* 82489DX */
                 value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
+       apic_write(APIC_LVT1, value);
  }
  
  static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
  
                 /* enables sending errors */
                 value = ERROR_APIC_VECTOR;
-               apic_write_around(APIC_LVTERR, value);
+               apic_write(APIC_LVTERR, value);
                 /*
                  * spec says clear errors after enabling vector.
                  */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
          */
         value = apic_read(APIC_TASKPRI);
         value &= ~APIC_TPRI_MASK;
-       apic_write_around(APIC_TASKPRI, value);
+       apic_write(APIC_TASKPRI, value);
  
         /*
          * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
          * Set spurious IRQ vector
          */
         value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
  
         /*
          * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
                 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
                                 smp_processor_id());
         }
-       apic_write_around(APIC_LVT0, value);
+       apic_write(APIC_LVT0, value);
  
         /*
          * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
                 value = APIC_DM_NMI | APIC_LVT_MASKED;
         if (!integrated)                /* 82489DX */
                 value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
+       apic_write(APIC_LVT1, value);
  }
  
  void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
         /* Disable the local apic timer */
         value = apic_read(APIC_LVTT);
         value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-       apic_write_around(APIC_LVTT, value);
+       apic_write(APIC_LVTT, value);
  
         setup_apic_nmi_watchdog(NULL);
         apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
  
  int __init APIC_init_uniprocessor(void)
  {
-       if (disable_apic)
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-
         if (!smp_found_config && !cpu_has_apic)
                 return -1;
  
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                 value &= ~APIC_VECTOR_MASK;
                 value |= APIC_SPIV_APIC_ENABLED;
                 value |= 0xf;
-               apic_write_around(APIC_SPIV, value);
+               apic_write(APIC_SPIV, value);
  
                 if (!virt_wire_setup) {
                         /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                                 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
                         value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
                         value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-                       apic_write_around(APIC_LVT0, value);
+                       apic_write(APIC_LVT0, value);
                 } else {
                         /* Disable LVT0 */
-                       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+                       apic_write(APIC_LVT0, APIC_LVT_MASKED);
                 }
  
                 /*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                         APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
                 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
                 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-               apic_write_around(APIC_LVT1, value);
+               apic_write(APIC_LVT1, value);
         }
  }
  
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
  static int __init parse_nolapic(char *arg)
  {
         disable_apic = 1;
-       clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       setup_clear_cpu_cap(X86_FEATURE_APIC);
         return 0;
  }
  early_param("nolapic", parse_nolapic);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c

index 1e3d32e27c14c23a8d48d1dc6bcf8b30faeefae7..7f1f030da7ee4c048990eecf9dc00229472028ca 100644 (file)
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
  /*
   * Debug level, exported for io_apic.c
   */
-int apic_verbosity;
+unsigned int apic_verbosity;
  
  /* Have we found an MP table */
  int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
  
  #define TICK_COUNT 100000000
  
-static void __init calibrate_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
  {
         unsigned apic, apic_start;
         unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
                 clockevent_delta2ns(0xF, &lapic_clockevent);
  
         calibration_result = result / HZ;
+
+       /*
+        * Do a sanity check on the APIC calibration result
+        */
+       if (calibration_result < (1000000 / HZ)) {
+               printk(KERN_WARNING
+                       "APIC frequency too slow, disabling apic timer\n");
+               return -1;
+       }
+
+       return 0;
  }
  
  /*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
         }
  
         printk(KERN_INFO "Using local APIC timer interrupts.\n");
-       calibrate_APIC_clock();
-
-       /*
-        * Do a sanity check on the APIC calibration result
-        */
-       if (calibration_result < (1000000 / HZ)) {
-               printk(KERN_WARNING
-                      "APIC frequency too slow, disabling apic timer\n");
+       if (calibrate_APIC_clock()) {
                 /* No broadcast on UP ! */
                 if (num_possible_cpus() > 1)
                         setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
  static __init int setup_disableapic(char *str)
  {
         disable_apic = 1;
-       clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       setup_clear_cpu_cap(X86_FEATURE_APIC);
         return 0;
  }
  early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c

index bacf5deeec2d13a16a675e2b6bc44e8f53ed232c..aa89387006fe3a730b9b7eba9b7b55cc3c1beb4d 100644 (file)
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
  #include <asm/ia32.h>
  #include <asm/bootparam.h>
  
+#include <xen/interface/xen.h>
+
  #define __NO_STUBS 1
  #undef __SYSCALL
  #undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
         OFFSET(BP_loadflags, boot_params, hdr.loadflags);
         OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
         OFFSET(BP_version, boot_params, hdr.version);
+
+       BLANK();
+       DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+       BLANK();
+       OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+       OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
         return 0;
  }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c

new file mode 100644 (file)

index 0000000..c639bd5
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
+/*
+ * BIOS run time interface routines.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <asm/uv/bios.h>
+
+const char *
+x86_bios_strerror(long status)
+{
+       const char *str;
+       switch (status) {
+       case  0: str = "Call completed without error"; break;
+       case -1: str = "Not implemented"; break;
+       case -2: str = "Invalid argument"; break;
+       case -3: str = "Call completed with error"; break;
+       default: str = "Unknown BIOS status code"; break;
+       }
+       return str;
+}
+
+long
+x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
+                  unsigned long *drift_info)
+{
+       struct uv_bios_retval isrv;
+
+       BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
+       *ticks_per_second = isrv.v0;
+       *drift_info = isrv.v1;
+       return isrv.status;
+}
+EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 81a07ca65d4487d7f3133619210d287238ac411c..cae9cabc3031f1e3a2a3d6f8085d20b557026ac3 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,8 +24,6 @@
  extern void vide(void);
  __asm__(".align 4\nvide: ret");
  
-int force_mwait __cpuinitdata;
-
  static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
  {
         if (cpuid_eax(0x80000000) >= 0x80000007) {
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c

index 7c36fb8a28d46455c2a51b8adabc8511e1f9386c..d1692b2a41ffac4bf6b9423d38af42741a847b36 100644 (file)
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
         /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
         if (c->x86_power & (1<<8))
                 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+       set_cpu_cap(c, X86_FEATURE_SYSCALL32);
  }
  
  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index 1b1c56bb338f7de23f8ee65f4e53f155c0cc294c..c9b58a806e852d3d4a2ff96e0f48c0e737cff80f 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -131,13 +131,7 @@ static void __init check_popad(void)
   *   (for due to lack of "invlpg" and working WP on a i386)
   * - In order to run on anything without a TSC, we need to be
   *   compiled for a i486.
- * - In order to support the local APIC on a buggy Pentium machine,
- *   we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
- *   which happens implicitly if compiled for a Pentium or lower
- *   (unless an advanced selection of CPU features is used) as an
- *   otherwise config implies a properly working local APIC without
- *   the need to do extra reads from the APIC.
-*/
+ */
  
  static void __init check_config(void)
  {
@@ -151,21 +145,6 @@ static void __init check_config(void)
         if (boot_cpu_data.x86 == 3)
                 panic("Kernel requires i486+ for 'invlpg' and other features");
  #endif
-
-/*
- * If we were told we had a good local APIC, check for buggy Pentia,
- * i.e. all B steppings and the C2 stepping of P54C when using their
- * integrated APIC (see 11AP erratum in "Pentium Processor
- * Specification Update").
- */
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
-           && cpu_has_apic
-           && boot_cpu_data.x86 == 5
-           && boot_cpu_data.x86_model == 2
-           && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
-               panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
-#endif
  }
  
  
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c

index 7b8cc72feb40e3ed8bfd02437fe3b6324b024f67..dd6e3f15017eb87b04885fd5164681bbdd9a3073 100644 (file)
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -7,15 +7,13 @@
  #include <linux/module.h>
  #include <linux/kgdb.h>
  #include <linux/topology.h>
-#include <linux/string.h>
  #include <linux/delay.h>
  #include <linux/smp.h>
-#include <linux/module.h>
  #include <linux/percpu.h>
-#include <asm/processor.h>
  #include <asm/i387.h>
  #include <asm/msr.h>
  #include <asm/io.h>
+#include <asm/linkage.h>
  #include <asm/mmu_context.h>
  #include <asm/mtrr.h>
  #include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
                         c->x86_capability[2] = cpuid_edx(0x80860001);
         }
  
-       c->extended_cpuid_level = cpuid_eax(0x80000000);
         if (c->extended_cpuid_level >= 0x80000007)
                 c->x86_power = cpuid_edx(0x80000007);
  
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
                 c->x86_phys_bits = eax & 0xff;
         }
  
-       /* Assume all 64-bit CPUs support 32-bit syscall */
-       set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-
         if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
             cpu_devs[c->x86_vendor]->c_early_init)
                 cpu_devs[c->x86_vendor]->c_early_init(c);
  
         validate_pat_support(c);
-
-       /* early_param could clear that, but recall get it set again */
-       if (disable_apic)
-               clear_cpu_cap(c, X86_FEATURE_APIC);
  }
  
  /*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
  }
  
  char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-                          DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
+                          DEBUG_STKSZ] __page_aligned_bss;
  
  extern asmlinkage void ignore_sysret(void);
  
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 70609efdf1da3ae7fea3a311ffbb1639e3f5675e..b75f2569b8f8ba1940d55ce3616f25a91c7c6d75 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
         if (cpu_has_bts)
                 ds_init_intel(c);
  
+       /*
+        * See if we have a good local APIC by checking for buggy Pentia,
+        * i.e. all B steppings and the C2 stepping of P54C when using their
+        * integrated APIC (see 11AP erratum in "Pentium Processor
+        * Specification Update").
+        */
+       if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+           (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+               set_cpu_cap(c, X86_FEATURE_11AP);
+
  #ifdef CONFIG_X86_NUMAQ
         numaq_tsc_disable();
  #endif
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c

index eef001ad3bdee23a5a5585ce75dd7be4378f40ff..9b60fce09f758d5af0520bccb61445a99aaad693 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
         /* The temperature transition interrupt handler setup */
         h = THERMAL_APIC_VECTOR;                /* our delivery vector */
         h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
-       apic_write_around(APIC_LVTTHMR, h);
+       apic_write(APIC_LVTTHMR, h);
  
         rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
         wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
         wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
  
         l = apic_read(APIC_LVTTHMR);
-       apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+       apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
         printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
  
         /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c

index 28c29180b3807e94f14b38a090ebb8d6deb0e9de..9af89078f7bb0cb2b6ce7ed958c9483b24d4402e 100644 (file)
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
         for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
                 count++;
  
-       printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
+       printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+                        count, start, end);
         for (i = 0; i < count; i++) {
                 struct early_res *r = &early_res[i];
                 printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
         }
  }
  
-/*
- * Non-standard memory setup can be specified via this quirk:
- */
-char * (*arch_memory_setup_quirk)(void);
-
  char *__init default_machine_specific_memory_setup(void)
  {
         char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
  
  char *__init __attribute__((weak)) machine_specific_memory_setup(void)
  {
-       if (arch_memory_setup_quirk) {
-               char *who = arch_memory_setup_quirk();
+       if (x86_quirks->arch_memory_setup) {
+               char *who = x86_quirks->arch_memory_setup();
  
                 if (who)
                         return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
         printk(KERN_INFO "BIOS-provided physical RAM map:\n");
         e820_print_map(who);
  }
-
-#ifdef CONFIG_X86_64
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
-       int i;
-
-       if (slot < 0 || slot >= e820.nr_map)
-               return -1;
-       for (i = slot; i < e820.nr_map; i++) {
-               if (e820.map[i].type != E820_RAM)
-                       continue;
-               break;
-       }
-       if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
-               return -1;
-       *addr = e820.map[i].addr;
-       *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
-               max_pfn << PAGE_SHIFT) - *addr;
-       return i + 1;
-}
-#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c

index a0e11c0cc872f03b1ca9b85fd9521f19fa8a3269..4353cf5e6fac8b4d329e18def887dadd3f55bbb8 100644 (file)
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
  #include <asm/dma.h>
  #include <asm/io_apic.h>
  #include <asm/apic.h>
-
-#ifdef CONFIG_GART_IOMMU
-#include <asm/gart.h>
-#endif
+#include <asm/iommu.h>
  
  static void __init fix_hypertransport_config(int num, int slot, int func)
  {
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S

index 6bc07f0f1202eeb1eaac0b55064acad7f355371a..cdfd94cc6b14e4fd1c06c058904e13c2f6575810 100644 (file)
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -332,7 +332,7 @@ sysenter_past_esp:
         GET_THREAD_INFO(%ebp)
  
         /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
         jnz syscall_trace_entry
         cmpl $(nr_syscalls), %eax
         jae syscall_badsys
@@ -370,7 +370,7 @@ ENTRY(system_call)
         GET_THREAD_INFO(%ebp)
                                         # system call tracing in operation / emulation
         /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
         jnz syscall_trace_entry
         cmpl $(nr_syscalls), %eax
         jae syscall_badsys
@@ -383,10 +383,6 @@ syscall_exit:
                                         # setting need_resched or sigpending
                                         # between sampling and the iret
         TRACE_IRQS_OFF
-       testl $X86_EFLAGS_TF,PT_EFLAGS(%esp)    # If tracing set singlestep flag on exit
-       jz no_singlestep
-       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
         movl TI_flags(%ebp), %ecx
         testw $_TIF_ALLWORK_MASK, %cx   # current->work
         jne syscall_exit_work
@@ -514,12 +510,8 @@ END(work_pending)
  syscall_trace_entry:
         movl $-ENOSYS,PT_EAX(%esp)
         movl %esp, %eax
-       xorl %edx,%edx
-       call do_syscall_trace
-       cmpl $0, %eax
-       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
-                                       # so must skip actual syscall
-       movl PT_ORIG_EAX(%esp), %eax
+       call syscall_trace_enter
+       /* What it returned is what we'll actually use.  */
         cmpl $(nr_syscalls), %eax
         jnae syscall_call
         jmp syscall_exit
@@ -528,14 +520,13 @@ END(syscall_trace_entry)
         # perform syscall exit tracing
         ALIGN
  syscall_exit_work:
-       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+       testb $_TIF_WORK_SYSCALL_EXIT, %cl
         jz work_pending
         TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_ANY)     # could let do_syscall_trace() call
+       ENABLE_INTERRUPTS(CLBR_ANY)     # could let syscall_trace_leave() call
                                         # schedule() instead
         movl %esp, %eax
-       movl $1, %edx
-       call do_syscall_trace
+       call syscall_trace_leave
         jmp resume_userspace
  END(syscall_exit_work)
         CFI_ENDPROC
@@ -1024,6 +1015,7 @@ ENDPROC(kernel_thread_helper)
  ENTRY(xen_sysenter_target)
         RING0_INT_FRAME
         addl $5*4, %esp         /* remove xen-provided frame */
+       CFI_ADJUST_CFA_OFFSET -5*4
         jmp sysenter_past_esp
         CFI_ENDPROC
  
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index ae63e584c340cbafd342af95e18bce946b474525..8410e26f418337d7fc37d77dba6a1f0e60e7f525 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs)
         movq  %rcx,RIP-ARGOFFSET(%rsp)
         CFI_REL_OFFSET rip,RIP-ARGOFFSET
         GET_THREAD_INFO(%rcx)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-               TI_flags(%rcx)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
         jnz tracesys
         cmpq $__NR_syscall_max,%rax
         ja badsys
@@ -430,7 +429,12 @@ tracesys:
         FIXUP_TOP_OF_STACK %rdi
         movq %rsp,%rdi
         call syscall_trace_enter
-       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+       /*
+        * Reload arg registers from stack in case ptrace changed them.
+        * We don't reload %rax because syscall_trace_enter() returned
+        * the value it wants us to use in the table lookup.
+        */
+       LOAD_ARGS ARGOFFSET, 1
         RESTORE_REST
         cmpq $__NR_syscall_max,%rax
         ja   int_ret_from_sys_call      /* RAX(%rsp) set to -ENOSYS above */
@@ -483,7 +487,7 @@ int_very_careful:
         ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_REST
         /* Check for syscall exit trace */      
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+       testl $_TIF_WORK_SYSCALL_EXIT,%edx
         jz int_signal
         pushq %rdi
         CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +495,7 @@ int_very_careful:
         call syscall_trace_leave
         popq %rdi
         CFI_ADJUST_CFA_OFFSET -8
-       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+       andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
         jmp int_restore_rest
         
  int_signal:
@@ -1189,6 +1193,7 @@ END(device_not_available)
         /* runs on exception stack */
  KPROBE_ENTRY(debug)
         INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq $0
         CFI_ADJUST_CFA_OFFSET 8         
         paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1203,7 @@ KPROBE_END(debug)
         /* runs on exception stack */   
  KPROBE_ENTRY(nmi)
         INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq $-1
         CFI_ADJUST_CFA_OFFSET 8
         paranoidentry do_nmi, 0, 0
@@ -1211,6 +1217,7 @@ KPROBE_END(nmi)
  
  KPROBE_ENTRY(int3)
         INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq $0
         CFI_ADJUST_CFA_OFFSET 8
         paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun)
         /* runs on exception stack */
  ENTRY(double_fault)
         XCPT_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         paranoidentry do_double_fault
         jmp paranoid_exit1
         CFI_ENDPROC
@@ -1253,6 +1261,7 @@ END(segment_not_present)
         /* runs on exception stack */
  ENTRY(stack_segment)
         XCPT_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         paranoidentry do_stack_segment
         jmp paranoid_exit1
         CFI_ENDPROC
@@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug)
         /* runs on exception stack */
  ENTRY(machine_check)
         INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq $0
         CFI_ADJUST_CFA_OFFSET 8 
         paranoidentry do_machine_check
@@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret)
         sysret
         CFI_ENDPROC
  ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+       zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+       CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+   see the correct pointer to the pt_regs */
+       movq %rdi, %rsp            # we don't return, adjust the stack frame
+       CFI_ENDPROC
+       CFI_DEFAULT_STACK
+11:    incl %gs:pda_irqcount
+       movq %rsp,%rbp
+       CFI_DEF_CFA_REGISTER rbp
+       cmovzq %gs:pda_irqstackptr,%rsp
+       pushq %rbp                      # backlink for old unwinder
+       call xen_evtchn_do_upcall
+       popq %rsp
+       CFI_DEF_CFA_REGISTER rsp
+       decl %gs:pda_irqcount
+       jmp  error_exit
+       CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+       framesz = (RIP-0x30)    /* workaround buggy gas */
+       _frame framesz
+       CFI_REL_OFFSET rcx, 0
+       CFI_REL_OFFSET r11, 8
+       movw %ds,%cx
+       cmpw %cx,0x10(%rsp)
+       CFI_REMEMBER_STATE
+       jne 1f
+       movw %es,%cx
+       cmpw %cx,0x18(%rsp)
+       jne 1f
+       movw %fs,%cx
+       cmpw %cx,0x20(%rsp)
+       jne 1f
+       movw %gs,%cx
+       cmpw %cx,0x28(%rsp)
+       jne 1f
+       /* All segments match their saved values => Category 2 (Bad IRET). */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %r11
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %rcx
+       CFI_ADJUST_CFA_OFFSET 8
+       jmp general_protection
+       CFI_RESTORE_STATE
+1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       SAVE_ALL
+       jmp error_exit
+       CFI_ENDPROC
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c

index 711f11c30b060dd17bc86be21b439d2068040489..3c3929340692fd0d254593bbc712c365929a8902 100644 (file)
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -24,6 +24,7 @@
  #include <asm/pgtable.h>
  #include <asm/uv/uv_mmrs.h>
  #include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
  
  DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
  EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
  short uv_possible_blades;
  EXPORT_SYMBOL_GPL(uv_possible_blades);
  
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
  /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
  
  static cpumask_t uv_target_cpus(void)
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
                 map_high("MMIOH", mmioh.s.base, shift, map_uc);
  }
  
+static __init void uv_rtc_init(void)
+{
+       long status, ticks_per_sec, drift;
+
+       status =
+           x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
+                                       &drift);
+       if (status != 0 || ticks_per_sec < 100000) {
+               printk(KERN_WARNING
+                       "unable to determine platform RTC clock frequency, "
+                       "guessing.\n");
+               /* BIOS gives wrong value for clock freq. so guess */
+               sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+       } else
+               sn_rtc_cycles_per_second = ticks_per_sec;
+}
+
  static __init void uv_system_init(void)
  {
         union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
         gnode_upper = (((unsigned long)node_id.s.node_id) &
                        ~((1 << n_val) - 1)) << m_val;
  
+       uv_rtc_init();
+
         for_each_present_cpu(cpu) {
                 nid = cpu_to_node(cpu);
                 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c

index c9781982914693cbb6ba98bf7cea94b1a291c8a5..1b318e903bf63f14bd4f514ad8fd5916289bb979 100644 (file)
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
  static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
  #endif
  
+void __init x86_64_init_pda(void)
+{
+       _cpu_pda = __cpu_pda;
+       cpu_pda(0) = &_boot_cpu_pda;
+       pda_init(0);
+}
+
  static void __init zap_identity_mappings(void)
  {
         pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
  
         early_printk("Kernel alive\n");
  
-       _cpu_pda = __cpu_pda;
-       cpu_pda(0) = &_boot_cpu_pda;
-       pda_init(0);
+       x86_64_init_pda();
  
         early_printk("Kernel really alive\n");
  
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index b07ac7b217cb161e2b4f35b1edf4677fe32f0d64..db3280afe886f1b97a9f2c383afc4ac80993a060 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
         /* This must match the first entry in level2_kernel_pgt */
         .quad   0x0000000000000000
  
+#include "../../x86/xen/xen-head.S"
         
         .section .bss, "aw", @nobits
         .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c

index 558abf4c796afa0d7dd7ad2622e3bd42f28e8d39..de9aa0e3a9c51e10df0e85403529e3aa285ad395 100644 (file)
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
         /*
          * Send the IPI. The write to APIC_ICR fires this off.
          */
-       apic_write_around(APIC_ICR, cfg);
+       apic_write(APIC_ICR, cfg);
  }
  #endif /* !CONFIG_SMP */
  
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
         unsigned long v;
  
         v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
  }
  
  static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
         unsigned long v;
  
         v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
  }
  
  static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
          * The AEOI mode will finish them in the 8259A
          * automatically.
          */
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
         init_8259A(1);
         timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
  
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
         pin2  = ioapic_i8259.pin;
         apic2 = ioapic_i8259.apic;
  
-       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-               vector, apic1, pin1, apic2, pin2);
+       apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+                   "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                   vector, apic1, pin1, apic2, pin2);
  
         /*
          * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
                 }
                 clear_IO_APIC_pin(apic1, pin1);
                 if (!no_pin1)
-                       printk(KERN_ERR "..MP-BIOS bug: "
-                              "8254 timer not connected to IO-APIC\n");
+                       apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+                                   "8254 timer not connected to IO-APIC\n");
  
-               printk(KERN_INFO "...trying to set up timer (IRQ0) "
-                      "through the 8259A ... ");
-               printk("\n..... (found pin %d) ...", pin2);
+               apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+                           "(IRQ0) through the 8259A ...\n");
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "..... (found apic %d pin %d) ...\n", apic2, pin2);
                 /*
                  * legacy devices should be connected to IO APIC #0
                  */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
                 unmask_IO_APIC_irq(0);
                 enable_8259A_irq(0);
                 if (timer_irq_works()) {
-                       printk("works.\n");
+                       apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                         timer_through_8259 = 1;
                         if (nmi_watchdog == NMI_IO_APIC) {
                                 disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
                  */
                 disable_8259A_irq(0);
                 clear_IO_APIC_pin(apic2, pin2);
-               printk(" failed.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
         }
  
         if (nmi_watchdog == NMI_IO_APIC) {
-               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+               apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+                           "through the IO-APIC - disabling NMI Watchdog!\n");
                 nmi_watchdog = NMI_NONE;
         }
         timer_ack = 0;
  
-       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as Virtual Wire IRQ...\n");
  
         lapic_register_intr(0, vector);
-       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
+       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
         enable_8259A_irq(0);
  
         if (timer_irq_works()) {
-               printk(" works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                 goto out;
         }
         disable_8259A_irq(0);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
-       printk(" failed.\n");
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
  
-       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as ExtINT IRQ...\n");
  
         init_8259A(0);
         make_8259A_irq(0);
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
  
         unlock_ExtINT_logic();
  
         if (timer_irq_works()) {
-               printk(" works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                 goto out;
         }
-       printk(" failed :(.\n");
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
         panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
-               "report.  Then try booting with the 'noapic' option");
+               "report.  Then try booting with the 'noapic' option.\n");
  out:
         local_irq_restore(flags);
  }
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c

index 6510cde36b3549149eabefa4aaf724c72e5a0959..64a46affd85813c95bcc0f0a8b1dc71a142af7d1 100644 (file)
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
  #include <asm/proto.h>
  #include <asm/acpi.h>
  #include <asm/dma.h>
+#include <asm/i8259.h>
  #include <asm/nmi.h>
  #include <asm/msidef.h>
  #include <asm/hypertransport.h>
@@ -1696,8 +1697,9 @@ static inline void __init check_timer(void)
         pin2  = ioapic_i8259.pin;
         apic2 = ioapic_i8259.apic;
  
-       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-               cfg->vector, apic1, pin1, apic2, pin2);
+       apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+                   "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                   cfg->vector, apic1, pin1, apic2, pin2);
  
         /*
          * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1737,13 @@ static inline void __init check_timer(void)
                 }
                 clear_IO_APIC_pin(apic1, pin1);
                 if (!no_pin1)
-                       apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+                       apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
                                     "8254 timer not connected to IO-APIC\n");
  
-               apic_printk(APIC_VERBOSE,KERN_INFO
-                       "...trying to set up timer (IRQ0) "
-                       "through the 8259A ... ");
-               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-                       apic2, pin2);
+               apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+                           "(IRQ0) through the 8259A ...\n");
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "..... (found apic %d pin %d) ...\n", apic2, pin2);
                 /*
                  * legacy devices should be connected to IO APIC #0
                  */
@@ -1751,7 +1752,7 @@ static inline void __init check_timer(void)
                 unmask_IO_APIC_irq(0);
                 enable_8259A_irq(0);
                 if (timer_irq_works()) {
-                       apic_printk(APIC_VERBOSE," works.\n");
+                       apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                         timer_through_8259 = 1;
                         if (nmi_watchdog == NMI_IO_APIC) {
                                 disable_8259A_irq(0);
@@ -1765,29 +1766,32 @@ static inline void __init check_timer(void)
                  */
                 disable_8259A_irq(0);
                 clear_IO_APIC_pin(apic2, pin2);
-               apic_printk(APIC_VERBOSE," failed.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
         }
  
         if (nmi_watchdog == NMI_IO_APIC) {
-               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+               apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+                           "through the IO-APIC - disabling NMI Watchdog!\n");
                 nmi_watchdog = NMI_NONE;
         }
  
-       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as Virtual Wire IRQ...\n");
  
         lapic_register_intr(0);
         apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
         enable_8259A_irq(0);
  
         if (timer_irq_works()) {
-               apic_printk(APIC_VERBOSE," works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                 goto out;
         }
         disable_8259A_irq(0);
         apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
-       apic_printk(APIC_VERBOSE," failed.\n");
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
  
-       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as ExtINT IRQ...\n");
  
         init_8259A(0);
         make_8259A_irq(0);
@@ -1796,11 +1800,12 @@ static inline void __init check_timer(void)
         unlock_ExtINT_logic();
  
         if (timer_irq_works()) {
-               apic_printk(APIC_VERBOSE," works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                 goto out;
         }
-       apic_printk(APIC_VERBOSE," failed :(.\n");
-       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+               "report.  Then try booting with the 'noapic' option.\n");
  out:
         local_irq_restore(flags);
  }
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c

index 5921e5f0a64027745fa2ac3e5f695ceaca9ddeae..1c3a66a67f83d3bb6d7eec2d5e4cfb1560660720 100644 (file)
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
  
  static int __init io_delay_param(char *s)
  {
+       if (!s)
+               return -EINVAL;
+
         if (!strcmp(s, "0x80"))
                 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
         else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c

index 9d98cda39ad9ea006729b999daec04c0e1e32d72..3f7537b669d312e08a6d92e92787feb0ae903549 100644 (file)
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
         /*
          * Send the IPI. The write to APIC_ICR fires this off.
          */
-       apic_write_around(APIC_ICR, cfg);
+       apic_write(APIC_ICR, cfg);
  }
  
  void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
          * prepare target chip field
          */
         cfg = __prepare_ICR2(mask);
-       apic_write_around(APIC_ICR2, cfg);
+       apic_write(APIC_ICR2, cfg);
  
         /*
          * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
         /*
          * Send the IPI. The write to APIC_ICR fires this off.
          */
-       apic_write_around(APIC_ICR, cfg);
+       apic_write(APIC_ICR, cfg);
  }
  
  /*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c

index 47a6f6f124789a32179172920e4f6a436700ec50..1cf8c1fcc0889c5a066ddb2b7e150ece69dc1729 100644 (file)
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
  static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
  static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
  
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
  
  static void call_on_stack(void *func, void *stack)
  {
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c

index c03205991718b30da8adfed31d915f7f325b2aa0..f2d43bc7551488a61390c27cd429a9721fefbc26 100644 (file)
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
  #include <linux/init.h>
  #include <linux/io.h>
  #include <linux/mm.h>
+#include <linux/module.h>
  
  #include <asm/setup.h>
  
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
  #ifdef CONFIG_DEBUG_BOOT_PARAMS
  struct setup_data_node {
         u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
  {
         int error = 0;
  
+       arch_debugfs_dir = debugfs_create_dir("x86", NULL);
+       if (!arch_debugfs_dir)
+               return -ENOMEM;
+
  #ifdef CONFIG_DEBUG_BOOT_PARAMS
         error = boot_params_kdebugfs_init();
  #endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c

index b8c6743a13daddad0446d686d3f36436b3ac7531..43c019f85f0db749425e451957bd2279cee47b80 100644 (file)
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
  
         resume_execution(cur, regs, kcb);
         regs->flags |= kcb->kprobe_saved_flags;
-       trace_hardirqs_fixup_flags(regs->flags);
  
         if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
                 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c

index a888e67f5874dad3415220cc9fd0c9f962c4661c..0e867676b5a57ebd14b063bf71cdac852e2b4089 100644 (file)
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr,
                      const Elf_Shdr *sechdrs,
                      struct module *me)
  {
-       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+               *para = NULL;
         char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
  
         for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr,
                         alt = s;
                 if (!strcmp(".smp_locks", secstrings + s->sh_name))
                         locks= s;
+               if (!strcmp(".parainstructions", secstrings + s->sh_name))
+                       para = s;
         }
  
         if (alt) {
@@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr,
                                             tseg, tseg + text->sh_size);
         }
  
+       if (para) {
+               void *pseg = (void *)para->sh_addr;
+               apply_paravirt(pseg, pseg + para->sh_size);
+       }
+
         return module_bug_finalize(hdr, sechdrs, me);
  }
  
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c

index 3b25e49380c6eed8ff660d409afb2deda8c428cd..6ae005ccaed83bc46a6666f1d65b80baf7737f8d 100644 (file)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
  #include <asm/bios_ebda.h>
  #include <asm/e820.h>
  #include <asm/trampoline.h>
+#include <asm/setup.h>
  
  #include <mach_apic.h>
  #ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
         return sum & 0xFF;
  }
  
-#ifdef CONFIG_X86_NUMAQ
-int found_numaq;
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_config_translation {
-       unsigned char mpc_type;
-       unsigned char trans_len;
-       unsigned char trans_type;
-       unsigned char trans_quad;
-       unsigned char trans_global;
-       unsigned char trans_local;
-       unsigned short trans_reserved;
-};
-
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
-    __cpuinitdata;
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
-       return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-
-static inline int mpc_apic_id(struct mpc_config_processor *m,
-                       struct mpc_config_translation *translation_record)
-{
-       int quad = translation_record->trans_quad;
-       int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
-
-       printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
-              m->mpc_apicid,
-              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-              m->mpc_apicver, quad, logical_apicid);
-       return logical_apicid;
-}
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-
-static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-       struct mpc_config_translation *translation)
-{
-       int quad = translation->trans_quad;
-       int local = translation->trans_local;
-
-       mp_bus_id_to_node[m->mpc_busid] = quad;
-       mp_bus_id_to_local[m->mpc_busid] = local;
-       printk(KERN_INFO "Bus #%d is %s (node %d)\n",
-              m->mpc_busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-static void mpc_oem_pci_bus(struct mpc_config_bus *m,
-       struct mpc_config_translation *translation)
-{
-       int quad = translation->trans_quad;
-       int local = translation->trans_local;
-
-       quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
-}
-
-#endif
-
  static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
  {
         int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
                 disabled_cpus++;
                 return;
         }
-#ifdef CONFIG_X86_NUMAQ
-       if (found_numaq)
-               apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+       if (x86_quirks->mpc_apic_id)
+               apicid = x86_quirks->mpc_apic_id(m);
         else
                 apicid = m->mpc_apicid;
-#else
-       apicid = m->mpc_apicid;
-#endif
+
         if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
                 bootup_cpu = " (Bootup-CPU)";
                 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
         memcpy(str, m->mpc_bustype, 6);
         str[6] = 0;
  
-#ifdef CONFIG_X86_NUMAQ
-       if (found_numaq)
-               mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-#else
-       printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
-#endif
+       if (x86_quirks->mpc_oem_bus_info)
+               x86_quirks->mpc_oem_bus_info(m, str);
+       else
+               printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
  
  #if MAX_MP_BUSSES < 256
         if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
                 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
  #endif
         } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-#ifdef CONFIG_X86_NUMAQ
-               if (found_numaq)
-                       mpc_oem_pci_bus(m, translation_table[mpc_record]);
-#endif
+               if (x86_quirks->mpc_oem_pci_bus)
+                       x86_quirks->mpc_oem_pci_bus(m);
+
                 clear_bit(m->mpc_busid, mp_bus_not_pci);
  #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
                 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
                 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
  }
  
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info(struct mpc_config_translation *m)
-{
-       printk(KERN_INFO
-              "Translation: record %d, type %d, quad %d, global %d, local %d\n",
-              mpc_record, m->trans_type, m->trans_quad, m->trans_global,
-              m->trans_local);
-
-       if (mpc_record >= MAX_MPC_ENTRY)
-               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-       else
-               translation_table[mpc_record] = m;      /* stash this for later */
-       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-               node_set_online(m->trans_quad);
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
-                                   unsigned short oemsize)
-{
-       int count = sizeof(*oemtable);  /* the header size */
-       unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
-       mpc_record = 0;
-       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
-              oemtable);
-       if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
-               printk(KERN_WARNING
-                      "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-                      oemtable->oem_signature[0], oemtable->oem_signature[1],
-                      oemtable->oem_signature[2], oemtable->oem_signature[3]);
-               return;
-       }
-       if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
-               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-               return;
-       }
-       while (count < oemtable->oem_length) {
-               switch (*oemptr) {
-               case MP_TRANSLATION:
-                       {
-                               struct mpc_config_translation *m =
-                                   (struct mpc_config_translation *)oemptr;
-                               MP_translation_info(m);
-                               oemptr += sizeof(*m);
-                               count += sizeof(*m);
-                               ++mpc_record;
-                               break;
-                       }
-               default:
-                       {
-                               printk(KERN_WARNING
-                                      "Unrecognised OEM table entry type! - %d\n",
-                                      (int)*oemptr);
-                               return;
-                       }
-               }
-       }
-}
-
-void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
-                                char *productid)
-{
-       if (strncmp(oem, "IBM NUMA", 8))
-               printk("Warning!  Not a NUMA-Q system!\n");
-       else
-               found_numaq = 1;
-
-       if (mpc->mpc_oemptr)
-               smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
-                                mpc->mpc_oemsize);
-}
-#endif /* CONFIG_X86_NUMAQ */
-
  /*
   * Read/parse the MPC
   */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
         } else
                 mps_oem_check(mpc, oem, str);
  #endif
-
         /* save the local APIC address, it might be non-default */
         if (!acpi_lapic)
                 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
         if (early)
                 return 1;
  
+       if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
+               struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
+               x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+       }
+
         /*
          *      Now process the configuration blocks.
          */
-#ifdef CONFIG_X86_NUMAQ
-       mpc_record = 0;
-#endif
+       if (x86_quirks->mpc_record)
+               *x86_quirks->mpc_record = 0;
+
         while (count < mpc->mpc_length) {
                 switch (*mpt) {
                 case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
                         count = mpc->mpc_length;
                         break;
                 }
-#ifdef CONFIG_X86_NUMAQ
-               ++mpc_record;
-#endif
+               if (x86_quirks->mpc_record)
+                       (*x86_quirks->mpc_record)++;
         }
  
  #ifdef CONFIG_X86_GENERICARCH
@@ -725,12 +577,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
  
  static struct intel_mp_floating *mpf_found;
  
-/*
- * Machine specific quirk for finding the SMP config before other setup
- * activities destroy the table:
- */
-int (*mach_get_smp_config_quirk)(unsigned int early);
-
  /*
   * Scan the memory blocks for an SMP configuration block.
   */
@@ -738,8 +584,8 @@ static void __init __get_smp_config(unsigned int early)
  {
         struct intel_mp_floating *mpf = mpf_found;
  
-       if (mach_get_smp_config_quirk) {
-               if (mach_get_smp_config_quirk(early))
+       if (x86_quirks->mach_get_smp_config) {
+               if (x86_quirks->mach_get_smp_config(early))
                         return;
         }
         if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
         return 0;
  }
  
-int (*mach_find_smp_config_quirk)(unsigned int reserve);
-
  static void __init __find_smp_config(unsigned int reserve)
  {
         unsigned int address;
  
-       if (mach_find_smp_config_quirk) {
-               if (mach_find_smp_config_quirk(reserve))
+       if (x86_quirks->mach_find_smp_config) {
+               if (x86_quirks->mach_find_smp_config(reserve))
                         return;
         }
         /*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c

index ec024b3baad0764821c036d0aa2552397f76f017..ac6d51222e7d3562abb1e7b5e8bd4a16a1c1c1f4 100644 (file)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
  
  static void __acpi_nmi_enable(void *__unused)
  {
-       apic_write_around(APIC_LVT0, APIC_DM_NMI);
+       apic_write(APIC_LVT0, APIC_DM_NMI);
  }
  
  /*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
  
  static void __acpi_nmi_disable(void *__unused)
  {
-       apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
  }
  
  /*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
  
  #ifdef CONFIG_SYSCTL
  
+static int __init setup_unknown_nmi_panic(char *str)
+{
+       unknown_nmi_panic = 1;
+       return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
  static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
  {
         unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c

index a23e8233b9ac59b8afbb0634be2a0c318defd652..b8c45610b20a89a6f52f9b266d68de042d688168 100644 (file)
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
  #include <asm/processor.h>
  #include <asm/mpspec.h>
  #include <asm/e820.h>
+#include <asm/setup.h>
  
  #define        MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
  
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
         }
  }
  
+
+void __init numaq_tsc_disable(void)
+{
+       if (!found_numaq)
+               return;
+
+       if (num_online_nodes() > 1) {
+               printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+               setup_clear_cpu_cap(X86_FEATURE_TSC);
+       }
+}
+
+static int __init numaq_pre_time_init(void)
+{
+       numaq_tsc_disable();
+       return 0;
+}
+
+int found_numaq;
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+struct mpc_config_translation {
+       unsigned char mpc_type;
+       unsigned char trans_len;
+       unsigned char trans_type;
+       unsigned char trans_quad;
+       unsigned char trans_global;
+       unsigned char trans_local;
+       unsigned short trans_reserved;
+};
+
+/* x86_quirks member */
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+    __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+       return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+/* x86_quirks member */
+static int mpc_apic_id(struct mpc_config_processor *m)
+{
+       int quad = translation_table[mpc_record]->trans_quad;
+       int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+       printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver, quad, logical_apicid);
+       return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+/* x86_quirks member */
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
+{
+       int quad = translation_table[mpc_record]->trans_quad;
+       int local = translation_table[mpc_record]->trans_local;
+
+       mp_bus_id_to_node[m->mpc_busid] = quad;
+       mp_bus_id_to_local[m->mpc_busid] = local;
+       printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+              m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+
+/* x86_quirks member */
+static void mpc_oem_pci_bus(struct mpc_config_bus *m)
+{
+       int quad = translation_table[mpc_record]->trans_quad;
+       int local = translation_table[mpc_record]->trans_local;
+
+       quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
+static void __init MP_translation_info(struct mpc_config_translation *m)
+{
+       printk(KERN_INFO
+              "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+              mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+              m->trans_local);
+
+       if (mpc_record >= MAX_MPC_ENTRY)
+               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+       else
+               translation_table[mpc_record] = m;      /* stash this for later */
+       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+               node_set_online(m->trans_quad);
+}
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+       int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+                                   unsigned short oemsize)
+{
+       int count = sizeof(*oemtable);  /* the header size */
+       unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+       mpc_record = 0;
+       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+              oemtable);
+       if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+               printk(KERN_WARNING
+                      "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+                      oemtable->oem_signature[0], oemtable->oem_signature[1],
+                      oemtable->oem_signature[2], oemtable->oem_signature[3]);
+               return;
+       }
+       if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+               return;
+       }
+       while (count < oemtable->oem_length) {
+               switch (*oemptr) {
+               case MP_TRANSLATION:
+                       {
+                               struct mpc_config_translation *m =
+                                   (struct mpc_config_translation *)oemptr;
+                               MP_translation_info(m);
+                               oemptr += sizeof(*m);
+                               count += sizeof(*m);
+                               ++mpc_record;
+                               break;
+                       }
+               default:
+                       {
+                               printk(KERN_WARNING
+                                      "Unrecognised OEM table entry type! - %d\n",
+                                      (int)*oemptr);
+                               return;
+                       }
+               }
+       }
+}
+
+static struct x86_quirks numaq_x86_quirks __initdata = {
+       .arch_pre_time_init     = numaq_pre_time_init,
+       .arch_time_init         = NULL,
+       .arch_pre_intr_init     = NULL,
+       .arch_memory_setup      = NULL,
+       .arch_intr_init         = NULL,
+       .arch_trap_init         = NULL,
+       .mach_get_smp_config    = NULL,
+       .mach_find_smp_config   = NULL,
+       .mpc_record             = &mpc_record,
+       .mpc_apic_id            = mpc_apic_id,
+       .mpc_oem_bus_info       = mpc_oem_bus_info,
+       .mpc_oem_pci_bus        = mpc_oem_pci_bus,
+       .smp_read_mpc_oem       = smp_read_mpc_oem,
+};
+
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+                                char *productid)
+{
+       if (strncmp(oem, "IBM NUMA", 8))
+               printk("Warning!  Not a NUMA-Q system!\n");
+       else
+               found_numaq = 1;
+}
+
  static __init void early_check_numaq(void)
  {
         /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
          */
         if (smp_found_config)
                 early_get_smp_config();
+
+       if (found_numaq)
+               x86_quirks = &numaq_x86_quirks;
  }
  
  int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
         smp_dump_qct();
         return 1;
  }
-
-void __init numaq_tsc_disable(void)
-{
-       if (!found_numaq)
-               return;
-
-       if (num_online_nodes() > 1) {
-               printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-               setup_clear_cpu_cap(X86_FEATURE_TSC);
-       }
-}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c

index e0f571d58c19c0bfa4eeee39d89972ed2239f55f..b4564d089b43b91bdfcbe36eb432e0cd245fc593 100644 (file)
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
  #include <asm/desc.h>
  #include <asm/setup.h>
  #include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
  #include <asm/time.h>
  #include <asm/pgalloc.h>
  #include <asm/irq.h>
@@ -361,7 +362,6 @@ struct pv_cpu_ops pv_cpu_ops = {
  struct pv_apic_ops pv_apic_ops = {
  #ifdef CONFIG_X86_LOCAL_APIC
         .apic_write = native_apic_write,
-       .apic_write_atomic = native_apic_write_atomic,
         .apic_read = native_apic_read,
         .setup_boot_clock = setup_boot_APIC_clock,
         .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +373,9 @@ struct pv_mmu_ops pv_mmu_ops = {
  #ifndef CONFIG_X86_64
         .pagetable_setup_start = native_pagetable_setup_start,
         .pagetable_setup_done = native_pagetable_setup_done,
+#else
+       .pagetable_setup_start = paravirt_nop,
+       .pagetable_setup_done = paravirt_nop,
  #endif
  
         .read_cr2 = native_read_cr2,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c

index 6959b5c45df4546f28c2ef962fdc197d7afbfe3a..151f2d171f7c7386b7a14f574d4c7f34edfab0f3 100644 (file)
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -36,7 +36,7 @@
  #include <linux/delay.h>
  #include <linux/scatterlist.h>
  #include <linux/iommu-helper.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include <asm/calgary.h>
  #include <asm/tce.h>
  #include <asm/pci-direct.h>
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c

index 8467ec2320f178584afb402cfb2b48859a3eb48e..a4213c00dffc355a6a8b8958c85399256a81be40 100644 (file)
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,12 +5,11 @@
  
  #include <asm/proto.h>
  #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include <asm/calgary.h>
  #include <asm/amd_iommu.h>
  
-int forbid_dac __read_mostly;
-EXPORT_SYMBOL(forbid_dac);
+static int forbid_dac __read_mostly;
  
  const struct dma_mapping_ops *dma_ops;
  EXPORT_SYMBOL(dma_ops);
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
          * The order of these functions is important for
          * fall-back/fail-over reasons
          */
-#ifdef CONFIG_GART_IOMMU
         gart_iommu_hole_init();
-#endif
  
-#ifdef CONFIG_CALGARY_IOMMU
         detect_calgary();
-#endif
  
         detect_intel_iommu();
  
         amd_iommu_detect();
  
-#ifdef CONFIG_SWIOTLB
         pci_swiotlb_init();
-#endif
  }
  #endif
  
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
                         swiotlb = 1;
  #endif
  
-#ifdef CONFIG_GART_IOMMU
                 gart_parse_options(p);
-#endif
  
  #ifdef CONFIG_CALGARY_IOMMU
                 if (!strncmp(p, "calgary", 7))
@@ -500,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent);
  
  static int __init pci_iommu_init(void)
  {
-#ifdef CONFIG_CALGARY_IOMMU
         calgary_iommu_init();
-#endif
  
         intel_iommu_init();
  
         amd_iommu_init();
  
-#ifdef CONFIG_GART_IOMMU
         gart_iommu_init();
-#endif
  
         no_iommu_init();
         return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c

index c3fe78406d1897b40e380f0528cf4db832d0a191..be60961f8695681b2b96dcda25b694933b5f0d47 100644 (file)
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
  #include <asm/mtrr.h>
  #include <asm/pgtable.h>
  #include <asm/proto.h>
+#include <asm/iommu.h>
  #include <asm/gart.h>
  #include <asm/cacheflush.h>
  #include <asm/swiotlb.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c

index aec43d56f49c57cedffaefad7205fbbfe9b984c2..792b9179eff315ecf3ae26e7e1635073e1a42c26 100644 (file)
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
  #include <linux/dma-mapping.h>
  #include <linux/scatterlist.h>
  
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include <asm/processor.h>
  #include <asm/dma.h>
  
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c

index 82299cd1d04d452fea6ba6069fb3297a8824f7de..20df839b9c2012c12fa082c7b42ada6e63879a87 100644 (file)
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
  #include <linux/module.h>
  #include <linux/dma-mapping.h>
  
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include <asm/swiotlb.h>
  #include <asm/dma.h>
  
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 4d629c62f4f8fbb993a49c0d5f12d7e88bbd94b5..7fc4d5b0a6a0f99a4d1d9c4df685a5a4d3a135bb 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
  EXPORT_SYMBOL(idle_nomwait);
  
  struct kmem_cache *task_xstate_cachep;
+static int force_mwait __cpuinitdata;
  
  int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
  {
@@ -199,6 +200,7 @@ static void poll_idle(void)
   *
   * idle=mwait overrides this decision and forces the usage of mwait.
   */
+static int __cpuinitdata force_mwait;
  
  #define MWAIT_INFO                     0x05
  #define MWAIT_ECX_EXTENDED_INFO                0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
  
  static int __init idle_setup(char *str)
  {
+       if (!str)
+               return -EINVAL;
+
         if (!strcmp(str, "poll")) {
                 printk("using polling idle threads.\n");
                 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index a8e53626ac9aaf5fc8290908aaf42552556a1b11..e8a8e1b998176fba76076a6a9b0c9d432f6f3922 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
  struct task_struct *
  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  {
-       struct thread_struct *prev = &prev_p->thread,
-                                *next = &next_p->thread;
+       struct thread_struct *prev = &prev_p->thread;
+       struct thread_struct *next = &next_p->thread;
         int cpu = smp_processor_id();
         struct tss_struct *tss = &per_cpu(init_tss, cpu);
         unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  
         /* 
          * Switch FS and GS.
+        *
+        * Segment register != 0 always requires a reload.  Also
+        * reload when it has changed.  When prev process used 64bit
+        * base always reload to avoid an information leak.
          */
-       { 
-               /* segment register != 0 always requires a reload. 
-                  also reload when it has changed. 
-                  when prev process used 64bit base always reload
-                  to avoid an information leak. */
-               if (unlikely(fsindex | next->fsindex | prev->fs)) {
-                       loadsegment(fs, next->fsindex);
-                       /* check if the user used a selector != 0
-                        * if yes clear 64bit base, since overloaded base
-                         * is always mapped to the Null selector
-                         */
-                       if (fsindex)
+       if (unlikely(fsindex | next->fsindex | prev->fs)) {
+               loadsegment(fs, next->fsindex);
+               /* 
+                * Check if the user used a selector != 0; if yes
+                *  clear 64bit base, since overloaded base is always
+                *  mapped to the Null selector
+                */
+               if (fsindex)
                         prev->fs = 0;                           
-               }
-               /* when next process has a 64bit base use it */
-               if (next->fs) 
-                       wrmsrl(MSR_FS_BASE, next->fs); 
-               prev->fsindex = fsindex;
-
-               if (unlikely(gsindex | next->gsindex | prev->gs)) {
-                       load_gs_index(next->gsindex);
-                       if (gsindex)
+       }
+       /* when next process has a 64bit base use it */
+       if (next->fs)
+               wrmsrl(MSR_FS_BASE, next->fs);
+       prev->fsindex = fsindex;
+
+       if (unlikely(gsindex | next->gsindex | prev->gs)) {
+               load_gs_index(next->gsindex);
+               if (gsindex)
                         prev->gs = 0;                           
-               }
-               if (next->gs)
-                       wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
-               prev->gsindex = gsindex;
         }
+       if (next->gs)
+               wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+       prev->gsindex = gsindex;
  
         /* Must be after DS reload */
         unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         write_pda(pcurrent, next_p); 
  
         write_pda(kernelstack,
-       (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+                 (unsigned long)task_stack_page(next_p) +
+                 THREAD_SIZE - PDA_STACKOFFSET);
  #ifdef CONFIG_CC_STACKPROTECTOR
         write_pda(stack_canary, next_p->stack_canary);
         /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c

index 77040b6070e18c00ce71f788efa0d40d5a156229..e37dccce85db5e15922602b833a3821b78ca4ffd 100644 (file)
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
  #endif
  }
  
-#ifdef CONFIG_X86_32
-
  void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
  {
         struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
         force_sig_info(SIGTRAP, &info, tsk);
  }
  
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-       int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-       /*
-        * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-        * interception
-        */
-       int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-       int ret = 0;
-
-       /* do the secure computing check first */
-       if (!entryexit)
-               secure_computing(regs->orig_ax);
-
-       if (unlikely(current->audit_context)) {
-               if (entryexit)
-                       audit_syscall_exit(AUDITSC_RESULT(regs->ax),
-                                               regs->ax);
-               /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-                * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-                * not used, entry.S will call us only on syscall exit, not
-                * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-                * calling send_sigtrap() on syscall entry.
-                *
-                * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-                * is_singlestep is false, despite his name, so we will still do
-                * the correct thing.
-                */
-               else if (is_singlestep)
-                       goto out;
-       }
-
-       if (!(current->ptrace & PT_PTRACED))
-               goto out;
-
-       /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-        * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-        * here. We have to check this and return */
-       if (is_sysemu && entryexit)
-               return 0;
-
-       /* Fake a debug trap */
-       if (is_singlestep)
-               send_sigtrap(current, regs, 0);
-
-       if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-               goto out;
-
-       /* the 0x80 provides a way for the tracing parent to distinguish
-          between a syscall stop and SIGTRAP delivery */
-       /* Note that the debugger could change the result of test_thread_flag!*/
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
-
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-       ret = is_sysemu;
-out:
-       if (unlikely(current->audit_context) && !entryexit)
-               audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
-                                   regs->bx, regs->cx, regs->dx, regs->si);
-       if (ret == 0)
-               return 0;
-
-       regs->orig_ax = -1; /* force skip of syscall restarting */
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-       return 1;
-}
-
-#else  /* CONFIG_X86_64 */
-
  static void syscall_trace(struct pt_regs *regs)
  {
+       if (!(current->ptrace & PT_PTRACED))
+               return;
  
  #if 0
         printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
         }
  }
  
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+#ifdef CONFIG_X86_32
+# define IS_IA32       1
+#elif defined CONFIG_IA32_EMULATION
+# define IS_IA32       test_thread_flag(TIF_IA32)
+#else
+# define IS_IA32       0
+#endif
+
+/*
+ * We must return the syscall number to actually look up in the table.
+ * This can be -1L to skip running any syscall at all.
+ */
+asmregparm long syscall_trace_enter(struct pt_regs *regs)
  {
+       long ret = 0;
+
+       /*
+        * If we stepped into a sysenter/syscall insn, it trapped in
+        * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+        * If user-mode had set TF itself, then it's still clear from
+        * do_debug() and we need to set it again to restore the user
+        * state.  If we entered on the slow path, TF was already set.
+        */
+       if (test_thread_flag(TIF_SINGLESTEP))
+               regs->flags |= X86_EFLAGS_TF;
+
         /* do the secure computing check first */
         secure_computing(regs->orig_ax);
  
-       if (test_thread_flag(TIF_SYSCALL_TRACE)
-           && (current->ptrace & PT_PTRACED))
+       if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+               ret = -1L;
+
+       if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
                 syscall_trace(regs);
  
         if (unlikely(current->audit_context)) {
-               if (test_thread_flag(TIF_IA32)) {
+               if (IS_IA32)
                         audit_syscall_entry(AUDIT_ARCH_I386,
                                             regs->orig_ax,
                                             regs->bx, regs->cx,
                                             regs->dx, regs->si);
-               } else {
+#ifdef CONFIG_X86_64
+               else
                         audit_syscall_entry(AUDIT_ARCH_X86_64,
                                             regs->orig_ax,
                                             regs->di, regs->si,
                                             regs->dx, regs->r10);
-               }
+#endif
         }
+
+       return ret ?: regs->orig_ax;
  }
  
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+asmregparm void syscall_trace_leave(struct pt_regs *regs)
  {
         if (unlikely(current->audit_context))
                 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
  
-       if ((test_thread_flag(TIF_SYSCALL_TRACE)
-            || test_thread_flag(TIF_SINGLESTEP))
-           && (current->ptrace & PT_PTRACED))
+       if (test_thread_flag(TIF_SYSCALL_TRACE))
                 syscall_trace(regs);
-}
  
-#endif /* CONFIG_X86_32 */
+       /*
+        * If TIF_SYSCALL_EMU is set, we only get here because of
+        * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+        * We already reported this syscall instruction in
+        * syscall_trace_enter(), so don't do any more now.
+        */
+       if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+               return;
+
+       /*
+        * If we are single-stepping, synthesize a trap to follow the
+        * system call instruction.
+        */
+       if (test_thread_flag(TIF_SINGLESTEP) &&
+           (current->ptrace & PT_PTRACED))
+               send_sigtrap(current, regs, 0);
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index f8a62160e1513221a379a400cfd30dc6e3eaaf0b..9dcf39c0297260f63f691b67ffbaed4ead8a4390 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                         DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
                 },
         },
+       {       /* Handle problems with rebooting on Dell T5400's */
+               .callback = set_bios_reboot,
+               .ident = "Dell Precision T5400",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+               },
+       },
         {       /* Handle problems with rebooting on HP laptops */
                 .callback = set_bios_reboot,
                 .ident = "HP Compaq Laptop",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index 531b55b8e81a1de1827eac5691d5f8aef1d8d10c..ec952aa5394a403a42308de114a370a656968a34 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
  #include <linux/slab.h>
  #include <linux/user.h>
  #include <linux/delay.h>
-#include <linux/highmem.h>
  
  #include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/kexec.h>
  #include <linux/cpufreq.h>
  #include <linux/dma-mapping.h>
  #include <linux/ctype.h>
@@ -96,7 +92,7 @@
  #include <asm/smp.h>
  #include <asm/desc.h>
  #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include <asm/mmu_context.h>
  #include <asm/proto.h>
  
@@ -104,7 +100,6 @@
  #include <asm/paravirt.h>
  
  #include <asm/percpu.h>
-#include <asm/sections.h>
  #include <asm/topology.h>
  #include <asm/apicdef.h>
  #ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
  early_param("elfcorehdr", setup_elfcorehdr);
  #endif
  
+static struct x86_quirks default_x86_quirks __initdata;
+
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+
  /*
   * Determine if we were loaded by an EFI loader.  If so, then we have also been
   * passed the efi memmap, systab, etc., so we should use these data structures
@@ -824,7 +823,10 @@ void __init setup_arch(char **cmdline_p)
         vmi_init();
  #endif
  
+       paravirt_pagetable_setup_start(swapper_pg_dir);
         paging_init();
+       paravirt_pagetable_setup_done(swapper_pg_dir);
+       paravirt_post_allocator_init();
  
  #ifdef CONFIG_X86_64
         map_vsyscall();
@@ -854,14 +856,6 @@ void __init setup_arch(char **cmdline_p)
         init_cpu_to_node();
  #endif
  
-#ifdef CONFIG_X86_NUMAQ
-       /*
-        * need to check online nodes num, call it
-        * here before time_init/tsc_init
-        */
-       numaq_tsc_disable();
-#endif
-
         init_apic_mappings();
         ioapic_init_mappings();
  
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c

index d92373630963f980fb5471a0d96d903b5d4b18bf..07faaa5109cb78003c4b2e5bbbad15da7136ca34 100644 (file)
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
  
  badframe:
         if (show_unhandled_signals && printk_ratelimit()) {
-               printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:"
+               printk("%s%s[%d] bad frame in sigreturn frame:"
                         "%p ip:%lx sp:%lx oeax:%lx",
                     task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
                     current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs)
  void
  do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
  {
-       /* Pending single-step? */
-       if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->flags |= X86_EFLAGS_TF;
-               clear_thread_flag(TIF_SINGLESTEP);
-       }
-
         /* deal with pending signal delivery */
         if (thread_info_flags & _TIF_SIGPENDING)
                 do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c

index e53b267662e712681d99b4b5ae6014af9bf790c1..bf87684474f18497e2326d497217ae45b4888df5 100644 (file)
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs)
  void do_notify_resume(struct pt_regs *regs, void *unused,
                       __u32 thread_info_flags)
  {
-       /* Pending single-step? */
-       if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->flags |= X86_EFLAGS_TF;
-               clear_thread_flag(TIF_SINGLESTEP);
-       }
-
  #ifdef CONFIG_X86_MCE
         /* notify userspace of pending MCEs */
         if (thread_info_flags & _TIF_MCE_NOTIFY)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 27456574f070eb4b22c1aca6e95feee5949bdf7a..27640196eb7ccadea18f712cc065aab3800e21d0 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
                         printk(KERN_CONT
                                "a previous APIC delivery may have failed\n");
  
-               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-               apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+               apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
  
                 timeout = 0;
                 do {
@@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
         int maxlvt;
  
         /* Target chip */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
  
         /* Boot on the stack */
         /* Kick the second */
-       apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+       apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
  
         Dprintk("Waiting for send to finish...\n");
         send_status = safe_apic_wait_icr_idle();
@@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
          * Give the other CPU some time to accept the IPI.
          */
         udelay(200);
-       /*
-        * Due to the Pentium erratum 3AP.
-        */
         maxlvt = lapic_get_maxlvt();
-       if (maxlvt > 3) {
-               apic_read_around(APIC_SPIV);
+       if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                 apic_write(APIC_ESR, 0);
-       }
         accept_status = (apic_read(APIC_ESR) & 0xEF);
         Dprintk("NMI sent.\n");
  
@@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                 return send_status;
         }
  
+       maxlvt = lapic_get_maxlvt();
+
         /*
          * Be paranoid about clearing APIC errors.
          */
         if (APIC_INTEGRATED(apic_version[phys_apicid])) {
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
+                       apic_write(APIC_ESR, 0);
                 apic_read(APIC_ESR);
         }
  
@@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
         /*
          * Turn INIT on target chip
          */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
  
         /*
          * Send IPI
          */
-       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-                               | APIC_DM_INIT);
+       apic_write(APIC_ICR,
+                  APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
  
         Dprintk("Waiting for send to finish...\n");
         send_status = safe_apic_wait_icr_idle();
@@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
         Dprintk("Deasserting INIT.\n");
  
         /* Target chip */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
  
         /* Send IPI */
-       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+       apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
  
         Dprintk("Waiting for send to finish...\n");
         send_status = safe_apic_wait_icr_idle();
@@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
          */
         Dprintk("#startup loops: %d.\n", num_starts);
  
-       maxlvt = lapic_get_maxlvt();
-
         for (j = 1; j <= num_starts; j++) {
                 Dprintk("Sending STARTUP #%d.\n", j);
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
+                       apic_write(APIC_ESR, 0);
                 apic_read(APIC_ESR);
                 Dprintk("After apic_write.\n");
  
@@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                  */
  
                 /* Target chip */
-               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
  
                 /* Boot on the stack */
                 /* Kick the second */
-               apic_write_around(APIC_ICR, APIC_DM_STARTUP
-                                       | (start_eip >> 12));
+               apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
  
                 /*
                  * Give the other CPU some time to accept the IPI.
@@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                  * Give the other CPU some time to accept the IPI.
                  */
                 udelay(200);
-               /*
-                * Due to the Pentium erratum 3AP.
-                */
-               if (maxlvt > 3) {
-                       apic_read_around(APIC_SPIV);
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
                         apic_write(APIC_ESR, 0);
-               }
                 accept_status = (apic_read(APIC_ESR) & 0xEF);
                 if (send_status || accept_status)
                         break;
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
   *
   * Must be called after the _cpu_pda pointer table is initialized.
   */
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
  {
         struct x8664_pda *oldpda, *newpda;
         unsigned long size = sizeof(struct x8664_pda);
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
  {
         extern unsigned int maxcpus;
  
-       maxcpus = simple_strtoul(arg, NULL, 0);
+       if (arg)
+               maxcpus = simple_strtoul(arg, NULL, 0);
         return 0;
  }
  early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c

index 92c20fee6781b5f5d419335e68eee8361b7f960d..e8b9863ef8c4f8d09f10344d868450711add7ee5 100644 (file)
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
  static int enable_single_step(struct task_struct *child)
  {
         struct pt_regs *regs = task_pt_regs(child);
+       unsigned long oflags;
+
+       /*
+        * If we stepped into a sysenter/syscall insn, it trapped in
+        * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+        * If user-mode had set TF itself, then it's still clear from
+        * do_debug() and we need to set it again to restore the user
+        * state so we don't wrongly set TIF_FORCED_TF below.
+        * If enable_single_step() was used last and that is what
+        * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
+        * already set and our bookkeeping is fine.
+        */
+       if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
+               regs->flags |= X86_EFLAGS_TF;
  
         /*
          * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
          */
         set_tsk_thread_flag(child, TIF_SINGLESTEP);
  
-       /*
-        * If TF was already set, don't do anything else
-        */
-       if (regs->flags & X86_EFLAGS_TF)
-               return 0;
+       oflags = regs->flags;
  
         /* Set TF on the kernel stack.. */
         regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
          * ..but if TF is changed by the instruction we will trace,
          * don't mark it as being "us" that set it, so that we
          * won't clear it by hand later.
+        *
+        * Note that if we don't actually execute the popf because
+        * of a signal arriving right now or suchlike, we will lose
+        * track of the fact that it really was "us" that set it.
          */
-       if (is_setting_trap_flag(child, regs))
+       if (is_setting_trap_flag(child, regs)) {
+               clear_tsk_thread_flag(child, TIF_FORCED_TF);
                 return 0;
+       }
+
+       /*
+        * If TF was already set, check whether it was us who set it.
+        * If not, we should never attempt a block step.
+        */
+       if (oflags & X86_EFLAGS_TF)
+               return test_tsk_thread_flag(child, TIF_FORCED_TF);
  
         set_tsk_thread_flag(child, TIF_FORCED_TF);
  
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c

index 059ca6ee59b4f1bbc9d0eafe4c1f2b3977f17e79..ffe3c664afc0aae1318c1b26ab272f858d87812b 100644 (file)
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
   */
  void __init time_init(void)
  {
+       pre_time_init_hook();
         tsc_init();
         late_time_init = choose_time_init();
  }
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c

index 8a768973c4f01bdf4b9317d0b23800a233608db9..03df8e45e5a1562e6d63f149a6189eb7964e7335 100644 (file)
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -58,6 +58,7 @@
  #include <asm/nmi.h>
  #include <asm/smp.h>
  #include <asm/io.h>
+#include <asm/traps.h>
  
  #include "mach_traps.h"
  
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
  gate_desc idt_table[256]
         __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
  
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
  int panic_on_unrecovered_nmi;
  int kstack_depth_to_print = 24;
  static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
  
  static void
  show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *stack, unsigned long bp, char *log_lvl)
+               unsigned long *stack, unsigned long bp, char *log_lvl)
  {
         dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
         printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
         return ud2 == 0x0b0f;
  }
  
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+       unsigned long flags;
+
+       oops_enter();
+
+       if (die_owner != raw_smp_processor_id()) {
+               console_verbose();
+               raw_local_irq_save(flags);
+               __raw_spin_lock(&die_lock);
+               die_owner = smp_processor_id();
+               die_nest_count = 0;
+               bust_spinlocks(1);
+       } else {
+               raw_local_irq_save(flags);
+       }
+       die_nest_count++;
+       return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+       bust_spinlocks(0);
+       die_owner = -1;
+       add_taint(TAINT_DIE);
+       __raw_spin_unlock(&die_lock);
+       raw_local_irq_restore(flags);
+
+       if (!regs)
+               return;
+
+       if (kexec_should_crash(current))
+               crash_kexec(regs);
+
+       if (in_interrupt())
+               panic("Fatal exception in interrupt");
+
+       if (panic_on_oops)
+               panic("Fatal exception");
+
+       oops_exit();
+       do_exit(signr);
+}
+
  int __kprobes __die(const char *str, struct pt_regs *regs, long err)
  {
         unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
   */
  void die(const char *str, struct pt_regs *regs, long err)
  {
-       static struct {
-               raw_spinlock_t lock;
-               u32 lock_owner;
-               int lock_owner_depth;
-       } die = {
-               .lock =                 __RAW_SPIN_LOCK_UNLOCKED,
-               .lock_owner =           -1,
-               .lock_owner_depth =     0
-       };
-       unsigned long flags;
-
-       oops_enter();
-
-       if (die.lock_owner != raw_smp_processor_id()) {
-               console_verbose();
-               raw_local_irq_save(flags);
-               __raw_spin_lock(&die.lock);
-               die.lock_owner = smp_processor_id();
-               die.lock_owner_depth = 0;
-               bust_spinlocks(1);
-       } else {
-               raw_local_irq_save(flags);
-       }
+       unsigned long flags = oops_begin();
  
-       if (++die.lock_owner_depth < 3) {
+       if (die_nest_count < 3) {
                 report_bug(regs->ip, regs);
  
                 if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
                 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
         }
  
-       bust_spinlocks(0);
-       die.lock_owner = -1;
-       add_taint(TAINT_DIE);
-       __raw_spin_unlock(&die.lock);
-       raw_local_irq_restore(flags);
-
-       if (!regs)
-               return;
-
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-
-       if (panic_on_oops)
-               panic("Fatal exception");
-
-       oops_exit();
-       do_exit(SIGSEGV);
+       oops_end(flags, regs, SIGSEGV);
  }
  
  static inline void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c

index 2696a683778204e2f1b417ab4929f27f657836bf..3f18d73f420c414d809a1046dd5676b992e8676e 100644 (file)
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -51,30 +51,10 @@
  #include <asm/pgalloc.h>
  #include <asm/proto.h>
  #include <asm/pda.h>
+#include <asm/traps.h>
  
  #include <mach_traps.h>
  
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
  int panic_on_unrecovered_nmi;
  int kstack_depth_to_print = 12;
  static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
         .address = print_trace_address,
  };
  
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp, char *log_lvl)
  {
         printk("\nCall Trace:\n");
-       dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
+       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
         printk("\n");
  }
  
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp)
+{
+       show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
  static void
-_show_stack(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *sp, unsigned long bp)
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *sp, unsigned long bp, char *log_lvl)
  {
         unsigned long *stack;
         int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
                 printk(" %016lx", *stack++);
                 touch_nmi_watchdog();
         }
-       show_trace(task, regs, sp, bp);
+       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
  }
  
  void show_stack(struct task_struct *task, unsigned long *sp)
  {
-       _show_stack(task, NULL, sp, 0);
+       show_stack_log_lvl(task, NULL, sp, 0, "");
  }
  
  /*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
                 u8 *ip;
  
                 printk("Stack: ");
-               _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+               show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+                               regs->bp, "");
                 printk("\n");
  
                 printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
  }
  
  void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{ 
+{
         die_owner = -1;
         bust_spinlocks(0);
         die_nest_count--;
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c

index e94bdb6add1d335b273eb5613962d686ffc08198..41e01b145c4800c514e07f456a126a0a0b5104e7 100644 (file)
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,7 +73,7 @@ int is_visws_box(void)
         return visws_board_type >= 0;
  }
  
-static int __init visws_time_init_quirk(void)
+static int __init visws_time_init(void)
  {
         printk(KERN_INFO "Starting Cobalt Timer system clock\n");
  
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
         return 0;
  }
  
-static int __init visws_pre_intr_init_quirk(void)
+static int __init visws_pre_intr_init(void)
  {
         init_VISWS_APIC_irqs();
  
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
  
  long long mem_size __initdata = 0;
  
-static char * __init visws_memory_setup_quirk(void)
+static char * __init visws_memory_setup(void)
  {
         long long gfx_mem_size = 8 * MB;
  
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
         outl(PIIX_SPECIAL_STOP, 0xCFC);
  }
  
-static int __init visws_get_smp_config_quirk(unsigned int early)
+static int __init visws_get_smp_config(unsigned int early)
  {
         /*
          * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
   * No problem for Linux.
   */
  
-static void __init MP_processor_info (struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
  {
         int ver, logical_apicid;
         physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
         apic_version[m->mpc_apicid] = ver;
  }
  
-int __init visws_find_smp_config_quirk(unsigned int reserve)
+static int __init visws_find_smp_config(unsigned int reserve)
  {
         struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
         unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
         return 1;
  }
  
-extern int visws_trap_init_quirk(void);
+static int visws_trap_init(void);
+
+static struct x86_quirks visws_x86_quirks __initdata = {
+       .arch_time_init         = visws_time_init,
+       .arch_pre_intr_init     = visws_pre_intr_init,
+       .arch_memory_setup      = visws_memory_setup,
+       .arch_intr_init         = NULL,
+       .arch_trap_init         = visws_trap_init,
+       .mach_get_smp_config    = visws_get_smp_config,
+       .mach_find_smp_config   = visws_find_smp_config,
+};
  
  void __init visws_early_detect(void)
  {
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
  
         /*
          * Install special quirks for timer, interrupt and memory setup:
-        */
-       arch_time_init_quirk            = visws_time_init_quirk;
-       arch_pre_intr_init_quirk        = visws_pre_intr_init_quirk;
-       arch_memory_setup_quirk         = visws_memory_setup_quirk;
-
-       /*
          * Fall back to generic behavior for traps:
+        * Override generic MP-table parsing:
          */
-       arch_intr_init_quirk            = NULL;
-       arch_trap_init_quirk            = visws_trap_init_quirk;
+       x86_quirks = &visws_x86_quirks;
  
         /*
          * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
          */
         no_broadcast = 0;
  
-       /*
-        * Override generic MP-table parsing:
-        */
-       mach_get_smp_config_quirk       = visws_get_smp_config_quirk;
-       mach_find_smp_config_quirk      = visws_find_smp_config_quirk;
-
  #ifdef CONFIG_X86_IO_APIC
         /*
          * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
                 co_apic_read(CO_APIC_ID));
  }
  
-int __init visws_trap_init_quirk(void)
+static int __init visws_trap_init(void)
  {
         lithium_init();
         cobalt_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c

index b15346092b7b72aa66e75621f4e31f72698ccbce..0a1b1a9d922df7f4380a40d4b210330dbc8db17a 100644 (file)
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
  #ifdef CONFIG_X86_LOCAL_APIC
         para_fill(pv_apic_ops.apic_read, APICRead);
         para_fill(pv_apic_ops.apic_write, APICWrite);
-       para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
  #endif
  
         /*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 50dad44fb54234e2725c833fafd15b59b3db0283..0313a5eec4125620016299cca616400821f7bd41 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -991,7 +991,6 @@ __init void lguest_init(void)
  #ifdef CONFIG_X86_LOCAL_APIC
         /* apic read/write intercepts */
         pv_apic_ops.apic_write = lguest_apic_write;
-       pv_apic_ops.apic_write_atomic = lguest_apic_write;
         pv_apic_ops.apic_read = lguest_apic_read;
  #endif
  
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c

index 48278fa7d3dee05eeaf2ccfa9bb8c726f6dd2f84..3d317836be9ed9e1739e92461d0912dc37c65ffb 100644 (file)
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,14 +10,6 @@
  #include <asm/e820.h>
  #include <asm/setup.h>
  
-/*
- * Any quirks to be performed to initialize timers/irqs/etc?
- */
-int (*arch_time_init_quirk)(void);
-int (*arch_pre_intr_init_quirk)(void);
-int (*arch_intr_init_quirk)(void);
-int (*arch_trap_init_quirk)(void);
-
  #ifdef CONFIG_HOTPLUG_CPU
  #define DEFAULT_SEND_IPI       (1)
  #else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
   **/
  void __init pre_intr_init_hook(void)
  {
-       if (arch_pre_intr_init_quirk) {
-               if (arch_pre_intr_init_quirk())
+       if (x86_quirks->arch_pre_intr_init) {
+               if (x86_quirks->arch_pre_intr_init())
                         return;
         }
         init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
   **/
  void __init intr_init_hook(void)
  {
-       if (arch_intr_init_quirk) {
-               if (arch_intr_init_quirk())
+       if (x86_quirks->arch_intr_init) {
+               if (x86_quirks->arch_intr_init())
                         return;
         }
  #ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
   **/
  void __init trap_init_hook(void)
  {
-       if (arch_trap_init_quirk) {
-               if (arch_trap_init_quirk())
+       if (x86_quirks->arch_trap_init) {
+               if (x86_quirks->arch_trap_init())
                         return;
         }
  }
@@ -110,6 +102,16 @@ static struct irqaction irq0  = {
         .name = "timer"
  };
  
+/**
+ * pre_time_init_hook - do any specific initialisations before.
+ *
+ **/
+void __init pre_time_init_hook(void)
+{
+       if (x86_quirks->arch_pre_time_init)
+               x86_quirks->arch_pre_time_init();
+}
+
  /**
   * time_init_hook - do any specific initialisations for the system timer.
   *
@@ -119,13 +121,13 @@ static struct irqaction irq0  = {
   **/
  void __init time_init_hook(void)
  {
-       if (arch_time_init_quirk) {
+       if (x86_quirks->arch_time_init) {
                 /*
                  * A nonzero return code does not mean failure, it means
                  * that the architecture quirk does not want any
                  * generic (timer) setup to be performed after this:
                  */
-               if (arch_time_init_quirk())
+               if (x86_quirks->arch_time_init())
                         return;
         }
  
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 9873716e9f764bcd7c5bc0f369a0269efdffb35a..1fbb844c3d7afdc0b35723d96ba6578c09584de1 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA)         += k8topology_64.o
  endif
  obj-$(CONFIG_ACPI_NUMA)                += srat_$(BITS).o
  
+obj-$(CONFIG_MEMTEST)          += memtest.o
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index 9689a5138e6472e33c6d0862b3ae56194ffcedb4..d37f29376b0ce455ae3907051a58779f4b995a25 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
                 reserve_early(table_start << PAGE_SHIFT,
                                  table_end << PAGE_SHIFT, "PGTABLE");
  
+       if (!after_init_bootmem)
+               early_memtest(start, end);
+
         return end >> PAGE_SHIFT;
  }
  
@@ -868,8 +871,6 @@ void __init paging_init(void)
          */
         sparse_init();
         zone_sizes_init();
-
-       paravirt_post_allocator_init();
  }
  
  /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index 306049edd55322a3c5aec4c7acdf3e38e224eda6..ec37121f67092b8c996b9b99e104d17e642dc650 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -517,118 +517,6 @@ static void __init init_gbpages(void)
                 direct_gbpages = 0;
  }
  
-#ifdef CONFIG_MEMTEST
-
-static void __init memtest(unsigned long start_phys, unsigned long size,
-                                unsigned pattern)
-{
-       unsigned long i;
-       unsigned long *start;
-       unsigned long start_bad;
-       unsigned long last_bad;
-       unsigned long val;
-       unsigned long start_phys_aligned;
-       unsigned long count;
-       unsigned long incr;
-
-       switch (pattern) {
-       case 0:
-               val = 0UL;
-               break;
-       case 1:
-               val = -1UL;
-               break;
-       case 2:
-               val = 0x5555555555555555UL;
-               break;
-       case 3:
-               val = 0xaaaaaaaaaaaaaaaaUL;
-               break;
-       default:
-               return;
-       }
-
-       incr = sizeof(unsigned long);
-       start_phys_aligned = ALIGN(start_phys, incr);
-       count = (size - (start_phys_aligned - start_phys))/incr;
-       start = __va(start_phys_aligned);
-       start_bad = 0;
-       last_bad = 0;
-
-       for (i = 0; i < count; i++)
-               start[i] = val;
-       for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-               if (*start != val) {
-                       if (start_phys_aligned == last_bad + incr) {
-                               last_bad += incr;
-                       } else {
-                               if (start_bad) {
-                                       printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-                                               val, start_bad, last_bad + incr);
-                                       reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-                               }
-                               start_bad = last_bad = start_phys_aligned;
-                       }
-               }
-       }
-       if (start_bad) {
-               printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-                       val, start_bad, last_bad + incr);
-               reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-       }
-
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
-       if (arg)
-               memtest_pattern = simple_strtoul(arg, NULL, 0);
-       return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-       u64 t_start, t_size;
-       unsigned pattern;
-
-       if (!memtest_pattern)
-               return;
-
-       printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
-       for (pattern = 0; pattern < memtest_pattern; pattern++) {
-               t_start = start;
-               t_size = 0;
-               while (t_start < end) {
-                       t_start = find_e820_area_size(t_start, &t_size, 1);
-
-                       /* done ? */
-                       if (t_start >= end)
-                               break;
-                       if (t_start + t_size > end)
-                               t_size = end - t_start;
-
-                       printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
-                               (unsigned long long)t_start,
-                               (unsigned long long)t_start + t_size, pattern);
-
-                       memtest(t_start, t_size, pattern);
-
-                       t_start += t_size;
-               }
-       }
-       printk(KERN_CONT "\n");
-}
-#else
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
  static unsigned long __init kernel_physical_mapping_init(unsigned long start,
                                                 unsigned long end,
                                                 unsigned long page_size_mask)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c

new file mode 100644 (file)

index 0000000..672e17f
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+
+#include <asm/e820.h>
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+                                unsigned pattern)
+{
+       unsigned long i;
+       unsigned long *start;
+       unsigned long start_bad;
+       unsigned long last_bad;
+       unsigned long val;
+       unsigned long start_phys_aligned;
+       unsigned long count;
+       unsigned long incr;
+
+       switch (pattern) {
+       case 0:
+               val = 0UL;
+               break;
+       case 1:
+               val = -1UL;
+               break;
+       case 2:
+#ifdef CONFIG_X86_64
+               val = 0x5555555555555555UL;
+#else
+               val = 0x55555555UL;
+#endif
+               break;
+       case 3:
+#ifdef CONFIG_X86_64
+               val = 0xaaaaaaaaaaaaaaaaUL;
+#else
+               val = 0xaaaaaaaaUL;
+#endif
+               break;
+       default:
+               return;
+       }
+
+       incr = sizeof(unsigned long);
+       start_phys_aligned = ALIGN(start_phys, incr);
+       count = (size - (start_phys_aligned - start_phys))/incr;
+       start = __va(start_phys_aligned);
+       start_bad = 0;
+       last_bad = 0;
+
+       for (i = 0; i < count; i++)
+               start[i] = val;
+       for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+               if (*start != val) {
+                       if (start_phys_aligned == last_bad + incr) {
+                               last_bad += incr;
+                       } else {
+                               if (start_bad) {
+                                       printk(KERN_CONT "\n  %010lx bad mem addr %010lx - %010lx reserved",
+                                               val, start_bad, last_bad + incr);
+                                       reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+                               }
+                               start_bad = last_bad = start_phys_aligned;
+                       }
+               }
+       }
+       if (start_bad) {
+               printk(KERN_CONT "\n  %016lx bad mem addr %010lx - %010lx reserved",
+                       val, start_bad, last_bad + incr);
+               reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+       }
+
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+       if (arg)
+               memtest_pattern = simple_strtoul(arg, NULL, 0);
+       return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(unsigned long start, unsigned long end)
+{
+       u64 t_start, t_size;
+       unsigned pattern;
+
+       if (!memtest_pattern)
+               return;
+
+       printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+       for (pattern = 0; pattern < memtest_pattern; pattern++) {
+               t_start = start;
+               t_size = 0;
+               while (t_start < end) {
+                       t_start = find_e820_area_size(t_start, &t_size, 1);
+
+                       /* done ? */
+                       if (t_start >= end)
+                               break;
+                       if (t_start + t_size > end)
+                               t_size = end - t_start;
+
+                       printk(KERN_CONT "\n  %010llx - %010llx pattern %d",
+                               (unsigned long long)t_start,
+                               (unsigned long long)t_start + t_size, pattern);
+
+                       memtest(t_start, t_size, pattern);
+
+                       t_start += t_size;
+               }
+       }
+       printk(KERN_CONT "\n");
+}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c

index 6bb597f4d70133e4aefb9db3d8e4b1bec259c804..2fe30916d4b66aba6ea1121a9d5bfadb60f1c82f 100644 (file)
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
  #include <linux/gfp.h>
  #include <linux/fs.h>
  #include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
  
  #include <asm/msr.h>
  #include <asm/tlbflush.h>
@@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
  
         free_memtype(addr, addr + size);
  }
+
+#if defined(CONFIG_DEBUG_FS)
+
+/* get Nth element of the linked list */
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+       struct memtype *list_node, *print_entry;
+       int i = 1;
+
+       print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+       if (!print_entry)
+               return NULL;
+
+       spin_lock(&memtype_lock);
+       list_for_each_entry(list_node, &memtype_list, nd) {
+               if (pos == i) {
+                       *print_entry = *list_node;
+                       spin_unlock(&memtype_lock);
+                       return print_entry;
+               }
+               ++i;
+       }
+       spin_unlock(&memtype_lock);
+       kfree(print_entry);
+       return NULL;
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       if (*pos == 0) {
+               ++*pos;
+               seq_printf(seq, "PAT memtype list:\n");
+       }
+
+       return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       ++*pos;
+       return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+       struct memtype *print_entry = (struct memtype *)v;
+
+       seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+                       print_entry->start, print_entry->end);
+       kfree(print_entry);
+       return 0;
+}
+
+static struct seq_operations memtype_seq_ops = {
+       .start = memtype_seq_start,
+       .next  = memtype_seq_next,
+       .stop  = memtype_seq_stop,
+       .show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+       .open    = memtype_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+       debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
+                               NULL, &memtype_fops);
+       return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile

index e515e8db842a0aa4a78ffe617abc7df08b110659..d49202e740eaf5a224897e4991c9eec951657918 100644 (file)
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG)    += mmconfig_$(BITS).o direct.o mmconfig-shared.o
  obj-$(CONFIG_PCI_DIRECT)       += direct.o
  obj-$(CONFIG_PCI_OLPC)         += olpc.o
  
-pci-y                          := fixup.o
-pci-$(CONFIG_ACPI)             += acpi.o
-pci-y                          += legacy.o irq.o
+obj-y                          += fixup.o
+obj-$(CONFIG_ACPI)             += acpi.o
+obj-y                          += legacy.o irq.o
  
-pci-$(CONFIG_X86_VISWS)                += visws.o
+obj-$(CONFIG_X86_VISWS)                += visws.o
  
-pci-$(CONFIG_X86_NUMAQ)                += numa.o
+obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
  
-obj-y                          += $(pci-y) common.o early.o
+obj-y                          += common.o early.o
  obj-y                          += amd_bus.o
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c

index 132876cc6fca3e9d42cd836b2378ee26b4cdf7c7..ec9ce35e44d6afe51b2c63bafbe9d5e3053d68c4 100644 (file)
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
  
  int __init pci_subsys_init(void)
  {
+#ifdef CONFIG_X86_NUMAQ
+       pci_numaq_init();
+#endif
  #ifdef CONFIG_ACPI
         pci_acpi_init();
+#endif
+#ifdef CONFIG_X86_VISWS
+       pci_visws_init();
  #endif
         pci_legacy_init();
         pcibios_irq_init();
-#ifdef CONFIG_X86_NUMAQ
-       pci_numa_init();
-#endif
         pcibios_init();
  
         return 0;
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c

deleted file mode 100644 (file)

index 8b5ca19..0000000
--- a/arch/x86/pci/numa.c
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * numa.c - Low-level PCI access for NUMA-Q machines
- */
-
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/nodemask.h>
-#include <mach_apic.h>
-#include <asm/mpspec.h>
-#include "pci.h"
-
-#define XQUAD_PORTIO_BASE 0xfe400000
-#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
-
-#define BUS2QUAD(global) (mp_bus_id_to_node[global])
-
-#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
-
-#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
-
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-EXPORT_SYMBOL(xquad_portio);
-
-#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
-
-#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
-       (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
-
-static void write_cf8(unsigned bus, unsigned devfn, unsigned reg)
-{
-       unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg);
-       if (xquad_portio)
-               writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus)));
-       else
-               outl(val, 0xCF8);
-}
-
-static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
-                            unsigned int devfn, int reg, int len, u32 *value)
-{
-       unsigned long flags;
-       void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
-
-       if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
-               return -EINVAL;
-
-       spin_lock_irqsave(&pci_config_lock, flags);
-
-       write_cf8(bus, devfn, reg);
-
-       switch (len) {
-       case 1:
-               if (xquad_portio)
-                       *value = readb(adr + (reg & 3));
-               else
-                       *value = inb(0xCFC + (reg & 3));
-               break;
-       case 2:
-               if (xquad_portio)
-                       *value = readw(adr + (reg & 2));
-               else
-                       *value = inw(0xCFC + (reg & 2));
-               break;
-       case 4:
-               if (xquad_portio)
-                       *value = readl(adr);
-               else
-                       *value = inl(0xCFC);
-               break;
-       }
-
-       spin_unlock_irqrestore(&pci_config_lock, flags);
-
-       return 0;
-}
-
-static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
-                             unsigned int devfn, int reg, int len, u32 value)
-{
-       unsigned long flags;
-       void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
-
-       if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
-               return -EINVAL;
-
-       spin_lock_irqsave(&pci_config_lock, flags);
-
-       write_cf8(bus, devfn, reg);
-
-       switch (len) {
-       case 1:
-               if (xquad_portio)
-                       writeb(value, adr + (reg & 3));
-               else
-                       outb((u8)value, 0xCFC + (reg & 3));
-               break;
-       case 2:
-               if (xquad_portio)
-                       writew(value, adr + (reg & 2));
-               else
-                       outw((u16)value, 0xCFC + (reg & 2));
-               break;
-       case 4:
-               if (xquad_portio)
-                       writel(value, adr + reg);
-               else
-                       outl((u32)value, 0xCFC);
-               break;
-       }
-
-       spin_unlock_irqrestore(&pci_config_lock, flags);
-
-       return 0;
-}
-
-#undef PCI_CONF1_MQ_ADDRESS
-
-static struct pci_raw_ops pci_direct_conf1_mq = {
-       .read   = pci_conf1_mq_read,
-       .write  = pci_conf1_mq_write
-};
-
-
-static void __devinit pci_fixup_i450nx(struct pci_dev *d)
-{
-       /*
-        * i450NX -- Find and scan all secondary buses on all PXB's.
-        */
-       int pxb, reg;
-       u8 busno, suba, subb;
-       int quad = BUS2QUAD(d->bus->number);
-
-       printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
-       reg = 0xd0;
-       for(pxb=0; pxb<2; pxb++) {
-               pci_read_config_byte(d, reg++, &busno);
-               pci_read_config_byte(d, reg++, &suba);
-               pci_read_config_byte(d, reg++, &subb);
-               DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
-               if (busno) {
-                       /* Bus A */
-                       pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
-               }
-               if (suba < subb) {
-                       /* Bus B */
-                       pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1));
-               }
-       }
-       pcibios_last_bus = -1;
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
-
-int __init pci_numa_init(void)
-{
-       int quad;
-
-       if (!found_numaq)
-               return 0;
-
-       raw_pci_ops = &pci_direct_conf1_mq;
-
-       if (pcibios_scanned++)
-               return 0;
-
-       pci_root_bus = pcibios_scan_root(0);
-       if (pci_root_bus)
-               pci_bus_add_devices(pci_root_bus);
-       if (num_online_nodes() > 1)
-               for_each_online_node(quad) {
-                       if (quad == 0)
-                               continue;
-                       printk("Scanning PCI bus %d for quad %d\n", 
-                               QUADLOCAL2BUS(quad,0), quad);
-                       pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0));
-               }
-       return 0;
-}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c

new file mode 100644 (file)

index 0000000..f4b16dc
--- /dev/null
+++ b/arch/x86/pci/numaq_32.c
@@ -0,0 +1,178 @@
+/*
+ * numaq_32.c - Low-level PCI access for NUMA-Q machines
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/nodemask.h>
+#include <mach_apic.h>
+#include <asm/mpspec.h>
+#include "pci.h"
+
+#define XQUAD_PORTIO_BASE 0xfe400000
+#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
+
+#define BUS2QUAD(global) (mp_bus_id_to_node[global])
+
+#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
+
+#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
+
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+EXPORT_SYMBOL(xquad_portio);
+
+#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
+
+#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
+       (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
+
+static void write_cf8(unsigned bus, unsigned devfn, unsigned reg)
+{
+       unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg);
+       if (xquad_portio)
+               writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus)));
+       else
+               outl(val, 0xCF8);
+}
+
+static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
+                            unsigned int devfn, int reg, int len, u32 *value)
+{
+       unsigned long flags;
+       void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
+
+       if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
+               return -EINVAL;
+
+       spin_lock_irqsave(&pci_config_lock, flags);
+
+       write_cf8(bus, devfn, reg);
+
+       switch (len) {
+       case 1:
+               if (xquad_portio)
+                       *value = readb(adr + (reg & 3));
+               else
+                       *value = inb(0xCFC + (reg & 3));
+               break;
+       case 2:
+               if (xquad_portio)
+                       *value = readw(adr + (reg & 2));
+               else
+                       *value = inw(0xCFC + (reg & 2));
+               break;
+       case 4:
+               if (xquad_portio)
+                       *value = readl(adr);
+               else
+                       *value = inl(0xCFC);
+               break;
+       }
+
+       spin_unlock_irqrestore(&pci_config_lock, flags);
+
+       return 0;
+}
+
+static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
+                             unsigned int devfn, int reg, int len, u32 value)
+{
+       unsigned long flags;
+       void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
+
+       if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
+               return -EINVAL;
+
+       spin_lock_irqsave(&pci_config_lock, flags);
+
+       write_cf8(bus, devfn, reg);
+
+       switch (len) {
+       case 1:
+               if (xquad_portio)
+                       writeb(value, adr + (reg & 3));
+               else
+                       outb((u8)value, 0xCFC + (reg & 3));
+               break;
+       case 2:
+               if (xquad_portio)
+                       writew(value, adr + (reg & 2));
+               else
+                       outw((u16)value, 0xCFC + (reg & 2));
+               break;
+       case 4:
+               if (xquad_portio)
+                       writel(value, adr + reg);
+               else
+                       outl((u32)value, 0xCFC);
+               break;
+       }
+
+       spin_unlock_irqrestore(&pci_config_lock, flags);
+
+       return 0;
+}
+
+#undef PCI_CONF1_MQ_ADDRESS
+
+static struct pci_raw_ops pci_direct_conf1_mq = {
+       .read   = pci_conf1_mq_read,
+       .write  = pci_conf1_mq_write
+};
+
+
+static void __devinit pci_fixup_i450nx(struct pci_dev *d)
+{
+       /*
+        * i450NX -- Find and scan all secondary buses on all PXB's.
+        */
+       int pxb, reg;
+       u8 busno, suba, subb;
+       int quad = BUS2QUAD(d->bus->number);
+
+       printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
+       reg = 0xd0;
+       for(pxb=0; pxb<2; pxb++) {
+               pci_read_config_byte(d, reg++, &busno);
+               pci_read_config_byte(d, reg++, &suba);
+               pci_read_config_byte(d, reg++, &subb);
+               DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
+               if (busno) {
+                       /* Bus A */
+                       pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
+               }
+               if (suba < subb) {
+                       /* Bus B */
+                       pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1));
+               }
+       }
+       pcibios_last_bus = -1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
+
+int __init pci_numaq_init(void)
+{
+       int quad;
+
+       if (!found_numaq)
+               return 0;
+
+       raw_pci_ops = &pci_direct_conf1_mq;
+
+       if (pcibios_scanned++)
+               return 0;
+
+       pci_root_bus = pcibios_scan_root(0);
+       if (pci_root_bus)
+               pci_bus_add_devices(pci_root_bus);
+       if (num_online_nodes() > 1)
+               for_each_online_node(quad) {
+                       if (quad == 0)
+                               continue;
+                       printk("Scanning PCI bus %d for quad %d\n", 
+                               QUADLOCAL2BUS(quad,0), quad);
+                       pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0));
+               }
+       return 0;
+}
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h

index 3e25deb821ac9152feb484db2414565c541f18de..15b9cf6be729c0c7cddee3ff54d1ce9809f32d58 100644 (file)
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
  /* some common used subsys_initcalls */
  extern int __init pci_acpi_init(void);
  extern int __init pcibios_irq_init(void);
-extern int __init pci_numa_init(void);
+extern int __init pci_visws_init(void);
+extern int __init pci_numaq_init(void);
  extern int __init pcibios_init(void);
  
  /* pci-mmconfig.c */
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c

index 1a7bed492bb15647b2f1ce0df3740cd074094781..42f4cb19facab8a47b70c85fcfd819cb086ade3b 100644 (file)
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
         pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
  }
  
-static int __init pci_visws_init(void)
+int __init pci_visws_init(void)
  {
+       if (!is_visws_box())
+               return -1;
+
+       pcibios_enable_irq = &pci_visws_enable_irq;
+       pcibios_disable_irq = &pci_visws_disable_irq;
+
         /* The VISWS supports configuration access type 1 only */
         pci_probe = (pci_probe | PCI_PROBE_CONF1) &
                     ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
         pcibios_resource_survey();
         return 0;
  }
-
-static __init int pci_subsys_init(void)
-{
-       if (!is_visws_box())
-               return -1;
-
-       pcibios_enable_irq = &pci_visws_enable_irq;
-       pcibios_disable_irq = &pci_visws_disable_irq;
-
-       pci_visws_init();
-       pcibios_init();
-
-       return 0;
-}
-subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile

index b7ad9f89d21f8898dbf792de05b0dddd21f3a803..4d6ef0a336d6d3499580b32958d59c93639bc647 100644 (file)
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
  # Build multiple 32-bit vDSO images to choose from at boot time.
  #
  obj-$(VDSO32-y)                        += vdso32-syms.lds
-vdso32.so-$(CONFIG_X86_32)     += int80
+vdso32.so-$(VDSO32-y)          += int80
  vdso32.so-$(CONFIG_COMPAT)     += syscall
  vdso32.so-$(VDSO32-y)          += sysenter
  
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c

index 0bce5429a51546de1fab7f0acd29e921d38a9870..513f330c58326b2126e6cfe4c78214eaa77226e8 100644 (file)
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
         }
  }
  
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
  static struct page *vdso32_pages[1];
  
  #ifdef CONFIG_X86_64
  
  #define        vdso32_sysenter()       (boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define        vdso32_syscall()        (boot_cpu_has(X86_FEATURE_SYSCALL32))
  
  /* May not be __init: called during resume */
  void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
  #else  /* CONFIG_X86_32 */
  
  #define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall()       (0)
  
  void enable_sep_cpu(void)
  {
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
         gate_vma_init();
  #endif
  
-       if (!vdso32_sysenter()) {
-               vsyscall = &vdso32_default_start;
-               vsyscall_len = &vdso32_default_end - &vdso32_default_start;
-       } else {
+       if (vdso32_syscall()) {
+               vsyscall = &vdso32_syscall_start;
+               vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+       } else if (vdso32_sysenter()){
                 vsyscall = &vdso32_sysenter_start;
                 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+       } else {
+               vsyscall = &vdso32_int80_start;
+               vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
         }
  
         memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S

index 1e36f72cab865e450cc3f17a9fdf53a7da485a14..2ce5f82c333b15c255b621e63d7bc57b7278b361 100644 (file)
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
  
  __INITDATA
  
-       .globl vdso32_default_start, vdso32_default_end
-vdso32_default_start:
-#ifdef CONFIG_X86_32
+       .globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
         .incbin "arch/x86/vdso/vdso32-int80.so"
-#else
+vdso32_int80_end:
+
+       .globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+#ifdef CONFIG_COMPAT
         .incbin "arch/x86/vdso/vdso32-syscall.so"
  #endif
-vdso32_default_end:
+vdso32_syscall_end:
  
         .globl vdso32_sysenter_start, vdso32_sysenter_end
  vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c

index 19a6cfaf5db9c2ff89e60ff7c8fa3f4485d9a14b..257ba4a10abf0740ee908d2ff1369c77d7d31b50 100644 (file)
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
  extern char vdso_start[], vdso_end[];
  extern unsigned short vdso_sync_cpuid;
  
-struct page **vdso_pages;
+static struct page **vdso_pages;
+static unsigned vdso_size;
  
  static inline void *var_ref(void *p, char *name)
  {
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
         int i;
         char *vbase;
  
+       vdso_size = npages << PAGE_SHIFT;
         vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
         if (!vdso_pages)
                 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
         struct mm_struct *mm = current->mm;
         unsigned long addr;
         int ret;
-       unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
  
         if (!vdso_enabled)
                 return 0;
  
         down_write(&mm->mmap_sem);
-       addr = vdso_addr(mm->start_stack, len);
-       addr = get_unmapped_area(NULL, addr, len, 0, 0);
+       addr = vdso_addr(mm->start_stack, vdso_size);
+       addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
         if (IS_ERR_VALUE(addr)) {
                 ret = addr;
                 goto up_fail;
         }
  
-       ret = install_special_mapping(mm, addr, len,
+       ret = install_special_mapping(mm, addr, vdso_size,
                                       VM_READ|VM_EXEC|
                                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
                                       VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig

index c2cc9958087109f9e8e8aa749c0e0b8d693612cd..3815e425f4702f11a1693ba3a43a49e9dabc05bb 100644 (file)
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
         bool "Xen guest support"
         select PARAVIRT
         select PARAVIRT_CLOCK
-       depends on X86_32
-       depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+       depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+       depends on X86_CMPXCHG && X86_TSC
         help
           This is the Linux Xen port.  Enabling this will allow the
           kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
  
  config XEN_MAX_DOMAIN_MEMORY
         int "Maximum allowed size of a domain in gigabytes"
-       default 8
+       default 8 if X86_32
+       default 32 if X86_64
         depends on XEN
         help
           The pseudo-physical to machine address array is sized
           according to the maximum possible memory size of a Xen
           domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
-\ No newline at end of file
+         need to be too stingy here.
+
+config XEN_SAVE_RESTORE
+       bool
+       depends on PM
+       default y
+\ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile

index 2ba2d1649131a1b0151911179679b599a98b9272..59c1e539aed28b9d2c8395671be4a459097df7e7 100644 (file)
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
  obj-y          := enlighten.o setup.o multicalls.o mmu.o \
-                       time.o xen-asm.o grant-table.o suspend.o
+                       time.o xen-asm_$(BITS).o grant-table.o suspend.o
  
  obj-$(CONFIG_SMP)      += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index bb508456ef523e1fa50f77a2993bf481b06f03f0..194bbd6e32410dd4c00c1aa360bca80b312d41b1 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
  #include <xen/interface/sched.h>
  #include <xen/features.h>
  #include <xen/page.h>
+#include <xen/hvc-console.h>
  
  #include <asm/paravirt.h>
  #include <asm/page.h>
@@ -40,12 +41,12 @@
  #include <asm/xen/hypervisor.h>
  #include <asm/fixmap.h>
  #include <asm/processor.h>
+#include <asm/msr-index.h>
  #include <asm/setup.h>
  #include <asm/desc.h>
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <asm/reboot.h>
-#include <asm/pgalloc.h>
  
  #include "xen-ops.h"
  #include "mmu.h"
@@ -56,6 +57,18 @@ EXPORT_SYMBOL_GPL(hypercall_page);
  DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
  DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
  
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
  /*
   * Note about cr3 (pagetable base) values:
   *
@@ -167,10 +180,14 @@ void xen_vcpu_restore(void)
  
  static void __init xen_banner(void)
  {
+       unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       struct xen_extraversion extra;
+       HYPERVISOR_xen_version(XENVER_extraversion, &extra);
+
         printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
                pv_info.name);
-       printk(KERN_INFO "Hypervisor signature: %s%s\n",
-              xen_start_info->magic,
+       printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+              version >> 16, version & 0xffff, extra.extraversion,
                xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
  }
  
@@ -363,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
  
  static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
  {
-       xen_mc_batch();
-
-       load_TLS_descriptor(t, cpu, 0);
-       load_TLS_descriptor(t, cpu, 1);
-       load_TLS_descriptor(t, cpu, 2);
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-
         /*
          * XXX sleazy hack: If we're being called in a lazy-cpu zone,
          * it means we're in a context switch, and %gs has just been
@@ -379,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
          * Either way, it has been saved, and the new value will get
          * loaded properly.  This will go away as soon as Xen has been
          * modified to not save/restore %gs for normal hypercalls.
+        *
+        * On x86_64, this hack is not used for %gs, because gs points
+        * to KERNEL_GS_BASE (and uses it for PDA references), so we
+        * must not zero %gs on x86_64
+        *
+        * For x86_64, we need to zero %fs, otherwise we may get an
+        * exception between the new %fs descriptor being loaded and
+        * %fs being effectively cleared at __switch_to().
          */
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
                 loadsegment(gs, 0);
+#else
+               loadsegment(fs, 0);
+#endif
+       }
+
+       xen_mc_batch();
+
+       load_TLS_descriptor(t, cpu, 0);
+       load_TLS_descriptor(t, cpu, 1);
+       load_TLS_descriptor(t, cpu, 2);
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+       if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+               BUG();
  }
+#endif
  
  static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
                                 const void *ptr)
@@ -400,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
         preempt_enable();
  }
  
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
                             struct trap_info *info)
  {
-       u8 type, dpl;
-
-       type = (high >> 8) & 0x1f;
-       dpl = (high >> 13) & 3;
-
-       if (type != 0xf && type != 0xe)
+       if (val->type != 0xf && val->type != 0xe)
                 return 0;
  
         info->vector = vector;
-       info->address = (high & 0xffff0000) | (low & 0x0000ffff);
-       info->cs = low >> 16;
-       info->flags = dpl;
+       info->address = gate_offset(*val);
+       info->cs = gate_segment(*val);
+       info->flags = val->dpl;
         /* interrupt gates clear IF */
-       if (type == 0xe)
+       if (val->type == 0xe)
                 info->flags |= 4;
  
         return 1;
@@ -443,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
  
         if (p >= start && (p + 8) <= end) {
                 struct trap_info info[2];
-               u32 *desc = (u32 *)g;
  
                 info[1].address = 0;
  
-               if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
+               if (cvt_gate_to_trap(entrynum, g, &info[0]))
                         if (HYPERVISOR_set_trap_table(info))
                                 BUG();
         }
@@ -460,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
  {
         unsigned in, out, count;
  
-       count = (desc->size+1) / 8;
+       count = (desc->size+1) / sizeof(gate_desc);
         BUG_ON(count > 256);
  
         for (in = out = 0; in < count; in++) {
-               const u32 *entry = (u32 *)(desc->address + in * 8);
+               gate_desc *entry = (gate_desc*)(desc->address) + in;
  
-               if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+               if (cvt_gate_to_trap(in, entry, &traps[out]))
                         out++;
         }
         traps[out].address = 0;
@@ -695,33 +727,89 @@ static void set_current_cr3(void *v)
         x86_write_percpu(xen_current_cr3, (unsigned long)v);
  }
  
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
  {
         struct mmuext_op *op;
         struct multicall_space mcs;
-       unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       unsigned long mfn;
  
-       BUG_ON(preemptible());
+       if (cr3)
+               mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       else
+               mfn = 0;
  
-       mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
+       WARN_ON(mfn == 0 && kernel);
  
-       /* Update while interrupts are disabled, so its atomic with
-          respect to ipis */
-       x86_write_percpu(xen_cr3, cr3);
+       mcs = __xen_mc_entry(sizeof(*op));
  
         op = mcs.args;
-       op->cmd = MMUEXT_NEW_BASEPTR;
+       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
         op->arg1.mfn = mfn;
  
         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
  
-       /* Update xen_update_cr3 once the batch has actually
-          been submitted. */
-       xen_mc_callback(set_current_cr3, (void *)cr3);
+       if (kernel) {
+               x86_write_percpu(xen_cr3, cr3);
+
+               /* Update xen_current_cr3 once the batch has actually
+                  been submitted. */
+               xen_mc_callback(set_current_cr3, (void *)cr3);
+       }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       xen_mc_batch();  /* disables interrupts */
+
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
+       x86_write_percpu(xen_cr3, cr3);
+
+       __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+               if (user_pgd)
+                       __xen_write_cr3(false, __pa(user_pgd));
+               else
+                       __xen_write_cr3(false, 0);
+       }
+#endif
  
         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
  }
  
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+       int ret;
+
+       ret = 0;
+
+       switch(msr) {
+#ifdef CONFIG_X86_64
+               unsigned which;
+               u64 base;
+
+       case MSR_FS_BASE:               which = SEGBASE_FS; goto set;
+       case MSR_KERNEL_GS_BASE:        which = SEGBASE_GS_USER; goto set;
+       case MSR_GS_BASE:               which = SEGBASE_GS_KERNEL; goto set;
+
+       set:
+               base = ((u64)high << 32) | low;
+               if (HYPERVISOR_set_segment_base(which, base) != 0)
+                       ret = -EFAULT;
+               break;
+#endif
+       default:
+               ret = native_write_msr_safe(msr, low, high);
+       }
+
+       return ret;
+}
+
  /* Early in boot, while setting up the initial pagetable, assume
     everything is pinned. */
  static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
         xen_alloc_ptpage(mm, pfn, PT_PMD);
  }
  
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+       pgd_t *pgd = mm->pgd;
+       int ret = 0;
+
+       BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+       {
+               struct page *page = virt_to_page(pgd);
+               pgd_t *user_pgd;
+
+               BUG_ON(page->private != 0);
+
+               ret = -ENOMEM;
+
+               user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+               page->private = (unsigned long)user_pgd;
+
+               if (user_pgd != NULL) {
+                       user_pgd[pgd_index(VSYSCALL_START)] =
+                               __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+                       ret = 0;
+               }
+
+               BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+       }
+#endif
+
+       return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+       pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+       if (user_pgd)
+               free_page((unsigned long)user_pgd);
+#endif
+}
+
  /* This should never happen until we're OK to use struct page */
  static void xen_release_ptpage(u32 pfn, unsigned level)
  {
@@ -803,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
         xen_release_ptpage(pfn, PT_PMD);
  }
  
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+{
+       xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(u32 pfn)
+{
+       xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
  #ifdef CONFIG_HIGHPTE
  static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
  {
@@ -841,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
  
  static __init void xen_pagetable_setup_start(pgd_t *base)
  {
-       pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
-       int i;
-
-       /* special set_pte for pagetable initialization */
-       pv_mmu_ops.set_pte = xen_set_pte_init;
-
-       init_mm.pgd = base;
-       /*
-        * copy top-level of Xen-supplied pagetable into place.  This
-        * is a stand-in while we copy the pmd pages.
-        */
-       memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
-       /*
-        * For PAE, need to allocate new pmds, rather than
-        * share Xen's, since Xen doesn't like pmd's being
-        * shared between address spaces.
-        */
-       for (i = 0; i < PTRS_PER_PGD; i++) {
-               if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-                       pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-
-                       memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
-                              PAGE_SIZE);
-
-                       make_lowmem_page_readonly(pmd);
-
-                       set_pgd(&base[i], __pgd(1 + __pa(pmd)));
-               } else
-                       pgd_clear(&base[i]);
-       }
-
-       /* make sure zero_page is mapped RO so we can use it in pagetables */
-       make_lowmem_page_readonly(empty_zero_page);
-       make_lowmem_page_readonly(base);
-       /*
-        * Switch to new pagetable.  This is done before
-        * pagetable_init has done anything so that the new pages
-        * added to the table can be prepared properly for Xen.
-        */
-       xen_write_cr3(__pa(base));
-
-       /* Unpin initial Xen pagetable */
-       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
-                         PFN_DOWN(__pa(xen_start_info->pt_base)));
  }
  
  void xen_setup_shared_info(void)
  {
         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-               unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
-               /*
-                * Create a mapping for the shared info page.
-                * Should be set_fixmap(), but shared_info is a machine
-                * address with no corresponding pseudo-phys address.
-                */
-               set_pte_mfn(addr,
-                           PFN_DOWN(xen_start_info->shared_info),
-                           PAGE_KERNEL);
-
-               HYPERVISOR_shared_info = (struct shared_info *)addr;
+               set_fixmap(FIX_PARAVIRT_BOOTMAP,
+                          xen_start_info->shared_info);
+
+               HYPERVISOR_shared_info =
+                       (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
         } else
                 HYPERVISOR_shared_info =
                         (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1007,32 @@ void xen_setup_shared_info(void)
  
  static __init void xen_pagetable_setup_done(pgd_t *base)
  {
-       /* This will work as long as patching hasn't happened yet
-          (which it hasn't) */
-       pv_mmu_ops.alloc_pte = xen_alloc_pte;
-       pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-       pv_mmu_ops.release_pte = xen_release_pte;
-       pv_mmu_ops.release_pmd = xen_release_pmd;
-       pv_mmu_ops.set_pte = xen_set_pte;
-
         xen_setup_shared_info();
-
-       /* Actually pin the pagetable down, but we can't set PG_pinned
-          yet because the page structures don't exist yet. */
-       pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
  }
  
  static __init void xen_post_allocator_init(void)
  {
+       pv_mmu_ops.set_pte = xen_set_pte;
         pv_mmu_ops.set_pmd = xen_set_pmd;
         pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+       pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+       /* This will work as long as patching hasn't happened yet
+          (which it hasn't) */
+       pv_mmu_ops.alloc_pte = xen_alloc_pte;
+       pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+       pv_mmu_ops.release_pte = xen_release_pte;
+       pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+       pv_mmu_ops.alloc_pud = xen_alloc_pud;
+       pv_mmu_ops.release_pud = xen_release_pud;
+#endif
  
+#ifdef CONFIG_X86_64
+       SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
         xen_mark_init_mm_pinned();
  }
  
@@ -950,6 +1046,7 @@ void xen_setup_vcpu_info_placement(void)
  
         /* xen_vcpu_setup managed to place the vcpu_info within the
            percpu area for all cpus, so make use of it */
+#ifdef CONFIG_X86_32
         if (have_vcpu_info_placement) {
                 printk(KERN_INFO "Xen: using vcpu_info placement\n");
  
@@ -959,6 +1056,7 @@ void xen_setup_vcpu_info_placement(void)
                 pv_irq_ops.irq_enable = xen_irq_enable_direct;
                 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
         }
+#endif
  }
  
  static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
         goto patch_site
  
         switch (type) {
+#ifdef CONFIG_X86_32
                 SITE(pv_irq_ops, irq_enable);
                 SITE(pv_irq_ops, irq_disable);
                 SITE(pv_irq_ops, save_fl);
                 SITE(pv_irq_ops, restore_fl);
+#endif /* CONFIG_X86_32 */
  #undef SITE
  
         patch_site:
@@ -1025,8 +1125,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
  #ifdef CONFIG_X86_F00F_BUG
         case FIX_F00F_IDT:
  #endif
+#ifdef CONFIG_X86_32
         case FIX_WP_TEST:
         case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+       case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+       case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
  #ifdef CONFIG_X86_LOCAL_APIC
         case FIX_APIC_BASE:     /* maps dummy local APIC */
  #endif
@@ -1039,6 +1146,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
         }
  
         __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+       /* Replicate changes to map the vsyscall page into the user
+          pagetable vsyscall mapping. */
+       if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+               unsigned long vaddr = __fix_to_virt(idx);
+               set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+       }
+#endif
  }
  
  static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
         .wbinvd = native_wbinvd,
  
         .read_msr = native_read_msr_safe,
-       .write_msr = native_write_msr_safe,
+       .write_msr = xen_write_msr_safe,
         .read_tsc = native_read_tsc,
         .read_pmc = native_read_pmc,
  
         .iret = xen_iret,
         .irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+       .usergs_sysret32 = xen_sysret32,
+       .usergs_sysret64 = xen_sysret64,
+#endif
  
         .load_tr_desc = paravirt_nop,
         .set_ldt = xen_set_ldt,
         .load_gdt = xen_load_gdt,
         .load_idt = xen_load_idt,
         .load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+       .load_gs_index = xen_load_gs_index,
+#endif
  
         .store_gdt = native_store_gdt,
         .store_idt = native_store_idt,
@@ -1109,14 +1232,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
         .set_iopl_mask = xen_set_iopl_mask,
         .io_delay = xen_io_delay,
  
+       /* Xen takes care of %gs when switching to usermode for us */
+       .swapgs = paravirt_nop,
+
         .lazy_mode = {
                 .enter = paravirt_enter_lazy_cpu,
                 .leave = xen_leave_lazy,
         },
  };
  
+static void __init __xen_init_IRQ(void)
+{
+#ifdef CONFIG_X86_64
+       int i;
+
+       /* Create identity vector->irq map */
+       for(i = 0; i < NR_VECTORS; i++) {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       per_cpu(vector_irq, cpu)[i] = i;
+       }
+#endif /* CONFIG_X86_64 */
+
+       xen_init_IRQ();
+}
+
  static const struct pv_irq_ops xen_irq_ops __initdata = {
-       .init_IRQ = xen_init_IRQ,
+       .init_IRQ = __xen_init_IRQ,
         .save_fl = xen_save_fl,
         .restore_fl = xen_restore_fl,
         .irq_disable = xen_irq_disable,
@@ -1124,14 +1267,13 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
         .safe_halt = xen_safe_halt,
         .halt = xen_halt,
  #ifdef CONFIG_X86_64
-       .adjust_exception_frame = paravirt_nop,
+       .adjust_exception_frame = xen_adjust_exception_frame,
  #endif
  };
  
  static const struct pv_apic_ops xen_apic_ops __initdata = {
  #ifdef CONFIG_X86_LOCAL_APIC
         .apic_write = xen_apic_write,
-       .apic_write_atomic = xen_apic_write,
         .apic_read = xen_apic_read,
         .setup_boot_clock = paravirt_nop,
         .setup_secondary_clock = paravirt_nop,
@@ -1157,8 +1299,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .pte_update = paravirt_nop,
         .pte_update_defer = paravirt_nop,
  
-       .pgd_alloc = __paravirt_pgd_alloc,
-       .pgd_free = paravirt_nop,
+       .pgd_alloc = xen_pgd_alloc,
+       .pgd_free = xen_pgd_free,
  
         .alloc_pte = xen_alloc_pte_init,
         .release_pte = xen_release_pte_init,
@@ -1170,7 +1312,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .kmap_atomic_pte = xen_kmap_atomic_pte,
  #endif
  
-       .set_pte = NULL,        /* see xen_pagetable_setup_* */
+#ifdef CONFIG_X86_64
+       .set_pte = xen_set_pte,
+#else
+       .set_pte = xen_set_pte_init,
+#endif
         .set_pte_at = xen_set_pte_at,
         .set_pmd = xen_set_pmd_hyper,
  
@@ -1184,15 +1330,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .make_pte = xen_make_pte,
         .make_pgd = xen_make_pgd,
  
+#ifdef CONFIG_X86_PAE
         .set_pte_atomic = xen_set_pte_atomic,
         .set_pte_present = xen_set_pte_at,
-       .set_pud = xen_set_pud_hyper,
         .pte_clear = xen_pte_clear,
         .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+       .set_pud = xen_set_pud_hyper,
  
         .make_pmd = xen_make_pmd,
         .pmd_val = xen_pmd_val,
  
+#if PAGETABLE_LEVELS == 4
+       .pud_val = xen_pud_val,
+       .make_pud = xen_make_pud,
+       .set_pgd = xen_set_pgd_hyper,
+
+       .alloc_pud = xen_alloc_pte_init,
+       .release_pud = xen_release_pte_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
         .activate_mm = xen_activate_mm,
         .dup_mmap = xen_dup_mmap,
         .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1362,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .set_fixmap = xen_set_fixmap,
  };
  
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
-       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
-       .smp_prepare_cpus = xen_smp_prepare_cpus,
-       .cpu_up = xen_cpu_up,
-       .smp_cpus_done = xen_smp_cpus_done,
-
-       .smp_send_stop = xen_smp_send_stop,
-       .smp_send_reschedule = xen_smp_send_reschedule,
-
-       .send_call_func_ipi = xen_smp_send_call_function_ipi,
-       .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-};
-#endif /* CONFIG_SMP */
-
  static void xen_reboot(int reason)
  {
         struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
  
  static void __init xen_reserve_top(void)
  {
+#ifdef CONFIG_X86_32
         unsigned long top = HYPERVISOR_VIRT_START;
         struct xen_platform_parameters pp;
  
@@ -1271,8 +1414,248 @@ static void __init xen_reserve_top(void)
                 top = pp.virt_start;
  
         reserve_top_address(-top + 2 * PAGE_SIZE);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+       return (void *)(paddr + __START_KERNEL_map);
+#else
+       return __va(paddr);
+#endif
  }
  
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+       phys_addr_t paddr;
+
+       maddr &= PTE_MASK;
+       paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+       return __ka(m2p(maddr));
+}
+
+#ifdef CONFIG_X86_64
+static void walk(pgd_t *pgd, unsigned long addr)
+{
+       unsigned l4idx = pgd_index(addr);
+       unsigned l3idx = pud_index(addr);
+       unsigned l2idx = pmd_index(addr);
+       unsigned l1idx = pte_index(addr);
+       pgd_t l4;
+       pud_t l3;
+       pmd_t l2;
+       pte_t l1;
+
+       xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
+                      pgd, addr, l4idx, l3idx, l2idx, l1idx);
+
+       l4 = pgd[l4idx];
+       xen_raw_printk("  l4: %016lx\n", l4.pgd);
+       xen_raw_printk("      %016lx\n", pgd_val(l4));
+
+       l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
+       xen_raw_printk("  l3: %016lx\n", l3.pud);
+       xen_raw_printk("      %016lx\n", pud_val(l3));
+
+       l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
+       xen_raw_printk("  l2: %016lx\n", l2.pmd);
+       xen_raw_printk("      %016lx\n", pmd_val(l2));
+
+       l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
+       xen_raw_printk("  l1: %016lx\n", l1.pte);
+       xen_raw_printk("      %016lx\n", pte_val(l1));
+}
+#endif
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+       unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+       pte_t pte = pfn_pte(pfn, prot);
+
+       xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
+                      addr, pfn, get_phys_to_machine(pfn),
+                      pgprot_val(prot), pte.pte);
+
+       if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+               BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+       unsigned pmdidx, pteidx;
+       unsigned ident_pte;
+       unsigned long pfn;
+
+       ident_pte = 0;
+       pfn = 0;
+       for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+               pte_t *pte_page;
+
+               /* Reuse or allocate a page of ptes */
+               if (pmd_present(pmd[pmdidx]))
+                       pte_page = m2v(pmd[pmdidx].pmd);
+               else {
+                       /* Check for free pte pages */
+                       if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+                               break;
+
+                       pte_page = &level1_ident_pgt[ident_pte];
+                       ident_pte += PTRS_PER_PTE;
+
+                       pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+               }
+
+               /* Install mappings */
+               for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+                       pte_t pte;
+
+                       if (pfn > max_pfn_mapped)
+                               max_pfn_mapped = pfn;
+
+                       if (!pte_none(pte_page[pteidx]))
+                               continue;
+
+                       pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+                       pte_page[pteidx] = pte;
+               }
+       }
+
+       for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+               set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+       set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+       pte_t *pte = v;
+       int i;
+
+       /* All levels are converted the same way, so just treat them
+          as ptes. */
+       for(i = 0; i < PTRS_PER_PTE; i++)
+               pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working.  We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+       pud_t *l3;
+       pmd_t *l2;
+
+       /* Zap identity mapping */
+       init_level4_pgt[0] = __pgd(0);
+
+       /* Pre-constructed entries are in pfn, so convert to mfn */
+       convert_pfn_mfn(init_level4_pgt);
+       convert_pfn_mfn(level3_ident_pgt);
+       convert_pfn_mfn(level3_kernel_pgt);
+
+       l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+       l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+       memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+       memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+       l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+       memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       /* Set up identity map */
+       xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+       /* Make pagetable pieces RO */
+       set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+       /* Pin down new L4 */
+       pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+                         PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+       /* Unpin Xen-provided one */
+       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+       /* Switch over */
+       pgd = init_level4_pgt;
+
+       /*
+        * At this stage there can be no user pgd, and no page
+        * structure to attach it to, so make sure we just set kernel
+        * pgd.
+        */
+       xen_mc_batch();
+       __xen_write_cr3(true, __pa(pgd));
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+       reserve_early(__pa(xen_start_info->pt_base),
+                     __pa(xen_start_info->pt_base +
+                          xen_start_info->nr_pt_frames * PAGE_SIZE),
+                     "XEN PAGETABLES");
+
+       return pgd;
+}
+#else  /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+       pmd_t *kernel_pmd;
+
+       init_pg_tables_start = __pa(pgd);
+       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+       max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+       memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+       memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+       set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+                       __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+       set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+       xen_write_cr3(__pa(swapper_pg_dir));
+
+       pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+       return swapper_pg_dir;
+}
+#endif /* CONFIG_X86_64 */
+
  /* First C function to be called on Xen boot */
  asmlinkage void __init xen_start_kernel(void)
  {
@@ -1301,53 +1684,56 @@ asmlinkage void __init xen_start_kernel(void)
  
         machine_ops = xen_machine_ops;
  
-#ifdef CONFIG_SMP
-       smp_ops = xen_smp_ops;
+#ifdef CONFIG_X86_64
+       /* Disable until direct per-cpu data access. */
+       have_vcpu_info_placement = 0;
+       x86_64_init_pda();
  #endif
  
+       xen_smp_init();
+
         /* Get mfn list */
         if (!xen_feature(XENFEAT_auto_translated_physmap))
                 xen_build_dynamic_phys_to_machine();
  
         pgd = (pgd_t *)xen_start_info->pt_base;
  
-       init_pg_tables_start = __pa(pgd);
-       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-       max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT;
-
-       init_mm.pgd = pgd; /* use the Xen pagetables to start */
-
-       /* keep using Xen gdt for now; no urgent need to change it */
-
-       x86_write_percpu(xen_cr3, __pa(pgd));
-       x86_write_percpu(xen_current_cr3, __pa(pgd));
+       /* Prevent unwanted bits from being set in PTEs. */
+       __supported_pte_mask &= ~_PAGE_GLOBAL;
+       if (!is_initial_xendomain())
+               __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
  
         /* Don't do the full vcpu_info placement stuff until we have a
            possible map and a non-dummy shared_info. */
         per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
  
+       xen_raw_console_write("mapping kernel into physical memory\n");
+       pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+
+       init_mm.pgd = pgd;
+
+       /* keep using Xen gdt for now; no urgent need to change it */
+
         pv_info.kernel_rpl = 1;
         if (xen_feature(XENFEAT_supervisor_mode_kernel))
                 pv_info.kernel_rpl = 0;
  
-       /* Prevent unwanted bits from being set in PTEs. */
-       __supported_pte_mask &= ~_PAGE_GLOBAL;
-       if (!is_initial_xendomain())
-               __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
         /* set the limit of our address space */
         xen_reserve_top();
  
+#ifdef CONFIG_X86_32
         /* set up basic CPUID stuff */
         cpu_detect(&new_cpu_data);
         new_cpu_data.hard_math = 1;
         new_cpu_data.x86_capability[0] = cpuid_edx(1);
+#endif
  
         /* Poke various useful things into boot_params */
         boot_params.hdr.type_of_loader = (9 << 4) | 0;
         boot_params.hdr.ramdisk_image = xen_start_info->mod_start
                 ? __pa(xen_start_info->mod_start) : 0;
         boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+       boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
  
         if (!is_initial_xendomain()) {
                 add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1741,21 @@ asmlinkage void __init xen_start_kernel(void)
                 add_preferred_console("hvc", 0, NULL);
         }
  
+       xen_raw_console_write("about to get started...\n");
+
+#if 0
+       xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
+                      &boot_params, __pa_symbol(&boot_params),
+                      __va(__pa_symbol(&boot_params)));
+
+       walk(pgd, &boot_params);
+       walk(pgd, __va(__pa(&boot_params)));
+#endif
+
         /* Start the world */
+#ifdef CONFIG_X86_32
         i386_start_kernel();
+#else
+       x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+#endif
  }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c

index ff0aa74afaa1ae01ce1a3bd45a907af1cd04e3ea..a44d56e38bd1725ce891627249cb9cbf314a546d 100644 (file)
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
  
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
+#include <asm/fixmap.h>
  #include <asm/mmu_context.h>
  #include <asm/paravirt.h>
+#include <asm/linkage.h>
  
  #include <asm/xen/hypercall.h>
  #include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
  #include "multicalls.h"
  #include "mmu.h"
  
+/*
+ * Just beyond the highest usermode address.  STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT     ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
+
  #define P2M_ENTRIES_PER_PAGE   (PAGE_SIZE / sizeof(unsigned long))
  #define TOP_ENTRIES            (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
  
  /* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
-       __attribute__((section(".data.page_aligned"))) =
+static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
                 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
  
   /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES]
-       __attribute__((section(".data.page_aligned"))) =
+static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
                 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
  
  /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES]
-       __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
  
-static unsigned long p2m_top_mfn_list[
-                       PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
-       __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
+       __page_aligned_bss;
  
  static inline unsigned p2m_top_index(unsigned long pfn)
  {
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
         p2m_top[topidx][idx] = mfn;
  }
  
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
  {
+       unsigned long address = (unsigned long)vaddr;
         unsigned int level;
         pte_t *pte = lookup_address(address, &level);
         unsigned offset = address & ~PAGE_MASK;
  
         BUG_ON(pte == NULL);
  
-       return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+       return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
  }
  
  void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
  
         xen_mc_batch();
  
-       u.ptr = virt_to_machine(ptr).maddr;
+       /* ptr may be ioremapped for 64-bit pagetable setup */
+       u.ptr = arbitrary_virt_to_machine(ptr).maddr;
         u.val = pmd_val_ma(val);
         extend_mmu_update(&u);
  
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
   */
  void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
  {
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       if (pgd_none(*pgd)) {
-               BUG();
-               return;
-       }
-       pud = pud_offset(pgd, vaddr);
-       if (pud_none(*pud)) {
-               BUG();
-               return;
-       }
-       pmd = pmd_offset(pud, vaddr);
-       if (pmd_none(*pmd)) {
-               BUG();
-               return;
-       }
-       pte = pte_offset_kernel(pmd, vaddr);
-       /* <mfn,flags> stored as-is, to permit clearing entries */
-       xen_set_pte(pte, mfn_pte(mfn, flags));
-
-       /*
-        * It's enough to flush this one mapping.
-        * (PGE mappings get flushed as well)
-        */
-       __flush_tlb_one(vaddr);
+       set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
  }
  
  void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
  
         xen_mc_batch();
  
-       u.ptr = virt_to_machine(ptr).maddr;
+       /* ptr may be ioremapped for 64-bit pagetable setup */
+       u.ptr = arbitrary_virt_to_machine(ptr).maddr;
         u.val = pud_val_ma(val);
         extend_mmu_update(&u);
  
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
  
  void xen_set_pte(pte_t *ptep, pte_t pte)
  {
+#ifdef CONFIG_X86_PAE
         ptep->pte_high = pte.pte_high;
         smp_wmb();
         ptep->pte_low = pte.pte_low;
+#else
+       *ptep = pte;
+#endif
  }
  
+#ifdef CONFIG_X86_PAE
  void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
  {
-       set_64bit((u64 *)ptep, pte_val_ma(pte));
+       set_64bit((u64 *)ptep, native_pte_val(pte));
  }
  
  void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
  {
         set_pmd(pmdp, __pmd(0));
  }
+#endif /* CONFIG_X86_PAE */
  
  pmd_t xen_make_pmd(pmdval_t pmd)
  {
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
         return native_make_pmd(pmd);
  }
  
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud)
+{
+       return pte_mfn_to_pfn(pud.pud);
+}
+
+pud_t xen_make_pud(pudval_t pud)
+{
+       pud = pte_pfn_to_mfn(pud);
+
+       return native_make_pud(pud);
+}
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+       pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+       unsigned offset = pgd - pgd_page;
+       pgd_t *user_ptr = NULL;
+
+       if (offset < pgd_index(USER_LIMIT)) {
+               struct page *page = virt_to_page(pgd_page);
+               user_ptr = (pgd_t *)page->private;
+               if (user_ptr)
+                       user_ptr += offset;
+       }
+
+       return user_ptr;
+}
+
+static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+       struct mmu_update u;
+
+       u.ptr = virt_to_machine(ptr).maddr;
+       u.val = pgd_val_ma(val);
+       extend_mmu_update(&u);
+}
+
+/*
+ * Raw hypercall-based set_pgd, intended for in early boot before
+ * there's a page structure.  This implies:
+ *  1. The only existing pagetable is the kernel's
+ *  2. It is always pinned
+ *  3. It has no user pagetable attached to it
+ */
+void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+       preempt_disable();
+
+       xen_mc_batch();
+
+       __xen_set_pgd_hyper(ptr, val);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+       preempt_enable();
+}
+
+void xen_set_pgd(pgd_t *ptr, pgd_t val)
+{
+       pgd_t *user_ptr = xen_get_user_pgd(ptr);
+
+       /* If page is not pinned, we can just update the entry
+          directly */
+       if (!page_pinned(ptr)) {
+               *ptr = val;
+               if (user_ptr) {
+                       WARN_ON(page_pinned(user_ptr));
+                       *user_ptr = val;
+               }
+               return;
+       }
+
+       /* If it's pinned, then we can at least batch the kernel and
+          user updates together. */
+       xen_mc_batch();
+
+       __xen_set_pgd_hyper(ptr, val);
+       if (user_ptr)
+               __xen_set_pgd_hyper(user_ptr, val);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
  /*
-  (Yet another) pagetable walker.  This one is intended for pinning a
-  pagetable.  This means that it walks a pagetable and calls the
-  callback function on each page it finds making up the page table,
-  at every level.  It walks the entire pagetable, but it only bothers
-  pinning pte pages which are below pte_limit.  In the normal case
-  this will be TASK_SIZE, but at boot we need to pin up to
-  FIXADDR_TOP.  But the important bit is that we don't pin beyond
-  there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
+ * (Yet another) pagetable walker.  This one is intended for pinning a
+ * pagetable.  This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level.  It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit.  In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
                     unsigned long limit)
  {
-       pgd_t *pgd = pgd_base;
         int flush = 0;
-       unsigned long addr = 0;
-       unsigned long pgd_next;
+       unsigned hole_low, hole_high;
+       unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
+       unsigned pgdidx, pudidx, pmdidx;
  
-       BUG_ON(limit > FIXADDR_TOP);
+       /* The limit is the last byte to be touched */
+       limit--;
+       BUG_ON(limit >= FIXADDR_TOP);
  
         if (xen_feature(XENFEAT_auto_translated_physmap))
                 return 0;
  
-       for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+       /*
+        * 64-bit has a great big hole in the middle of the address
+        * space, which contains the Xen mappings.  On 32-bit these
+        * will end up making a zero-sized hole and so is a no-op.
+        */
+       hole_low = pgd_index(USER_LIMIT);
+       hole_high = pgd_index(PAGE_OFFSET);
+
+       pgdidx_limit = pgd_index(limit);
+#if PTRS_PER_PUD > 1
+       pudidx_limit = pud_index(limit);
+#else
+       pudidx_limit = 0;
+#endif
+#if PTRS_PER_PMD > 1
+       pmdidx_limit = pmd_index(limit);
+#else
+       pmdidx_limit = 0;
+#endif
+
+       flush |= (*func)(virt_to_page(pgd), PT_PGD);
+
+       for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
                 pud_t *pud;
-               unsigned long pud_limit, pud_next;
  
-               pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+               if (pgdidx >= hole_low && pgdidx < hole_high)
+                       continue;
  
-               if (!pgd_val(*pgd))
+               if (!pgd_val(pgd[pgdidx]))
                         continue;
  
-               pud = pud_offset(pgd, 0);
+               pud = pud_offset(&pgd[pgdidx], 0);
  
                 if (PTRS_PER_PUD > 1) /* not folded */
                         flush |= (*func)(virt_to_page(pud), PT_PUD);
  
-               for (; addr != pud_limit; pud++, addr = pud_next) {
+               for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
                         pmd_t *pmd;
-                       unsigned long pmd_limit;
  
-                       pud_next = pud_addr_end(addr, pud_limit);
-
-                       if (pud_next < limit)
-                               pmd_limit = pud_next;
-                       else
-                               pmd_limit = limit;
+                       if (pgdidx == pgdidx_limit &&
+                           pudidx > pudidx_limit)
+                               goto out;
  
-                       if (pud_none(*pud))
+                       if (pud_none(pud[pudidx]))
                                 continue;
  
-                       pmd = pmd_offset(pud, 0);
+                       pmd = pmd_offset(&pud[pudidx], 0);
  
                         if (PTRS_PER_PMD > 1) /* not folded */
                                 flush |= (*func)(virt_to_page(pmd), PT_PMD);
  
-                       for (; addr != pmd_limit; pmd++) {
-                               addr += (PAGE_SIZE * PTRS_PER_PTE);
-                               if ((pmd_limit-1) < (addr-1)) {
-                                       addr = pmd_limit;
-                                       break;
-                               }
+                       for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
+                               struct page *pte;
+
+                               if (pgdidx == pgdidx_limit &&
+                                   pudidx == pudidx_limit &&
+                                   pmdidx > pmdidx_limit)
+                                       goto out;
  
-                               if (pmd_none(*pmd))
+                               if (pmd_none(pmd[pmdidx]))
                                         continue;
  
-                               flush |= (*func)(pmd_page(*pmd), PT_PTE);
+                               pte = pmd_page(pmd[pmdidx]);
+                               flush |= (*func)(pte, PT_PTE);
                         }
                 }
         }
-
-       flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
+out:
  
         return flush;
  }
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
  {
         xen_mc_batch();
  
-       if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+       if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
                 /* re-enable interrupts for kmap_flush_unused */
                 xen_mc_issue(0);
                 kmap_flush_unused();
                 xen_mc_batch();
         }
  
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+               xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+               if (user_pgd) {
+                       pin_page(virt_to_page(user_pgd), PT_PGD);
+                       xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+               }
+       }
+#else /* CONFIG_X86_32 */
+#ifdef CONFIG_X86_PAE
+       /* Need to make sure unshared kernel PMD is pinnable */
+       pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
         xen_mc_issue(0);
  }
  
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
         spin_unlock_irqrestore(&pgd_lock, flags);
  }
  
-/* The init_mm pagetable is really pinned as soon as its created, but
-   that's before we have page structures to store the bits.  So do all
-   the book-keeping now. */
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits.  So do all
+ * the book-keeping now.
+ */
  static __init int mark_pinned(struct page *page, enum pt_level level)
  {
         SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
  
         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
  
-       pgd_walk(pgd, unpin_page, TASK_SIZE);
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+               if (user_pgd) {
+                       xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+                       unpin_page(virt_to_page(user_pgd), PT_PGD);
+               }
+       }
+#endif
+
+#ifdef CONFIG_X86_PAE
+       /* Need to make sure unshared kernel PMD is unpinned */
+       pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
+
+       pgd_walk(pgd, unpin_page, USER_LIMIT);
  
         xen_mc_issue(0);
  }
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
         list_for_each_entry(page, &pgd_list, lru) {
                 if (PageSavePinned(page)) {
                         BUG_ON(!PagePinned(page));
-                       printk("unpinning pinned %p\n", page_address(page));
                         xen_pgd_unpin((pgd_t *)page_address(page));
                         ClearPageSavePinned(page);
                 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
  static void drop_other_mm_ref(void *info)
  {
         struct mm_struct *mm = info;
+       struct mm_struct *active_mm;
+
+#ifdef CONFIG_X86_64
+       active_mm = read_pda(active_mm);
+#else
+       active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
+#endif
  
-       if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+       if (active_mm == mm)
                 leave_mm(smp_processor_id());
  
         /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h

index 297bf9f5b8bc3c5a709ff811078c874ffe9b2bab..0f59bd03f9e31f9ac8316e452b21729b0d4de9cb 100644 (file)
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
         PT_PTE
  };
  
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
  
  void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
  
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
  void xen_set_pte(pte_t *ptep, pte_t pteval);
  void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t pteval);
+
+#ifdef CONFIG_X86_PAE
  void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#endif /* CONFIG_X86_PAE */
+
  void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
  void xen_set_pud(pud_t *ptr, pud_t val);
  void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
  void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
+
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud);
+pud_t xen_make_pud(pudval_t pudval);
+void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
+void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
+#endif
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd);
  
  pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
  void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c

index 3c63c4da7ed19cbe4ae43234fe6f6068eb638a79..9efd1c6c9776c5f9e8973026dd03b111c6e9e585 100644 (file)
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
                 if (ret) {
                         printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
                                ret, smp_processor_id());
+                       dump_stack();
                         for (i = 0; i < b->mcidx; i++) {
                                 printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
                                        i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c

index e0a39595bde3b342c8364a4306f3b8ea61d88def..b6acc3a0af46d0adeb44a99a8b8d78e1ef8c5140 100644 (file)
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
  
  /*
   * Set the bit indicating "nosegneg" library variants should be used.
+ * We only need to bother in pure 32-bit mode; compat 32-bit processes
+ * can have un-truncated segments, so wrapping around is allowed.
   */
  static void __init fiddle_vdso(void)
  {
-       extern const char vdso32_default_start;
-       u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
+#ifdef CONFIG_X86_32
+       u32 *mask;
+       mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
         *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+       mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+       *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
  }
  
-void xen_enable_sysenter(void)
+static __cpuinit int register_callback(unsigned type, const void *func)
  {
-       int cpu = smp_processor_id();
-       extern void xen_sysenter_target(void);
-       /* Mask events on entry, even though they get enabled immediately */
-       static struct callback_register sysenter = {
-               .type = CALLBACKTYPE_sysenter,
-               .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+       struct callback_register callback = {
+               .type = type,
+               .address = XEN_CALLBACK(__KERNEL_CS, func),
                 .flags = CALLBACKF_mask_events,
         };
  
-       if (!boot_cpu_has(X86_FEATURE_SEP) ||
-           HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
-               clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+       return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void __cpuinit xen_enable_sysenter(void)
+{
+       extern void xen_sysenter_target(void);
+       int ret;
+       unsigned sysenter_feature;
+
+#ifdef CONFIG_X86_32
+       sysenter_feature = X86_FEATURE_SEP;
+#else
+       sysenter_feature = X86_FEATURE_SYSENTER32;
+#endif
+
+       if (!boot_cpu_has(sysenter_feature))
+               return;
+
+       ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+       if(ret != 0)
+               setup_clear_cpu_cap(sysenter_feature);
+}
+
+void __cpuinit xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+       int ret;
+       extern void xen_syscall_target(void);
+       extern void xen_syscall32_target(void);
+
+       ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+       if (ret != 0) {
+               printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
+               /* Pretty fatal; 64-bit userspace has no other
+                  mechanism for syscalls. */
         }
+
+       if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
+               ret = register_callback(CALLBACKTYPE_syscall32,
+                                       xen_syscall32_target);
+               if (ret != 0)
+                       setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+       }
+#endif /* CONFIG_X86_64 */
  }
  
  void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
         if (!xen_feature(XENFEAT_auto_translated_physmap))
                 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
  
-       HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
-                                __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+       if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+           register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+               BUG();
  
         xen_enable_sysenter();
+       xen_enable_syscall();
  
         set_iopl.iopl = 1;
         rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
  
         pm_idle = xen_idle;
  
-#ifdef CONFIG_SMP
-       /* fill cpus_possible with all available cpus */
-       xen_fill_possible_map();
-#endif
-
         paravirt_disable_iospace();
  
         fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c

index 233156f39b7f39f605fdb2c0cbd266da60783e28..f702199312a5b82c5d580f5987d6dc33e4c50e4d 100644 (file)
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
         int cpu = smp_processor_id();
  
         cpu_init();
+       preempt_disable();
+
         xen_enable_sysenter();
+       xen_enable_syscall();
  
-       preempt_disable();
-       per_cpu(cpu_state, cpu) = CPU_ONLINE;
+       cpu = smp_processor_id();
+       smp_store_cpu_info(cpu);
+       cpu_data(cpu).x86_max_cores = 1;
+       set_cpu_sibling_map(cpu);
  
         xen_setup_cpu_clockevents();
  
+       cpu_set(cpu, cpu_online_map);
+       x86_write_percpu(cpu_state, CPU_ONLINE);
+       wmb();
+
         /* We can take interrupts now: we're officially "up". */
         local_irq_enable();
  
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
         return rc;
  }
  
-void __init xen_fill_possible_map(void)
+static void __init xen_fill_possible_map(void)
  {
         int i, rc;
  
         for (i = 0; i < NR_CPUS; i++) {
                 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-               if (rc >= 0)
+               if (rc >= 0) {
+                       num_processors++;
                         cpu_set(i, cpu_possible_map);
+               }
         }
  }
  
-void __init xen_smp_prepare_boot_cpu(void)
+static void __init xen_smp_prepare_boot_cpu(void)
  {
-       int cpu;
-
         BUG_ON(smp_processor_id() != 0);
         native_smp_prepare_boot_cpu();
  
         /* We've switched to the "real" per-cpu gdt, so make sure the
            old memory can be recycled */
-       make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
-       for_each_possible_cpu(cpu) {
-               cpus_clear(per_cpu(cpu_sibling_map, cpu));
-               /*
-                * cpu_core_map lives in a per cpu area that is cleared
-                * when the per cpu array is allocated.
-                *
-                * cpus_clear(per_cpu(cpu_core_map, cpu));
-                */
-       }
+       make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
  
         xen_setup_vcpu_info_placement();
  }
  
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
  {
         unsigned cpu;
  
-       for_each_possible_cpu(cpu) {
-               cpus_clear(per_cpu(cpu_sibling_map, cpu));
-               /*
-                * cpu_core_ map will be zeroed when the per
-                * cpu area is allocated.
-                *
-                * cpus_clear(per_cpu(cpu_core_map, cpu));
-                */
-       }
-
         smp_store_cpu_info(0);
+       cpu_data(0).x86_max_cores = 1;
         set_cpu_sibling_map(0);
  
         if (xen_smp_intr_init(0))
@@ -225,7 +215,7 @@ static __cpuinit int
  cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
  {
         struct vcpu_guest_context *ctxt;
-       struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+       struct desc_struct *gdt;
  
         if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
                 return 0;
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
         if (ctxt == NULL)
                 return -ENOMEM;
  
+       gdt = get_cpu_gdt_table(cpu);
+
         ctxt->flags = VGCF_IN_KERNEL;
         ctxt->user_regs.ds = __USER_DS;
         ctxt->user_regs.es = __USER_DS;
-       ctxt->user_regs.fs = __KERNEL_PERCPU;
-       ctxt->user_regs.gs = 0;
         ctxt->user_regs.ss = __KERNEL_DS;
+#ifdef CONFIG_X86_32
+       ctxt->user_regs.fs = __KERNEL_PERCPU;
+#endif
         ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
         ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
  
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
  
         ctxt->ldt_ents = 0;
  
-       BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
-       make_lowmem_page_readonly(gdt->gdt);
+       BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+       make_lowmem_page_readonly(gdt);
  
-       ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
-       ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+       ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+       ctxt->gdt_ents      = GDT_ENTRIES;
  
         ctxt->user_regs.cs = __KERNEL_CS;
         ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
         ctxt->kernel_ss = __KERNEL_DS;
         ctxt->kernel_sp = idle->thread.sp0;
  
+#ifdef CONFIG_X86_32
         ctxt->event_callback_cs     = __KERNEL_CS;
-       ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
         ctxt->failsafe_callback_cs  = __KERNEL_CS;
+#endif
+       ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
         ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
  
         per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
         return 0;
  }
  
-int __cpuinit xen_cpu_up(unsigned int cpu)
+static int __cpuinit xen_cpu_up(unsigned int cpu)
  {
         struct task_struct *idle = idle_task(cpu);
         int rc;
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
                 return rc;
  #endif
  
+#ifdef CONFIG_X86_64
+       /* Allocate node local memory for AP pdas */
+       WARN_ON(cpu == 0);
+       if (cpu > 0) {
+               rc = get_local_pda(cpu);
+               if (rc)
+                       return rc;
+       }
+#endif
+
+#ifdef CONFIG_X86_32
         init_gdt(cpu);
         per_cpu(current_task, cpu) = idle;
         irq_ctx_init(cpu);
+#else
+       cpu_pda(cpu)->pcurrent = idle;
+       clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
         xen_setup_timer(cpu);
  
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+
         /* make sure interrupts start blocked */
         per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
  
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
         if (rc)
                 return rc;
  
-       smp_store_cpu_info(cpu);
-       set_cpu_sibling_map(cpu);
-       /* This must be done before setting cpu_online_map */
-       wmb();
-
-       cpu_set(cpu, cpu_online_map);
-
         rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
         BUG_ON(rc);
  
+       while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+               HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+               barrier();
+       }
+
         return 0;
  }
  
-void xen_smp_cpus_done(unsigned int max_cpus)
+static void xen_smp_cpus_done(unsigned int max_cpus)
  {
  }
  
@@ -335,12 +345,12 @@ static void stop_self(void *v)
         BUG();
  }
  
-void xen_smp_send_stop(void)
+static void xen_smp_send_stop(void)
  {
         smp_call_function(stop_self, NULL, 0);
  }
  
-void xen_smp_send_reschedule(int cpu)
+static void xen_smp_send_reschedule(int cpu)
  {
         xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
  }
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
                 xen_send_IPI_one(cpu, vector);
  }
  
-void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(cpumask_t mask)
  {
         int cpu;
  
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
         }
  }
  
-void xen_smp_send_call_function_single_ipi(int cpu)
+static void xen_smp_send_call_function_single_ipi(int cpu)
  {
         xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
  }
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
  {
         irq_enter();
         generic_smp_call_function_interrupt();
+#ifdef CONFIG_X86_32
         __get_cpu_var(irq_stat).irq_call_count++;
+#else
+       add_pda(irq_call_count, 1);
+#endif
         irq_exit();
  
         return IRQ_HANDLED;
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
  {
         irq_enter();
         generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
         __get_cpu_var(irq_stat).irq_call_count++;
+#else
+       add_pda(irq_call_count, 1);
+#endif
         irq_exit();
  
         return IRQ_HANDLED;
  }
+
+static const struct smp_ops xen_smp_ops __initdata = {
+       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+       .smp_prepare_cpus = xen_smp_prepare_cpus,
+       .cpu_up = xen_cpu_up,
+       .smp_cpus_done = xen_smp_cpus_done,
+
+       .smp_send_stop = xen_smp_send_stop,
+       .smp_send_reschedule = xen_smp_send_reschedule,
+
+       .send_call_func_ipi = xen_smp_send_call_function_ipi,
+       .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+       smp_ops = xen_smp_ops;
+       xen_fill_possible_map();
+}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c

index 251669a932d4c440f39cc361374ddf59816d0676..2a234db5949beb39e21e5fa09089c3740d802940 100644 (file)
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
                 xen_cpu_initialized_map = cpu_online_map;
  #endif
                 xen_vcpu_restore();
-               xen_timer_resume();
         }
  
  }
  
+void xen_arch_resume(void)
+{
+       /* nothing */
+}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S

deleted file mode 100644 (file)

index 2497a30..0000000
--- a/arch/x86/xen/xen-asm.S
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
-       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
-       The inline versions are the same as the direct-use versions, with the
-       pre- and post-amble chopped off.
-
-       This code is encoded for size rather than absolute efficiency,
-       with a view to being able to inline as much as possible.
-
-       We only bother with direct forms (ie, vcpu in pda) of the operations
-       here; the indirect forms are better handled in C, since they're
-       generally too large to inline anyway.
- */
-
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/percpu.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
-#include <xen/interface/xen.h>
-
-#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)    .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
-       Enable events.  This clears the event mask and tests the pending
-       event status with one and operation.  If there are pending
-       events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-       /* Unmask events */
-       movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* Test for pending */
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-       jz 1f
-
-2:     call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-       ret
-       ENDPROC(xen_irq_enable_direct)
-       RELOC(xen_irq_enable_direct, 2b+1)
-
-
-/*
-       Disabling events is simply a matter of making the event mask
-       non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-       movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-       ret
-       ENDPROC(xen_irq_disable_direct)
-       RELOC(xen_irq_disable_direct, 0)
-
-/*
-       (xen_)save_fl is used to get the current interrupt enable status.
-       Callers expect the status to be in X86_EFLAGS_IF, and other bits
-       may be set in the return value.  We take advantage of this by
-       making sure that X86_EFLAGS_IF has the right value (and other bits
-       in that byte are 0), but other bits in the return value are
-       undefined.  We need to toggle the state of the bit, because
-       Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-       setz %ah
-       addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-       ret
-       ENDPROC(xen_save_fl_direct)
-       RELOC(xen_save_fl_direct, 0)
-
-
-/*
-       In principle the caller should be passing us a value return
-       from xen_save_fl_direct, but for robustness sake we test only
-       the X86_EFLAGS_IF flag rather than the whole byte. After
-       setting the interrupt mask state, it checks for unmasked
-       pending events and enters the hypervisor to get them delivered
-       if so.
- */
-ENTRY(xen_restore_fl_direct)
-       testb $X86_EFLAGS_IF>>8, %ah
-       setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* check for unmasked and pending */
-       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-       jz 1f
-2:     call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
-       ret
-       ENDPROC(xen_restore_fl_direct)
-       RELOC(xen_restore_fl_direct, 2b+1)
-
-/*
-       We can't use sysexit directly, because we're not running in ring0.
-       But we can easily fake it up using iret.  Assuming xen_sysexit
-       is jumped to with a standard stack frame, we can just strip it
-       back to a standard iret frame and use iret.
- */
-ENTRY(xen_sysexit)
-       movl PT_EAX(%esp), %eax                 /* Shouldn't be necessary? */
-       orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
-       lea PT_EIP(%esp), %esp
-
-       jmp xen_iret
-ENDPROC(xen_sysexit)
-
-/*
-       This is run where a normal iret would be run, with the same stack setup:
-             8: eflags
-             4: cs
-       esp-> 0: eip
-
-       This attempts to make sure that any pending events are dealt
-       with on return to usermode, but there is a small window in
-       which an event can happen just before entering usermode.  If
-       the nested interrupt ends up setting one of the TIF_WORK_MASK
-       pending work flags, they will not be tested again before
-       returning to usermode. This means that a process can end up
-       with pending work, which will be unprocessed until the process
-       enters and leaves the kernel again, which could be an
-       unbounded amount of time.  This means that a pending signal or
-       reschedule event could be indefinitely delayed.
-
-       The fix is to notice a nested interrupt in the critical
-       window, and if one occurs, then fold the nested interrupt into
-       the current interrupt stack frame, and re-process it
-       iteratively rather than recursively.  This means that it will
-       exit via the normal path, and all pending work will be dealt
-       with appropriately.
-
-       Because the nested interrupt handler needs to deal with the
-       current stack state in whatever form its in, we keep things
-       simple by only using a single register which is pushed/popped
-       on the stack.
- */
-ENTRY(xen_iret)
-       /* test eflags for special cases */
-       testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
-       jnz hyper_iret
-
-       push %eax
-       ESP_OFFSET=4    # bytes pushed onto stack
-
-       /* Store vcpu_info pointer for easy access.  Do it this
-          way to avoid having to reload %fs */
-#ifdef CONFIG_SMP
-       GET_THREAD_INFO(%eax)
-       movl TI_cpu(%eax),%eax
-       movl __per_cpu_offset(,%eax,4),%eax
-       mov per_cpu__xen_vcpu(%eax),%eax
-#else
-       movl per_cpu__xen_vcpu, %eax
-#endif
-
-       /* check IF state we're restoring */
-       testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-
-       /* Maybe enable events.  Once this happens we could get a
-          recursive event, so the critical region starts immediately
-          afterwards.  However, if that happens we don't end up
-          resuming the code, so we don't have to be worried about
-          being preempted to another CPU. */
-       setz XEN_vcpu_info_mask(%eax)
-xen_iret_start_crit:
-
-       /* check for unmasked and pending */
-       cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-
-       /* If there's something pending, mask events again so we
-          can jump back into xen_hypervisor_callback */
-       sete XEN_vcpu_info_mask(%eax)
-
-       popl %eax
-
-       /* From this point on the registers are restored and the stack
-          updated, so we don't need to worry about it if we're preempted */
-iret_restore_end:
-
-       /* Jump to hypervisor_callback after fixing up the stack.
-          Events are masked, so jumping out of the critical
-          region is OK. */
-       je xen_hypervisor_callback
-
-1:     iret
-xen_iret_end_crit:
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
-
-hyper_iret:
-       /* put this out of line since its very rarely used */
-       jmp hypercall_page + __HYPERVISOR_iret * 32
-
-       .globl xen_iret_start_crit, xen_iret_end_crit
-
-/*
-   This is called by xen_hypervisor_callback in entry.S when it sees
-   that the EIP at the time of interrupt was between xen_iret_start_crit
-   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
-   a more refined determination of what to do.
-
-   The stack format at this point is:
-       ----------------
-        ss             : (ss/esp may be present if we came from usermode)
-        esp            :
-        eflags         }  outer exception info
-        cs             }
-        eip            }
-       ---------------- <- edi (copy dest)
-        eax            :  outer eax if it hasn't been restored
-       ----------------
-        eflags         }  nested exception info
-        cs             }   (no ss/esp because we're nested
-        eip            }    from the same ring)
-        orig_eax       }<- esi (copy src)
-        - - - - - - - -
-        fs             }
-        es             }
-        ds             }  SAVE_ALL state
-        eax            }
-         :             :
-        ebx            }<- esp
-       ----------------
-
-   In order to deliver the nested exception properly, we need to shift
-   everything from the return addr up to the error code so it
-   sits just under the outer exception info.  This means that when we
-   handle the exception, we do it in the context of the outer exception
-   rather than starting a new one.
-
-   The only caveat is that if the outer eax hasn't been
-   restored yet (ie, it's still on stack), we need to insert
-   its value into the SAVE_ALL state before going on, since
-   it's usermode state which we eventually need to restore.
- */
-ENTRY(xen_iret_crit_fixup)
-       /*
-          Paranoia: Make sure we're really coming from kernel space.
-          One could imagine a case where userspace jumps into the
-          critical range address, but just before the CPU delivers a GP,
-          it decides to deliver an interrupt instead.  Unlikely?
-          Definitely.  Easy to avoid?  Yes.  The Intel documents
-          explicitly say that the reported EIP for a bad jump is the
-          jump instruction itself, not the destination, but some virtual
-          environments get this wrong.
-        */
-       movl PT_CS(%esp), %ecx
-       andl $SEGMENT_RPL_MASK, %ecx
-       cmpl $USER_RPL, %ecx
-       je 2f
-
-       lea PT_ORIG_EAX(%esp), %esi
-       lea PT_EFLAGS(%esp), %edi
-
-       /* If eip is before iret_restore_end then stack
-          hasn't been restored yet. */
-       cmp $iret_restore_end, %eax
-       jae 1f
-
-       movl 0+4(%edi),%eax             /* copy EAX (just above top of frame) */
-       movl %eax, PT_EAX(%esp)
-
-       lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
-
-       /* set up the copy */
-1:     std
-       mov $PT_EIP / 4, %ecx           /* saved regs up to orig_eax */
-       rep movsl
-       cld
-
-       lea 4(%edi),%esp                /* point esp to new frame */
-2:     jmp xen_do_upcall
-
-
-/*
-       Force an event check by making a hypercall,
-       but preserve regs before making the call.
- */
-check_events:
-       push %eax
-       push %ecx
-       push %edx
-       call force_evtchn_callback
-       pop %edx
-       pop %ecx
-       pop %eax
-       ret
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S

new file mode 100644 (file)

index 0000000..2497a30
--- /dev/null
+++ b/arch/x86/xen/xen-asm_32.S
@@ -0,0 +1,305 @@
+/*
+       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+       The inline versions are the same as the direct-use versions, with the
+       pre- and post-amble chopped off.
+
+       This code is encoded for size rather than absolute efficiency,
+       with a view to being able to inline as much as possible.
+
+       We only bother with direct forms (ie, vcpu in pda) of the operations
+       here; the indirect forms are better handled in C, since they're
+       generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)    .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+/*
+       Enable events.  This clears the event mask and tests the pending
+       event status with one and operation.  If there are pending
+       events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+       /* Unmask events */
+       movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* Test for pending */
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+       jz 1f
+
+2:     call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+       ret
+       ENDPROC(xen_irq_enable_direct)
+       RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+       Disabling events is simply a matter of making the event mask
+       non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+       movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+       ret
+       ENDPROC(xen_irq_disable_direct)
+       RELOC(xen_irq_disable_direct, 0)
+
+/*
+       (xen_)save_fl is used to get the current interrupt enable status.
+       Callers expect the status to be in X86_EFLAGS_IF, and other bits
+       may be set in the return value.  We take advantage of this by
+       making sure that X86_EFLAGS_IF has the right value (and other bits
+       in that byte are 0), but other bits in the return value are
+       undefined.  We need to toggle the state of the bit, because
+       Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+       setz %ah
+       addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+       ret
+       ENDPROC(xen_save_fl_direct)
+       RELOC(xen_save_fl_direct, 0)
+
+
+/*
+       In principle the caller should be passing us a value return
+       from xen_save_fl_direct, but for robustness sake we test only
+       the X86_EFLAGS_IF flag rather than the whole byte. After
+       setting the interrupt mask state, it checks for unmasked
+       pending events and enters the hypervisor to get them delivered
+       if so.
+ */
+ENTRY(xen_restore_fl_direct)
+       testb $X86_EFLAGS_IF>>8, %ah
+       setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+       jz 1f
+2:     call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+       ret
+       ENDPROC(xen_restore_fl_direct)
+       RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+       We can't use sysexit directly, because we're not running in ring0.
+       But we can easily fake it up using iret.  Assuming xen_sysexit
+       is jumped to with a standard stack frame, we can just strip it
+       back to a standard iret frame and use iret.
+ */
+ENTRY(xen_sysexit)
+       movl PT_EAX(%esp), %eax                 /* Shouldn't be necessary? */
+       orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
+       lea PT_EIP(%esp), %esp
+
+       jmp xen_iret
+ENDPROC(xen_sysexit)
+
+/*
+       This is run where a normal iret would be run, with the same stack setup:
+             8: eflags
+             4: cs
+       esp-> 0: eip
+
+       This attempts to make sure that any pending events are dealt
+       with on return to usermode, but there is a small window in
+       which an event can happen just before entering usermode.  If
+       the nested interrupt ends up setting one of the TIF_WORK_MASK
+       pending work flags, they will not be tested again before
+       returning to usermode. This means that a process can end up
+       with pending work, which will be unprocessed until the process
+       enters and leaves the kernel again, which could be an
+       unbounded amount of time.  This means that a pending signal or
+       reschedule event could be indefinitely delayed.
+
+       The fix is to notice a nested interrupt in the critical
+       window, and if one occurs, then fold the nested interrupt into
+       the current interrupt stack frame, and re-process it
+       iteratively rather than recursively.  This means that it will
+       exit via the normal path, and all pending work will be dealt
+       with appropriately.
+
+       Because the nested interrupt handler needs to deal with the
+       current stack state in whatever form its in, we keep things
+       simple by only using a single register which is pushed/popped
+       on the stack.
+ */
+ENTRY(xen_iret)
+       /* test eflags for special cases */
+       testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+       jnz hyper_iret
+
+       push %eax
+       ESP_OFFSET=4    # bytes pushed onto stack
+
+       /* Store vcpu_info pointer for easy access.  Do it this
+          way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+       GET_THREAD_INFO(%eax)
+       movl TI_cpu(%eax),%eax
+       movl __per_cpu_offset(,%eax,4),%eax
+       mov per_cpu__xen_vcpu(%eax),%eax
+#else
+       movl per_cpu__xen_vcpu, %eax
+#endif
+
+       /* check IF state we're restoring */
+       testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+       /* Maybe enable events.  Once this happens we could get a
+          recursive event, so the critical region starts immediately
+          afterwards.  However, if that happens we don't end up
+          resuming the code, so we don't have to be worried about
+          being preempted to another CPU. */
+       setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+       /* If there's something pending, mask events again so we
+          can jump back into xen_hypervisor_callback */
+       sete XEN_vcpu_info_mask(%eax)
+
+       popl %eax
+
+       /* From this point on the registers are restored and the stack
+          updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+       /* Jump to hypervisor_callback after fixing up the stack.
+          Events are masked, so jumping out of the critical
+          region is OK. */
+       je xen_hypervisor_callback
+
+1:     iret
+xen_iret_end_crit:
+.section __ex_table,"a"
+       .align 4
+       .long 1b,iret_exc
+.previous
+
+hyper_iret:
+       /* put this out of line since its very rarely used */
+       jmp hypercall_page + __HYPERVISOR_iret * 32
+
+       .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+
+   The stack format at this point is:
+       ----------------
+        ss             : (ss/esp may be present if we came from usermode)
+        esp            :
+        eflags         }  outer exception info
+        cs             }
+        eip            }
+       ---------------- <- edi (copy dest)
+        eax            :  outer eax if it hasn't been restored
+       ----------------
+        eflags         }  nested exception info
+        cs             }   (no ss/esp because we're nested
+        eip            }    from the same ring)
+        orig_eax       }<- esi (copy src)
+        - - - - - - - -
+        fs             }
+        es             }
+        ds             }  SAVE_ALL state
+        eax            }
+         :             :
+        ebx            }<- esp
+       ----------------
+
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+       /*
+          Paranoia: Make sure we're really coming from kernel space.
+          One could imagine a case where userspace jumps into the
+          critical range address, but just before the CPU delivers a GP,
+          it decides to deliver an interrupt instead.  Unlikely?
+          Definitely.  Easy to avoid?  Yes.  The Intel documents
+          explicitly say that the reported EIP for a bad jump is the
+          jump instruction itself, not the destination, but some virtual
+          environments get this wrong.
+        */
+       movl PT_CS(%esp), %ecx
+       andl $SEGMENT_RPL_MASK, %ecx
+       cmpl $USER_RPL, %ecx
+       je 2f
+
+       lea PT_ORIG_EAX(%esp), %esi
+       lea PT_EFLAGS(%esp), %edi
+
+       /* If eip is before iret_restore_end then stack
+          hasn't been restored yet. */
+       cmp $iret_restore_end, %eax
+       jae 1f
+
+       movl 0+4(%edi),%eax             /* copy EAX (just above top of frame) */
+       movl %eax, PT_EAX(%esp)
+
+       lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+
+       /* set up the copy */
+1:     std
+       mov $PT_EIP / 4, %ecx           /* saved regs up to orig_eax */
+       rep movsl
+       cld
+
+       lea 4(%edi),%esp                /* point esp to new frame */
+2:     jmp xen_do_upcall
+
+
+/*
+       Force an event check by making a hypercall,
+       but preserve regs before making the call.
+ */
+check_events:
+       push %eax
+       push %ecx
+       push %edx
+       call force_evtchn_callback
+       pop %edx
+       pop %ecx
+       pop %eax
+       ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S

new file mode 100644 (file)

index 0000000..4038cbf
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
+/*
+       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+       The inline versions are the same as the direct-use versions, with the
+       pre- and post-amble chopped off.
+
+       This code is encoded for size rather than absolute efficiency,
+       with a view to being able to inline as much as possible.
+
+       We only bother with direct forms (ie, vcpu in pda) of the operations
+       here; the indirect forms are better handled in C, since they're
+       generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)    .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#if 0
+#include <asm/percpu.h>
+
+/*
+       Enable events.  This clears the event mask and tests the pending
+       event status with one and operation.  If there are pending
+       events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+       /* Unmask events */
+       movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* Test for pending */
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+       jz 1f
+
+2:     call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+       ret
+       ENDPROC(xen_irq_enable_direct)
+       RELOC(xen_irq_enable_direct, 2b+1)
+
+/*
+       Disabling events is simply a matter of making the event mask
+       non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+       movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ENDPATCH(xen_irq_disable_direct)
+       ret
+       ENDPROC(xen_irq_disable_direct)
+       RELOC(xen_irq_disable_direct, 0)
+
+/*
+       (xen_)save_fl is used to get the current interrupt enable status.
+       Callers expect the status to be in X86_EFLAGS_IF, and other bits
+       may be set in the return value.  We take advantage of this by
+       making sure that X86_EFLAGS_IF has the right value (and other bits
+       in that byte are 0), but other bits in the return value are
+       undefined.  We need to toggle the state of the bit, because
+       Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+       setz %ah
+       addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+       ret
+       ENDPROC(xen_save_fl_direct)
+       RELOC(xen_save_fl_direct, 0)
+
+/*
+       In principle the caller should be passing us a value return
+       from xen_save_fl_direct, but for robustness sake we test only
+       the X86_EFLAGS_IF flag rather than the whole byte. After
+       setting the interrupt mask state, it checks for unmasked
+       pending events and enters the hypervisor to get them delivered
+       if so.
+ */
+ENTRY(xen_restore_fl_direct)
+       testb $X86_EFLAGS_IF>>8, %ah
+       setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+       jz 1f
+2:     call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+       ret
+       ENDPROC(xen_restore_fl_direct)
+       RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+       Force an event check by making a hypercall,
+       but preserve regs before making the call.
+ */
+check_events:
+       push %rax
+       push %rcx
+       push %rdx
+       push %rsi
+       push %rdi
+       push %r8
+       push %r9
+       push %r10
+       push %r11
+       call force_evtchn_callback
+       pop %r11
+       pop %r10
+       pop %r9
+       pop %r8
+       pop %rdi
+       pop %rsi
+       pop %rdx
+       pop %rcx
+       pop %rax
+       ret
+#endif
+
+ENTRY(xen_adjust_exception_frame)
+       mov 8+0(%rsp),%rcx
+       mov 8+8(%rsp),%r11
+       ret $16
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+       Xen64 iret frame:
+
+       ss
+       rsp
+       rflags
+       cs
+       rip             <-- standard iret frame
+
+       flags
+
+       rcx             }
+       r11             }<-- pushed by hypercall page
+rsp -> rax             }
+ */
+ENTRY(xen_iret)
+       pushq $0
+1:     jmp hypercall_iret
+ENDPATCH(xen_iret)
+RELOC(xen_iret, 1b+1)
+
+/*
+       sysexit is not used for 64-bit processes, so it's
+       only ever used to return to 32-bit compat userspace.
+ */
+ENTRY(xen_sysexit)
+       pushq $__USER32_DS
+       pushq %rcx
+       pushq $X86_EFLAGS_IF
+       pushq $__USER32_CS
+       pushq %rdx
+
+       pushq $VGCF_in_syscall
+1:     jmp hypercall_iret
+ENDPATCH(xen_sysexit)
+RELOC(xen_sysexit, 1b+1)
+
+ENTRY(xen_sysret64)
+       /* We're already on the usermode stack at this point, but still
+          with the kernel gs, so we can easily switch back */
+       movq %rsp, %gs:pda_oldrsp
+       movq %gs:pda_kernelstack,%rsp
+
+       pushq $__USER_DS
+       pushq %gs:pda_oldrsp
+       pushq %r11
+       pushq $__USER_CS
+       pushq %rcx
+
+       pushq $VGCF_in_syscall
+1:     jmp hypercall_iret
+ENDPATCH(xen_sysret64)
+RELOC(xen_sysret64, 1b+1)
+
+ENTRY(xen_sysret32)
+       /* We're already on the usermode stack at this point, but still
+          with the kernel gs, so we can easily switch back */
+       movq %rsp, %gs:pda_oldrsp
+       movq %gs:pda_kernelstack, %rsp
+
+       pushq $__USER32_DS
+       pushq %gs:pda_oldrsp
+       pushq %r11
+       pushq $__USER32_CS
+       pushq %rcx
+
+       pushq $VGCF_in_syscall
+1:     jmp hypercall_iret
+ENDPATCH(xen_sysret32)
+RELOC(xen_sysret32, 1b+1)
+
+/*
+       Xen handles syscall callbacks much like ordinary exceptions,
+       which means we have:
+        - kernel gs
+        - kernel rsp
+        - an iret-like stack frame on the stack (including rcx and r11):
+               ss
+               rsp
+               rflags
+               cs
+               rip
+               r11
+       rsp->   rcx
+
+       In all the entrypoints, we undo all that to make it look
+       like a CPU-generated syscall/sysenter and jump to the normal
+       entrypoint.
+ */
+
+.macro undo_xen_syscall
+       mov 0*8(%rsp),%rcx
+       mov 1*8(%rsp),%r11
+       mov 5*8(%rsp),%rsp
+.endm
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+       undo_xen_syscall
+       jmp system_call_after_swapgs
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+       undo_xen_syscall
+       jmp ia32_cstar_target
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+       undo_xen_syscall
+       jmp ia32_sysenter_target
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+       lea 16(%rsp), %rsp      /* strip %rcx,%r11 */
+       mov $-ENOSYS, %rax
+       pushq $VGCF_in_syscall
+       jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S

index 7c0cf6320a0aa0ed55d56b5c627856dcb579d377..63d49a523ed307f9407e805ca507bf293ad0dfca 100644 (file)
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
  
  #include <linux/elfnote.h>
  #include <linux/init.h>
+
  #include <asm/boot.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
  #include <xen/interface/elfnote.h>
  #include <asm/xen/interface.h>
  
         __INIT
  ENTRY(startup_xen)
-       movl %esi,xen_start_info
         cld
-       movl $(init_thread_union+THREAD_SIZE),%esp
+#ifdef CONFIG_X86_32
+       mov %esi,xen_start_info
+       mov $init_thread_union+THREAD_SIZE,%esp
+#else
+       mov %rsi,xen_start_info
+       mov $init_thread_union+THREAD_SIZE,%rsp
+#endif
         jmp xen_start_kernel
  
         __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
  .pushsection .text
         .align PAGE_SIZE_asm
  ENTRY(hypercall_page)
-       .skip 0x1000
+       .skip PAGE_SIZE_asm
  .popsection
  
         ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
         ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
         ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
-       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
-       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
-       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+#ifdef CONFIG_X86_32
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
         ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
         ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
         ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
         ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
                 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
         ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
-       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long __HYPERVISOR_VIRT_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
  
  #endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h

index 6f4b1045c1c20768015d1fc7987f9ee3a73b4475..dd3c23152a2e618f733192c86aac1ad04de61ed9 100644 (file)
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
  void __init xen_arch_setup(void);
  void __init xen_init_IRQ(void);
  void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
  void xen_vcpu_restore(void);
  
  void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
  unsigned long xen_get_wallclock(void);
  int xen_set_wallclock(unsigned long time);
  unsigned long long xen_sched_clock(void);
-void xen_timer_resume(void);
  
  irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
  
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
  
  void xen_mark_init_mm_pinned(void);
  
-void __init xen_fill_possible_map(void);
-
  void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
  
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(cpumask_t mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
  
  extern cpumask_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+#endif
  
  
  /* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
  DECL_ASM(unsigned long, xen_save_fl_direct, void);
  DECL_ASM(void, xen_restore_fl_direct, unsigned long);
  
+/* These are not functions, and cannot be called normally */
  void xen_iret(void);
  void xen_sysexit(void);
+void xen_sysret32(void);
+void xen_sysret64(void);
+void xen_adjust_exception_frame(void);
  
  #endif /* XEN_OPS_H */
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c

index ef671d1a3bf08441719b5f0bda8338648b1d1ee6..902bbe7882158d5c040f8847de1ddb84bad1d0e9 100644 (file)
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -92,7 +92,7 @@ struct netfront_info {
          */
         union skb_entry {
                 struct sk_buff *skb;
-               unsigned link;
+               unsigned long link;
         } tx_skbs[NET_TX_RING_SIZE];
         grant_ref_t gref_tx_head;
         grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@@ -125,6 +125,17 @@ struct netfront_rx_info {
         struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
  };
  
+static void skb_entry_set_link(union skb_entry *list, unsigned short id)
+{
+       list->link = id;
+}
+
+static int skb_entry_is_link(const union skb_entry *list)
+{
+       BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
+       return ((unsigned long)list->skb < PAGE_OFFSET);
+}
+
  /*
   * Access macros for acquiring freeing slots in tx_skbs[].
   */
@@ -132,7 +143,7 @@ struct netfront_rx_info {
  static void add_id_to_freelist(unsigned *head, union skb_entry *list,
                                unsigned short id)
  {
-       list[id].link = *head;
+       skb_entry_set_link(&list[id], *head);
         *head = id;
  }
  
@@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np)
  
         for (i = 0; i < NET_TX_RING_SIZE; i++) {
                 /* Skip over entries which are actually freelist references */
-               if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
+               if (skb_entry_is_link(&np->tx_skbs[i]))
                         continue;
  
                 skb = np->tx_skbs[i].skb;
@@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev
         /* Initialise tx_skbs as a free chain containing every entry. */
         np->tx_skb_freelist = 0;
         for (i = 0; i < NET_TX_RING_SIZE; i++) {
-               np->tx_skbs[i].link = i+1;
+               skb_entry_set_link(&np->tx_skbs[i], i+1);
                 np->grant_tx_ref[i] = GRANT_INVALID_REF;
         }
  
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c

index 3f7b81c065d25188e17d82665b63575a91c92e19..8d0e60ac849cb5f34e609fdaf9215688091728a7 100644 (file)
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -37,7 +37,7 @@
  #include "intel-iommu.h"
  #include <asm/proto.h> /* force_iommu in this header in x86-64*/
  #include <asm/cacheflush.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
  #include "pci.h"
  
  #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c

index 5b546e365f007d028843eea0ee84e77a0def1f58..a5bc91ae6ff69f5d5d4b2e9f49e308acd9126b20 100644 (file)
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -63,11 +63,12 @@ static int xen_suspend(void *data)
         gnttab_resume();
         xen_mm_unpin_all();
  
-       device_power_up();
+       device_power_up(PMSG_RESUME);
  
         if (!*cancelled) {
                 xen_irq_resume();
                 xen_console_resume();
+               xen_timer_resume();
         }
  
         return 0;
@@ -107,12 +108,13 @@ static void do_suspend(void)
                 goto out;
         }
  
-       if (!cancelled)
+       if (!cancelled) {
+               xen_arch_resume();
                 xenbus_resume();
-       else
+       } else
                 xenbus_suspend_cancel();
  
-       device_resume();
+       device_resume(PMSG_RESUME);
  
         /* Make sure timer events get retriggered on all CPUs */
         clock_was_set();
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h

index 7bfcb47cc452d5dc27ce2bf34c7cb22acddec7b7..22aa58ca1991c3003565f8fbc0313f3cf37b9d92 100644 (file)
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -27,13 +27,12 @@
  /*
   * some size calculation constants
   */
-#define DEV_TABLE_ENTRY_SIZE           256
+#define DEV_TABLE_ENTRY_SIZE           32
  #define ALIAS_TABLE_ENTRY_SIZE         2
  #define RLOOKUP_TABLE_ENTRY_SIZE       (sizeof(void *))
  
  /* helper macros */
  #define LOW_U32(x) ((x) & ((1ULL << 32)-1))
-#define HIGH_U32(x) (LOW_U32((x) >> 32))
  
  /* Length of the MMIO region for the AMD IOMMU */
  #define MMIO_REGION_LENGTH       0x4000
@@ -158,78 +157,170 @@
  
  #define MAX_DOMAIN_ID 65536
  
+/*
+ * This structure contains generic data for  IOMMU protection domains
+ * independent of their use.
+ */
  struct protection_domain {
-       spinlock_t lock;
-       u16 id;
-       int mode;
-       u64 *pt_root;
-       void *priv;
+       spinlock_t lock; /* mostly used to lock the page table*/
+       u16 id;          /* the domain id written to the device table */
+       int mode;        /* paging mode (0-6 levels) */
+       u64 *pt_root;    /* page table root pointer */
+       void *priv;      /* private data */
  };
  
+/*
+ * Data container for a dma_ops specific protection domain
+ */
  struct dma_ops_domain {
         struct list_head list;
+
+       /* generic protection domain information */
         struct protection_domain domain;
+
+       /* size of the aperture for the mappings */
         unsigned long aperture_size;
+
+       /* address we start to search for free addresses */
         unsigned long next_bit;
+
+       /* address allocation bitmap */
         unsigned long *bitmap;
+
+       /*
+        * Array of PTE pages for the aperture. In this array we save all the
+        * leaf pages of the domain page table used for the aperture. This way
+        * we don't need to walk the page table to find a specific PTE. We can
+        * just calculate its address in constant time.
+        */
         u64 **pte_pages;
  };
  
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
  struct amd_iommu {
         struct list_head list;
+
+       /* locks the accesses to the hardware */
         spinlock_t lock;
  
+       /* device id of this IOMMU */
         u16 devid;
+       /*
+        * Capability pointer. There could be more than one IOMMU per PCI
+        * device function if there are more than one AMD IOMMU capability
+        * pointers.
+        */
         u16 cap_ptr;
  
+       /* physical address of MMIO space */
         u64 mmio_phys;
+       /* virtual address of MMIO space */
         u8 *mmio_base;
+
+       /* capabilities of that IOMMU read from ACPI */
         u32 cap;
+
+       /* first device this IOMMU handles. read from PCI */
         u16 first_device;
+       /* last device this IOMMU handles. read from PCI */
         u16 last_device;
+
+       /* start of exclusion range of that IOMMU */
         u64 exclusion_start;
+       /* length of exclusion range of that IOMMU */
         u64 exclusion_length;
  
+       /* command buffer virtual address */
         u8 *cmd_buf;
+       /* size of command buffer */
         u32 cmd_buf_size;
  
+       /* if one, we need to send a completion wait command */
         int need_sync;
  
+       /* default dma_ops domain for that IOMMU */
         struct dma_ops_domain *default_dom;
  };
  
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
  extern struct list_head amd_iommu_list;
  
+/*
+ * Structure defining one entry in the device table
+ */
  struct dev_table_entry {
         u32 data[8];
  };
  
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
  struct unity_map_entry {
         struct list_head list;
+
+       /* starting device id this entry is used for (including) */
         u16 devid_start;
+       /* end device id this entry is used for (including) */
         u16 devid_end;
+
+       /* start address to unity map (including) */
         u64 address_start;
+       /* end address to unity map (including) */
         u64 address_end;
+
+       /* required protection */
         int prot;
  };
  
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
  extern struct list_head amd_iommu_unity_map;
  
-/* data structures for device handling */
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
  extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
  extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
  extern struct amd_iommu **amd_iommu_rlookup_table;
  
+/* size of the dma_ops aperture as power of 2 */
  extern unsigned amd_iommu_aperture_order;
  
+/* largest PCI device id we expect translation requests for */
  extern u16 amd_iommu_last_bdf;
  
  /* data structures for protection domain handling */
  extern struct protection_domain **amd_iommu_pd_table;
+
+/* allocation bitmap for domain ids */
  extern unsigned long *amd_iommu_pd_alloc_bitmap;
  
+/* will be 1 if device isolation is enabled */
  extern int amd_iommu_isolate;
  
+/* takes a PCI device id and prints it out in a readable form */
  static inline void print_devid(u16 devid, int nl)
  {
         int bus = devid >> 8;
@@ -241,4 +332,11 @@ static inline void print_devid(u16 devid, int nl)
                 printk("\n");
  }
  
+/* takes bus and device/function and returns the device id
+ * FIXME: should that be in generic PCI code? */
+static inline u16 calc_devid(u8 bus, u8 devfn)
+{
+       return (((u16)bus) << 8) | devfn;
+}
+
  #endif
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h

index 4e2c1e517f0652fb1877c378f760fdb03fe6e8bb..b96460a7190daf74ba155f5e552f8f6c7b2f8e35 100644 (file)
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -3,6 +3,8 @@
  
  #include <linux/pm.h>
  #include <linux/delay.h>
+
+#include <asm/alternative.h>
  #include <asm/fixmap.h>
  #include <asm/apicdef.h>
  #include <asm/processor.h>
@@ -10,7 +12,7 @@
  
  #define ARCH_APICTIMER_STOPS_ON_C3     1
  
-#define Dprintk(x...)
+#define Dprintk printk
  
  /*
   * Debugging macros
@@ -35,7 +37,7 @@ extern void generic_apic_probe(void);
  
  #ifdef CONFIG_X86_LOCAL_APIC
  
-extern int apic_verbosity;
+extern unsigned int apic_verbosity;
  extern int local_apic_timer_c2_ok;
  
  extern int ioapic_force;
@@ -48,7 +50,6 @@ extern int disable_apic;
  #include <asm/paravirt.h>
  #else
  #define apic_write native_apic_write
-#define apic_write_atomic native_apic_write_atomic
  #define apic_read native_apic_read
  #define setup_boot_clock setup_boot_APIC_clock
  #define setup_secondary_clock setup_secondary_APIC_clock
@@ -58,12 +59,11 @@ extern int is_vsmp_box(void);
  
  static inline void native_apic_write(unsigned long reg, u32 v)
  {
-       *((volatile u32 *)(APIC_BASE + reg)) = v;
-}
+       volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
  
-static inline void native_apic_write_atomic(unsigned long reg, u32 v)
-{
-       (void)xchg((u32 *)(APIC_BASE + reg), v);
+       alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+                      ASM_OUTPUT2("=r" (v), "=m" (*addr)),
+                      ASM_OUTPUT2("0" (v), "m" (*addr)));
  }
  
  static inline u32 native_apic_read(unsigned long reg)
@@ -75,16 +75,6 @@ extern void apic_wait_icr_idle(void);
  extern u32 safe_apic_wait_icr_idle(void);
  extern int get_physical_broadcast(void);
  
-#ifdef CONFIG_X86_GOOD_APIC
-# define FORCE_READ_AROUND_WRITE 0
-# define apic_read_around(x)
-# define apic_write_around(x, y) apic_write((x), (y))
-#else
-# define FORCE_READ_AROUND_WRITE 1
-# define apic_read_around(x) apic_read(x)
-# define apic_write_around(x, y) apic_write_atomic((x), (y))
-#endif
-
  static inline void ack_APIC_irq(void)
  {
         /*
@@ -95,7 +85,7 @@ static inline void ack_APIC_irq(void)
          */
  
         /* Docs say use 0 for future compatibility */
-       apic_write_around(APIC_EOI, 0);
+       apic_write(APIC_EOI, 0);
  }
  
  extern int lapic_get_maxlvt(void);
diff --git a/include/asm-x86/arch_hooks.h b/include/asm-x86/arch_hooks.h

index 768aee8a04ef85c5cf1c6ee1b22fa0ebcf3dc60d..8411750ceb633763ad2fda66d8f154f9b87cfd7f 100644 (file)
--- a/include/asm-x86/arch_hooks.h
+++ b/include/asm-x86/arch_hooks.h
@@ -21,6 +21,7 @@ extern void intr_init_hook(void);
  extern void pre_intr_init_hook(void);
  extern void pre_setup_arch_hook(void);
  extern void trap_init_hook(void);
+extern void pre_time_init_hook(void);
  extern void time_init_hook(void);
  extern void mca_nmi_hook(void);
  
diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h

index 96b1829cea15aa0fbc00177e394a9bab47a5a5b7..cfb2b64f76e7d645a8418426f71239b80250c5bb 100644 (file)
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -356,7 +356,7 @@ static inline unsigned long ffz(unsigned long word)
   * __fls: find last set bit in word
   * @word: The word to search
   *
- * Undefined if no zero exists, so code should check against ~0UL first.
+ * Undefined if no set bit exists, so code should check against 0 first.
   */
  static inline unsigned long __fls(unsigned long word)
  {
diff --git a/include/asm-x86/calling.h b/include/asm-x86/calling.h

index f13e62e2cb3e75160297459ef182921b170d0850..2bc162e0ec6eb3bd02c685d80e99c24762a002b7 100644 (file)
--- a/include/asm-x86/calling.h
+++ b/include/asm-x86/calling.h
@@ -104,7 +104,7 @@
         .endif
         .endm
  
-       .macro LOAD_ARGS offset
+       .macro LOAD_ARGS offset, skiprax=0
         movq \offset(%rsp),    %r11
         movq \offset+8(%rsp),  %r10
         movq \offset+16(%rsp), %r9
@@ -113,7 +113,10 @@
         movq \offset+48(%rsp), %rdx
         movq \offset+56(%rsp), %rsi
         movq \offset+64(%rsp), %rdi
+       .if \skiprax
+       .else
         movq \offset+72(%rsp), %rax
+       .endif
         .endm
  
  #define REST_SKIP      6*8
@@ -165,4 +168,3 @@
         .macro icebp
         .byte 0xf1
         .endm
-
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h

index 75ef959db32921922db8f38bac44f8ac960cbc4a..2f5a792b0accafba2c93abc31e28157124a4a322 100644 (file)
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -79,6 +79,7 @@
  #define X86_FEATURE_REP_GOOD   (3*32+16) /* rep microcode works well on this CPU */
  #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
  #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
+#define X86_FEATURE_11AP       (3*32+19)  /* Bad local APIC aka 11AP */
  
  /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
  #define X86_FEATURE_XMM3       (4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/include/asm-x86/dma-mapping.h b/include/asm-x86/dma-mapping.h

index a1a4dc7fe6ece75087cc33e33d3c60e2477cd586..c2ddd3d1b8831a98fe139639e131f458bfc41d99 100644 (file)
--- a/include/asm-x86/dma-mapping.h
+++ b/include/asm-x86/dma-mapping.h
@@ -14,7 +14,6 @@ extern dma_addr_t bad_dma_address;
  extern int iommu_merge;
  extern struct device fallback_dev;
  extern int panic_on_overflow;
-extern int forbid_dac;
  extern int force_iommu;
  
  struct dma_mapping_ops {
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h

index 06633b01dd5b73685d3bcb0698ffe8442fc5f88b..16a31e2c7c5747888b0c75178bb0683859dbbac1 100644 (file)
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -90,6 +90,14 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
  }
  #endif
  
+#ifdef CONFIG_MEMTEST
+extern void early_memtest(unsigned long start, unsigned long end);
+#else
+static inline void early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
  extern unsigned long end_user_pfn;
  
  extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h

index aae2f0501a4006a145ca82e9d65f7999c89aaf5f..f1ac2b2167d7fdf0ba27c450d2c688f05c3b9e02 100644 (file)
--- a/include/asm-x86/fixmap_32.h
+++ b/include/asm-x86/fixmap_32.h
@@ -90,13 +90,13 @@ enum fixed_addresses {
          * 256 temporary boot-time mappings, used by early_ioremap(),
          * before ioremap() is functional.
          *
-        * We round it up to the next 512 pages boundary so that we
+        * We round it up to the next 256 pages boundary so that we
          * can have a single pgd entry and a single pte table:
          */
  #define NR_FIX_BTMAPS          64
  #define FIX_BTMAPS_NESTING     4
-       FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
-                       (__end_of_permanent_fixed_addresses & 511),
+       FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+                       (__end_of_permanent_fixed_addresses & 255),
         FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
         FIX_WP_TEST,
  #ifdef CONFIG_ACPI
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h

index 33b9aeeb35a2367ba5deb1092622c579c6431c5a..3f62a83887f32c21c0919dd5c255847a8e6c2740 100644 (file)
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -2,7 +2,6 @@
  #define _ASM_X8664_GART_H 1
  
  #include <asm/e820.h>
-#include <asm/iommu.h>
  
  extern void set_up_gart_resume(u32, u32);
  
diff --git a/include/asm-x86/iommu.h b/include/asm-x86/iommu.h

index 068c9a40aa5b10c81f5123bf80c3b212cda32840..d63166fb3ab705a3e294b0b5f4c75bd544d17411 100644 (file)
--- a/include/asm-x86/iommu.h
+++ b/include/asm-x86/iommu.h
@@ -25,10 +25,18 @@ extern void gart_iommu_hole_init(void);
  static inline void early_gart_iommu_check(void)
  {
  }
-
+static inline void gart_iommu_init(void)
+{
+}
  static inline void gart_iommu_shutdown(void)
  {
  }
+static inline void gart_parse_options(char *options)
+{
+}
+static inline void gart_iommu_hole_init(void)
+{
+}
  #endif
  
  #endif
diff --git a/include/asm-x86/mach-bigsmp/mach_apic.h b/include/asm-x86/mach-bigsmp/mach_apic.h

index 017c8c19ad8f874e0cc461c388f58716545eda34..c3b9dc6970c95726a3adaafe5502334b9d64c68c 100644 (file)
--- a/include/asm-x86/mach-bigsmp/mach_apic.h
+++ b/include/asm-x86/mach-bigsmp/mach_apic.h
@@ -63,9 +63,9 @@ static inline void init_apic_ldr(void)
         unsigned long val;
         int cpu = smp_processor_id();
  
-       apic_write_around(APIC_DFR, APIC_DFR_VALUE);
+       apic_write(APIC_DFR, APIC_DFR_VALUE);
         val = calculate_ldr(cpu);
-       apic_write_around(APIC_LDR, val);
+       apic_write(APIC_LDR, val);
  }
  
  static inline void setup_apic_routing(void)
diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h

index 0b2cde5e1b74b38641c4cea560784be0d7716a9b..f3226b9a6b823459935a4de2c5b8e82165776e57 100644 (file)
--- a/include/asm-x86/mach-default/mach_apic.h
+++ b/include/asm-x86/mach-default/mach_apic.h
@@ -46,10 +46,10 @@ static inline void init_apic_ldr(void)
  {
         unsigned long val;
  
-       apic_write_around(APIC_DFR, APIC_DFR_VALUE);
+       apic_write(APIC_DFR, APIC_DFR_VALUE);
         val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
         val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
-       apic_write_around(APIC_LDR, val);
+       apic_write(APIC_LDR, val);
  }
  
  static inline int apic_id_registered(void)
diff --git a/include/asm-x86/mach-es7000/mach_apic.h b/include/asm-x86/mach-es7000/mach_apic.h

index fbc8ad256f5aadda25deaeb7d7564515ec1e12ca..0a3fdf93067253e448ffa6a39ca8bcc0d1c0ed90 100644 (file)
--- a/include/asm-x86/mach-es7000/mach_apic.h
+++ b/include/asm-x86/mach-es7000/mach_apic.h
@@ -66,9 +66,9 @@ static inline void init_apic_ldr(void)
         unsigned long val;
         int cpu = smp_processor_id();
  
-       apic_write_around(APIC_DFR, APIC_DFR_VALUE);
+       apic_write(APIC_DFR, APIC_DFR_VALUE);
         val = calculate_ldr(cpu);
-       apic_write_around(APIC_LDR, val);
+       apic_write(APIC_LDR, val);
  }
  
  #ifndef CONFIG_X86_GENERICARCH
diff --git a/include/asm-x86/mach-generic/mach_mpspec.h b/include/asm-x86/mach-generic/mach_mpspec.h

index 9ef0b941bb22ba96559d2653a71ac30ac4e19f72..c83c120be538504f5139228bc53182e3040f43ea 100644 (file)
--- a/include/asm-x86/mach-generic/mach_mpspec.h
+++ b/include/asm-x86/mach-generic/mach_mpspec.h
@@ -7,4 +7,6 @@
  /* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
  #define MAX_MP_BUSSES 260
  
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+                               char *productid);
  #endif /* __ASM_MACH_MPSPEC_H */
diff --git a/include/asm-x86/mach-summit/mach_apic.h b/include/asm-x86/mach-summit/mach_apic.h

index 1f76c2e7023226656aeb48539c4985a25af339c9..75d2c95005d74b8eab2c2fce764dfc925f5aa876 100644 (file)
--- a/include/asm-x86/mach-summit/mach_apic.h
+++ b/include/asm-x86/mach-summit/mach_apic.h
@@ -63,10 +63,10 @@ static inline void init_apic_ldr(void)
          * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
         BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
         id = my_cluster | (1UL << count);
-       apic_write_around(APIC_DFR, APIC_DFR_VALUE);
+       apic_write(APIC_DFR, APIC_DFR_VALUE);
         val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
         val |= SET_APIC_LOGICAL_ID(id);
-       apic_write_around(APIC_LDR, val);
+       apic_write(APIC_LDR, val);
  }
  
  static inline int multi_timer_check(int apic, int irq)
diff --git a/include/asm-x86/mach-visws/entry_arch.h b/include/asm-x86/mach-visws/entry_arch.h

deleted file mode 100644 (file)

index 86be554..0000000
--- a/include/asm-x86/mach-visws/entry_arch.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/*
- * VISWS uses the standard Linux entry points:
- */
-
-#include "../mach-default/entry_arch.h"
diff --git a/include/asm-x86/mach-visws/mach_apic.h b/include/asm-x86/mach-visws/mach_apic.h

deleted file mode 100644 (file)

index 6943e7a..0000000
--- a/include/asm-x86/mach-visws/mach_apic.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../mach-default/mach_apic.h"
diff --git a/include/asm-x86/mach-visws/mach_apicdef.h b/include/asm-x86/mach-visws/mach_apicdef.h

deleted file mode 100644 (file)

index 42711d1..0000000
--- a/include/asm-x86/mach-visws/mach_apicdef.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../mach-default/mach_apicdef.h"
diff --git a/include/asm-x86/mach-visws/setup_arch.h b/include/asm-x86/mach-visws/setup_arch.h

deleted file mode 100644 (file)

index fa4766c..0000000
--- a/include/asm-x86/mach-visws/setup_arch.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../mach-default/setup_arch.h"
diff --git a/include/asm-x86/mach-visws/smpboot_hooks.h b/include/asm-x86/mach-visws/smpboot_hooks.h

deleted file mode 100644 (file)

index e4433ca..0000000
--- a/include/asm-x86/mach-visws/smpboot_hooks.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../mach-default/smpboot_hooks.h"
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h

index ef5e8ec6a6ab7431f0c799a2b56a4cd5f18fe5e9..695ce9383f52620590c627ace65d63b2d47912fe 100644 (file)
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -205,7 +205,6 @@ struct pv_apic_ops {
          * these shouldn't be in this interface.
          */
         void (*apic_write)(unsigned long reg, u32 v);
-       void (*apic_write_atomic)(unsigned long reg, u32 v);
         u32 (*apic_read)(unsigned long reg);
         void (*setup_boot_clock)(void);
         void (*setup_secondary_clock)(void);
@@ -896,11 +895,6 @@ static inline void apic_write(unsigned long reg, u32 v)
         PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
  }
  
-static inline void apic_write_atomic(unsigned long reg, u32 v)
-{
-       PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
-}
-
  static inline u32 apic_read(unsigned long reg)
  {
         return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
@@ -1396,8 +1390,8 @@ extern struct paravirt_patch_site __parainstructions[],
   * caller saved registers but the argument parameter */
  #define PV_SAVE_REGS "pushq %%rdi;"
  #define PV_RESTORE_REGS "popq %%rdi;"
-#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
-#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
  #define PV_FLAGS_ARG "D"
  #endif
  
@@ -1489,8 +1483,26 @@ static inline unsigned long __raw_local_irq_save(void)
  
  
  #ifdef CONFIG_X86_64
-#define PV_SAVE_REGS   pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
-#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
+#define PV_SAVE_REGS                           \
+       push %rax;                              \
+       push %rcx;                              \
+       push %rdx;                              \
+       push %rsi;                              \
+       push %rdi;                              \
+       push %r8;                               \
+       push %r9;                               \
+       push %r10;                              \
+       push %r11
+#define PV_RESTORE_REGS                                \
+       pop %r11;                               \
+       pop %r10;                               \
+       pop %r9;                                \
+       pop %r8;                                \
+       pop %rdi;                               \
+       pop %rsi;                               \
+       pop %rdx;                               \
+       pop %rcx;                               \
+       pop %rax
  #define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 8)
  #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
  #define PARA_INDIRECT(addr)    *addr(%rip)
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h

index 912a3a17b9db85958abee1464d58e64d798f5f31..4e91ee1e37aa170140e92814a08232fef4e6dfd4 100644 (file)
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -22,6 +22,32 @@
  
  DECLARE_PER_CPU(struct x8664_pda, pda);
  
+/*
+ * These are supposed to be implemented as a single instruction which
+ * operates on the per-cpu data base segment.  x86-64 doesn't have
+ * that yet, so this is a fairly inefficient workaround for the
+ * meantime.  The single instruction is atomic with respect to
+ * preemption and interrupts, so we need to explicitly disable
+ * interrupts here to achieve the same effect.  However, because it
+ * can be used from within interrupt-disable/enable, we can't actually
+ * disable interrupts; disabling preemption is enough.
+ */
+#define x86_read_percpu(var)                                           \
+       ({                                                              \
+               typeof(per_cpu_var(var)) __tmp;                         \
+               preempt_disable();                                      \
+               __tmp = __get_cpu_var(var);                             \
+               preempt_enable();                                       \
+               __tmp;                                                  \
+       })
+
+#define x86_write_percpu(var, val)                                     \
+       do {                                                            \
+               preempt_disable();                                      \
+               __get_cpu_var(var) = (val);                             \
+               preempt_enable();                                       \
+       } while(0)
+
  #else /* CONFIG_X86_64 */
  
  #ifdef __ASSEMBLY__
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h

index 49cbd76b9547ed3d0bf3075f4f6980a0143858c7..96aa76e691d8f4d0e1611b295e3d8db56633101c 100644 (file)
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -302,6 +302,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
  /* Install a pte for a particular vaddr in kernel space. */
  void set_pte_vaddr(unsigned long vaddr, pte_t pte);
  
+#ifdef CONFIG_X86_32
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void native_pagetable_setup_start(pgd_t *base) {}
+static inline void native_pagetable_setup_done(pgd_t *base) {}
+#endif
+
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #else  /* !CONFIG_PARAVIRT */
@@ -333,6 +341,16 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
  
  #define pte_update(mm, addr, ptep)              do { } while (0)
  #define pte_update_defer(mm, addr, ptep)        do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+       native_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+       native_pagetable_setup_done(base);
+}
  #endif /* CONFIG_PARAVIRT */
  
  #endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h

index ec871c420d7e5c511a41c098c3ffb9895de25a20..0611abf96a5e397be364f10814340c48c821fb2d 100644 (file)
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -171,21 +171,6 @@ do {                                               \
   */
  #define update_mmu_cache(vma, address, pte) do { } while (0)
  
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
-
-#ifndef CONFIG_PARAVIRT
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
-       native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
-       native_pagetable_setup_done(base);
-}
-#endif /* !CONFIG_PARAVIRT */
-
  #endif /* !__ASSEMBLY__ */
  
  /*
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h

index fa7208b483cada7c9d735f8381c503b289a02d86..805d3128bfc4682683b900fd40190a05d355d2aa 100644 (file)
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -16,6 +16,8 @@
  extern pud_t level3_kernel_pgt[512];
  extern pud_t level3_ident_pgt[512];
  extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
  extern pgd_t init_level4_pgt[];
  
  #define swapper_pg_dir init_level4_pgt
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h

index 55402d2ab9380e3f621c889503af1cec0abd9851..15cb82a44e89197c294e1ad59b07f1a8978c4e1d 100644 (file)
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -722,8 +722,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
  
  extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
  
-extern int                     force_mwait;
-
  extern void select_idle_routine(const struct cpuinfo_x86 *c);
  
  extern unsigned long           boot_option_idle_override;
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h

index f224eb3c3157591ac69a53466609ea0841beefbc..72e7b9db29bba0e2cb736d86281ab4f7c24cdd1d 100644 (file)
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -73,11 +73,11 @@
  
  #ifdef __x86_64__
  # define PTRACE_ARCH_PRCTL       30
-#else
-# define PTRACE_SYSEMU           31
-# define PTRACE_SYSEMU_SINGLESTEP 32
  #endif
  
+#define PTRACE_SYSEMU            31
+#define PTRACE_SYSEMU_SINGLESTEP  32
+
  #define PTRACE_SINGLEBLOCK     33      /* resume execution until next branch */
  
  #ifndef __ASSEMBLY__
diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h

index dfc8601c08922f26e6b19a4a28d998a0b1a7c274..646452ea9ea3ede3d7847f5be78c8ca5a0feee48 100644 (file)
--- a/include/asm-x86/segment.h
+++ b/include/asm-x86/segment.h
@@ -1,6 +1,15 @@
  #ifndef _ASM_X86_SEGMENT_H_
  #define _ASM_X86_SEGMENT_H_
  
+/* Constructor for a conventional segment GDT (or LDT) entry */
+/* This is a macro so it can be used in initializers */
+#define GDT_ENTRY(flags, base, limit)                  \
+       ((((base)  & 0xff000000ULL) << (56-24)) |       \
+        (((flags) & 0x0000f0ffULL) << 40) |            \
+        (((limit) & 0x000f0000ULL) << (48-16)) |       \
+        (((base)  & 0x00ffffffULL) << 16) |            \
+        (((limit) & 0x0000ffffULL)))
+
  /* Simple and small GDT entries for booting only */
  
  #define GDT_ENTRY_BOOT_CS      2
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h

index 90ab2225e71bbefe9fb06c9d85127c555e8a8445..a07c6f1c01e15b9480f14fcca8dc13d875334fc9 100644 (file)
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -19,13 +19,28 @@ static inline int is_visws_box(void) { return 0; }
  /*
   * Any setup quirks to be performed?
   */
-extern int (*arch_time_init_quirk)(void);
-extern int (*arch_pre_intr_init_quirk)(void);
-extern int (*arch_intr_init_quirk)(void);
-extern int (*arch_trap_init_quirk)(void);
-extern char * (*arch_memory_setup_quirk)(void);
-extern int (*mach_get_smp_config_quirk)(unsigned int early);
-extern int (*mach_find_smp_config_quirk)(unsigned int reserve);
+struct mpc_config_processor;
+struct mpc_config_bus;
+struct mp_config_oemtable;
+struct x86_quirks {
+       int (*arch_pre_time_init)(void);
+       int (*arch_time_init)(void);
+       int (*arch_pre_intr_init)(void);
+       int (*arch_intr_init)(void);
+       int (*arch_trap_init)(void);
+       char * (*arch_memory_setup)(void);
+       int (*mach_get_smp_config)(unsigned int early);
+       int (*mach_find_smp_config)(unsigned int reserve);
+
+       int *mpc_record;
+       int (*mpc_apic_id)(struct mpc_config_processor *m);
+       void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
+       void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
+       void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
+                                    unsigned short oemsize);
+};
+
+extern struct x86_quirks *x86_quirks;
  
  #ifndef CONFIG_PARAVIRT
  #define paravirt_post_allocator_init() do {} while (0)
@@ -76,6 +91,7 @@ extern unsigned long init_pg_tables_start;
  extern unsigned long init_pg_tables_end;
  
  #else
+void __init x86_64_init_pda(void);
  void __init x86_64_start_kernel(char *real_mode);
  void __init x86_64_start_reservations(char *real_mode_data);
  
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h

index c2784b3e0b77e23269a1c61407aaeb2db39cdc9d..3c877f74f279454cd579cf71530ef3bd051b75ff 100644 (file)
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -25,6 +25,8 @@ extern cpumask_t cpu_callin_map;
  extern void (*mtrr_hook)(void);
  extern void zap_low_mappings(void);
  
+extern int __cpuinit get_local_pda(int cpu);
+
  extern int smp_num_siblings;
  extern unsigned int num_processors;
  extern cpumask_t cpu_initialized;
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h

index f5d9e74b1e4ab70a718886961113cafbdb2f4fcc..c706a7442633f1f3fdf051598f06326157d51b7a 100644 (file)
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -45,12 +45,14 @@ extern int swiotlb_force;
  
  #ifdef CONFIG_SWIOTLB
  extern int swiotlb;
+extern void pci_swiotlb_init(void);
  #else
  #define swiotlb 0
+static inline void pci_swiotlb_init(void)
+{
+}
  #endif
  
-extern void pci_swiotlb_init(void);
-
  static inline void dma_mark_clean(void *addr, size_t size) {}
  
  #endif /* _ASM_SWIOTLB_H */
diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h

index 895339d2bc0bae799af368ec10efff5050d5afc3..0a8f27d31d0db4eb25075a01525827e540488867 100644 (file)
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -75,9 +75,7 @@ struct thread_info {
  #define TIF_NEED_RESCHED       3       /* rescheduling necessary */
  #define TIF_SINGLESTEP         4       /* reenable singlestep on user return*/
  #define TIF_IRET               5       /* force IRET */
-#ifdef CONFIG_X86_32
  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
-#endif
  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
  #define TIF_SECCOMP            8       /* secure computing */
  #define TIF_MCE_NOTIFY         10      /* notify userspace of an MCE */
@@ -100,11 +98,7 @@ struct thread_info {
  #define _TIF_SINGLESTEP                (1 << TIF_SINGLESTEP)
  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
  #define _TIF_IRET              (1 << TIF_IRET)
-#ifdef CONFIG_X86_32
  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
-#else
-#define _TIF_SYSCALL_EMU       0
-#endif
  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
  #define _TIF_MCE_NOTIFY                (1 << TIF_MCE_NOTIFY)
@@ -121,18 +115,27 @@ struct thread_info {
  #define _TIF_DS_AREA_MSR       (1 << TIF_DS_AREA_MSR)
  #define _TIF_BTS_TRACE_TS      (1 << TIF_BTS_TRACE_TS)
  
+/* work to do in syscall_trace_enter() */
+#define _TIF_WORK_SYSCALL_ENTRY        \
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+        _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
+
+/* work to do in syscall_trace_leave() */
+#define _TIF_WORK_SYSCALL_EXIT \
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+
  /* work to do on interrupt/exception return */
  #define _TIF_WORK_MASK                                                 \
         (0x0000FFFF &                                                   \
-        ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|       \
-        _TIF_SECCOMP|_TIF_SYSCALL_EMU))
+        ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|                       \
+          _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
  
  /* work to do on any return to user space */
  #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
  
  /* Only used for 64 bit */
  #define _TIF_DO_NOTIFY_MASK                                            \
-       (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+       (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
  
  /* flags to check in __switch_to() */
  #define _TIF_WORK_CTXSW                                                        \
diff --git a/include/asm-x86/traps.h b/include/asm-x86/traps.h

new file mode 100644 (file)

index 0000000..a4b65a7
--- /dev/null
+++ b/include/asm-x86/traps.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_X86_TRAPS_H
+#define _ASM_X86_TRAPS_H
+
+/* Common in X86_32 and X86_64 */
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void alignment_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void machine_check(void);
+#endif /* CONFIG_X86_MCE */
+
+void do_divide_error(struct pt_regs *, long);
+void do_overflow(struct pt_regs *, long);
+void do_bounds(struct pt_regs *, long);
+void do_coprocessor_segment_overrun(struct pt_regs *, long);
+void do_invalid_TSS(struct pt_regs *, long);
+void do_segment_not_present(struct pt_regs *, long);
+void do_stack_segment(struct pt_regs *, long);
+void do_alignment_check(struct pt_regs *, long);
+void do_invalid_op(struct pt_regs *, long);
+void do_general_protection(struct pt_regs *, long);
+void do_nmi(struct pt_regs *, long);
+
+extern int panic_on_unrecovered_nmi;
+extern int kstack_depth_to_print;
+
+#ifdef CONFIG_X86_32
+
+void do_iret_error(struct pt_regs *, long);
+void do_int3(struct pt_regs *, long);
+void do_debug(struct pt_regs *, long);
+void math_error(void __user *);
+void do_coprocessor_error(struct pt_regs *, long);
+void do_simd_coprocessor_error(struct pt_regs *, long);
+void do_spurious_interrupt_bug(struct pt_regs *, long);
+unsigned long patch_espfix_desc(unsigned long, unsigned long);
+asmlinkage void math_emulate(long);
+
+#else /* CONFIG_X86_32 */
+
+asmlinkage void double_fault(void);
+
+asmlinkage void do_int3(struct pt_regs *, long);
+asmlinkage void do_stack_segment(struct pt_regs *, long);
+asmlinkage void do_debug(struct pt_regs *, unsigned long);
+asmlinkage void do_coprocessor_error(struct pt_regs *);
+asmlinkage void do_simd_coprocessor_error(struct pt_regs *);
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs *);
+
+#endif /* CONFIG_X86_32 */
+#endif /* _ASM_X86_TRAPS_H */
diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h

new file mode 100644 (file)

index 0000000..aa73362
--- /dev/null
+++ b/include/asm-x86/uv/bios.h
@@ -0,0 +1,68 @@
+#ifndef _ASM_X86_BIOS_H
+#define _ASM_X86_BIOS_H
+
+/*
+ * BIOS layer definitions.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/rtc.h>
+
+#define BIOS_FREQ_BASE                 0x01000001
+
+enum {
+       BIOS_FREQ_BASE_PLATFORM = 0,
+       BIOS_FREQ_BASE_INTERVAL_TIMER = 1,
+       BIOS_FREQ_BASE_REALTIME_CLOCK = 2
+};
+
+# define BIOS_CALL(result, a0, a1, a2, a3, a4, a5, a6, a7)             \
+       do {                                                            \
+               /* XXX - the real call goes here */                     \
+               result.status = BIOS_STATUS_UNIMPLEMENTED;              \
+               isrv.v0 = 0;                                            \
+               isrv.v1 = 0;                                            \
+       } while (0)
+
+enum {
+       BIOS_STATUS_SUCCESS             =  0,
+       BIOS_STATUS_UNIMPLEMENTED       = -1,
+       BIOS_STATUS_EINVAL              = -2,
+       BIOS_STATUS_ERROR               = -3
+};
+
+struct uv_bios_retval {
+       /*
+        * A zero status value indicates call completed without error.
+        * A negative status value indicates reason of call failure.
+        * A positive status value indicates success but an
+        * informational value should be printed (e.g., "reboot for
+        * change to take effect").
+        */
+       s64 status;
+       u64 v0;
+       u64 v1;
+       u64 v2;
+};
+
+extern long
+x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
+                  unsigned long *drift_info);
+extern const char *x86_bios_strerror(long status);
+
+#endif /* _ASM_X86_BIOS_H */
diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h

index 86e085e003d2b6b4ea46c7591d7db8ae0e9c244b..8e18fb80f5e641ac3683b8c84c68cb511c4ab30f 100644 (file)
--- a/include/asm-x86/vdso.h
+++ b/include/asm-x86/vdso.h
@@ -36,4 +36,12 @@ extern const char VDSO32_PRELINK[];
  extern void __user __kernel_sigreturn;
  extern void __user __kernel_rt_sigreturn;
  
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_int80_start, vdso32_int80_end;
+extern const char vdso32_syscall_start, vdso32_syscall_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+
  #endif /* asm-x86/vdso.h */
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h

index 2a4f9b41d68499d39004fb238a49a528b1a282be..91cb7fd5c1234be0f549b44f8a93208c7fedd92d 100644 (file)
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -40,83 +40,157 @@
  #include <xen/interface/sched.h>
  #include <xen/interface/physdev.h>
  
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ *    The two architectures put their arguments in different sets of
+ *    registers.
+ *
+ * - Work around asm syntax quirks
+ *    It isn't possible to specify one of the rNN registers in a
+ *    constraint, so we use explicit register variables to get the
+ *    args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ *    Even unused parameters can be clobbered by the hypervisor, so we
+ *    need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ *    This is the tricky part.  Because x86_32 has such a constrained
+ *    register set, gcc versions below 4.3 have trouble generating
+ *    code when all the arg registers and memory are trashed by the
+ *    asm.  There are syntactically simpler ways of achieving the
+ *    semantics below, but they cause the compiler to crash.
+ *
+ *    The only combination I found which works is:
+ *     - assign the __argX variables first
+ *     - list all actually used parameters as "+r" (__argX)
+ *     - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language.  Sorry.  (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
  extern struct { char _entry[32]; } hypercall_page[];
  
+#define __HYPERCALL            "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x)                                           \
+       [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG     "eax"
+#define __HYPERCALL_ARG1REG    "ebx"
+#define __HYPERCALL_ARG2REG    "ecx"
+#define __HYPERCALL_ARG3REG    "edx"
+#define __HYPERCALL_ARG4REG    "esi"
+#define __HYPERCALL_ARG5REG    "edi"
+#else
+#define __HYPERCALL_RETREG     "rax"
+#define __HYPERCALL_ARG1REG    "rdi"
+#define __HYPERCALL_ARG2REG    "rsi"
+#define __HYPERCALL_ARG3REG    "rdx"
+#define __HYPERCALL_ARG4REG    "r10"
+#define __HYPERCALL_ARG5REG    "r8"
+#endif
+
+#define __HYPERCALL_DECLS                                              \
+       register unsigned long __res  asm(__HYPERCALL_RETREG);          \
+       register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+       register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+       register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+       register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+       register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM     "=r" (__res)
+#define __HYPERCALL_1PARAM     __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM     __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM     __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM     __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM     __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1)                                           \
+       __HYPERCALL_0ARG()              __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2)                                                \
+       __HYPERCALL_1ARG(a1)            __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3)                                     \
+       __HYPERCALL_2ARG(a1,a2)         __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4)                                  \
+       __HYPERCALL_3ARG(a1,a2,a3)      __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5)                               \
+       __HYPERCALL_4ARG(a1,a2,a3,a4)   __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5   "memory"
+#define __HYPERCALL_CLOBBER4   __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3   __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2   __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1   __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0   __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
+
  #define _hypercall0(type, name)                                                \
  ({                                                                     \
-       long __res;                                                     \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res)                                          \
-               : [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_0ARG();                                             \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_0PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER0);                          \
         (type)__res;                                                    \
  })
  
  #define _hypercall1(type, name, a1)                                    \
  ({                                                                     \
-       long __res, __ign1;                                             \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1)                           \
-               : "1" ((long)(a1)),                                     \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_1ARG(a1);                                           \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_1PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER1);                          \
         (type)__res;                                                    \
  })
  
  #define _hypercall2(type, name, a1, a2)                                        \
  ({                                                                     \
-       long __res, __ign1, __ign2;                                     \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)            \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_2ARG(a1, a2);                                       \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_2PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER2);                          \
         (type)__res;                                                    \
  })
  
  #define _hypercall3(type, name, a1, a2, a3)                            \
  ({                                                                     \
-       long __res, __ign1, __ign2, __ign3;                             \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
-               "=d" (__ign3)                                           \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 "3" ((long)(a3)),                                     \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_3ARG(a1, a2, a3);                                   \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_3PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER3);                          \
         (type)__res;                                                    \
  })
  
  #define _hypercall4(type, name, a1, a2, a3, a4)                                \
  ({                                                                     \
-       long __res, __ign1, __ign2, __ign3, __ign4;                     \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
-               "=d" (__ign3), "=S" (__ign4)                            \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 "3" ((long)(a3)), "4" ((long)(a4)),                   \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_4ARG(a1, a2, a3, a4);                               \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_4PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER4);                          \
         (type)__res;                                                    \
  })
  
  #define _hypercall5(type, name, a1, a2, a3, a4, a5)                    \
  ({                                                                     \
-       long __res, __ign1, __ign2, __ign3, __ign4, __ign5;             \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
-               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)             \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 "3" ((long)(a3)), "4" ((long)(a4)),                   \
-                 "5" ((long)(a5)),                                     \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_5ARG(a1, a2, a3, a4, a5);                           \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_5PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER5);                          \
         (type)__res;                                                    \
  })
  
@@ -152,6 +226,7 @@ HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
         return _hypercall2(int, stack_switch, ss, esp);
  }
  
+#ifdef CONFIG_X86_32
  static inline int
  HYPERVISOR_set_callbacks(unsigned long event_selector,
                          unsigned long event_address,
@@ -162,6 +237,17 @@ HYPERVISOR_set_callbacks(unsigned long event_selector,
                            event_selector, event_address,
                            failsafe_selector, failsafe_address);
  }
+#else  /* CONFIG_X86_64 */
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_address,
+                       unsigned long failsafe_address,
+                       unsigned long syscall_address)
+{
+       return _hypercall3(int, set_callbacks,
+                          event_address, failsafe_address,
+                          syscall_address);
+}
+#endif  /* CONFIG_X86_{32,64} */
  
  static inline int
  HYPERVISOR_callback_op(int cmd, void *arg)
@@ -223,12 +309,12 @@ static inline int
  HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
                              unsigned long flags)
  {
-       unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
-       pte_hi = new_val.pte_high;
-#endif
-       return _hypercall4(int, update_va_mapping, va,
-                          new_val.pte_low, pte_hi, flags);
+       if (sizeof(new_val) == sizeof(long))
+               return _hypercall3(int, update_va_mapping, va,
+                                  new_val.pte, flags);
+       else
+               return _hypercall4(int, update_va_mapping, va,
+                                  new_val.pte, new_val.pte >> 32, flags);
  }
  
  static inline int
@@ -281,12 +367,13 @@ static inline int
  HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
                                          unsigned long flags, domid_t domid)
  {
-       unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
-       pte_hi = new_val.pte_high;
-#endif
-       return _hypercall5(int, update_va_mapping_otherdomain, va,
-                          new_val.pte_low, pte_hi, flags, domid);
+       if (sizeof(new_val) == sizeof(long))
+               return _hypercall4(int, update_va_mapping_otherdomain, va,
+                                  new_val.pte, flags, domid);
+       else
+               return _hypercall5(int, update_va_mapping_otherdomain, va,
+                                  new_val.pte, new_val.pte >> 32,
+                                  flags, domid);
  }
  
  static inline int
@@ -301,6 +388,14 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
         return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
  }
  
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+       return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
  static inline int
  HYPERVISOR_suspend(unsigned long srec)
  {
@@ -327,14 +422,14 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
  {
         mcl->op = __HYPERVISOR_update_va_mapping;
         mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = new_val.pte_high;
-#else
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = 0;
-#endif
-       mcl->args[3] = flags;
+       if (sizeof(new_val) == sizeof(long)) {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = flags;
+       } else {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = new_val.pte >> 32;
+               mcl->args[3] = flags;
+       }
  }
  
  static inline void
@@ -354,15 +449,16 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
  {
         mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
         mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = new_val.pte_high;
-#else
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = 0;
-#endif
-       mcl->args[3] = flags;
-       mcl->args[4] = domid;
+       if (sizeof(new_val) == sizeof(long)) {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = flags;
+               mcl->args[3] = domid;
+       } else {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = new_val.pte >> 32;
+               mcl->args[3] = flags;
+               mcl->args[4] = domid;
+       }
  }
  
  static inline void
@@ -370,10 +466,15 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
                         struct desc_struct desc)
  {
         mcl->op = __HYPERVISOR_update_descriptor;
-       mcl->args[0] = maddr;
-       mcl->args[1] = maddr >> 32;
-       mcl->args[2] = desc.a;
-       mcl->args[3] = desc.b;
+       if (sizeof(maddr) == sizeof(long)) {
+               mcl->args[0] = maddr;
+               mcl->args[1] = *(unsigned long *)&desc;
+       } else {
+               mcl->args[0] = maddr;
+               mcl->args[1] = maddr >> 32;
+               mcl->args[2] = desc.a;
+               mcl->args[3] = desc.b;
+       }
  }
  
  static inline void
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h

index 6227000a1e840780512b31e1e246bf7cd14ee716..9d810f2538a2227083bbe8b6bc7a7b753c9add7b 100644 (file)
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -1,13 +1,13 @@
  /******************************************************************************
   * arch-x86_32.h
   *
- * Guest OS interface to x86 32-bit Xen.
+ * Guest OS interface to x86 Xen.
   *
   * Copyright (c) 2004, K A Fraser
   */
  
-#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
-#define __XEN_PUBLIC_ARCH_X86_32_H__
+#ifndef __ASM_X86_XEN_INTERFACE_H
+#define __ASM_X86_XEN_INTERFACE_H
  
  #ifdef __XEN__
  #define __DEFINE_GUEST_HANDLE(name, type) \
@@ -57,6 +57,17 @@ DEFINE_GUEST_HANDLE(long);
  DEFINE_GUEST_HANDLE(void);
  #endif
  
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
  /*
   * SEGMENT DESCRIPTOR TABLES
   */
@@ -70,59 +81,22 @@ DEFINE_GUEST_HANDLE(void);
  #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
  #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
  
-/*
- * These flat segments are in the Xen-private section of every GDT. Since these
- * are also present in the initial GDT, many OSes will be able to avoid
- * installing their own GDT.
- */
-#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
-#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
-#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
-#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
-#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
-#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
-
-#define FLAT_KERNEL_CS FLAT_RING1_CS
-#define FLAT_KERNEL_DS FLAT_RING1_DS
-#define FLAT_KERNEL_SS FLAT_RING1_SS
-#define FLAT_USER_CS    FLAT_RING3_CS
-#define FLAT_USER_DS    FLAT_RING3_DS
-#define FLAT_USER_SS    FLAT_RING3_SS
-
-/* And the trap vector is... */
-#define TRAP_INSTR "int $0x82"
-
-/*
- * Virtual addresses beyond this are not modifiable by guest OSes. The
- * machine->physical mapping table starts at this address, read-only.
- */
-#ifdef CONFIG_X86_PAE
-#define __HYPERVISOR_VIRT_START 0xF5800000
-#else
-#define __HYPERVISOR_VIRT_START 0xFC000000
-#endif
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#endif
-
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
-
-/* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
-
-#ifndef __ASSEMBLY__
-
  /*
   * Send an array of these to HYPERVISOR_set_trap_table()
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ *  Level == 0: Noone may enter
+ *  Level == 1: Kernel may enter
+ *  Level == 2: Kernel may enter
+ *  Level == 3: Everyone may enter
   */
  #define TI_GET_DPL(_ti)                ((_ti)->flags & 3)
  #define TI_GET_IF(_ti)         ((_ti)->flags & 4)
  #define TI_SET_DPL(_ti, _dpl)  ((_ti)->flags |= (_dpl))
  #define TI_SET_IF(_ti, _if)    ((_ti)->flags |= ((!!(_if))<<2))
  
+#ifndef __ASSEMBLY__
  struct trap_info {
      uint8_t       vector;  /* exception vector                              */
      uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
@@ -131,32 +105,21 @@ struct trap_info {
  };
  DEFINE_GUEST_HANDLE_STRUCT(trap_info);
  
-struct cpu_user_regs {
-    uint32_t ebx;
-    uint32_t ecx;
-    uint32_t edx;
-    uint32_t esi;
-    uint32_t edi;
-    uint32_t ebp;
-    uint32_t eax;
-    uint16_t error_code;    /* private */
-    uint16_t entry_vector;  /* private */
-    uint32_t eip;
-    uint16_t cs;
-    uint8_t  saved_upcall_mask;
-    uint8_t  _pad0;
-    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
-    uint32_t esp;
-    uint16_t ss, _pad1;
-    uint16_t es, _pad2;
-    uint16_t ds, _pad3;
-    uint16_t fs, _pad4;
-    uint16_t gs, _pad5;
+struct arch_shared_info {
+    unsigned long max_pfn;                  /* max pfn that appears in table */
+    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    unsigned long pfn_to_mfn_frame_list_list;
+    unsigned long nmi_reason;
  };
-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+#endif /* !__ASSEMBLY__ */
  
-typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+#ifdef CONFIG_X86_32
+#include "interface_32.h"
+#else
+#include "interface_64.h"
+#endif
  
+#ifndef __ASSEMBLY__
  /*
   * The following is all CPU context. Note that the fpu_ctxt block is filled
   * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -173,33 +136,29 @@ struct vcpu_guest_context {
      unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
      unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
      unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
      unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
      unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+#ifdef __i386__
      unsigned long event_callback_cs;        /* CS:EIP of event callback     */
      unsigned long event_callback_eip;
      unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
      unsigned long failsafe_callback_eip;
+#else
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_eip;
+    unsigned long syscall_callback_eip;
+#endif
      unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+    /* Segment base addresses. */
+    uint64_t      fs_base;
+    uint64_t      gs_base_kernel;
+    uint64_t      gs_base_user;
+#endif
  };
  DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
-
-struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
-    unsigned long nmi_reason;
-};
-
-struct arch_vcpu_info {
-    unsigned long cr2;
-    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
-};
-
-struct xen_callback {
-       unsigned long cs;
-       unsigned long eip;
-};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLY__ */
  
  /*
   * Prefix forces emulation of some non-trapping instructions.
@@ -213,4 +172,4 @@ struct xen_callback {
  #define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
  #endif
  
-#endif
+#endif /* __ASM_X86_XEN_INTERFACE_H */
diff --git a/include/asm-x86/xen/interface_32.h b/include/asm-x86/xen/interface_32.h

new file mode 100644 (file)

index 0000000..d8ac41d
--- /dev/null
+++ b/include/asm-x86/xen/interface_32.h
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __ASM_X86_XEN_INTERFACE_32_H
+#define __ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebp;
+    uint32_t eax;
+    uint16_t error_code;    /* private */
+    uint16_t entry_vector;  /* private */
+    uint32_t eip;
+    uint16_t cs;
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad0;
+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
+    uint32_t esp;
+    uint16_t ss, _pad1;
+    uint16_t es, _pad2;
+    uint16_t ds, _pad3;
+    uint16_t fs, _pad4;
+    uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+       unsigned long cs;
+       unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip)                              \
+       ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* __ASM_X86_XEN_INTERFACE_32_H */
diff --git a/include/asm-x86/xen/interface_64.h b/include/asm-x86/xen/interface_64.h

new file mode 100644 (file)

index 0000000..842266c
--- /dev/null
+++ b/include/asm-x86/xen/interface_64.h
@@ -0,0 +1,159 @@
+#ifndef __ASM_X86_XEN_INTERFACE_64_H
+#define __ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000  /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS   FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS   FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS   FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ *  @which == SEGBASE_*  ;  @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS          0
+#define SEGBASE_GS_USER     1
+#define SEGBASE_GS_KERNEL   2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ *   RING0 -> RING3 kernel mode.
+ *   RING1 -> RING3 kernel mode.
+ *   RING2 -> RING3 kernel mode.
+ *   RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ *      orb   $3,1*8(%rsp)
+ *      iretq
+ * If flags contains VGCF_in_syscall:
+ *   Restore RAX, RIP, RFLAGS, RSP.
+ *   Discard R11, RCX, CS, SS.
+ * Otherwise:
+ *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall  (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL  VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+    /* Top of stack (%rsp at point of hypercall). */
+    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+    uint64_t r ## name, e ## name; \
+    uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+    uint64_t r15;
+    uint64_t r14;
+    uint64_t r13;
+    uint64_t r12;
+    __DECL_REG(bp);
+    __DECL_REG(bx);
+    uint64_t r11;
+    uint64_t r10;
+    uint64_t r9;
+    uint64_t r8;
+    __DECL_REG(ax);
+    __DECL_REG(cx);
+    __DECL_REG(dx);
+    __DECL_REG(si);
+    __DECL_REG(di);
+    uint32_t error_code;    /* private */
+    uint32_t entry_vector;  /* private */
+    __DECL_REG(ip);
+    uint16_t cs, _pad0[1];
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad1[3];
+    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
+    __DECL_REG(sp);
+    uint16_t ss, _pad2[3];
+    uint16_t es, _pad3[3];
+    uint16_t ds, _pad4[3];
+    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
+    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip)                              \
+       ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* __ASM_X86_XEN_INTERFACE_64_H */
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h

index 377c04591c15e7b7d7a2584a176e6d2cb4d8bb77..05e678a86628b5a7c53563f766eeb57652ed4014 100644 (file)
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -148,13 +148,17 @@ static inline pte_t __pte_ma(pteval_t x)
  }
  
  #define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
  #define pud_val_ma(v) ((v).pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
  #define __pmd_ma(x)    ((pmd_t) { (x) } )
  
  #define pgd_val_ma(x)  ((x).pgd)
  
  
-xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+xmaddr_t arbitrary_virt_to_machine(void *address);
  void make_lowmem_page_readonly(void *vaddr);
  void make_lowmem_page_readwrite(void *vaddr);
  
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h

index 7266124361b44f24259db727c7dc23d0694fb8de..32755cdf68db6512daeea31a16e18048167b7212 100644 (file)
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -26,6 +26,8 @@ struct debugfs_blob_wrapper {
         unsigned long size;
  };
  
+extern struct dentry *arch_debugfs_dir;
+
  #if defined(CONFIG_DEBUG_FS)
  
  /* declared over in file.c */
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h

index 98b79bc404ddef529acbab64a6588c1c5998afc5..c3adde32669b49f958b816a5b3b6e757b25a078e 100644 (file)
--- a/include/xen/hvc-console.h
+++ b/include/xen/hvc-console.h
@@ -5,11 +5,12 @@ extern struct console xenboot_console;
  
  #ifdef CONFIG_HVC_XEN
  void xen_console_resume(void);
+void xen_raw_console_write(const char *str);
+void xen_raw_printk(const char *fmt, ...);
  #else
  static inline void xen_console_resume(void) { }
+static inline void xen_raw_console_write(const char *str) { }
+static inline void xen_raw_printk(const char *fmt, ...) { }
  #endif
  
-void xen_raw_console_write(const char *str);
-void xen_raw_printk(const char *fmt, ...);
-
  #endif /* XEN_HVC_CONSOLE_H */
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h

index 4aadcba31af9810f973760e4ae5f9fc30ada070a..2ae3cd243264c0652cbd3d3e8e815b6363ef756c 100644 (file)
--- a/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@ -82,9 +82,9 @@
   */
  #define CALLBACKOP_register                0
  struct callback_register {
-    uint16_t type;
-    uint16_t flags;
-    struct xen_callback address;
+       uint16_t type;
+       uint16_t flags;
+       xen_callback_t address;
  };
  
  /*
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h

index a706d6a7896016c8698698ad1472a26b7551cab9..883a21bba24bac3991e79533f586679de6926a4a 100644 (file)
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -11,4 +11,7 @@ void xen_post_suspend(int suspend_cancelled);
  void xen_mm_pin_all(void);
  void xen_mm_unpin_all(void);
  
+void xen_timer_resume(void);
+void xen_arch_resume(void);
+
  #endif /* INCLUDE_XEN_OPS_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig

index b45da40e8d25f6a480fb8366004fcd4de2607ef4..59dfdf1e1d2071ba29b711fbb2b34d28a57cd998 100644 (file)
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
  
  config PM_SLEEP
         bool
-       depends on SUSPEND || HIBERNATION
+       depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
         default y
  
  config SUSPEND
author	Ingo Molnar <mingo@elte.hu>
	Mon, 21 Jul 2008 14:37:17 +0000 (16:37 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 21 Jul 2008 14:37:17 +0000 (16:37 +0200)
Documentation/kernel-parameters.txt		patch \| blob \| blame \| history
arch/x86/Kconfig		patch \| blob \| blame \| history
arch/x86/Kconfig.cpu		patch \| blob \| blame \| history
arch/x86/Kconfig.debug		patch \| blob \| blame \| history
arch/x86/boot/edd.c		patch \| blob \| blame \| history
arch/x86/boot/pm.c		patch \| blob \| blame \| history
arch/x86/ia32/ia32_signal.c		patch \| blob \| blame \| history
arch/x86/ia32/ia32entry.S		patch \| blob \| blame \| history
arch/x86/kernel/Makefile		patch \| blob \| blame \| history
arch/x86/kernel/acpi/sleep.c		patch \| blob \| blame \| history
arch/x86/kernel/amd_iommu.c		patch \| blob \| blame \| history
arch/x86/kernel/amd_iommu_init.c		patch \| blob \| blame \| history
arch/x86/kernel/aperture_64.c		patch \| blob \| blame \| history
arch/x86/kernel/apic_32.c		patch \| blob \| blame \| history
arch/x86/kernel/apic_64.c		patch \| blob \| blame \| history
arch/x86/kernel/asm-offsets_64.c		patch \| blob \| blame \| history
arch/x86/kernel/bios_uv.c	[new file with mode: 0644]	patch \| blob
arch/x86/kernel/cpu/amd.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/amd_64.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/bugs.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/common_64.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/intel.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/mcheck/p4.c		patch \| blob \| blame \| history
arch/x86/kernel/e820.c		patch \| blob \| blame \| history
arch/x86/kernel/early-quirks.c		patch \| blob \| blame \| history
arch/x86/kernel/entry_32.S		patch \| blob \| blame \| history
arch/x86/kernel/entry_64.S		patch \| blob \| blame \| history
arch/x86/kernel/genx2apic_uv_x.c		patch \| blob \| blame \| history
arch/x86/kernel/head64.c		patch \| blob \| blame \| history
arch/x86/kernel/head_64.S		patch \| blob \| blame \| history
arch/x86/kernel/io_apic_32.c		patch \| blob \| blame \| history
arch/x86/kernel/io_apic_64.c		patch \| blob \| blame \| history
arch/x86/kernel/io_delay.c		patch \| blob \| blame \| history
arch/x86/kernel/ipi.c		patch \| blob \| blame \| history
arch/x86/kernel/irq_32.c		patch \| blob \| blame \| history
arch/x86/kernel/kdebugfs.c		patch \| blob \| blame \| history
arch/x86/kernel/kprobes.c		patch \| blob \| blame \| history
arch/x86/kernel/module_64.c		patch \| blob \| blame \| history
arch/x86/kernel/mpparse.c		patch \| blob \| blame \| history
arch/x86/kernel/nmi.c		patch \| blob \| blame \| history
arch/x86/kernel/numaq_32.c		patch \| blob \| blame \| history
arch/x86/kernel/paravirt.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-calgary_64.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-dma.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-gart_64.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-nommu.c		patch \| blob \| blame \| history
arch/x86/kernel/pci-swiotlb_64.c		patch \| blob \| blame \| history
arch/x86/kernel/process.c		patch \| blob \| blame \| history
arch/x86/kernel/process_64.c		patch \| blob \| blame \| history
arch/x86/kernel/ptrace.c		patch \| blob \| blame \| history
arch/x86/kernel/reboot.c		patch \| blob \| blame \| history
arch/x86/kernel/setup.c		patch \| blob \| blame \| history
arch/x86/kernel/signal_32.c		patch \| blob \| blame \| history
arch/x86/kernel/signal_64.c		patch \| blob \| blame \| history
arch/x86/kernel/smpboot.c		patch \| blob \| blame \| history
arch/x86/kernel/step.c		patch \| blob \| blame \| history
arch/x86/kernel/time_32.c		patch \| blob \| blame \| history
arch/x86/kernel/traps_32.c		patch \| blob \| blame \| history
arch/x86/kernel/traps_64.c		patch \| blob \| blame \| history
arch/x86/kernel/visws_quirks.c		patch \| blob \| blame \| history
arch/x86/kernel/vmi_32.c		patch \| blob \| blame \| history
arch/x86/lguest/boot.c		patch \| blob \| blame \| history
arch/x86/mach-default/setup.c		patch \| blob \| blame \| history
arch/x86/mm/Makefile		patch \| blob \| blame \| history
arch/x86/mm/init_32.c		patch \| blob \| blame \| history
arch/x86/mm/init_64.c		patch \| blob \| blame \| history
arch/x86/mm/memtest.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/pat.c		patch \| blob \| blame \| history
arch/x86/pci/Makefile		patch \| blob \| blame \| history
arch/x86/pci/legacy.c		patch \| blob \| blame \| history
arch/x86/pci/numa.c	[deleted file]	patch \| blob \| blame \| history
arch/x86/pci/numaq_32.c	[new file with mode: 0644]	patch \| blob
arch/x86/pci/pci.h		patch \| blob \| blame \| history
arch/x86/pci/visws.c		patch \| blob \| blame \| history
arch/x86/vdso/Makefile		patch \| blob \| blame \| history
arch/x86/vdso/vdso32-setup.c		patch \| blob \| blame \| history
arch/x86/vdso/vdso32.S		patch \| blob \| blame \| history
arch/x86/vdso/vma.c		patch \| blob \| blame \| history
arch/x86/xen/Kconfig		patch \| blob \| blame \| history
arch/x86/xen/Makefile		patch \| blob \| blame \| history
arch/x86/xen/enlighten.c		patch \| blob \| blame \| history
arch/x86/xen/mmu.c		patch \| blob \| blame \| history
arch/x86/xen/mmu.h		patch \| blob \| blame \| history
arch/x86/xen/multicalls.c		patch \| blob \| blame \| history
arch/x86/xen/setup.c		patch \| blob \| blame \| history
arch/x86/xen/smp.c		patch \| blob \| blame \| history
arch/x86/xen/suspend.c		patch \| blob \| blame \| history
arch/x86/xen/xen-asm.S	[deleted file]	patch \| blob \| blame \| history
arch/x86/xen/xen-asm_32.S	[new file with mode: 0644]	patch \| blob
arch/x86/xen/xen-asm_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/xen/xen-head.S		patch \| blob \| blame \| history
arch/x86/xen/xen-ops.h		patch \| blob \| blame \| history
drivers/net/xen-netfront.c		patch \| blob \| blame \| history
drivers/pci/intel-iommu.c		patch \| blob \| blame \| history
drivers/xen/manage.c		patch \| blob \| blame \| history
include/asm-x86/amd_iommu_types.h		patch \| blob \| blame \| history
include/asm-x86/apic.h		patch \| blob \| blame \| history
include/asm-x86/arch_hooks.h		patch \| blob \| blame \| history
include/asm-x86/bitops.h		patch \| blob \| blame \| history
include/asm-x86/calling.h		patch \| blob \| blame \| history
include/asm-x86/cpufeature.h		patch \| blob \| blame \| history
include/asm-x86/dma-mapping.h		patch \| blob \| blame \| history
include/asm-x86/e820.h		patch \| blob \| blame \| history
include/asm-x86/fixmap_32.h		patch \| blob \| blame \| history
include/asm-x86/gart.h		patch \| blob \| blame \| history
include/asm-x86/iommu.h		patch \| blob \| blame \| history
include/asm-x86/mach-bigsmp/mach_apic.h		patch \| blob \| blame \| history
include/asm-x86/mach-default/mach_apic.h		patch \| blob \| blame \| history
include/asm-x86/mach-es7000/mach_apic.h		patch \| blob \| blame \| history
include/asm-x86/mach-generic/mach_mpspec.h		patch \| blob \| blame \| history
include/asm-x86/mach-summit/mach_apic.h		patch \| blob \| blame \| history
include/asm-x86/mach-visws/entry_arch.h	[deleted file]	patch \| blob \| blame \| history
include/asm-x86/mach-visws/mach_apic.h	[deleted file]	patch \| blob \| blame \| history
include/asm-x86/mach-visws/mach_apicdef.h	[deleted file]	patch \| blob \| blame \| history
include/asm-x86/mach-visws/setup_arch.h	[deleted file]	patch \| blob \| blame \| history
include/asm-x86/mach-visws/smpboot_hooks.h	[deleted file]	patch \| blob \| blame \| history
include/asm-x86/paravirt.h		patch \| blob \| blame \| history
include/asm-x86/percpu.h		patch \| blob \| blame \| history
include/asm-x86/pgtable.h		patch \| blob \| blame \| history
include/asm-x86/pgtable_32.h		patch \| blob \| blame \| history
include/asm-x86/pgtable_64.h		patch \| blob \| blame \| history
include/asm-x86/processor.h		patch \| blob \| blame \| history
include/asm-x86/ptrace-abi.h		patch \| blob \| blame \| history
include/asm-x86/segment.h		patch \| blob \| blame \| history
include/asm-x86/setup.h		patch \| blob \| blame \| history
include/asm-x86/smp.h		patch \| blob \| blame \| history
include/asm-x86/swiotlb.h		patch \| blob \| blame \| history
include/asm-x86/thread_info.h		patch \| blob \| blame \| history
include/asm-x86/traps.h	[new file with mode: 0644]	patch \| blob
include/asm-x86/uv/bios.h	[new file with mode: 0644]	patch \| blob
include/asm-x86/vdso.h		patch \| blob \| blame \| history
include/asm-x86/xen/hypercall.h		patch \| blob \| blame \| history
include/asm-x86/xen/interface.h		patch \| blob \| blame \| history
include/asm-x86/xen/interface_32.h	[new file with mode: 0644]	patch \| blob
include/asm-x86/xen/interface_64.h	[new file with mode: 0644]	patch \| blob
include/asm-x86/xen/page.h		patch \| blob \| blame \| history
include/linux/debugfs.h		patch \| blob \| blame \| history
include/xen/hvc-console.h		patch \| blob \| blame \| history
include/xen/interface/callback.h		patch \| blob \| blame \| history
include/xen/xen-ops.h		patch \| blob \| blame \| history
kernel/power/Kconfig		patch \| blob \| blame \| history