diff options
| author | Andi Kleen <ak@muc.de> | 2003-04-29 17:32:05 -0700 |
|---|---|---|
| committer | Christoph Hellwig <hch@lst.de> | 2003-04-29 17:32:05 -0700 |
| commit | a31a4dea7f948c13e24f84fa310cec3814401dfd (patch) | |
| tree | c5b0cd5f19baa1bf1e3e512afc217b27e3d71ff7 | |
| parent | 946f68b907b7637253a2ed7636ed5e9d55a93e1b (diff) | |
[PATCH] Update alt_instr to handle SSE2 prefetch and better nops
| -rw-r--r-- | arch/i386/Kconfig | 19 | ||||
| -rw-r--r-- | arch/i386/kernel/cpu/amd.c | 9 | ||||
| -rw-r--r-- | arch/i386/kernel/cpu/intel.c | 5 | ||||
| -rw-r--r-- | arch/i386/kernel/setup.c | 88 | ||||
| -rw-r--r-- | arch/i386/vmlinux.lds.S | 1 | ||||
| -rw-r--r-- | include/asm-i386/cpufeature.h | 5 | ||||
| -rw-r--r-- | include/asm-i386/processor.h | 88 | ||||
| -rw-r--r-- | include/asm-i386/system.h | 40 |
8 files changed, 207 insertions, 48 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 234e92cbf4ba..729fab09cd2b 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -273,6 +273,13 @@ config MVIAC3_2 endchoice +config X86_GENERIC + bool "Generic x86 support" + help + Including some tuning for non selected x86 CPUs too. + when it has moderate overhead. This is intended for generic + distributions kernels. + # # Define implied options from the CPU selection here # @@ -288,10 +295,10 @@ config X86_XADD config X86_L1_CACHE_SHIFT int + default "7" if MPENTIUM4 || X86_GENERIC default "4" if MELAN || M486 || M386 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 default "6" if MK7 || MK8 - default "7" if MPENTIUM4 config RWSEM_GENERIC_SPINLOCK bool @@ -363,16 +370,6 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y -config X86_PREFETCH - bool - depends on MPENTIUMIII || MPENTIUM4 || MVIAC3_2 - default y - -config X86_SSE2 - bool - depends on MK8 || MPENTIUM4 - default y - config HUGETLB_PAGE bool "Huge TLB Page Support" help diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index ff810509b8fe..091b98ae93b6 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -178,6 +178,15 @@ static void __init init_amd(struct cpuinfo_x86 *c) break; } + switch (c->x86) { + case 15: + set_bit(X86_FEATURE_K8, c->x86_capability); + break; + case 6: + set_bit(X86_FEATURE_K7, c->x86_capability); + break; + } + display_cacheinfo(c); } diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 69aa8304f797..1736d1a2115b 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -353,6 +353,11 @@ too_many_siblings: break; } #endif + + if (c->x86 == 15) + set_bit(X86_FEATURE_P4, c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_P3, c->x86_capability); } diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 9c20d88208ae..26efeeb167c2 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -795,41 +795,91 @@ static void __init register_memory(unsigned long max_low_pfn) pci_mem_start = low_mem_size; } +/* Use inline assembly to define this because the nops are defined + as inline assembly strings in the include files and we cannot + get them easily into strings. */ +asm("intelnops: " + GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 + GENERIC_NOP7 GENERIC_NOP8); +asm("k8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +asm("k7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); + +extern unsigned char intelnops[], k8nops[], k7nops[]; +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static struct nop { + int cpuid; + unsigned char **noptable; +} noptypes[] = { + { X86_FEATURE_K8, k8_nops }, + { X86_FEATURE_K7, k7_nops }, + { -1, 0 } +}; + /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ void apply_alternatives(void *start, void *end) { struct alt_instr *a; int diff, i, k; - - for (a = start; a < (struct alt_instr *)end; - a = (void *)ALIGN((unsigned long)(a + 1) + a->instrlen, 4)) { + unsigned char **noptable = intel_nops; + for (i = 0; noptypes[i].cpuid >= 0; i++) { + if (boot_cpu_has(noptypes[i].cpuid)) { + noptable = noptypes[i].noptable; + break; + } + } + for (a = start; (void *)a < end; a++) { if (!boot_cpu_has(a->cpuid)) continue; BUG_ON(a->replacementlen > a->instrlen); memcpy(a->instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; + /* Pad the rest with nops */ for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - static const char *nops[] = { - 0, - "\x90", -#if CONFIG_MK7 || CONFIG_MK8 - "\x66\x90", - "\x66\x66\x90", - "\x66\x66\x66\x90", -#else - "\x89\xf6", - "\x8d\x76\x00", - "\x8d\x74\x26\x00", -#endif - }; - k = min_t(int, diff, ARRAY_SIZE(nops)); - memcpy(a->instr + i, nops[k], k); + k = diff; + if (k > ASM_NOP_MAX) + k = ASM_NOP_MAX; + memcpy(a->instr + i, noptable[k], k); } } } diff --git a/arch/i386/vmlinux.lds.S b/arch/i386/vmlinux.lds.S index b6a3f2de1bcf..56401363b4f6 100644 --- a/arch/i386/vmlinux.lds.S +++ b/arch/i386/vmlinux.lds.S @@ -85,6 +85,7 @@ SECTIONS __alt_instructions = .; .altinstructions : { *(.altinstructions) } __alt_instructions_end = .; + .altinstr_replacement : { *(.altinstr_replacement) } . = ALIGN(4096); __initramfs_start = .; .init.ramfs : { *(.init.ramfs) } diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 46a85b395db8..040e1f66ea48 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -63,6 +63,11 @@ #define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ +/* cpu types for specific tunings: */ +#define X86_FEATURE_K8 (3*32+ 4) /* Opteron, Athlon64 */ +#define X86_FEATURE_K7 (3*32+ 5) /* Athlon */ +#define X86_FEATURE_P3 (3*32+ 6) /* P3 */ +#define X86_FEATURE_P4 (3*32+ 7) /* P4 */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index bc47e152108d..d69cc46d5866 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -15,6 +15,7 @@ #include <asm/sigcontext.h> #include <asm/cpufeature.h> #include <asm/msr.h> +#include <asm/system.h> #include <linux/cache.h> #include <linux/config.h> #include <linux/threads.h> @@ -495,32 +496,93 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() -/* Prefetch instructions for Pentium III and AMD Athlon */ -#ifdef CONFIG_X86_PREFETCH +/* generic versions from gas */ +#define GENERIC_NOP1 ".byte 0x90\n" +#define GENERIC_NOP2 ".byte 0x89,0xf6\n" +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 + +/* Opteron nops */ +#define K8_NOP1 GENERIC_NOP1 +#define K8_NOP2 ".byte 0x66,0x90\n" +#define K8_NOP3 ".byte 0x66,0x66,0x90\n" +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" +#define K8_NOP5 K8_NOP3 K8_NOP2 +#define K8_NOP6 K8_NOP3 K8_NOP3 +#define K8_NOP7 K8_NOP4 K8_NOP3 +#define K8_NOP8 K8_NOP4 K8_NOP4 + +/* K7 nops */ +/* uses eax dependencies (arbitary choice) */ +#define K7_NOP1 GENERIC_NOP1 +#define K7_NOP2 ".byte 0x8b,0xc0\n" +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" +#define K7_NOP5 K7_NOP4 ASM_NOP1 +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" +#define K7_NOP8 K7_NOP7 ASM_NOP1 + +#ifdef CONFIG_MK8 +#define ASM_NOP1 K8_NOP1 +#define ASM_NOP2 K8_NOP2 +#define ASM_NOP3 K8_NOP3 +#define ASM_NOP4 K8_NOP4 +#define ASM_NOP5 K8_NOP5 +#define ASM_NOP6 K8_NOP6 +#define ASM_NOP7 K8_NOP7 +#define ASM_NOP8 K8_NOP8 +#elif CONFIG_MK7 +#define ASM_NOP1 K7_NOP1 +#define ASM_NOP2 K7_NOP2 +#define ASM_NOP3 K7_NOP3 +#define ASM_NOP4 K7_NOP4 +#define ASM_NOP5 K7_NOP5 +#define ASM_NOP6 K7_NOP6 +#define ASM_NOP7 K7_NOP7 +#define ASM_NOP8 K7_NOP8 +#else +#define ASM_NOP1 GENERIC_NOP1 +#define ASM_NOP2 GENERIC_NOP2 +#define ASM_NOP3 GENERIC_NOP3 +#define ASM_NOP4 GENERIC_NOP4 +#define ASM_NOP5 GENERIC_NOP5 +#define ASM_NOP6 GENERIC_NOP6 +#define ASM_NOP7 GENERIC_NOP7 +#define ASM_NOP8 GENERIC_NOP8 +#endif +#define ASM_NOP_MAX 8 + +/* Prefetch instructions for Pentium III and AMD Athlon */ +/* It's not worth to care about 3dnow! prefetches for the K6 + because they are microcoded there and very slow. */ #define ARCH_HAS_PREFETCH extern inline void prefetch(const void *x) { - __asm__ __volatile__ ("prefetchnta (%0)" : : "r"(x)); + alternative_input(ASM_NOP3, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); } -#elif defined CONFIG_X86_USE_3DNOW - #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH -extern inline void prefetch(const void *x) -{ - __asm__ __volatile__ ("prefetch (%0)" : : "r"(x)); -} - +/* 3dnow! prefetch to get an exclusive cache line. Useful for + spinlocks to avoid one state transition in the cache coherency protocol. */ extern inline void prefetchw(const void *x) { - __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x)); + alternative_input(ASM_NOP3, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); } #define spin_lock_prefetch(x) prefetchw(x) -#endif - #endif /* __ASM_I386_PROCESSOR_H */ diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h index 5831f6d34ad7..71b9dd77ed04 100644 --- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -277,13 +277,16 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, /* Compiling for a 386 proper. Is it worth implementing via cli/sti? */ #endif +#ifdef __KERNEL__ struct alt_instr { - u8 *instr; /* original instruction */ - u8 cpuid; /* cpuid bit set for replacement */ - u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ - u8 replacement[0]; /* new instruction */ + __u8 *instr; /* original instruction */ + __u8 *replacement; + __u8 cpuid; /* cpuid bit set for replacement */ + __u8 instrlen; /* length of original instruction */ + __u8 replacementlen; /* length of new instruction, <= instrlen */ + __u8 pad; }; +#endif /* * Alternative instructions for different CPU types or capabilities. @@ -302,13 +305,40 @@ struct alt_instr { ".section .altinstructions,\"a\"\n" \ " .align 4\n" \ " .long 661b\n" /* label */ \ + " .long 663f\n" /* new instruction */ \ " .byte %c0\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ "663:\n\t" newinstr "\n664:\n" /* replacement */ \ ".previous" :: "i" (feature) : "memory") /* + * Alternative inline assembly with input. + * + * Pecularities: + * No memory clobber here. + * Argument numbers start with 1. + * Best is to use constraints that are fixed size (like (%1) ... "r") + * If you use variable sized constraints like "m" or "g" in the + * replacement maake sure to pad to the worst case length. + */ +#define alternative_input(oldinstr, newinstr, feature, input) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + " .align 4\n" \ + " .long 661b\n" /* label */ \ + " .long 663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature), input) + +/* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking * to devices. |
