// IA-32 (x86) specific code // Copyright (c) 2003 Jan Wassenberg // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // // Contact info: // Jan.Wassenberg@stud.uni-karlsruhe.de // http://www.stud.uni-karlsruhe.de/~urkt/ #include "precompiled.h" #include "lib.h" #include "posix.h" #include "ia32.h" #include "detect.h" #include "timer.h" // HACK (see call to wtime_reset_impl) #if OS_WIN #include "lib/sysdep/win/wtime.h" #endif #define NO_COLOR #ifndef NO_COLOR #include "graphics/Color.h" #endif #include #include #include #include #if !HAVE_MS_ASM && !HAVE_GNU_ASM #error ia32.cpp needs inline assembly support! #endif #define SELF_TEST_ENABLED 1 #include "self_test.h" // set by ia32_init, referenced by ia32_memcpy (asm) extern "C" u32 ia32_memcpy_size_mask = 0; void ia32_init() { ia32_asm_init(); // memcpy init: set the mask that is applied to transfer size before // choosing copy technique. this is the mechanism for disabling // codepaths that aren't supported on all CPUs; see article for details. // .. check for PREFETCHNTA and MOVNTQ support. these are part of the SSE // instruction set, but also supported on older Athlons as part of // the extended AMD MMX set. if(ia32_cap(SSE) || ia32_cap(AMD_MMX_EXT)) ia32_memcpy_size_mask = ~0u; } //----------------------------------------------------------------------------- // fast implementations of some sysdep.h functions; see documentation there //----------------------------------------------------------------------------- #if HAVE_MS_ASM // note: declspec naked is significantly faster: it avoids redundant // store/load, even though it prevents inlining. // if on 64-bit systems, [esp+4] will have to change cassert(sizeof(int)*CHAR_BIT == 32); __declspec(naked) float ia32_rintf(float) { __asm fld [esp+4] __asm frndint __asm ret } __declspec(naked) double ia32_rint(double) { __asm fld QWORD PTR [esp+4] __asm frndint __asm ret } #endif // HAVE_MS_ASM #if USE_IA32_FLOAT_TO_INT // implies HAVE_MS_ASM // notes: // - PTR is necessary because __declspec(naked) means the assembler // cannot refer to parameter argument type to get it right. // - to conform with the fallback implementation (a C cast), we need to // end up with truncate/"chop" rounding. subtracting does the trick, // assuming RC is the IA-32 default round-to-nearest mode. static const float round_bias = 0.4999999f; __declspec(naked) i32 ia32_i32_from_float(float f) { UNUSED2(f); __asm{ push eax fld DWORD PTR [esp+8] fsub [round_bias] fistp DWORD PTR [esp] pop eax ret }} __declspec(naked) i32 ia32_i32_from_double(double d) { UNUSED2(d); __asm{ push eax fld QWORD PTR [esp+8] fsub [round_bias] fistp DWORD PTR [esp] pop eax ret }} __declspec(naked) i64 ia32_i64_from_double(double d) { UNUSED2(d); __asm{ push edx push eax fld QWORD PTR [esp+12] fsub [round_bias] fistp QWORD PTR [esp] pop eax pop edx ret }} #endif // USE_IA32_FLOAT_TO_INT //----------------------------------------------------------------------------- // rationale: this function should return its output (instead of setting // out params) to simplify its callers. it is written in inline asm // (instead of moving to ia32.asm) to insulate from changing compiler // calling conventions. // MSC, ICC and GCC currently return 64 bits in edx:eax, which even // matches rdtsc output, but we play it safe and return a temporary. u64 rdtsc() { u64 c; #if HAVE_MS_ASM __asm { cpuid rdtsc mov dword ptr [c], eax mov dword ptr [c+4], edx } #elif HAVE_GNU_ASM __asm__ __volatile__ ( "cpuid; rdtsc" : "=A" (c) : /* no input */ : "ebx", "ecx" /* cpuid clobbers ebx and ecx */); #endif return c; } void ia32_debug_break() { #if HAVE_MS_ASM __asm int 3 // note: this probably isn't necessary, since unix_debug_break // (SIGTRAP) is most probably available if HAVE_GNU_ASM. // we include it for completeness, though. #elif HAVE_GNU_ASM __asm__ __volatile__ ("mfence"); #endif } //----------------------------------------------------------------------------- // support code for lock-free primitives //----------------------------------------------------------------------------- // enforce strong memory ordering. void mfence() { // Pentium IV if(ia32_cap(SSE2)) #if HAVE_MS_ASM __asm mfence #elif HAVE_GNU_ASM __asm__ __volatile__ ("mfence"); #endif } void serialize() { #if HAVE_MS_ASM __asm cpuid #elif HAVE_GNU_ASM __asm__ __volatile__ ("cpuid"); #endif } //----------------------------------------------------------------------------- // CPU / feature detect //----------------------------------------------------------------------------- bool ia32_cap(CpuCap cap) { // treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx // keep in sync with enum CpuCap! static u32 caps[4]; ONCE(\ u32 regs[4]; if(ia32_cpuid(1, regs))\ {\ caps[0] = regs[ECX];\ caps[1] = regs[EDX];\ }\ if(ia32_cpuid(0x80000001, regs))\ {\ caps[2] = regs[ECX];\ caps[3] = regs[EDX];\ }\ ); const uint tbl_idx = cap >> 5; const uint bit_idx = cap & 0x1f; if(tbl_idx > 3) { debug_warn("cap invalid"); return false; } return (caps[tbl_idx] & BIT(bit_idx)) != 0; } // we only store enum Vendor rather than the string because that // is easier to compare. enum Vendor { UNKNOWN, INTEL, AMD }; static Vendor vendor = UNKNOWN; enum MiscCpuCapBits { // AMD PowerNow! flags (returned in edx by CPUID 0x80000007) POWERNOW_FREQ_ID_CTRL = 2 }; static void get_cpu_vendor() { u32 regs[4]; if(!ia32_cpuid(0, regs)) return; // copy regs to string // note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order. char vendor_str[13]; u32* vendor_str_u32 = (u32*)vendor_str; vendor_str_u32[0] = regs[EBX]; vendor_str_u32[1] = regs[EDX]; vendor_str_u32[2] = regs[ECX]; vendor_str[12] = '\0'; // 0-terminate if(!strcmp(vendor_str, "AuthenticAMD")) vendor = AMD; else if(!strcmp(vendor_str, "GenuineIntel")) vendor = INTEL; else debug_warn("unknown vendor"); } static void get_cpu_type() { // get processor signature u32 regs[4]; if(!ia32_cpuid(1, regs)) debug_warn("cpuid 1 failed"); const uint model = bits(regs[EAX], 4, 7); const uint family = bits(regs[EAX], 8, 11); // get brand string (if available) // note: ia32_cpuid writes 4 u32s directly to cpu_type - // be very careful with pointer arithmetic! u32* cpu_type_u32 = (u32*)cpu_type; bool have_brand_string = false; if(ia32_cpuid(0x80000002, cpu_type_u32+0 ) && ia32_cpuid(0x80000003, cpu_type_u32+4) && ia32_cpuid(0x80000004, cpu_type_u32+8)) have_brand_string = true; // note: cpu_type is guaranteed to hold 48+1 chars, since that's the // length of the CPU brand string => we can safely copy short literals. // (this macro hides us from 'unsafe string code' searches) #define SAFE_STRCPY str##cpy // fall back to manual detect of CPU type because either: // - CPU doesn't support brand string (we use a flag to indicate this // rather than comparing against a default value because it is safer); // - the brand string is useless, e.g. "Unknown". this happens on // some older boards whose BIOS reprograms the string for CPUs it // doesn't recognize. if(!have_brand_string || strncmp(cpu_type, "Unknow", 6) == 0) { if(vendor == AMD) { // everything else is either too old, or should have a brand string. if(family == 6) { if(model == 3 || model == 7) SAFE_STRCPY(cpu_type, "AMD Duron"); else if(model <= 5) SAFE_STRCPY(cpu_type, "AMD Athlon"); else { if(ia32_cap(AMD_MP)) SAFE_STRCPY(cpu_type, "AMD Athlon MP"); else SAFE_STRCPY(cpu_type, "AMD Athlon XP"); } } } else if(vendor == INTEL) { // everything else is either too old, or should have a brand string. if(family == 6) { if(model == 1) SAFE_STRCPY(cpu_type, "Intel Pentium Pro"); else if(model == 3 || model == 5) SAFE_STRCPY(cpu_type, "Intel Pentium II"); else if(model == 6) SAFE_STRCPY(cpu_type, "Intel Celeron"); else SAFE_STRCPY(cpu_type, "Intel Pentium III"); } } } // cpu_type already holds a valid brand string; pretty it up. else { // strip (tm) from Athlon string if(!strncmp(cpu_type, "AMD Athlon(tm)", 14)) memmove(cpu_type+10, cpu_type+14, 35); // remove 2x (R) and CPU freq from P4 string float freq; // we can't use this because it isn't necessarily correct - the CPU // may be overclocked. a variable must be passed, though, since // scanf returns the number of fields actually stored. if(sscanf(cpu_type, " Intel(R) Pentium(R) 4 CPU %fGHz", &freq) == 1) SAFE_STRCPY(cpu_type, "Intel Pentium 4"); } } //----------------------------------------------------------------------------- static uint log_id_bits; // bit index; divides APIC ID into log and phys static const uint INVALID_ID = ~0u; static uint last_phys_id = INVALID_ID, last_log_id = INVALID_ID; static uint phys_ids = 0, log_ids = 0; // count # distinct physical and logical APIC IDs for get_cpu_count. // called on each OS-visible "CPU" by on_each_cpu. static void count_ids() { // get APIC id u32 regs[4]; if(!ia32_cpuid(1, regs)) debug_warn("cpuid 1 failed"); const uint id = bits(regs[EBX], 24, 31); // partition into physical and logical ID const uint phys_id = bits(id, 0, log_id_bits-1); const uint log_id = bits(id, log_id_bits, 7); // note: APIC IDs are assigned sequentially, so we compare against the // last one encountered. if(last_phys_id != INVALID_ID && last_phys_id != phys_id) cpus++; if(last_log_id != INVALID_ID && last_log_id != log_id ) cpus++; last_phys_id = phys_id; last_log_id = log_id; } // fix CPU count reported by OS (incorrect if HT active or multicore); // also separates it into cpu_ht_units and cpu_cores. static void get_cpu_count() { debug_assert(cpus > 0 && "must know # 'CPU's (call OS-specific detect first)"); // get # "logical CPUs" per package (uniform over all packages). // TFM is unclear but seems to imply this includes HT units *and* cores! u32 regs[4]; if(!ia32_cpuid(1, regs)) debug_warn("ia32_cpuid(1) failed"); const uint log_cpu_per_package = bits(regs[EBX], 16, 23); // .. and # cores if(ia32_cpuid(4, regs)) cpu_cores = bits(regs[EBX], 26, 31)+1; else cpu_cores = 1; // if HT is active (enabled in BIOS and OS), we have a problem: // OSes (Windows at least) report # CPUs as packages * cores * HT_units. // there is no direct way to determine if HT is actually enabled, // so if it is supported, we have to examine all APIC IDs and // figure out what kind of "CPU" each one is. *sigh* // // note: we don't check if it's Intel and P4 or above - HT may be // supported on other CPUs in future. all processors should set this // feature bit correctly, so it's not a problem. if(ia32_cap(HT)) { log_id_bits = log2(log_cpu_per_package); // see above last_phys_id = last_log_id = INVALID_ID; phys_ids = log_ids = 0; if(sys_on_each_cpu(count_ids) == 0) { cpus = phys_ids; cpu_ht_units = log_ids / cpu_cores; return; // this is authoritative } // OS apparently doesn't support CPU affinity. // HT might be disabled, but return # units anyway. else cpu_ht_units = log_cpu_per_package / cpu_cores; } // not HT-capable; return 1 to allow total = cpus * HT_units * cores. else cpu_ht_units = 1; cpus /= cpu_cores; } static void check_for_speedstep() { if(vendor == INTEL) { if(ia32_cap(EST)) cpu_speedstep = 1; } else if(vendor == AMD) { u32 regs[4]; if(ia32_cpuid(0x80000007, regs)) if(regs[EDX] & POWERNOW_FREQ_ID_CTRL) cpu_speedstep = 1; } } static void measure_cpu_freq() { // set max priority, to reduce interference while measuring. int old_policy; static sched_param old_param; // (static => 0-init) pthread_getschedparam(pthread_self(), &old_policy, &old_param); static sched_param max_param; max_param.sched_priority = sched_get_priority_max(SCHED_FIFO); pthread_setschedparam(pthread_self(), SCHED_FIFO, &max_param); // make sure the TSC is available, because we're going to // measure actual CPU clocks per known time interval. // counting loop iterations ("bogomips") is unreliable. if(ia32_cap(TSC)) { // note: no need to "warm up" cpuid - it will already have been // called several times by the time this code is reached. // (background: it's used in rdtsc() to serialize instruction flow; // the first call is documented to be slower on Intel CPUs) int num_samples = 16; // if clock is low-res, do less samples so it doesn't take too long. // balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error - // ok for using the TSC as a time reference) if(timer_res() >= 1e-3) num_samples = 8; std::vector samples(num_samples); int i; for(i = 0; i < num_samples; i++) { double dt; i64 dc; // i64 because VC6 can't convert u64 -> double, // and we don't need all 64 bits. // count # of clocks in max{1 tick, 1 ms}: // .. wait for start of tick. const double t0 = get_time(); u64 c1; double t1; do { // note: get_time effectively has a long delay (up to 5 us) // before returning the time. we call it before rdtsc to // minimize the delay between actually sampling time / TSC, // thus decreasing the chance for interference. // (if unavoidable background activity, e.g. interrupts, // delays the second reading, inaccuracy is introduced). t1 = get_time(); c1 = rdtsc(); } while(t1 == t0); // .. wait until start of next tick and at least 1 ms elapsed. do { const double t2 = get_time(); const u64 c2 = rdtsc(); dc = (i64)(c2 - c1); dt = t2 - t1; } while(dt < 1e-3); // .. freq = (delta_clocks) / (delta_seconds); // cpuid/rdtsc/timer overhead is negligible. const double freq = dc / dt; samples[i] = freq; } std::sort(samples.begin(), samples.end()); // median filter (remove upper and lower 25% and average the rest). // note: don't just take the lowest value! it could conceivably be // too low, if background processing delays reading c1 (see above). double sum = 0.0; const int lo = num_samples/4, hi = 3*num_samples/4; for(i = lo; i < hi; i++) sum += samples[i]; cpu_freq = sum / (hi-lo); } // else: TSC not available, can't measure; cpu_freq remains unchanged. // restore previous policy and priority. pthread_setschedparam(pthread_self(), old_policy, &old_param); } void ia32_get_cpu_info() { get_cpu_vendor(); get_cpu_type(); get_cpu_count(); check_for_speedstep(); measure_cpu_freq(); // HACK: on Windows, the HRT makes its final implementation choice // in the first calibrate call where cpu info is available. // call wtime_reset_impl here to have that happen now, // so app code isn't surprised by a timer change, although the HRT // does try to keep the timer continuous. #if OS_WIN wtime_reset_impl(); #endif } //----------------------------------------------------------------------------- // checks if there is an IA-32 CALL instruction right before ret_addr. // returns ERR_OK if so and ERR_FAIL if not. // also attempts to determine the call target. if that is possible // (directly addressed relative or indirect jumps), it is stored in // target, which is otherwise 0. // // this is useful for walking the stack manually. LibError ia32_get_call_target(void* ret_addr, void** target) { *target = 0; // points to end of the CALL instruction (which is of unknown length) const u8* c = (const u8*)ret_addr; // this would allow for avoiding exceptions when accessing ret_addr // close to the beginning of the code segment. it's not currently set // because this is really unlikely and not worth the trouble. const size_t len = ~0u; // CALL rel32 (E8 cd) if(len >= 5 && c[-5] == 0xE8) { *target = (u8*)ret_addr + *(i32*)(c-4); return ERR_OK; } // CALL r/m32 (FF /2) // .. CALL [r32 + r32*s] => FF 14 SIB if(len >= 3 && c[-3] == 0xFF && c[-2] == 0x14) return ERR_OK; // .. CALL [disp32] => FF 15 disp32 if(len >= 6 && c[6] == 0xFF && c[-5] == 0x15) { void* addr_of_target = *(void**)(c-4); if(!debug_is_pointer_bogus(addr_of_target)) { *target = *(void**)addr_of_target; return ERR_OK; } } // .. CALL [r32] => FF 00-3F(!14/15) if(len >= 2 && c[-2] == 0xFF && c[-1] < 0x40 && c[-1] != 0x14 && c[-1] != 0x15) return ERR_OK; // .. CALL [r32 + r32*s + disp8] => FF 54 SIB disp8 if(len >= 4 && c[-4] == 0xFF && c[-3] == 0x54) return ERR_OK; // .. CALL [r32 + disp8] => FF 50-57(!54) disp8 if(len >= 3 && c[-3] == 0xFF && (c[-2] & 0xF8) == 0x50 && c[-2] != 0x54) return ERR_OK; // .. CALL [r32 + r32*s + disp32] => FF 94 SIB disp32 if(len >= 7 && c[-7] == 0xFF && c[-6] == 0x94) return ERR_OK; // .. CALL [r32 + disp32] => FF 90-97(!94) disp32 if(len >= 6 && c[-6] == 0xFF && (c[-5] & 0xF8) == 0x90 && c[-5] != 0x94) return ERR_OK; // .. CALL r32 => FF D0-D7 if(len >= 2 && c[-2] == 0xFF && (c[-1] & 0xF8) == 0xD0) return ERR_OK; return ERR_FAIL; } //----------------------------------------------------------------------------- #ifndef NO_COLOR // Assembler-optimized function for color conversion extern "C" { u32 sse_ConvertRGBColorTo4ub(const RGBColor& src); } #endif void ia32_hook_capabilities() { #ifndef NO_COLOR if (ia32_cap(SSE)) { ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub; } else { debug_printf("No SSE available. Slow fallback routines will be used.\n"); } #endif } //---------------------------------------------------------------------------- // built-in self test //---------------------------------------------------------------------------- #if SELF_TEST_ENABLED namespace test { static void test_float_int() { TEST(i32_from_float(0.99999f) == 0); TEST(i32_from_float(1.0f) == 1); TEST(i32_from_float(1.01f) == 1); TEST(i32_from_float(5.6f) == 5); TEST(i32_from_double(0.99999) == 0); TEST(i32_from_double(1.0) == 1); TEST(i32_from_double(1.01) == 1); TEST(i32_from_double(5.6) == 5); TEST(i64_from_double(0.99999) == 0ll); TEST(i64_from_double(1.0) == 1ll); TEST(i64_from_double(1.01) == 1ll); TEST(i64_from_double(5.6) == 5ll); } static void self_test() { test_float_int(); } SELF_TEST_RUN; } // namespace test #endif // #if SELF_TEST_ENABLED