From 5d80d2ee5d549e809332bf242270c678f6fd68e5 Mon Sep 17 00:00:00 2001 From: janwas Date: Sun, 1 Jun 2008 08:25:12 +0000 Subject: [PATCH] fixes+improvements from work: - add AlignedAllocator - an STL allocator that returns cache-line-aligned objects (required to avoid RFOs when threads write to various independent items in a container) - bits: bit_mask can now be used for N=0..numBits (works around full-word-shifts-are-undefined issue) - precompiled.h: remove scoped_ptr, add function-related stuff from TR1 - numa: . add numa_IsMemoryInterleaved . numa_Allocate is now able to allocate large pages as well (reduces TLB misses) - os_cpu: interface change to support 32-bit apps running on WoW64 systems with > 4 GB of memory - topology: use new x86_x64_EnumerateCaches API; fix detection of cache ID - x86_x64: provide the means of enumerating all caches returned by CPUID and detect L1 cache size This was SVN commit r6004. --- source/lib/allocators/aligned_allocator.cpp | 12 ++ source/lib/allocators/aligned_allocator.h | 130 ++++++++++++++++++++ source/lib/bits.h | 11 +- source/lib/precompiled.h | 11 +- source/lib/rand.h | 2 +- source/lib/sysdep/linux/lcpu.cpp | 8 +- source/lib/sysdep/numa.h | 31 +++-- source/lib/sysdep/os_cpu.h | 4 +- source/lib/sysdep/osx/ocpu.cpp | 2 + source/lib/sysdep/win/wcpu.cpp | 16 +-- source/lib/sysdep/win/wnuma.cpp | 128 +++++++++++-------- source/lib/sysdep/x86_x64/topology.cpp | 93 +++++++------- source/lib/sysdep/x86_x64/x86_x64.cpp | 57 +++++++++ source/lib/sysdep/x86_x64/x86_x64.h | 34 +++++ 14 files changed, 411 insertions(+), 128 deletions(-) create mode 100644 source/lib/allocators/aligned_allocator.cpp create mode 100644 source/lib/allocators/aligned_allocator.h diff --git a/source/lib/allocators/aligned_allocator.cpp b/source/lib/allocators/aligned_allocator.cpp new file mode 100644 index 0000000000..a8a08a1d17 --- /dev/null +++ b/source/lib/allocators/aligned_allocator.cpp @@ -0,0 +1,12 @@ +/** + * ========================================================================= + * File : aligned_allocator.cpp + * Project : 0 A.D. + * Description : STL allocator for aligned memory + * ========================================================================= + */ + +// license: GPL; see lib/license.txt + +#include "precompiled.h" +#include "aligned_allocator.h" diff --git a/source/lib/allocators/aligned_allocator.h b/source/lib/allocators/aligned_allocator.h new file mode 100644 index 0000000000..c1d42b5b6a --- /dev/null +++ b/source/lib/allocators/aligned_allocator.h @@ -0,0 +1,130 @@ +/** + * ========================================================================= + * File : aligned_allocator.h + * Project : 0 A.D. + * Description : STL allocator for aligned memory + * ========================================================================= + */ + +// license: GPL; see lib/license.txt + +#ifndef ALIGNED_ALLOCATOR +#define ALIGNED_ALLOCATOR + +#include "lib/bits.h" // round_up +#include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_L1CacheLineSize + + +/** + * stateless STL allocator that aligns elements to the L1 cache line size. + * + * note: the alignment is hard-coded to avoid any allocator state. + * this avoids portability problems, which is important since allocators + * are rather poorly specified. + * + * references: + * http://www.tantalon.com/pete/customallocators.ppt + * http://www.flipcode.com/archives/Aligned_Block_Allocation.shtml + * http://www.josuttis.com/cppcode/allocator.html + * + * derived from code that bears the following copyright notice: + * (C) Copyright Nicolai M. Josuttis 1999. + * Permission to copy, use, modify, sell and distribute this software + * is granted provided this copyright notice appears in all copies. + * This software is provided "as is" without express or implied + * warranty, and with no claim as to its suitability for any purpose. + **/ +template +class AlignedAllocator +{ +public: + // type definitions + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + // rebind allocator to type U + template + struct rebind + { + typedef AlignedAllocator other; + }; + + pointer address(reference value) const + { + return &value; + } + + const_pointer address(const_reference value) const + { + return &value; + } + + AlignedAllocator() throw() + { + } + + AlignedAllocator(const AlignedAllocator&) throw() + { + } + + template + AlignedAllocator (const AlignedAllocator&) throw() + { + } + + ~AlignedAllocator() throw() + { + } + + size_type max_size() const throw() + { + // maximum number of *elements* that can be allocated + return std::numeric_limits::max() / sizeof(T); + } + + // allocate uninitialized storage + pointer allocate(size_type numElements, const void* hint = 0) + { + const size_type alignment = x86_x64_L1CacheLineSize(); + const size_type elementSize = round_up(sizeof(T), alignment); + const size_type size = numElements * elementSize; + pointer p = (pointer)_aligned_malloc(size, alignment); + return p; + } + + // deallocate storage of elements that have been destroyed + void deallocate(pointer p, size_type num) + { + _aligned_free((void*)p); + } + + void construct(pointer p, const T& value) + { + new((void*)p) T(value); + } + + void destroy(pointer p) + { + p->~T(); + } +}; + +// indicate that all specializations of this allocator are interchangeable +template +bool operator==(const AlignedAllocator&, const AlignedAllocator&) throw() +{ + return true; +} + +template +bool operator!=(const AlignedAllocator&, const AlignedAllocator&) throw() +{ + return false; +} + +#endif // #ifndef ALIGNED_ALLOCATOR diff --git a/source/lib/bits.h b/source/lib/bits.h index 62d36bf243..a330e1073d 100644 --- a/source/lib/bits.h +++ b/source/lib/bits.h @@ -44,9 +44,14 @@ bool IsBitSet(T value, size_t index) * @param num_bits number of bits in mask **/ template -T bit_mask(size_t num_bits) +T bit_mask(size_t numBits) { - return (T)(T(1) << num_bits)-1; + if(numBits == 0) // prevent shift count == bitsInT, which would be undefined. + return 0; + // note: the perhaps more intuitive (1 << numBits)-1 cannot + // handle numBits == bitsInT, but this implementation does. + const T bitsInT = sizeof(T)*CHAR_BIT; + return ~T(0) >> T(bitsInT-numBits); } @@ -64,7 +69,7 @@ template inline T bits(T num, size_t lo_idx, size_t hi_idx) { const size_t count = (hi_idx - lo_idx)+1; // # bits to return - T result = num >> lo_idx; + T result = num >> T(lo_idx); result &= bit_mask(count); return result; } diff --git a/source/lib/precompiled.h b/source/lib/precompiled.h index 2d11e2baf4..e1ab670cd9 100644 --- a/source/lib/precompiled.h +++ b/source/lib/precompiled.h @@ -63,10 +63,15 @@ # define BOOST_ALL_DYN_LINK #endif #include // noncopyable -#include +// the following boost libraries have been included in TR1 and are +// thus deemed usable: #include -#include -using boost::shared_ptr; // has been added to TR1 +using boost::shared_ptr; +#include +using boost::mem_fn; +#include +using boost::function; +#include #include "lib/external_libraries/boost_filesystem.h" // (this must come after boost and common lib headers) diff --git a/source/lib/rand.h b/source/lib/rand.h index 7c6fdf40cf..f709751f11 100644 --- a/source/lib/rand.h +++ b/source/lib/rand.h @@ -16,6 +16,6 @@ * avoids several common pitfalls; see discussion at * http://www.azillionmonkeys.com/qed/random.html **/ -extern size_t rand(size_t min_inclusive, size_t max_exclusive); +LIB_API size_t rand(size_t min_inclusive, size_t max_exclusive); #endif // #ifndef INCLUDED_RAND diff --git a/source/lib/sysdep/linux/lcpu.cpp b/source/lib/sysdep/linux/lcpu.cpp index 34fe56c7aa..305efb847a 100644 --- a/source/lib/sysdep/linux/lcpu.cpp +++ b/source/lib/sysdep/linux/lcpu.cpp @@ -70,7 +70,10 @@ size_t os_cpu_MemorySize() static size_t memorySize; if(!memorySize) - memorySize = sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize(); + { + const uint64_t memorySizeBytes = (uint64_t)sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize(); + memorySize = size_t(memorySizeBytes / MiB); + } return memorySize; } @@ -78,7 +81,8 @@ size_t os_cpu_MemorySize() size_t os_cpu_MemoryAvailable() { - const size_t memoryAvailable = sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize(); + const uint64_t memoryAvailableBytes = (uint64_t)sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize(); + const size_t memoryAvailable = size_t(memoryAvailableBytes / MiB); return memoryAvailable; } diff --git a/source/lib/sysdep/numa.h b/source/lib/sysdep/numa.h index 934df81d32..1d2a0fbc81 100644 --- a/source/lib/sysdep/numa.h +++ b/source/lib/sysdep/numa.h @@ -36,16 +36,19 @@ LIB_API size_t numa_AvailableMemory(size_t node); **/ LIB_API double numa_Factor(); +/** + * @return an indication of whether memory pages are node-interleaved. + * + * note: this requires ACPI access, which may not be available on + * least-permission accounts. the default is to return false so as + * not to cause callers to panic and trigger performance warnings. + **/ +LIB_API bool numa_IsMemoryInterleaved(); + //----------------------------------------------------------------------------- // allocator -/** - * simple allocator that "does the right thing" on NUMA systems - page frames - * will be taken from the node that first accesses them. - **/ -LIB_API void* numa_Allocate(size_t size); - enum LargePageDisposition { LPD_DEFAULT, @@ -54,15 +57,25 @@ enum LargePageDisposition }; /** - * allocate memory from a specific node. + * simple allocator that "does the right thing" on NUMA systems. * - * @param node node number (zero-based) * @param largePageDisposition - allows forcibly enabling/disabling the use * of large pages; the default decision involves a heuristic. * @param pageSize if non-zero, receives the size [bytes] of a single page * out of those used to map the memory. + * + * note: page frames will be taken from the node that first accesses them. **/ -LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0); +LIB_API void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* ppageSize = 0); + +/** + * allocate memory from a specific node. + * + * @param node node number (zero-based) + * @param largePageDisposition - see numa_Allocate + * @param pageSize - see numa_Allocate + **/ +LIB_API void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0); /** * release memory that had been handed out by one of the above allocators. diff --git a/source/lib/sysdep/os_cpu.h b/source/lib/sysdep/os_cpu.h index cbc2070a44..97b260367c 100644 --- a/source/lib/sysdep/os_cpu.h +++ b/source/lib/sysdep/os_cpu.h @@ -76,12 +76,12 @@ LIB_API size_t os_cpu_PageSize(); LIB_API size_t os_cpu_LargePageSize(); /** - * @return the size [bytes] of physical memory. + * @return the size [MB] of physical memory. **/ LIB_API size_t os_cpu_MemorySize(); /** - * @return the size [bytes] of currently available memory. + * @return the size [MB] of currently available memory. **/ LIB_API size_t os_cpu_MemoryAvailable(); diff --git a/source/lib/sysdep/osx/ocpu.cpp b/source/lib/sysdep/osx/ocpu.cpp index 57cf237edb..713e78b551 100644 --- a/source/lib/sysdep/osx/ocpu.cpp +++ b/source/lib/sysdep/osx/ocpu.cpp @@ -69,6 +69,7 @@ size_t os_cpu_MemorySize() // Argh, the API doesn't seem to be const-correct /*const*/ int mib[2] = { CTL_HW, HW_PHYSMEM }; sysctl(mib, 2, &memorySize, &len, 0, 0); + memorySize /= MiB; } return memorySize; @@ -82,6 +83,7 @@ size_t os_cpu_MemoryAvailable() // Argh, the API doesn't seem to be const-correct /*const*/ int mib[2] = { CTL_HW, HW_USERMEM }; sysctl(mib, 2, &memoryAvailable, &len, 0, 0); + memoryAvailable /= MiB; return memoryAvailable; } diff --git a/source/lib/sysdep/win/wcpu.cpp b/source/lib/sysdep/win/wcpu.cpp index 7bc38e9e8a..0a5bce04fb 100644 --- a/source/lib/sysdep/win/wcpu.cpp +++ b/source/lib/sysdep/win/wcpu.cpp @@ -142,31 +142,33 @@ static void GetMemoryStatus(MEMORYSTATUSEX& mse) size_t os_cpu_MemorySize() { - static size_t memorySize; + static size_t memorySizeMiB; - if(memorySize == 0) + if(memorySizeMiB == 0) { MEMORYSTATUSEX mse; GetMemoryStatus(mse); - memorySize = (size_t)mse.ullTotalPhys; + DWORDLONG memorySize = mse.ullTotalPhys; // Richter, "Programming Applications for Windows": the reported // value doesn't include non-paged pool reserved during boot; // it's not considered available to the kernel. (the amount is // 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up // to the nearest megabyte to fix this. - memorySize = round_up(memorySize, 1*MiB); + memorySize = round_up(memorySize, DWORDLONG(1*MiB)); + + memorySizeMiB = size_t(memorySize / MiB); } - return memorySize; + return memorySizeMiB; } size_t os_cpu_MemoryAvailable() { MEMORYSTATUSEX mse; GetMemoryStatus(mse); - const size_t memoryAvailable = (size_t)mse.ullAvailPhys; - return memoryAvailable; + const size_t memoryAvailableMiB = size_t(mse.ullAvailPhys / MiB); + return memoryAvailableMiB; } diff --git a/source/lib/sysdep/win/wnuma.cpp b/source/lib/sysdep/win/wnuma.cpp index a8a0368c48..2c78216b65 100644 --- a/source/lib/sysdep/win/wnuma.cpp +++ b/source/lib/sysdep/win/wnuma.cpp @@ -4,6 +4,7 @@ #include "lib/bits.h" // round_up, PopulationCount #include "lib/timer.h" #include "lib/sysdep/os_cpu.h" +#include "lib/sysdep/acpi.h" #include "win.h" #include "wutil.h" #include "wcpu.h" @@ -141,7 +142,8 @@ size_t numa_AvailableMemory(size_t node) ULONGLONG availableBytes; const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes); debug_assert(ok); - return (size_t)availableBytes; + const size_t availableMiB = size_t(availableBytes / MiB); + return availableMiB; } // NUMA not supported - return available system memory else @@ -194,22 +196,34 @@ double numa_Factor() } +bool numa_IsMemoryInterleaved() +{ + WinScopedLock lock(WNUMA_CS); + static int isInterleaved = -1; + if(isInterleaved == -1) + { + if(acpi_Init()) + { + // the BIOS only generates an SRAT (System Resource Affinity Table) + // if node interleaving is disabled. + isInterleaved = acpi_GetTable("SRAT") == 0; + acpi_Shutdown(); + } + else + isInterleaved = 0; // can't tell + } + + return isInterleaved != 0; +} + + //----------------------------------------------------------------------------- // allocator //----------------------------------------------------------------------------- -void* numa_Allocate(size_t size) -{ - void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); - if(!mem) - throw std::bad_alloc(); - return mem; -} - - static bool largePageAllocationTookTooLong = false; -static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node) +static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize) { // can't, OS does not support large pages if(os_cpu_LargePageSize() == 0) @@ -236,7 +250,7 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat // we want there to be plenty of memory available, otherwise the // page frames are going to be terribly fragmented and even a // single allocation would take SECONDS. - if(numa_AvailableMemory(node) < 2*GiB) + if(os_cpu_MemoryAvailable() < 2000) // 2 GB return false; } @@ -244,6 +258,44 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat } +void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize) +{ + void* mem = 0; + + // try allocating with large pages (reduces TLB misses) + if(ShouldUseLargePages(largePageDisposition, size)) + { + const size_t largePageSize = os_cpu_LargePageSize(); + const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES + // note: this call can take SECONDS, which is why several checks are + // undertaken before we even try. these aren't authoritative, so we + // at least prevent future attempts if it takes too long. + const double startTime = timer_Time(); + mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE); + if(ppageSize) + *ppageSize = largePageSize; + const double elapsedTime = timer_Time() - startTime; + debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime); + if(elapsedTime > 1.0) + largePageAllocationTookTooLong = true; + } + + // try (again) with regular pages + if(!mem) + { + mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); + if(ppageSize) + *ppageSize = os_cpu_PageSize(); + } + + // all attempts failed - we're apparently out of memory. + if(!mem) + throw std::bad_alloc(); + + return mem; +} + + static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node) { typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize); @@ -294,61 +346,35 @@ static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node) } -void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize) +void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize) { debug_assert(node < numa_NumNodes()); // see if there will be enough memory (non-authoritative, for debug purposes only) { - const size_t availableBytes = numa_AvailableMemory(node); - if(availableBytes < size) - debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size); + const size_t sizeMiB = size/MiB; + const size_t availableMiB = numa_AvailableMemory(node); + if(availableMiB < sizeMiB) + debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB); } - void* mem = 0; - size_t pageSize = 0; - - // try allocating with large pages (reduces TLB misses) - if(ShouldUseLargePages(largePageDisposition, size, node)) - { - const size_t largePageSize = os_cpu_LargePageSize(); - const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES - // note: this call can take SECONDS, which is why several checks are - // undertaken before we even try. these aren't authoritative, so we - // at least prevent future attempts if it takes too long. - const double startTime = timer_Time(); - mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE); - pageSize = largePageSize; - const double elapsedTime = timer_Time() - startTime; - debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime); - if(elapsedTime > 1.0) - largePageAllocationTookTooLong = true; - } - - // try (again) with regular pages - if(!mem) - { - mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); - pageSize = os_cpu_PageSize(); - } - - // all attempts failed - we're apparently out of memory. - if(!mem) - throw std::bad_alloc(); + size_t pageSize; // (used below even if ppageSize is zero) + void* const mem = numa_Allocate(size, largePageDisposition, &pageSize); + if(ppageSize) + *ppageSize = pageSize; // we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008. // workaround: fault in all pages now to ensure they are allocated from the // current node, then verify page attributes. // (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not - // actually allocate page frames. Windows uses a first-touch heuristic - - // the page will be taken from the node whose processor caused the fault.) + // actually allocate page frames. Windows XP uses a first-touch heuristic - + // the page will be taken from the node whose processor caused the fault. + // Windows Vista allocates on the "preferred" node, so affinity should be + // set such that this thread is running on .) memset(mem, 0, size); VerifyPages(mem, size, pageSize, node); - if(ppageSize) - *ppageSize = pageSize; - return mem; } diff --git a/source/lib/sysdep/x86_x64/topology.cpp b/source/lib/sysdep/x86_x64/topology.cpp index 2cecbf9601..acf31d5f30 100644 --- a/source/lib/sysdep/x86_x64/topology.cpp +++ b/source/lib/sysdep/x86_x64/topology.cpp @@ -12,7 +12,7 @@ #include "topology.h" #include "lib/bits.h" -#include "lib/sysdep/cpu.h" +#include "lib/sysdep/cpu.h" // ERR::CPU_FEATURE_MISSING #include "lib/sysdep/os_cpu.h" #include "x86_x64.h" @@ -99,36 +99,20 @@ static size_t LogicalPerCache() if(!logicalPerCache) { - logicalPerCache = 1; // caches aren't shared unless we find a descriptor + logicalPerCache = 1; // (default in case DetectL2Sharing fails) - // note: Intel Appnote 485 says the order in which caches are returned is - // undefined, so we need to loop through all of them. - for(u32 count = 0; ; count++) + struct DetectL2Sharing { - // get next cache descriptor - x86_x64_CpuidRegs regs; - regs.eax = 4; - regs.ecx = count; - x86_x64_cpuid(®s); - const u32 type = bits(regs.eax, 0, 4); - if(type == 0) // no more remaining - break; - - struct IsL2DataCache + static void Callback(const x86_x64_CacheParameters* cache) { - bool operator()(u32 type, u32 level) const - { - if(type != 1 && type != 3) // neither data nor unified - return false; - if(level != 2) - return false; - return true; - } - }; - const u32 level = bits(regs.eax, 5, 7); - if(IsL2DataCache()(type, level)) - logicalPerCache = bits(regs.eax, 14, 25)+1; - } + if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED) + return; + if(cache->level != 2) + return; + logicalPerCache = cache->sharedBy; + } + }; + x86_x64_EnumerateCaches(DetectL2Sharing::Callback); } return logicalPerCache; @@ -177,25 +161,18 @@ static const u8* ApicIds() /** - * count the number of unique values assumed by a certain field (i.e. part - * of the APIC ID). - * @param numBits width of the field; must be set to ceil_log2 of the - * maximum value that can be assumed by the field. - * @return number of unique values (one if numBits is zero - this is - * convenient and kind of justified by counting the empty symbol) + * count the number of unique APIC IDs after application of a mask. + * + * this is used to implement NumUniqueValuesInField and also required + * for counting the number of caches. **/ -static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits) +static size_t NumUniqueMaskedValues(const u8* apicIds, u8 mask) { - if(numBits == 0) - return 1; // see above - const u8 mask = bit_mask(numBits); - - typedef std::set IdSet; - IdSet ids; + std::set ids; for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++) { const u8 apicId = apicIds[processor]; - const u8 field = u8(apicId >> offset) & mask; + const u8 field = apicId & mask; ids.insert(field); } @@ -203,13 +180,31 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu } +/** + * count the number of values assumed by a certain field within APIC IDs. + * + * @param offset index of the lowest bit that is part of the field. + * @param numValues number of values that can be assumed by the field. + * if equal to one, the field is zero-width. + * @return number of unique values (for convenience of the topology code, + * this is always at least one) + **/ +static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numValues) +{ + if(numValues == 1) + return 1; // see above + const size_t numBits = ceil_log2(numValues); + const u8 mask = u8((bit_mask(numBits) << offset) & 0xFF); + return NumUniqueMaskedValues(apicIds, mask); +} + + static size_t NumPackages(const u8* apicIds) { if(apicIds) { const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore()); - const size_t numBits = 8; - return NumUniqueValuesInField(apicIds, offset, numBits); + return NumUniqueValuesInField(apicIds, offset, 256); } else { @@ -241,8 +236,7 @@ static size_t CoresPerPackage(const u8* apicIds) if(apicIds) { const size_t offset = ceil_log2(LogicalPerCore()); - const size_t numBits = ceil_log2(CoresPerPackage()); - return NumUniqueValuesInField(apicIds, offset, numBits); + return NumUniqueValuesInField(apicIds, offset, CoresPerPackage()); } else { @@ -257,8 +251,7 @@ static size_t LogicalPerCore(const u8* apicIds) if(apicIds) { const size_t offset = 0; - const size_t numBits = ceil_log2(LogicalPerCore()); - return NumUniqueValuesInField(apicIds, offset, numBits); + return NumUniqueValuesInField(apicIds, offset, LogicalPerCore()); } else { @@ -320,9 +313,9 @@ static size_t NumCaches(const u8* apicIds) { if(apicIds) { - const size_t offset = 0; const size_t numBits = ceil_log2(LogicalPerCache()); - return NumUniqueValuesInField(apicIds, offset, numBits); + const u8 mask = u8((0xFF << numBits) & 0xFF); + return NumUniqueMaskedValues(apicIds, mask); } else { diff --git a/source/lib/sysdep/x86_x64/x86_x64.cpp b/source/lib/sysdep/x86_x64/x86_x64.cpp index fcc966b2b7..1e9bb493d0 100644 --- a/source/lib/sysdep/x86_x64/x86_x64.cpp +++ b/source/lib/sysdep/x86_x64/x86_x64.cpp @@ -223,6 +223,63 @@ size_t x86_x64_Generation() } +//----------------------------------------------------------------------------- +// cache + +void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback) +{ + for(u32 count = 0; ; count++) + { + x86_x64_CpuidRegs regs; + regs.eax = 4; + regs.ecx = count; + x86_x64_cpuid(®s); + + x86_x64_CacheParameters cache; + cache.type = (x86_x64_CacheType)bits(regs.eax, 0, 4); + if(cache.type == X86_X64_CACHE_TYPE_NULL) // no more remaining + break; + cache.level = (size_t)bits(regs.eax, 5, 7); + cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1; + cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding) + cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1; + { + const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1; + const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1; + cache.size = cache.associativity * partitions * cache.lineSize * sets; + } + + callback(&cache); + } +} + + +size_t x86_x64_L1CacheLineSize() +{ + static size_t l1CacheLineSize; + + if(!l1CacheLineSize) + { + l1CacheLineSize = 64; // (default in case DetectL1CacheLineSize fails) + + struct DetectL1CacheLineSize + { + static void Callback(const x86_x64_CacheParameters* cache) + { + if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED) + return; + if(cache->level != 1) + return; + l1CacheLineSize = cache->lineSize; + } + }; + x86_x64_EnumerateCaches(DetectL1CacheLineSize::Callback); + } + + return l1CacheLineSize; +} + + //----------------------------------------------------------------------------- // identifier string diff --git a/source/lib/sysdep/x86_x64/x86_x64.h b/source/lib/sysdep/x86_x64/x86_x64.h index 74146854db..87159e5c21 100644 --- a/source/lib/sysdep/x86_x64/x86_x64.h +++ b/source/lib/sysdep/x86_x64/x86_x64.h @@ -96,6 +96,40 @@ enum x86_x64_Cap LIB_API bool x86_x64_cap(x86_x64_Cap cap); +//----------------------------------------------------------------------------- +// cache + +enum x86_x64_CacheType +{ + X86_X64_CACHE_TYPE_NULL, // never passed to the callback + X86_X64_CACHE_TYPE_DATA, + X86_X64_CACHE_TYPE_INSTRUCTION, + X86_X64_CACHE_TYPE_UNIFIED + // note: further values are "reserved" +}; + +struct x86_x64_CacheParameters +{ + x86_x64_CacheType type; + size_t level; + size_t associativity; + size_t lineSize; + size_t sharedBy; + size_t size; +}; + +typedef void (CALL_CONV *x86_x64_CacheCallback)(const x86_x64_CacheParameters*); + +/** + * call back for each cache reported by CPUID. + * + * note: ordering is undefined (see Intel AP-485) + **/ +LIB_API void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback); + +LIB_API size_t x86_x64_L1CacheLineSize(); + + //----------------------------------------------------------------------------- // stateless