fixes+improvements from work:
- add AlignedAllocator - an STL allocator that returns cache-line-aligned objects (required to avoid RFOs when threads write to various independent items in a container) - bits: bit_mask can now be used for N=0..numBits (works around full-word-shifts-are-undefined issue) - precompiled.h: remove scoped_ptr, add function-related stuff from TR1 - numa: . add numa_IsMemoryInterleaved . numa_Allocate is now able to allocate large pages as well (reduces TLB misses) - os_cpu: interface change to support 32-bit apps running on WoW64 systems with > 4 GB of memory - topology: use new x86_x64_EnumerateCaches API; fix detection of cache ID - x86_x64: provide the means of enumerating all caches returned by CPUID and detect L1 cache size This was SVN commit r6004.
This commit is contained in:
parent
0118bfd634
commit
5d80d2ee5d
12
source/lib/allocators/aligned_allocator.cpp
Normal file
12
source/lib/allocators/aligned_allocator.cpp
Normal file
@ -0,0 +1,12 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : aligned_allocator.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : STL allocator for aligned memory
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "aligned_allocator.h"
|
130
source/lib/allocators/aligned_allocator.h
Normal file
130
source/lib/allocators/aligned_allocator.h
Normal file
@ -0,0 +1,130 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : aligned_allocator.h
|
||||
* Project : 0 A.D.
|
||||
* Description : STL allocator for aligned memory
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef ALIGNED_ALLOCATOR
|
||||
#define ALIGNED_ALLOCATOR
|
||||
|
||||
#include "lib/bits.h" // round_up
|
||||
#include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_L1CacheLineSize
|
||||
|
||||
|
||||
/**
|
||||
* stateless STL allocator that aligns elements to the L1 cache line size.
|
||||
*
|
||||
* note: the alignment is hard-coded to avoid any allocator state.
|
||||
* this avoids portability problems, which is important since allocators
|
||||
* are rather poorly specified.
|
||||
*
|
||||
* references:
|
||||
* http://www.tantalon.com/pete/customallocators.ppt
|
||||
* http://www.flipcode.com/archives/Aligned_Block_Allocation.shtml
|
||||
* http://www.josuttis.com/cppcode/allocator.html
|
||||
*
|
||||
* derived from code that bears the following copyright notice:
|
||||
* (C) Copyright Nicolai M. Josuttis 1999.
|
||||
* Permission to copy, use, modify, sell and distribute this software
|
||||
* is granted provided this copyright notice appears in all copies.
|
||||
* This software is provided "as is" without express or implied
|
||||
* warranty, and with no claim as to its suitability for any purpose.
|
||||
**/
|
||||
template<class T>
|
||||
class AlignedAllocator
|
||||
{
|
||||
public:
|
||||
// type definitions
|
||||
typedef T value_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
typedef T& reference;
|
||||
typedef const T& const_reference;
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
|
||||
// rebind allocator to type U
|
||||
template <class U>
|
||||
struct rebind
|
||||
{
|
||||
typedef AlignedAllocator<U> other;
|
||||
};
|
||||
|
||||
pointer address(reference value) const
|
||||
{
|
||||
return &value;
|
||||
}
|
||||
|
||||
const_pointer address(const_reference value) const
|
||||
{
|
||||
return &value;
|
||||
}
|
||||
|
||||
AlignedAllocator() throw()
|
||||
{
|
||||
}
|
||||
|
||||
AlignedAllocator(const AlignedAllocator&) throw()
|
||||
{
|
||||
}
|
||||
|
||||
template <class U>
|
||||
AlignedAllocator (const AlignedAllocator<U>&) throw()
|
||||
{
|
||||
}
|
||||
|
||||
~AlignedAllocator() throw()
|
||||
{
|
||||
}
|
||||
|
||||
size_type max_size() const throw()
|
||||
{
|
||||
// maximum number of *elements* that can be allocated
|
||||
return std::numeric_limits<std::size_t>::max() / sizeof(T);
|
||||
}
|
||||
|
||||
// allocate uninitialized storage
|
||||
pointer allocate(size_type numElements, const void* hint = 0)
|
||||
{
|
||||
const size_type alignment = x86_x64_L1CacheLineSize();
|
||||
const size_type elementSize = round_up(sizeof(T), alignment);
|
||||
const size_type size = numElements * elementSize;
|
||||
pointer p = (pointer)_aligned_malloc(size, alignment);
|
||||
return p;
|
||||
}
|
||||
|
||||
// deallocate storage of elements that have been destroyed
|
||||
void deallocate(pointer p, size_type num)
|
||||
{
|
||||
_aligned_free((void*)p);
|
||||
}
|
||||
|
||||
void construct(pointer p, const T& value)
|
||||
{
|
||||
new((void*)p) T(value);
|
||||
}
|
||||
|
||||
void destroy(pointer p)
|
||||
{
|
||||
p->~T();
|
||||
}
|
||||
};
|
||||
|
||||
// indicate that all specializations of this allocator are interchangeable
|
||||
template <class T1, class T2>
|
||||
bool operator==(const AlignedAllocator<T1>&, const AlignedAllocator<T2>&) throw()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class T1, class T2>
|
||||
bool operator!=(const AlignedAllocator<T1>&, const AlignedAllocator<T2>&) throw()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // #ifndef ALIGNED_ALLOCATOR
|
@ -44,9 +44,14 @@ bool IsBitSet(T value, size_t index)
|
||||
* @param num_bits number of bits in mask
|
||||
**/
|
||||
template<typename T>
|
||||
T bit_mask(size_t num_bits)
|
||||
T bit_mask(size_t numBits)
|
||||
{
|
||||
return (T)(T(1) << num_bits)-1;
|
||||
if(numBits == 0) // prevent shift count == bitsInT, which would be undefined.
|
||||
return 0;
|
||||
// note: the perhaps more intuitive (1 << numBits)-1 cannot
|
||||
// handle numBits == bitsInT, but this implementation does.
|
||||
const T bitsInT = sizeof(T)*CHAR_BIT;
|
||||
return ~T(0) >> T(bitsInT-numBits);
|
||||
}
|
||||
|
||||
|
||||
@ -64,7 +69,7 @@ template<typename T>
|
||||
inline T bits(T num, size_t lo_idx, size_t hi_idx)
|
||||
{
|
||||
const size_t count = (hi_idx - lo_idx)+1; // # bits to return
|
||||
T result = num >> lo_idx;
|
||||
T result = num >> T(lo_idx);
|
||||
result &= bit_mask<T>(count);
|
||||
return result;
|
||||
}
|
||||
|
@ -63,10 +63,15 @@
|
||||
# define BOOST_ALL_DYN_LINK
|
||||
#endif
|
||||
#include <boost/utility.hpp> // noncopyable
|
||||
#include <boost/shared_array.hpp>
|
||||
// the following boost libraries have been included in TR1 and are
|
||||
// thus deemed usable:
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
using boost::shared_ptr; // has been added to TR1
|
||||
using boost::shared_ptr;
|
||||
#include <boost/mem_fn.hpp>
|
||||
using boost::mem_fn;
|
||||
#include <boost/function.hpp>
|
||||
using boost::function;
|
||||
#include <boost/bind.hpp>
|
||||
#include "lib/external_libraries/boost_filesystem.h"
|
||||
|
||||
// (this must come after boost and common lib headers)
|
||||
|
@ -16,6 +16,6 @@
|
||||
* avoids several common pitfalls; see discussion at
|
||||
* http://www.azillionmonkeys.com/qed/random.html
|
||||
**/
|
||||
extern size_t rand(size_t min_inclusive, size_t max_exclusive);
|
||||
LIB_API size_t rand(size_t min_inclusive, size_t max_exclusive);
|
||||
|
||||
#endif // #ifndef INCLUDED_RAND
|
||||
|
@ -70,7 +70,10 @@ size_t os_cpu_MemorySize()
|
||||
static size_t memorySize;
|
||||
|
||||
if(!memorySize)
|
||||
memorySize = sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize();
|
||||
{
|
||||
const uint64_t memorySizeBytes = (uint64_t)sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize();
|
||||
memorySize = size_t(memorySizeBytes / MiB);
|
||||
}
|
||||
|
||||
return memorySize;
|
||||
}
|
||||
@ -78,7 +81,8 @@ size_t os_cpu_MemorySize()
|
||||
|
||||
size_t os_cpu_MemoryAvailable()
|
||||
{
|
||||
const size_t memoryAvailable = sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize();
|
||||
const uint64_t memoryAvailableBytes = (uint64_t)sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize();
|
||||
const size_t memoryAvailable = size_t(memoryAvailableBytes / MiB);
|
||||
return memoryAvailable;
|
||||
}
|
||||
|
||||
|
@ -36,16 +36,19 @@ LIB_API size_t numa_AvailableMemory(size_t node);
|
||||
**/
|
||||
LIB_API double numa_Factor();
|
||||
|
||||
/**
|
||||
* @return an indication of whether memory pages are node-interleaved.
|
||||
*
|
||||
* note: this requires ACPI access, which may not be available on
|
||||
* least-permission accounts. the default is to return false so as
|
||||
* not to cause callers to panic and trigger performance warnings.
|
||||
**/
|
||||
LIB_API bool numa_IsMemoryInterleaved();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// allocator
|
||||
|
||||
/**
|
||||
* simple allocator that "does the right thing" on NUMA systems - page frames
|
||||
* will be taken from the node that first accesses them.
|
||||
**/
|
||||
LIB_API void* numa_Allocate(size_t size);
|
||||
|
||||
enum LargePageDisposition
|
||||
{
|
||||
LPD_DEFAULT,
|
||||
@ -54,15 +57,25 @@ enum LargePageDisposition
|
||||
};
|
||||
|
||||
/**
|
||||
* allocate memory from a specific node.
|
||||
* simple allocator that "does the right thing" on NUMA systems.
|
||||
*
|
||||
* @param node node number (zero-based)
|
||||
* @param largePageDisposition - allows forcibly enabling/disabling the use
|
||||
* of large pages; the default decision involves a heuristic.
|
||||
* @param pageSize if non-zero, receives the size [bytes] of a single page
|
||||
* out of those used to map the memory.
|
||||
*
|
||||
* note: page frames will be taken from the node that first accesses them.
|
||||
**/
|
||||
LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
|
||||
LIB_API void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* ppageSize = 0);
|
||||
|
||||
/**
|
||||
* allocate memory from a specific node.
|
||||
*
|
||||
* @param node node number (zero-based)
|
||||
* @param largePageDisposition - see numa_Allocate
|
||||
* @param pageSize - see numa_Allocate
|
||||
**/
|
||||
LIB_API void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
|
||||
|
||||
/**
|
||||
* release memory that had been handed out by one of the above allocators.
|
||||
|
@ -76,12 +76,12 @@ LIB_API size_t os_cpu_PageSize();
|
||||
LIB_API size_t os_cpu_LargePageSize();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of physical memory.
|
||||
* @return the size [MB] of physical memory.
|
||||
**/
|
||||
LIB_API size_t os_cpu_MemorySize();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of currently available memory.
|
||||
* @return the size [MB] of currently available memory.
|
||||
**/
|
||||
LIB_API size_t os_cpu_MemoryAvailable();
|
||||
|
||||
|
@ -69,6 +69,7 @@ size_t os_cpu_MemorySize()
|
||||
// Argh, the API doesn't seem to be const-correct
|
||||
/*const*/ int mib[2] = { CTL_HW, HW_PHYSMEM };
|
||||
sysctl(mib, 2, &memorySize, &len, 0, 0);
|
||||
memorySize /= MiB;
|
||||
}
|
||||
|
||||
return memorySize;
|
||||
@ -82,6 +83,7 @@ size_t os_cpu_MemoryAvailable()
|
||||
// Argh, the API doesn't seem to be const-correct
|
||||
/*const*/ int mib[2] = { CTL_HW, HW_USERMEM };
|
||||
sysctl(mib, 2, &memoryAvailable, &len, 0, 0);
|
||||
memoryAvailable /= MiB;
|
||||
return memoryAvailable;
|
||||
}
|
||||
|
||||
|
@ -142,31 +142,33 @@ static void GetMemoryStatus(MEMORYSTATUSEX& mse)
|
||||
|
||||
size_t os_cpu_MemorySize()
|
||||
{
|
||||
static size_t memorySize;
|
||||
static size_t memorySizeMiB;
|
||||
|
||||
if(memorySize == 0)
|
||||
if(memorySizeMiB == 0)
|
||||
{
|
||||
MEMORYSTATUSEX mse;
|
||||
GetMemoryStatus(mse);
|
||||
memorySize = (size_t)mse.ullTotalPhys;
|
||||
DWORDLONG memorySize = mse.ullTotalPhys;
|
||||
|
||||
// Richter, "Programming Applications for Windows": the reported
|
||||
// value doesn't include non-paged pool reserved during boot;
|
||||
// it's not considered available to the kernel. (the amount is
|
||||
// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
|
||||
// to the nearest megabyte to fix this.
|
||||
memorySize = round_up(memorySize, 1*MiB);
|
||||
memorySize = round_up(memorySize, DWORDLONG(1*MiB));
|
||||
|
||||
memorySizeMiB = size_t(memorySize / MiB);
|
||||
}
|
||||
|
||||
return memorySize;
|
||||
return memorySizeMiB;
|
||||
}
|
||||
|
||||
size_t os_cpu_MemoryAvailable()
|
||||
{
|
||||
MEMORYSTATUSEX mse;
|
||||
GetMemoryStatus(mse);
|
||||
const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
|
||||
return memoryAvailable;
|
||||
const size_t memoryAvailableMiB = size_t(mse.ullAvailPhys / MiB);
|
||||
return memoryAvailableMiB;
|
||||
}
|
||||
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "lib/bits.h" // round_up, PopulationCount
|
||||
#include "lib/timer.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "lib/sysdep/acpi.h"
|
||||
#include "win.h"
|
||||
#include "wutil.h"
|
||||
#include "wcpu.h"
|
||||
@ -141,7 +142,8 @@ size_t numa_AvailableMemory(size_t node)
|
||||
ULONGLONG availableBytes;
|
||||
const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
|
||||
debug_assert(ok);
|
||||
return (size_t)availableBytes;
|
||||
const size_t availableMiB = size_t(availableBytes / MiB);
|
||||
return availableMiB;
|
||||
}
|
||||
// NUMA not supported - return available system memory
|
||||
else
|
||||
@ -194,22 +196,34 @@ double numa_Factor()
|
||||
}
|
||||
|
||||
|
||||
bool numa_IsMemoryInterleaved()
|
||||
{
|
||||
WinScopedLock lock(WNUMA_CS);
|
||||
static int isInterleaved = -1;
|
||||
if(isInterleaved == -1)
|
||||
{
|
||||
if(acpi_Init())
|
||||
{
|
||||
// the BIOS only generates an SRAT (System Resource Affinity Table)
|
||||
// if node interleaving is disabled.
|
||||
isInterleaved = acpi_GetTable("SRAT") == 0;
|
||||
acpi_Shutdown();
|
||||
}
|
||||
else
|
||||
isInterleaved = 0; // can't tell
|
||||
}
|
||||
|
||||
return isInterleaved != 0;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// allocator
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void* numa_Allocate(size_t size)
|
||||
{
|
||||
void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
|
||||
if(!mem)
|
||||
throw std::bad_alloc();
|
||||
return mem;
|
||||
}
|
||||
|
||||
|
||||
static bool largePageAllocationTookTooLong = false;
|
||||
|
||||
static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node)
|
||||
static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize)
|
||||
{
|
||||
// can't, OS does not support large pages
|
||||
if(os_cpu_LargePageSize() == 0)
|
||||
@ -236,7 +250,7 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat
|
||||
// we want there to be plenty of memory available, otherwise the
|
||||
// page frames are going to be terribly fragmented and even a
|
||||
// single allocation would take SECONDS.
|
||||
if(numa_AvailableMemory(node) < 2*GiB)
|
||||
if(os_cpu_MemoryAvailable() < 2000) // 2 GB
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -244,6 +258,44 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat
|
||||
}
|
||||
|
||||
|
||||
void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
|
||||
{
|
||||
void* mem = 0;
|
||||
|
||||
// try allocating with large pages (reduces TLB misses)
|
||||
if(ShouldUseLargePages(largePageDisposition, size))
|
||||
{
|
||||
const size_t largePageSize = os_cpu_LargePageSize();
|
||||
const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES
|
||||
// note: this call can take SECONDS, which is why several checks are
|
||||
// undertaken before we even try. these aren't authoritative, so we
|
||||
// at least prevent future attempts if it takes too long.
|
||||
const double startTime = timer_Time();
|
||||
mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
|
||||
if(ppageSize)
|
||||
*ppageSize = largePageSize;
|
||||
const double elapsedTime = timer_Time() - startTime;
|
||||
debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
|
||||
if(elapsedTime > 1.0)
|
||||
largePageAllocationTookTooLong = true;
|
||||
}
|
||||
|
||||
// try (again) with regular pages
|
||||
if(!mem)
|
||||
{
|
||||
mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
|
||||
if(ppageSize)
|
||||
*ppageSize = os_cpu_PageSize();
|
||||
}
|
||||
|
||||
// all attempts failed - we're apparently out of memory.
|
||||
if(!mem)
|
||||
throw std::bad_alloc();
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
|
||||
static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
|
||||
{
|
||||
typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
|
||||
@ -294,61 +346,35 @@ static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
|
||||
}
|
||||
|
||||
|
||||
void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize)
|
||||
void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
|
||||
{
|
||||
debug_assert(node < numa_NumNodes());
|
||||
|
||||
// see if there will be enough memory (non-authoritative, for debug purposes only)
|
||||
{
|
||||
const size_t availableBytes = numa_AvailableMemory(node);
|
||||
if(availableBytes < size)
|
||||
debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size);
|
||||
const size_t sizeMiB = size/MiB;
|
||||
const size_t availableMiB = numa_AvailableMemory(node);
|
||||
if(availableMiB < sizeMiB)
|
||||
debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB);
|
||||
}
|
||||
|
||||
void* mem = 0;
|
||||
size_t pageSize = 0;
|
||||
|
||||
// try allocating with large pages (reduces TLB misses)
|
||||
if(ShouldUseLargePages(largePageDisposition, size, node))
|
||||
{
|
||||
const size_t largePageSize = os_cpu_LargePageSize();
|
||||
const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES
|
||||
// note: this call can take SECONDS, which is why several checks are
|
||||
// undertaken before we even try. these aren't authoritative, so we
|
||||
// at least prevent future attempts if it takes too long.
|
||||
const double startTime = timer_Time();
|
||||
mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
|
||||
pageSize = largePageSize;
|
||||
const double elapsedTime = timer_Time() - startTime;
|
||||
debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
|
||||
if(elapsedTime > 1.0)
|
||||
largePageAllocationTookTooLong = true;
|
||||
}
|
||||
|
||||
// try (again) with regular pages
|
||||
if(!mem)
|
||||
{
|
||||
mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
|
||||
pageSize = os_cpu_PageSize();
|
||||
}
|
||||
|
||||
// all attempts failed - we're apparently out of memory.
|
||||
if(!mem)
|
||||
throw std::bad_alloc();
|
||||
size_t pageSize; // (used below even if ppageSize is zero)
|
||||
void* const mem = numa_Allocate(size, largePageDisposition, &pageSize);
|
||||
if(ppageSize)
|
||||
*ppageSize = pageSize;
|
||||
|
||||
// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
|
||||
// workaround: fault in all pages now to ensure they are allocated from the
|
||||
// current node, then verify page attributes.
|
||||
// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
|
||||
// actually allocate page frames. Windows uses a first-touch heuristic -
|
||||
// the page will be taken from the node whose processor caused the fault.)
|
||||
// actually allocate page frames. Windows XP uses a first-touch heuristic -
|
||||
// the page will be taken from the node whose processor caused the fault.
|
||||
// Windows Vista allocates on the "preferred" node, so affinity should be
|
||||
// set such that this thread is running on <node>.)
|
||||
memset(mem, 0, size);
|
||||
|
||||
VerifyPages(mem, size, pageSize, node);
|
||||
|
||||
if(ppageSize)
|
||||
*ppageSize = pageSize;
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include "topology.h"
|
||||
|
||||
#include "lib/bits.h"
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/cpu.h" // ERR::CPU_FEATURE_MISSING
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "x86_x64.h"
|
||||
|
||||
@ -99,36 +99,20 @@ static size_t LogicalPerCache()
|
||||
|
||||
if(!logicalPerCache)
|
||||
{
|
||||
logicalPerCache = 1; // caches aren't shared unless we find a descriptor
|
||||
logicalPerCache = 1; // (default in case DetectL2Sharing fails)
|
||||
|
||||
// note: Intel Appnote 485 says the order in which caches are returned is
|
||||
// undefined, so we need to loop through all of them.
|
||||
for(u32 count = 0; ; count++)
|
||||
struct DetectL2Sharing
|
||||
{
|
||||
// get next cache descriptor
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 4;
|
||||
regs.ecx = count;
|
||||
x86_x64_cpuid(®s);
|
||||
const u32 type = bits(regs.eax, 0, 4);
|
||||
if(type == 0) // no more remaining
|
||||
break;
|
||||
|
||||
struct IsL2DataCache
|
||||
static void Callback(const x86_x64_CacheParameters* cache)
|
||||
{
|
||||
bool operator()(u32 type, u32 level) const
|
||||
{
|
||||
if(type != 1 && type != 3) // neither data nor unified
|
||||
return false;
|
||||
if(level != 2)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
const u32 level = bits(regs.eax, 5, 7);
|
||||
if(IsL2DataCache()(type, level))
|
||||
logicalPerCache = bits(regs.eax, 14, 25)+1;
|
||||
}
|
||||
if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED)
|
||||
return;
|
||||
if(cache->level != 2)
|
||||
return;
|
||||
logicalPerCache = cache->sharedBy;
|
||||
}
|
||||
};
|
||||
x86_x64_EnumerateCaches(DetectL2Sharing::Callback);
|
||||
}
|
||||
|
||||
return logicalPerCache;
|
||||
@ -177,25 +161,18 @@ static const u8* ApicIds()
|
||||
|
||||
|
||||
/**
|
||||
* count the number of unique values assumed by a certain field (i.e. part
|
||||
* of the APIC ID).
|
||||
* @param numBits width of the field; must be set to ceil_log2 of the
|
||||
* maximum value that can be assumed by the field.
|
||||
* @return number of unique values (one if numBits is zero - this is
|
||||
* convenient and kind of justified by counting the empty symbol)
|
||||
* count the number of unique APIC IDs after application of a mask.
|
||||
*
|
||||
* this is used to implement NumUniqueValuesInField and also required
|
||||
* for counting the number of caches.
|
||||
**/
|
||||
static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits)
|
||||
static size_t NumUniqueMaskedValues(const u8* apicIds, u8 mask)
|
||||
{
|
||||
if(numBits == 0)
|
||||
return 1; // see above
|
||||
const u8 mask = bit_mask<u8>(numBits);
|
||||
|
||||
typedef std::set<u8> IdSet;
|
||||
IdSet ids;
|
||||
std::set<u8> ids;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const u8 apicId = apicIds[processor];
|
||||
const u8 field = u8(apicId >> offset) & mask;
|
||||
const u8 field = apicId & mask;
|
||||
ids.insert(field);
|
||||
}
|
||||
|
||||
@ -203,13 +180,31 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* count the number of values assumed by a certain field within APIC IDs.
|
||||
*
|
||||
* @param offset index of the lowest bit that is part of the field.
|
||||
* @param numValues number of values that can be assumed by the field.
|
||||
* if equal to one, the field is zero-width.
|
||||
* @return number of unique values (for convenience of the topology code,
|
||||
* this is always at least one)
|
||||
**/
|
||||
static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numValues)
|
||||
{
|
||||
if(numValues == 1)
|
||||
return 1; // see above
|
||||
const size_t numBits = ceil_log2(numValues);
|
||||
const u8 mask = u8((bit_mask<u8>(numBits) << offset) & 0xFF);
|
||||
return NumUniqueMaskedValues(apicIds, mask);
|
||||
}
|
||||
|
||||
|
||||
static size_t NumPackages(const u8* apicIds)
|
||||
{
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
|
||||
const size_t numBits = 8;
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
return NumUniqueValuesInField(apicIds, offset, 256);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -241,8 +236,7 @@ static size_t CoresPerPackage(const u8* apicIds)
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = ceil_log2(LogicalPerCore());
|
||||
const size_t numBits = ceil_log2(CoresPerPackage());
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
return NumUniqueValuesInField(apicIds, offset, CoresPerPackage());
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -257,8 +251,7 @@ static size_t LogicalPerCore(const u8* apicIds)
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = 0;
|
||||
const size_t numBits = ceil_log2(LogicalPerCore());
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
return NumUniqueValuesInField(apicIds, offset, LogicalPerCore());
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -320,9 +313,9 @@ static size_t NumCaches(const u8* apicIds)
|
||||
{
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = 0;
|
||||
const size_t numBits = ceil_log2(LogicalPerCache());
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
const u8 mask = u8((0xFF << numBits) & 0xFF);
|
||||
return NumUniqueMaskedValues(apicIds, mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -223,6 +223,63 @@ size_t x86_x64_Generation()
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// cache
|
||||
|
||||
void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback)
|
||||
{
|
||||
for(u32 count = 0; ; count++)
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 4;
|
||||
regs.ecx = count;
|
||||
x86_x64_cpuid(®s);
|
||||
|
||||
x86_x64_CacheParameters cache;
|
||||
cache.type = (x86_x64_CacheType)bits(regs.eax, 0, 4);
|
||||
if(cache.type == X86_X64_CACHE_TYPE_NULL) // no more remaining
|
||||
break;
|
||||
cache.level = (size_t)bits(regs.eax, 5, 7);
|
||||
cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
|
||||
cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding)
|
||||
cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
|
||||
{
|
||||
const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1;
|
||||
const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1;
|
||||
cache.size = cache.associativity * partitions * cache.lineSize * sets;
|
||||
}
|
||||
|
||||
callback(&cache);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t x86_x64_L1CacheLineSize()
|
||||
{
|
||||
static size_t l1CacheLineSize;
|
||||
|
||||
if(!l1CacheLineSize)
|
||||
{
|
||||
l1CacheLineSize = 64; // (default in case DetectL1CacheLineSize fails)
|
||||
|
||||
struct DetectL1CacheLineSize
|
||||
{
|
||||
static void Callback(const x86_x64_CacheParameters* cache)
|
||||
{
|
||||
if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED)
|
||||
return;
|
||||
if(cache->level != 1)
|
||||
return;
|
||||
l1CacheLineSize = cache->lineSize;
|
||||
}
|
||||
};
|
||||
x86_x64_EnumerateCaches(DetectL1CacheLineSize::Callback);
|
||||
}
|
||||
|
||||
return l1CacheLineSize;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// identifier string
|
||||
|
||||
|
@ -96,6 +96,40 @@ enum x86_x64_Cap
|
||||
LIB_API bool x86_x64_cap(x86_x64_Cap cap);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// cache
|
||||
|
||||
enum x86_x64_CacheType
|
||||
{
|
||||
X86_X64_CACHE_TYPE_NULL, // never passed to the callback
|
||||
X86_X64_CACHE_TYPE_DATA,
|
||||
X86_X64_CACHE_TYPE_INSTRUCTION,
|
||||
X86_X64_CACHE_TYPE_UNIFIED
|
||||
// note: further values are "reserved"
|
||||
};
|
||||
|
||||
struct x86_x64_CacheParameters
|
||||
{
|
||||
x86_x64_CacheType type;
|
||||
size_t level;
|
||||
size_t associativity;
|
||||
size_t lineSize;
|
||||
size_t sharedBy;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
typedef void (CALL_CONV *x86_x64_CacheCallback)(const x86_x64_CacheParameters*);
|
||||
|
||||
/**
|
||||
* call back for each cache reported by CPUID.
|
||||
*
|
||||
* note: ordering is undefined (see Intel AP-485)
|
||||
**/
|
||||
LIB_API void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback);
|
||||
|
||||
LIB_API size_t x86_x64_L1CacheLineSize();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// stateless
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user