fixes/improvements to lib code from work
- topology.cpp: modify interface due to thread-safety issue. caller is now responsible for ensuring the first _Detect call isn't reentered; everything else is safe. - fix thread-safety issue in wnuma; use winit mechanism to ensure it's ready before use - VFS now takes a cacheSize parameter (required for being able to disable read-only file caches for the image loader at work) - allow dynarray that isn't actually holding memory - debug_stl: VC9 fix (disable this code except on the exact STL version on which it was tested) - zlib, lib_api: changes to macro used to toggle between static and dynamic linking - add boost filesystem header in external_libraries - amd64: cpu_ topology functions are now provided by x86_x64 - cpu: remove cpu_ClockFrequency (dangerous, may be tempting to use during WHRT init which would cause a crash). use x86_x64_ClockFrequency or os_cpu_ClockFrequency instead. - werrno: cope with newer boost versions - wmman: follow SUSv3 in rejecting zero-length mappings This was SVN commit r5954.
This commit is contained in:
parent
bafc8d0cfa
commit
04127c7af3
@ -45,7 +45,7 @@ class TestMeshManager : public CxxTest::TestSuite
|
||||
TS_ASSERT(fs::create_directory(MOD_PATH.external_directory_string()));
|
||||
TS_ASSERT(fs::create_directory(CACHE_PATH.external_directory_string()));
|
||||
|
||||
g_VFS = CreateVfs();
|
||||
g_VFS = CreateVfs(20*MiB);
|
||||
|
||||
TS_ASSERT_OK(g_VFS->Mount("", MOD_PATH));
|
||||
TS_ASSERT_OK(g_VFS->Mount("collada/", "tests/collada"));
|
||||
|
@ -25,14 +25,15 @@ static LibError validate_da(DynArray* da)
|
||||
{
|
||||
if(!da)
|
||||
WARN_RETURN(ERR::INVALID_PARAM);
|
||||
u8* const base = da->base;
|
||||
// u8* const base = da->base;
|
||||
const size_t max_size_pa = da->max_size_pa;
|
||||
const size_t cur_size = da->cur_size;
|
||||
const size_t pos = da->pos;
|
||||
const int prot = da->prot;
|
||||
|
||||
if(debug_is_pointer_bogus(base))
|
||||
WARN_RETURN(ERR::_1);
|
||||
// note: this happens if max_size == 0
|
||||
// if(debug_is_pointer_bogus(base))
|
||||
// WARN_RETURN(ERR::_1);
|
||||
// note: don't check if base is page-aligned -
|
||||
// might not be true for 'wrapped' mem regions.
|
||||
// if(!mem_IsPageMultiple((uintptr_t)base))
|
||||
@ -56,8 +57,9 @@ LibError da_alloc(DynArray* da, size_t max_size)
|
||||
{
|
||||
const size_t max_size_pa = mem_RoundUpToPage(max_size);
|
||||
|
||||
u8* p;
|
||||
RETURN_ERR(mem_Reserve(max_size_pa, &p));
|
||||
u8* p = 0;
|
||||
if(max_size_pa) // (avoid mmap failure)
|
||||
RETURN_ERR(mem_Reserve(max_size_pa, &p));
|
||||
|
||||
da->base = p;
|
||||
da->max_size_pa = max_size_pa;
|
||||
@ -85,7 +87,7 @@ LibError da_free(DynArray* da)
|
||||
// skip mem_Release if <da> was allocated via da_wrap_fixed
|
||||
// (i.e. it doesn't actually own any memory). don't complain;
|
||||
// da_free is supposed to be called even in the above case.
|
||||
if(!was_wrapped)
|
||||
if(!was_wrapped && size_pa)
|
||||
RETURN_ERR(mem_Release(p, size_pa));
|
||||
return INFO::OK;
|
||||
}
|
||||
|
@ -211,7 +211,7 @@ struct ContainerBase : public Container
|
||||
|
||||
struct Any_deque : public ContainerBase<std::deque<int> >
|
||||
{
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
|
||||
bool IsValid(size_t el_size) const
|
||||
{
|
||||
@ -277,7 +277,7 @@ struct Any_list : public ContainerBase<std::list<int> >
|
||||
};
|
||||
|
||||
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
|
||||
template<class _Traits>
|
||||
struct Any_tree : public std::_Tree<_Traits>
|
||||
@ -385,7 +385,7 @@ struct Any_vector: public ContainerBase<std::vector<int> >
|
||||
return true;
|
||||
}
|
||||
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
|
||||
size_t NumElements(size_t el_size) const
|
||||
{
|
||||
@ -416,7 +416,7 @@ struct Any_vector: public ContainerBase<std::vector<int> >
|
||||
};
|
||||
|
||||
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
|
||||
struct Any_basic_string : public ContainerBase<std::string>
|
||||
{
|
||||
@ -461,7 +461,7 @@ struct Any_stack : public Any_deque
|
||||
|
||||
struct Any_hash_map: public ContainerBase<STL_HASH_MAP<int,int> >
|
||||
{
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
|
||||
bool IsValid(size_t el_size) const
|
||||
{
|
||||
@ -482,7 +482,7 @@ struct Any_hash_multimap : public Any_hash_map
|
||||
|
||||
struct Any_hash_set: public ContainerBase<STL_HASH_SET<int> >
|
||||
{
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
|
||||
bool IsValid(size_t el_size) const
|
||||
{
|
||||
@ -610,7 +610,7 @@ LibError debug_stl_get_container_info(const char* type_name, const u8* p, size_t
|
||||
STD_CONTAINER(deque)
|
||||
STD_CONTAINER(list)
|
||||
STD_CONTAINER(vector)
|
||||
#if STL_DINKUMWARE
|
||||
#if STL_DINKUMWARE == 405
|
||||
STD_CONTAINER(map)
|
||||
STD_CONTAINER(multimap)
|
||||
STD_CONTAINER(set)
|
||||
|
26
source/lib/external_libraries/boost_filesystem.h
Normal file
26
source/lib/external_libraries/boost_filesystem.h
Normal file
@ -0,0 +1,26 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : boost_filesystem.h
|
||||
* Project : 0 A.D.
|
||||
* Description : bring in Boost filesystem library
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef INCLUDED_BOOST_FILESYSTEM
|
||||
#define INCLUDED_BOOST_FILESYSTEM
|
||||
|
||||
// not W4-clean
|
||||
#if MSC_VERSION
|
||||
# pragma warning(push, 3)
|
||||
#endif
|
||||
|
||||
#include "boost/filesystem.hpp"
|
||||
namespace fs = boost::filesystem;
|
||||
|
||||
#if MSC_VERSION
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
|
||||
#endif // #ifndef INCLUDED_BOOST_FILESYSTEM
|
@ -20,7 +20,7 @@
|
||||
# define WINAPIV __cdecl
|
||||
#endif
|
||||
|
||||
#ifndef FOM_ZLIB
|
||||
#ifndef ZLIB_STATIC
|
||||
#define ZLIB_DLL
|
||||
#endif
|
||||
|
||||
@ -28,18 +28,10 @@
|
||||
|
||||
// automatically link against the required library
|
||||
#if MSC_VERSION
|
||||
# ifdef FOM_ZLIB
|
||||
# ifdef NDEBUG
|
||||
# pragma comment(lib, "fom_zlib.lib")
|
||||
# else
|
||||
# pragma comment(lib, "fom_zlib_d.lib")
|
||||
# endif
|
||||
# ifdef NDEBUG
|
||||
# pragma comment(lib, "zlib1.lib")
|
||||
# else
|
||||
# ifdef NDEBUG
|
||||
# pragma comment(lib, "zlib1.lib")
|
||||
# else
|
||||
# pragma comment(lib, "zlib1d.lib")
|
||||
# endif
|
||||
# pragma comment(lib, "zlib1d.lib")
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
@ -25,8 +25,8 @@
|
||||
class VFS : public IVFS
|
||||
{
|
||||
public:
|
||||
VFS()
|
||||
: m_fileCache(ChooseCacheSize())
|
||||
VFS(size_t cacheSize)
|
||||
: m_cacheSize(cacheSize), m_fileCache(m_cacheSize)
|
||||
, m_trace(CreateTrace(4*MiB))
|
||||
{
|
||||
}
|
||||
@ -106,7 +106,7 @@ public:
|
||||
// safely handle zero-length files
|
||||
if(!size)
|
||||
fileContents = DummySharedPtr((u8*)0);
|
||||
else if(size > ChooseCacheSize())
|
||||
else if(size > m_cacheSize)
|
||||
{
|
||||
fileContents = io_Allocate(size);
|
||||
RETURN_ERR(file->Load(fileContents));
|
||||
@ -152,19 +152,15 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
static size_t ChooseCacheSize()
|
||||
{
|
||||
return 96*MiB;
|
||||
}
|
||||
|
||||
mutable VfsDirectory m_rootDirectory;
|
||||
size_t m_cacheSize;
|
||||
FileCache m_fileCache;
|
||||
PITrace m_trace;
|
||||
mutable VfsDirectory m_rootDirectory;
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
PIVFS CreateVfs()
|
||||
PIVFS CreateVfs(size_t cacheSize)
|
||||
{
|
||||
return PIVFS(new VFS);
|
||||
return PIVFS(new VFS(cacheSize));
|
||||
}
|
||||
|
@ -100,6 +100,6 @@ struct IVFS
|
||||
};
|
||||
|
||||
typedef shared_ptr<IVFS> PIVFS;
|
||||
LIB_API PIVFS CreateVfs();
|
||||
LIB_API PIVFS CreateVfs(size_t cacheSize);
|
||||
|
||||
#endif // #ifndef INCLUDED_VFS
|
||||
|
@ -3,22 +3,19 @@
|
||||
// note: EXTERN_C cannot be used because shared_ptr is often returned
|
||||
// by value, which requires C++ linkage.
|
||||
|
||||
#ifdef LIB_DLL
|
||||
#ifdef LIB_STATIC_LINK
|
||||
# define LIB_API
|
||||
#else
|
||||
# ifdef LIB_BUILD
|
||||
# define LIB_API __declspec(dllexport)
|
||||
# else
|
||||
# define LIB_API __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
# define LIB_API
|
||||
#endif
|
||||
|
||||
#if defined(LIB_DLL) && !defined(LIB_BUILD)
|
||||
# if MSC_VERSION
|
||||
# ifdef NDEBUG
|
||||
# pragma comment(lib, "lib.lib")
|
||||
# else
|
||||
# pragma comment(lib, "lib_d.lib")
|
||||
# if MSC_VERSION
|
||||
# ifdef NDEBUG
|
||||
# pragma comment(lib, "lib.lib")
|
||||
# else
|
||||
# pragma comment(lib, "lib_d.lib")
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
@ -56,19 +56,18 @@
|
||||
#include "lib/code_annotation.h"
|
||||
|
||||
// Boost
|
||||
// .. if this package isn't going to be statically linked, we're better off
|
||||
// using Boost via DLL. (otherwise, we would have to ensure the exact same
|
||||
// compiler is used, which is a pain because MSC8, MSC9 and ICC 10 are in use)
|
||||
#ifndef LIB_STATIC_LINK
|
||||
# define BOOST_ALL_DYN_LINK
|
||||
#endif
|
||||
#include <boost/utility.hpp> // noncopyable
|
||||
#include <boost/shared_array.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
#if MSC_VERSION
|
||||
# pragma warning(push, 3) // filesystem isn't W4-clean
|
||||
#endif
|
||||
#include <boost/filesystem.hpp>
|
||||
#if MSC_VERSION
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
using boost::shared_ptr; // has been added to TR1
|
||||
namespace fs = boost::filesystem;
|
||||
#include "lib/external_libraries/boost_filesystem.h"
|
||||
|
||||
// (this must come after boost and common lib headers)
|
||||
#include "lib/posix/posix.h"
|
||||
|
@ -26,16 +26,3 @@ void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// note: ACPI processor detection not yet implemented here, so we treat
|
||||
// dual-core systems as multiprocessors.
|
||||
|
||||
size_t cpu_NumPackages()
|
||||
{
|
||||
return cpu_NumProcessors();
|
||||
}
|
||||
|
||||
size_t cpu_CoresPerPackage()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
@ -28,16 +28,6 @@ namespace ERR
|
||||
**/
|
||||
LIB_API const char* cpu_IdentifierString();
|
||||
|
||||
/**
|
||||
* @return a rough estimate of the CPU clock frequency.
|
||||
*
|
||||
* note: the accuracy of this value is not important. while it is used by
|
||||
* the TSC timing backend, thermal drift is an issue that requires
|
||||
* continual recalibration anyway, which makes the initial accuracy moot.
|
||||
* querying frequency via OS is also much faster than ia32's measurement loop.
|
||||
**/
|
||||
LIB_API double cpu_ClockFrequency();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// lock-free support routines
|
||||
|
@ -11,7 +11,7 @@ to add the necessary parts to that generated manifest.
|
||||
ICC 10.1 IPO considers this string to be an input file, hence this
|
||||
is currently disabled there.
|
||||
*/
|
||||
#if MSC_VERSION >= 1400 && !ICC_VERSION
|
||||
#if MSC_VERSION >= 1400 && !ICC_VERSION && defined(LIB_STATIC_LINK)
|
||||
# if ARCH_IA32
|
||||
# pragma comment(linker, "\"/manifestdependency:type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='X86' publicKeyToken='6595b64144ccf1df'\"")
|
||||
# elif ARCH_AMD64
|
||||
|
@ -46,7 +46,7 @@ public:
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
virtual double NominalFrequency() const = 0;
|
||||
|
||||
|
@ -113,7 +113,7 @@ size_t CounterHPET::CounterBits() const
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
double CounterHPET::NominalFrequency() const
|
||||
{
|
||||
|
@ -42,7 +42,7 @@ public:
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
virtual double NominalFrequency() const;
|
||||
|
||||
|
@ -84,7 +84,7 @@ size_t CounterPMT::CounterBits() const
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
double CounterPMT::NominalFrequency() const
|
||||
{
|
||||
|
@ -43,7 +43,7 @@ public:
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
virtual double NominalFrequency() const;
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
#include "precompiled.h"
|
||||
#include "qpc.h"
|
||||
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "lib/sysdep/win/win.h"
|
||||
#include "lib/sysdep/win/wutil.h" // wutil_argv
|
||||
#include "pit.h" // PIT_FREQ
|
||||
@ -65,10 +65,10 @@ bool CounterQPC::IsSafe() const
|
||||
// used on MP HAL systems and can be detected by comparing QPF with the
|
||||
// CPU clock. we consider it unsafe unless the user promises (via
|
||||
// command line) that it's patched and thus reliable on their system.
|
||||
bool usesTsc = IsSimilarMagnitude(m_frequency, cpu_ClockFrequency());
|
||||
bool usesTsc = IsSimilarMagnitude(m_frequency, os_cpu_ClockFrequency());
|
||||
// unconfirmed reports indicate QPC sometimes uses 1/3 of the
|
||||
// CPU clock frequency, so check that as well.
|
||||
usesTsc |= IsSimilarMagnitude(m_frequency, cpu_ClockFrequency()/3);
|
||||
usesTsc |= IsSimilarMagnitude(m_frequency, os_cpu_ClockFrequency()/3);
|
||||
if(usesTsc)
|
||||
{
|
||||
const bool isTscSafe = wutil_HasCommandLineArgument("-wQpcTscSafe");
|
||||
@ -108,7 +108,7 @@ size_t CounterQPC::CounterBits() const
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
double CounterQPC::NominalFrequency() const
|
||||
{
|
||||
|
@ -41,7 +41,7 @@ public:
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
virtual double NominalFrequency() const;
|
||||
|
||||
|
@ -69,7 +69,7 @@ size_t CounterTGT::CounterBits() const
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
double CounterTGT::NominalFrequency() const
|
||||
{
|
||||
|
@ -36,7 +36,7 @@ public:
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
virtual double NominalFrequency() const;
|
||||
|
||||
|
@ -11,9 +11,10 @@
|
||||
#include "precompiled.h"
|
||||
#include "tsc.h"
|
||||
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/win/win.h"
|
||||
#include "lib/bits.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "lib/sysdep/win/win.h"
|
||||
#include "lib/sysdep/win/wutil.h"
|
||||
|
||||
#if ARCH_IA32 || ARCH_AMD64
|
||||
# include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_rdtsc
|
||||
@ -96,8 +97,12 @@ bool CounterTSC::IsSafe() const
|
||||
// per-core counter state and the abovementioned race condition.
|
||||
// however, we won't bother, since such platforms aren't yet widespread
|
||||
// and would surely support the nice and safe HPET, anyway)
|
||||
if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
|
||||
return false;
|
||||
{
|
||||
WinScopedLock lock(WHRT_CS);
|
||||
const CpuTopology* topology = cpu_topology_Detect();
|
||||
if(cpu_topology_NumPackages(topology) != 1 || cpu_topology_CoresPerPackage(topology) != 1)
|
||||
return false;
|
||||
}
|
||||
|
||||
#if ARCH_IA32 || ARCH_AMD64
|
||||
// recent CPU:
|
||||
@ -154,9 +159,16 @@ size_t CounterTSC::CounterBits() const
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
double CounterTSC::NominalFrequency() const
|
||||
{
|
||||
return cpu_ClockFrequency();
|
||||
// WARNING: do not call x86_x64_ClockFrequency because it uses the
|
||||
// HRT, which we're currently in the process of initializing.
|
||||
// instead query CPU clock frequency via OS.
|
||||
//
|
||||
// note: even here, initial accuracy isn't critical because the
|
||||
// clock is subject to thermal drift and would require continual
|
||||
// recalibration anyway.
|
||||
return os_cpu_ClockFrequency();
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ public:
|
||||
|
||||
/**
|
||||
* initial measurement of the tick rate. not necessarily correct
|
||||
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
|
||||
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
|
||||
**/
|
||||
virtual double NominalFrequency() const;
|
||||
};
|
||||
|
@ -7,107 +7,113 @@
|
||||
#include "win.h"
|
||||
#include "wutil.h"
|
||||
#include "wcpu.h"
|
||||
#include "winit.h"
|
||||
#include <Psapi.h>
|
||||
|
||||
#ifdef _OPENMP
|
||||
# include <omp.h>
|
||||
#endif
|
||||
|
||||
WINIT_REGISTER_EARLY_INIT(wnuma_Init);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// node topology
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
size_t numa_NumNodes()
|
||||
static size_t NumNodes()
|
||||
{
|
||||
static size_t numNodes;
|
||||
|
||||
if(!numNodes)
|
||||
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
|
||||
if(pGetNumaHighestNodeNumber)
|
||||
{
|
||||
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
|
||||
if(pGetNumaHighestNodeNumber)
|
||||
{
|
||||
ULONG highestNode;
|
||||
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
|
||||
debug_assert(ok);
|
||||
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
|
||||
numNodes = highestNode+1;
|
||||
}
|
||||
// NUMA not supported
|
||||
else
|
||||
numNodes = 1;
|
||||
ULONG highestNode;
|
||||
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
|
||||
debug_assert(ok);
|
||||
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
|
||||
return highestNode+1;
|
||||
}
|
||||
|
||||
return numNodes;
|
||||
// NUMA not supported
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
// note: it is easier to implement this in terms of numa_ProcessorMaskFromNode
|
||||
static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
|
||||
{
|
||||
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
|
||||
if(pGetNumaNodeProcessorMask)
|
||||
{
|
||||
DWORD_PTR processAffinity, systemAffinity;
|
||||
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
|
||||
debug_assert(ok);
|
||||
|
||||
for(size_t node = 0; node < numa_NumNodes(); node++)
|
||||
{
|
||||
ULONGLONG affinity;
|
||||
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
|
||||
debug_assert(ok);
|
||||
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
|
||||
nodesProcessorMask[node] = processorMask;
|
||||
}
|
||||
}
|
||||
// NUMA not supported - consider node 0 to consist of all system processors
|
||||
else
|
||||
nodesProcessorMask[0] = os_cpu_ProcessorMask();
|
||||
}
|
||||
|
||||
|
||||
// note: it is easier to implement this in terms of nodesProcessorMask
|
||||
// rather than the other way around because wcpu provides the
|
||||
// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
|
||||
// convert processor to processorNumber.
|
||||
size_t numa_NodeFromProcessor(size_t processor)
|
||||
static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode)
|
||||
{
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
|
||||
static std::vector<size_t> processorsNode;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
if(processorsNode.empty())
|
||||
for(size_t node = 0; node < numNodes; node++)
|
||||
{
|
||||
processorsNode.resize(os_cpu_NumProcessors(), 0);
|
||||
for(size_t node = 0; node < numa_NumNodes(); node++)
|
||||
const uintptr_t processorMask = nodesProcessorMask[node];
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
if(IsBitSet(processorMask, processor))
|
||||
processorsNode[processor] = node;
|
||||
}
|
||||
if(IsBitSet(processorMask, processor))
|
||||
processorsNode[processor] = node;
|
||||
}
|
||||
}
|
||||
|
||||
return processorsNode.at(processor);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// node topology interface
|
||||
|
||||
struct NodeTopology // POD
|
||||
{
|
||||
size_t numNodes;
|
||||
size_t processorsNode[os_cpu_MaxProcessors];
|
||||
uintptr_t nodesProcessorMask[os_cpu_MaxProcessors];
|
||||
};
|
||||
static NodeTopology s_nodeTopology;
|
||||
|
||||
static void DetectNodeTopology()
|
||||
{
|
||||
s_nodeTopology.numNodes = NumNodes();
|
||||
FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask);
|
||||
FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode);
|
||||
}
|
||||
|
||||
size_t numa_NumNodes()
|
||||
{
|
||||
return s_nodeTopology.numNodes;
|
||||
}
|
||||
|
||||
size_t numa_NodeFromProcessor(size_t processor)
|
||||
{
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
return s_nodeTopology.processorsNode[processor];
|
||||
}
|
||||
|
||||
uintptr_t numa_ProcessorMaskFromNode(size_t node)
|
||||
{
|
||||
debug_assert(node < numa_NumNodes());
|
||||
|
||||
static std::vector<uintptr_t> nodesProcessorMask;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
if(nodesProcessorMask.empty())
|
||||
{
|
||||
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
|
||||
if(pGetNumaNodeProcessorMask)
|
||||
{
|
||||
DWORD_PTR processAffinity, systemAffinity;
|
||||
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
|
||||
debug_assert(ok);
|
||||
|
||||
for(size_t node = 0; node < numa_NumNodes(); node++)
|
||||
{
|
||||
ULONGLONG affinity;
|
||||
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
|
||||
debug_assert(ok);
|
||||
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
|
||||
nodesProcessorMask.push_back(processorMask);
|
||||
}
|
||||
}
|
||||
// NUMA not supported - consider node 0 to consist of all system processors
|
||||
else
|
||||
nodesProcessorMask.push_back(os_cpu_ProcessorMask());
|
||||
}
|
||||
|
||||
return nodesProcessorMask.at(node);
|
||||
debug_assert(node < s_nodeTopology.numNodes);
|
||||
return s_nodeTopology.nodesProcessorMask[node];
|
||||
}
|
||||
|
||||
|
||||
@ -145,16 +151,10 @@ size_t numa_AvailableMemory(size_t node)
|
||||
|
||||
double numa_Factor()
|
||||
{
|
||||
WinScopedLock lock(WNUMA_CS);
|
||||
static double factor;
|
||||
|
||||
static bool initialized;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
if(!initialized)
|
||||
if(factor == 0.0)
|
||||
{
|
||||
initialized = true;
|
||||
|
||||
// if non-NUMA, skip the (expensive) measurements below.
|
||||
if(numa_NumNodes() == 1)
|
||||
factor = 1.0;
|
||||
@ -357,3 +357,12 @@ void numa_Deallocate(void* mem)
|
||||
{
|
||||
VirtualFree(mem, 0, MEM_RELEASE);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
static LibError wnuma_Init()
|
||||
{
|
||||
DetectNodeTopology();
|
||||
return INFO::OK;
|
||||
}
|
||||
|
@ -58,6 +58,8 @@
|
||||
|
||||
// defined by winsock2 and also Linux (with different values)
|
||||
// (values derived from winsock2 WSA* constants minus WSABASEERR)
|
||||
// update: disabled on newer Boost versions because filesystem drags in boost/cerrno.hpp
|
||||
#if BOOST_VERSION <= 103401
|
||||
#define EWOULDBLOCK 35
|
||||
#define EINPROGRESS 36
|
||||
#define EALREADY 37
|
||||
@ -84,6 +86,7 @@
|
||||
#define EHOSTUNREACH 65
|
||||
#define EDQUOT 69
|
||||
#define ESTALE 70
|
||||
#endif
|
||||
|
||||
// defined by winsock2 but not Linux
|
||||
// (commented out because they're not portable)
|
||||
|
@ -38,7 +38,7 @@ static DWORD win32_prot(int prot)
|
||||
NODEFAULT;
|
||||
}
|
||||
|
||||
UNREACHABLE;
|
||||
return 0; // UNREACHABLE
|
||||
}
|
||||
|
||||
|
||||
@ -176,6 +176,13 @@ static LibError mmap_file(void* start, size_t len, int prot, int flags, int fd,
|
||||
|
||||
void* mmap(void* start, size_t len, int prot, int flags, int fd, off_t ofs)
|
||||
{
|
||||
if(len == 0) // POSIX says this must cause mmap to fail
|
||||
{
|
||||
debug_assert(0);
|
||||
errno = EINVAL;
|
||||
return MAP_FAILED;
|
||||
}
|
||||
|
||||
void* p;
|
||||
LibError err;
|
||||
if(flags & MAP_ANONYMOUS)
|
||||
|
@ -330,7 +330,7 @@ C++ classes. this way is more reliable/documented, but has several drawbacks:
|
||||
|
||||
*/
|
||||
|
||||
#ifndef LIB_DLL
|
||||
#ifdef LIB_STATIC_LINK
|
||||
|
||||
EXTERN_C int mainCRTStartup();
|
||||
|
||||
|
@ -413,7 +413,7 @@ WinScopedDisableWow64Redirection::~WinScopedDisableWow64Redirection()
|
||||
//-----------------------------------------------------------------------------
|
||||
// module handle
|
||||
|
||||
#ifdef LIB_DLL
|
||||
#ifndef LIB_STATIC_LINK
|
||||
|
||||
HMODULE wutil_LibModuleHandle;
|
||||
|
||||
|
@ -35,6 +35,8 @@ enum WinLockId
|
||||
{
|
||||
WAIO_CS,
|
||||
WDBG_SYM_CS, // protects (non-reentrant) dbghelp.dll
|
||||
WHRT_CS,
|
||||
WNUMA_CS,
|
||||
|
||||
NUM_CS
|
||||
};
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "x86_x64.h"
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// detect *maximum* number of cores/packages/caches.
|
||||
// note: some of them may be disabled by the OS or BIOS.
|
||||
@ -143,20 +144,22 @@ static size_t LogicalPerCache()
|
||||
|
||||
/**
|
||||
* @return an array of the processors' unique APIC IDs or zero if
|
||||
* no APIC is present or process affinity is limited.
|
||||
* no xAPIC is present or process affinity is limited.
|
||||
**/
|
||||
static const u8* ApicIds()
|
||||
{
|
||||
static u8 apicIdStorage[os_cpu_MaxProcessors];
|
||||
static const u8* apicIds;
|
||||
const u8* const uninitialized = (const u8*)1;
|
||||
static const u8* apicIds = uninitialized;
|
||||
|
||||
static volatile uintptr_t initialized = 0;
|
||||
if(cpu_CAS(&initialized, 0, 1))
|
||||
if(apicIds == uninitialized)
|
||||
{
|
||||
// requires 'new' APIC (see x86_x64_ApicId for details)
|
||||
apicIds = 0; // return zero from now on unless the below succeeds
|
||||
|
||||
// requires xAPIC (see x86_x64_ApicId for details)
|
||||
if(x86_x64_Generation() >= 8)
|
||||
{
|
||||
// store each processor's APIC ID in turn
|
||||
static u8 apicIdStorage[os_cpu_MaxProcessors];
|
||||
struct StoreApicId
|
||||
{
|
||||
static void Callback(size_t processor, uintptr_t UNUSED(cbData))
|
||||
@ -200,89 +203,109 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_NumPackages()
|
||||
static size_t NumPackages(const u8* apicIds)
|
||||
{
|
||||
static size_t numPackages = 0;
|
||||
|
||||
if(!numPackages)
|
||||
if(apicIds)
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
|
||||
const size_t numBits = 8;
|
||||
numPackages = NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// note: correct results cannot be guaranteed because unreported
|
||||
// and disable logical units are indistinguishable. the below
|
||||
// assumptions are reasonable because we care most about packages
|
||||
// (i.e. whether the system is truly SMP). in contrast, it is
|
||||
// safe to overestimate the number of cores because that
|
||||
// only determines if memory barriers are needed or not.
|
||||
// note: requiring modern processors featuring an APIC does not
|
||||
// prevent this from being reached (the cause may be lack of
|
||||
// OS support or restricted process affinity).
|
||||
|
||||
// assume cores are enabled and count as processors.
|
||||
const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
|
||||
debug_assert(numPackagesTimesLogical != 0);
|
||||
// assume hyperthreads are enabled; check if they count as processors.
|
||||
if(numPackagesTimesLogical > LogicalPerCore())
|
||||
numPackages = numPackagesTimesLogical / LogicalPerCore();
|
||||
}
|
||||
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
|
||||
const size_t numBits = 8;
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// note: correct results cannot be guaranteed because unreported
|
||||
// and disable logical units are indistinguishable. the below
|
||||
// assumptions are reasonable because we care most about packages
|
||||
// (i.e. whether the system is truly SMP). in contrast, it is
|
||||
// safe to overestimate the number of cores because that
|
||||
// only determines if memory barriers are needed or not.
|
||||
// note: requiring modern processors featuring an APIC does not
|
||||
// prevent this from being reached (the cause may be lack of
|
||||
// OS support or restricted process affinity).
|
||||
|
||||
return numPackages;
|
||||
// assume cores are enabled and count as processors.
|
||||
const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
|
||||
debug_assert(numPackagesTimesLogical != 0);
|
||||
// assume hyperthreads are enabled.
|
||||
size_t numPackages = numPackagesTimesLogical;
|
||||
// if they are reported as processors, remove them from the count.
|
||||
if(numPackages > LogicalPerCore())
|
||||
numPackages /= LogicalPerCore();
|
||||
return numPackages;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_CoresPerPackage()
|
||||
static size_t CoresPerPackage(const u8* apicIds)
|
||||
{
|
||||
static size_t enabledCoresPerPackage;
|
||||
|
||||
if(!enabledCoresPerPackage)
|
||||
if(apicIds)
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = ceil_log2(LogicalPerCore());
|
||||
const size_t numBits = ceil_log2(CoresPerPackage());
|
||||
enabledCoresPerPackage = NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// guess (must match cpu_NumPackages's assumptions)
|
||||
enabledCoresPerPackage = CoresPerPackage();
|
||||
}
|
||||
const size_t offset = ceil_log2(LogicalPerCore());
|
||||
const size_t numBits = ceil_log2(CoresPerPackage());
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// guess (must match NumPackages's assumptions)
|
||||
return CoresPerPackage();
|
||||
}
|
||||
|
||||
return enabledCoresPerPackage;
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_LogicalPerCore()
|
||||
static size_t LogicalPerCore(const u8* apicIds)
|
||||
{
|
||||
static size_t enabledLogicalPerCore;
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = 0;
|
||||
const size_t numBits = ceil_log2(LogicalPerCore());
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// guess (must match NumPackages's assumptions)
|
||||
return LogicalPerCore();
|
||||
}
|
||||
}
|
||||
|
||||
if(!enabledLogicalPerCore)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU topology interface
|
||||
|
||||
struct CpuTopology // POD
|
||||
{
|
||||
size_t numPackages;
|
||||
size_t coresPerPackage;
|
||||
size_t logicalPerCore;
|
||||
};
|
||||
|
||||
const CpuTopology* cpu_topology_Detect()
|
||||
{
|
||||
static CpuTopology topology;
|
||||
|
||||
if(!topology.numPackages)
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = 0;
|
||||
const size_t numBits = ceil_log2(LogicalPerCore());
|
||||
enabledLogicalPerCore = NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// guess (must match cpu_NumPackages's assumptions)
|
||||
enabledLogicalPerCore = LogicalPerCore();
|
||||
}
|
||||
topology.numPackages = NumPackages(apicIds);
|
||||
topology.coresPerPackage = CoresPerPackage(apicIds);
|
||||
topology.logicalPerCore = LogicalPerCore(apicIds);
|
||||
}
|
||||
|
||||
return enabledLogicalPerCore;
|
||||
return &topology;
|
||||
}
|
||||
|
||||
size_t cpu_topology_NumPackages(const CpuTopology* topology)
|
||||
{
|
||||
return topology->numPackages;
|
||||
}
|
||||
|
||||
size_t cpu_topology_CoresPerPackage(const CpuTopology* topology)
|
||||
{
|
||||
return topology->coresPerPackage;
|
||||
}
|
||||
|
||||
size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
|
||||
{
|
||||
return topology->logicalPerCore;
|
||||
}
|
||||
|
||||
|
||||
@ -293,29 +316,22 @@ size_t cpu_LogicalPerCore()
|
||||
// functionality but returns incorrect results. (it claims all cores in
|
||||
// an Intel Core2 Quad processor share a single L2 cache.)
|
||||
|
||||
size_t cpu_NumCaches()
|
||||
static size_t NumCaches(const u8* apicIds)
|
||||
{
|
||||
static size_t numCaches;
|
||||
if(!numCaches)
|
||||
if(apicIds)
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = 0;
|
||||
const size_t numBits = ceil_log2(LogicalPerCache());
|
||||
numCaches = NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// assume each processor has its own cache
|
||||
numCaches = os_cpu_NumProcessors();
|
||||
}
|
||||
const size_t offset = 0;
|
||||
const size_t numBits = ceil_log2(LogicalPerCache());
|
||||
return NumUniqueValuesInField(apicIds, offset, numBits);
|
||||
}
|
||||
else
|
||||
{
|
||||
// assume each processor has its own cache
|
||||
return os_cpu_NumProcessors();
|
||||
}
|
||||
|
||||
return numCaches;
|
||||
}
|
||||
|
||||
class CacheTopology
|
||||
class CacheRelations
|
||||
{
|
||||
public:
|
||||
/**
|
||||
@ -388,64 +404,87 @@ private:
|
||||
std::vector<SharedCache> m_caches;
|
||||
};
|
||||
|
||||
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
|
||||
static void DetermineCachesProcessorMask(const u8* apicIds, uintptr_t* cachesProcessorMask)
|
||||
{
|
||||
static uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t numBits = ceil_log2(LogicalPerCache());
|
||||
const u8 cacheIdMask = u8(0xFF << numBits);
|
||||
|
||||
static volatile uintptr_t initialized = 0;
|
||||
if(cpu_CAS(&initialized, 0, 1))
|
||||
CacheRelations cacheRelations;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const u8 apicId = apicIds[processor];
|
||||
const u8 cacheId = apicId & cacheIdMask;
|
||||
cacheRelations.Add(cacheId, processor);
|
||||
}
|
||||
cacheRelations.StoreProcessorMasks(cachesProcessorMask);
|
||||
}
|
||||
else
|
||||
{
|
||||
// assume each processor has exactly one cache with matching IDs
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
cachesProcessorMask[processor] = uintptr_t(1) << processor;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void DetermineProcessorsCache(size_t numCaches, const uintptr_t* cachesProcessorMask, size_t* processorsCache)
|
||||
{
|
||||
for(size_t cache = 0; cache < numCaches; cache++)
|
||||
{
|
||||
// write to all entries that share this cache
|
||||
const uintptr_t processorMask = cachesProcessorMask[cache];
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
if(IsBitSet(processorMask, processor))
|
||||
{
|
||||
debug_assert(processorsCache[processor] == 0);
|
||||
processorsCache[processor] = cache;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// cache topology interface
|
||||
|
||||
struct CacheTopology // POD
|
||||
{
|
||||
size_t numCaches;
|
||||
size_t processorsCache[os_cpu_MaxProcessors];
|
||||
uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
|
||||
};
|
||||
|
||||
const CacheTopology* cache_topology_Detect()
|
||||
{
|
||||
static CacheTopology topology;
|
||||
|
||||
if(!topology.numCaches)
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t numBits = ceil_log2(LogicalPerCache());
|
||||
const u8 cacheIdMask = u8(0xFF << numBits);
|
||||
|
||||
CacheTopology cacheManager;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const u8 apicId = apicIds[processor];
|
||||
const u8 cacheId = apicId & cacheIdMask;
|
||||
cacheManager.Add(cacheId, processor);
|
||||
}
|
||||
cacheManager.StoreProcessorMasks(cachesProcessorMask);
|
||||
}
|
||||
else
|
||||
{
|
||||
// assume each cache belongs to exactly one processor and
|
||||
// cache index == processor index.
|
||||
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
|
||||
cachesProcessorMask[cache] = uintptr_t(1) << cache;
|
||||
}
|
||||
topology.numCaches = NumCaches(apicIds);
|
||||
DetermineCachesProcessorMask(apicIds, topology.cachesProcessorMask);
|
||||
DetermineProcessorsCache(topology.numCaches, topology.cachesProcessorMask, topology.processorsCache);
|
||||
}
|
||||
|
||||
debug_assert(cache < cpu_NumCaches());
|
||||
return cachesProcessorMask[cache];
|
||||
return &topology;
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_CacheFromProcessor(size_t processor)
|
||||
size_t cache_topology_NumCaches(const CacheTopology* topology)
|
||||
{
|
||||
static size_t processorsCache[os_cpu_MaxProcessors];
|
||||
|
||||
static volatile uintptr_t initialized = 0;
|
||||
if(cpu_CAS(&initialized, 0, 1))
|
||||
{
|
||||
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
|
||||
{
|
||||
// write to all entries that share this cache
|
||||
const uintptr_t processorMask = cpu_ProcessorMaskFromCache(cache);
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
if(IsBitSet(processorMask, processor))
|
||||
{
|
||||
debug_assert(processorsCache[processor] == 0);
|
||||
processorsCache[processor] = cache;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
return processorsCache[processor];
|
||||
return topology->numCaches;
|
||||
}
|
||||
|
||||
size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor)
|
||||
{
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
return topology->processorsCache[processor];
|
||||
}
|
||||
|
||||
uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache)
|
||||
{
|
||||
debug_assert(cache < topology->numCaches);
|
||||
return topology->cachesProcessorMask[cache];
|
||||
}
|
||||
|
@ -11,53 +11,91 @@
|
||||
#ifndef INCLUDED_TOPOLOGY
|
||||
#define INCLUDED_TOPOLOGY
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU
|
||||
// interface rationale:
|
||||
// - explicit initialization avoids the difficulty and overhead of
|
||||
// thread-safe lazy initialization checks.
|
||||
// - requiring an opaque struct to be passed in ensures users call the
|
||||
// init function before using the accessors.
|
||||
// - delegating responsibility for thread-safety to the caller of the
|
||||
// first *_Detect invocation avoids overhead and keeps us independent of
|
||||
// the various threading packages (Boost, OpenMP, POSIX, Win32, ..)
|
||||
|
||||
// OSes typically consider both SMT units and cores to be "processors".
|
||||
// the following routines determine how many of each are actually present and
|
||||
// enabled. this information is useful for detecting SMP systems, predicting
|
||||
// performance and dimensioning thread pools.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// cpu
|
||||
|
||||
/**
|
||||
* stores CPU topology, i.e. how many packages, cores and SMT units are
|
||||
* actually present and enabled. this is useful for detecting SMP systems,
|
||||
* predicting performance and dimensioning thread pools.
|
||||
*
|
||||
* note: OS abstractions usually only mention "processors", which could be
|
||||
* any mix of the above.
|
||||
**/
|
||||
struct CpuTopology;
|
||||
|
||||
/**
|
||||
* initialize static storage from which topology can be retrieved by
|
||||
* means of the following functions.
|
||||
* @return const pointer to a shared instance.
|
||||
*
|
||||
* WARNING: this function must not be reentered before it has returned once.
|
||||
**/
|
||||
LIB_API const CpuTopology* cpu_topology_Detect();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU packages / sockets.
|
||||
**/
|
||||
LIB_API size_t cpu_NumPackages();
|
||||
LIB_API size_t cpu_topology_NumPackages(const CpuTopology*);
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU cores per package.
|
||||
* (2 on dual-core systems)
|
||||
**/
|
||||
LIB_API size_t cpu_CoresPerPackage();
|
||||
LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*);
|
||||
|
||||
/**
|
||||
* @return number of *enabled* hyperthreading units per core.
|
||||
* (2 on P4 EE)
|
||||
**/
|
||||
LIB_API size_t cpu_LogicalPerCore();
|
||||
LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// L2 cache
|
||||
|
||||
// some CPU micro-architectures (e.g. Intel Core2) feature partitioned
|
||||
// L2 caches. if the cores sharing a cache work together on the same
|
||||
// sub-problem, contention may be reduced and effective capacity increased.
|
||||
// the following routines allow discovery of the L2 cache topology:
|
||||
/**
|
||||
* stores L2 cache topology, i.e. the mapping between processor and caches.
|
||||
* this allows cores sharing a cache to work together on the same dataset,
|
||||
* which may reduce contention and increase effective capacity.
|
||||
*
|
||||
* example: Intel Core2 micro-architectures (e.g. Intel Core2) feature
|
||||
* partitioned L2 caches shared by two cores.
|
||||
**/
|
||||
struct CacheTopology;
|
||||
|
||||
/**
|
||||
* initialize static storage from which topology can be retrieved by
|
||||
* means of the following functions.
|
||||
* @return const pointer to a shared instance.
|
||||
*
|
||||
* WARNING: this function must not be reentered before it has returned once.
|
||||
**/
|
||||
LIB_API const CacheTopology* cache_topology_Detect();
|
||||
|
||||
/**
|
||||
* @return number of distinct L2 caches
|
||||
**/
|
||||
LIB_API size_t cpu_NumCaches();
|
||||
LIB_API size_t cache_topology_NumCaches(const CacheTopology*);
|
||||
|
||||
/**
|
||||
* @return L2 cache number (zero-based) to which <processor> belongs.
|
||||
**/
|
||||
LIB_API size_t cpu_CacheFromProcessor(size_t processor);
|
||||
LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor);
|
||||
|
||||
/**
|
||||
* @return bit-mask of all processors sharing <cache>.
|
||||
**/
|
||||
LIB_API uintptr_t cpu_ProcessorMaskFromCache(size_t cache);
|
||||
LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache);
|
||||
|
||||
#endif // #ifndef INCLUDED_TOPOLOGY
|
||||
|
@ -338,6 +338,65 @@ const char* cpu_IdentifierString()
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// misc stateless functions
|
||||
|
||||
u8 x86_x64_ApicId()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
// note: CPUID function 1 should be available everywhere, but only
|
||||
// processors with an xAPIC (8th generation or above, e.g. P4/Athlon XP)
|
||||
// will return a nonzero value.
|
||||
if(!x86_x64_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
|
||||
return apicId;
|
||||
}
|
||||
|
||||
|
||||
u64 x86_x64_rdtsc()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
return (u64)__rdtsc();
|
||||
#elif GCC_VERSION
|
||||
// GCC supports "portable" assembly for both x86 and x64
|
||||
volatile u32 lo, hi;
|
||||
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return u64_from_u32(hi, lo);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void x86_x64_DebugBreak()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
__debugbreak();
|
||||
#elif GCC_VERSION
|
||||
// note: this probably isn't necessary, since unix_debug_break
|
||||
// (SIGTRAP) is most probably available if GCC_VERSION.
|
||||
// we include it for completeness, though.
|
||||
__asm__ __volatile__ ("int $3");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// enforce strong memory ordering.
|
||||
void cpu_MemoryFence()
|
||||
{
|
||||
if(x86_x64_cap(X86_X64_CAP_SSE2))
|
||||
_mm_mfence();
|
||||
}
|
||||
|
||||
|
||||
void cpu_Serialize()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
x86_x64_cpuid(®s); // CPUID serializes execution.
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU frequency
|
||||
|
||||
@ -367,10 +426,8 @@ public:
|
||||
};
|
||||
|
||||
// note: this function uses timer.cpp!timer_Time, which is implemented via
|
||||
// whrt.cpp on Windows, which again calls x86_x64_Init. be careful that
|
||||
// this function isn't called from there as well, else WHRT will be used
|
||||
// before its init completes.
|
||||
double cpu_ClockFrequency()
|
||||
// whrt.cpp on Windows.
|
||||
double x86_x64_ClockFrequency()
|
||||
{
|
||||
// if the TSC isn't available, there's really no good way to count the
|
||||
// actual CPU clocks per known time interval, so bail.
|
||||
@ -447,59 +504,3 @@ double cpu_ClockFrequency()
|
||||
const double clock_frequency = sum / (hi-lo);
|
||||
return clock_frequency;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// misc stateless functions
|
||||
|
||||
u8 x86_x64_ApicId()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!x86_x64_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
|
||||
return apicId;
|
||||
}
|
||||
|
||||
|
||||
u64 x86_x64_rdtsc()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
return (u64)__rdtsc();
|
||||
#elif GCC_VERSION
|
||||
// GCC supports "portable" assembly for both x86 and x64
|
||||
volatile u32 lo, hi;
|
||||
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return u64_from_u32(hi, lo);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void x86_x64_DebugBreak()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
__debugbreak();
|
||||
#elif GCC_VERSION
|
||||
// note: this probably isn't necessary, since unix_debug_break
|
||||
// (SIGTRAP) is most probably available if GCC_VERSION.
|
||||
// we include it for completeness, though.
|
||||
__asm__ __volatile__ ("int $3");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// enforce strong memory ordering.
|
||||
void cpu_MemoryFence()
|
||||
{
|
||||
if(x86_x64_cap(X86_X64_CAP_SSE2))
|
||||
_mm_mfence();
|
||||
}
|
||||
|
||||
|
||||
void cpu_Serialize()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
x86_x64_cpuid(®s); // CPUID serializes execution.
|
||||
}
|
||||
|
@ -100,13 +100,12 @@ LIB_API bool x86_x64_cap(x86_x64_Cap cap);
|
||||
// stateless
|
||||
|
||||
/**
|
||||
* @return APIC ID of the currently executing processor.
|
||||
* @return APIC ID of the currently executing processor or zero if the
|
||||
* platform does not have an xAPIC (i.e. 7th generation x86 or below).
|
||||
*
|
||||
* the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
|
||||
* (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
|
||||
* APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
|
||||
* reliably on WinXP. also, the OS already has the APIC registers mapped and
|
||||
* in constant use, and we don't want to interfere.
|
||||
* rationale: the alternative of accessing the APIC mmio registers is not
|
||||
* feasible - mahaf_MapPhysicalMemory only works reliably on WinXP. we also
|
||||
* don't want to intefere with the OS's constant use of the APIC registers.
|
||||
**/
|
||||
LIB_API u8 x86_x64_ApicId();
|
||||
|
||||
@ -122,4 +121,12 @@ LIB_API u64 x86_x64_rdtsc();
|
||||
**/
|
||||
LIB_API void x86_x64_DebugBreak(void);
|
||||
|
||||
/**
|
||||
* measure the CPU clock frequency via x86_x64_rdtsc and timer_Time.
|
||||
* (it follows that this must not be called from WHRT init.)
|
||||
* this takes several milliseconds (i.e. much longer than
|
||||
* os_cpu_ClockFrequency) but delivers accurate measurements.
|
||||
**/
|
||||
LIB_API double x86_x64_ClockFrequency();
|
||||
|
||||
#endif // #ifndef INCLUDED_X86_X64
|
||||
|
@ -141,7 +141,7 @@ void CNetLogSink::DoSink( const CNetLogEvent& event )
|
||||
//-----------------------------------------------------------------------------
|
||||
void CNetLogSink::DoBulkSink( const CNetLogEvent* pEvents, size_t eventCount )
|
||||
{
|
||||
unsigned* pIndices = NULL;
|
||||
size_t* pIndices = NULL;
|
||||
size_t indexCount = 0;
|
||||
size_t i;
|
||||
|
||||
|
@ -577,7 +577,7 @@ static void InitVfs(const CmdLineArgs& args)
|
||||
// the VFS prevents any accesses to files above this directory.
|
||||
path_SetRoot(args.GetArg0(), "../data");
|
||||
|
||||
g_VFS = CreateVfs();
|
||||
g_VFS = CreateVfs(96*MiB);
|
||||
|
||||
g_VFS->Mount("screenshots/", "screenshots");
|
||||
g_VFS->Mount("config/", "config");
|
||||
|
@ -75,8 +75,9 @@ void WriteSystemInfo()
|
||||
fprintf(f, "OS : %s %s (%s)\n", un.sysname, un.release, un.version);
|
||||
|
||||
// CPU
|
||||
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), cpu_NumPackages(), cpu_CoresPerPackage(), cpu_LogicalPerCore());
|
||||
const double cpu_freq = cpu_ClockFrequency();
|
||||
const CpuTopology* topology = cpu_topology_Detect();
|
||||
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), cpu_topology_NumPackages(topology), cpu_topology_CoresPerPackage(topology), cpu_topology_LogicalPerCore(topology));
|
||||
const double cpu_freq = os_cpu_ClockFrequency();
|
||||
if(cpu_freq != 0.0f)
|
||||
{
|
||||
if(cpu_freq < 1e9)
|
||||
|
@ -10,7 +10,7 @@ public:
|
||||
void test_paths()
|
||||
{
|
||||
TS_ASSERT_OK(path_SetRoot(0, "../data"));
|
||||
PIVFS vfs = CreateVfs();
|
||||
PIVFS vfs = CreateVfs(20*MiB);
|
||||
|
||||
TS_ASSERT_OK(vfs->Mount("", "mods/_test.xero"));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user