1
0
forked from 0ad/0ad

fixes/improvements to lib code from work

- topology.cpp: modify interface due to thread-safety issue. caller is
now responsible for ensuring the first _Detect call isn't reentered;
everything else is safe.
- fix thread-safety issue in wnuma; use winit mechanism to ensure it's
ready before use
- VFS now takes a cacheSize parameter (required for being able to
disable read-only file caches for the image loader at work)
- allow dynarray that isn't actually holding memory
- debug_stl: VC9 fix (disable this code except on the exact STL version
on which it was tested)
- zlib, lib_api: changes to macro used to toggle between static and
dynamic linking
- add boost filesystem header in external_libraries
- amd64: cpu_ topology functions are now provided by x86_x64
- cpu: remove cpu_ClockFrequency (dangerous, may be tempting to use
during WHRT init which would cause a crash). use x86_x64_ClockFrequency
or os_cpu_ClockFrequency instead.
- werrno: cope with newer boost versions
- wmman: follow SUSv3 in rejecting zero-length mappings

This was SVN commit r5954.
This commit is contained in:
janwas 2008-05-13 19:43:02 +00:00
parent bafc8d0cfa
commit 04127c7af3
37 changed files with 520 additions and 412 deletions

View File

@ -45,7 +45,7 @@ class TestMeshManager : public CxxTest::TestSuite
TS_ASSERT(fs::create_directory(MOD_PATH.external_directory_string()));
TS_ASSERT(fs::create_directory(CACHE_PATH.external_directory_string()));
g_VFS = CreateVfs();
g_VFS = CreateVfs(20*MiB);
TS_ASSERT_OK(g_VFS->Mount("", MOD_PATH));
TS_ASSERT_OK(g_VFS->Mount("collada/", "tests/collada"));

View File

@ -25,14 +25,15 @@ static LibError validate_da(DynArray* da)
{
if(!da)
WARN_RETURN(ERR::INVALID_PARAM);
u8* const base = da->base;
// u8* const base = da->base;
const size_t max_size_pa = da->max_size_pa;
const size_t cur_size = da->cur_size;
const size_t pos = da->pos;
const int prot = da->prot;
if(debug_is_pointer_bogus(base))
WARN_RETURN(ERR::_1);
// note: this happens if max_size == 0
// if(debug_is_pointer_bogus(base))
// WARN_RETURN(ERR::_1);
// note: don't check if base is page-aligned -
// might not be true for 'wrapped' mem regions.
// if(!mem_IsPageMultiple((uintptr_t)base))
@ -56,8 +57,9 @@ LibError da_alloc(DynArray* da, size_t max_size)
{
const size_t max_size_pa = mem_RoundUpToPage(max_size);
u8* p;
RETURN_ERR(mem_Reserve(max_size_pa, &p));
u8* p = 0;
if(max_size_pa) // (avoid mmap failure)
RETURN_ERR(mem_Reserve(max_size_pa, &p));
da->base = p;
da->max_size_pa = max_size_pa;
@ -85,7 +87,7 @@ LibError da_free(DynArray* da)
// skip mem_Release if <da> was allocated via da_wrap_fixed
// (i.e. it doesn't actually own any memory). don't complain;
// da_free is supposed to be called even in the above case.
if(!was_wrapped)
if(!was_wrapped && size_pa)
RETURN_ERR(mem_Release(p, size_pa));
return INFO::OK;
}

View File

@ -211,7 +211,7 @@ struct ContainerBase : public Container
struct Any_deque : public ContainerBase<std::deque<int> >
{
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
bool IsValid(size_t el_size) const
{
@ -277,7 +277,7 @@ struct Any_list : public ContainerBase<std::list<int> >
};
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
template<class _Traits>
struct Any_tree : public std::_Tree<_Traits>
@ -385,7 +385,7 @@ struct Any_vector: public ContainerBase<std::vector<int> >
return true;
}
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
size_t NumElements(size_t el_size) const
{
@ -416,7 +416,7 @@ struct Any_vector: public ContainerBase<std::vector<int> >
};
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
struct Any_basic_string : public ContainerBase<std::string>
{
@ -461,7 +461,7 @@ struct Any_stack : public Any_deque
struct Any_hash_map: public ContainerBase<STL_HASH_MAP<int,int> >
{
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
bool IsValid(size_t el_size) const
{
@ -482,7 +482,7 @@ struct Any_hash_multimap : public Any_hash_map
struct Any_hash_set: public ContainerBase<STL_HASH_SET<int> >
{
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
bool IsValid(size_t el_size) const
{
@ -610,7 +610,7 @@ LibError debug_stl_get_container_info(const char* type_name, const u8* p, size_t
STD_CONTAINER(deque)
STD_CONTAINER(list)
STD_CONTAINER(vector)
#if STL_DINKUMWARE
#if STL_DINKUMWARE == 405
STD_CONTAINER(map)
STD_CONTAINER(multimap)
STD_CONTAINER(set)

View File

@ -0,0 +1,26 @@
/**
* =========================================================================
* File : boost_filesystem.h
* Project : 0 A.D.
* Description : bring in Boost filesystem library
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_BOOST_FILESYSTEM
#define INCLUDED_BOOST_FILESYSTEM
// not W4-clean
#if MSC_VERSION
# pragma warning(push, 3)
#endif
#include "boost/filesystem.hpp"
namespace fs = boost::filesystem;
#if MSC_VERSION
# pragma warning(pop)
#endif
#endif // #ifndef INCLUDED_BOOST_FILESYSTEM

View File

@ -20,7 +20,7 @@
# define WINAPIV __cdecl
#endif
#ifndef FOM_ZLIB
#ifndef ZLIB_STATIC
#define ZLIB_DLL
#endif
@ -28,18 +28,10 @@
// automatically link against the required library
#if MSC_VERSION
# ifdef FOM_ZLIB
# ifdef NDEBUG
# pragma comment(lib, "fom_zlib.lib")
# else
# pragma comment(lib, "fom_zlib_d.lib")
# endif
# ifdef NDEBUG
# pragma comment(lib, "zlib1.lib")
# else
# ifdef NDEBUG
# pragma comment(lib, "zlib1.lib")
# else
# pragma comment(lib, "zlib1d.lib")
# endif
# pragma comment(lib, "zlib1d.lib")
# endif
#endif

View File

@ -25,8 +25,8 @@
class VFS : public IVFS
{
public:
VFS()
: m_fileCache(ChooseCacheSize())
VFS(size_t cacheSize)
: m_cacheSize(cacheSize), m_fileCache(m_cacheSize)
, m_trace(CreateTrace(4*MiB))
{
}
@ -106,7 +106,7 @@ public:
// safely handle zero-length files
if(!size)
fileContents = DummySharedPtr((u8*)0);
else if(size > ChooseCacheSize())
else if(size > m_cacheSize)
{
fileContents = io_Allocate(size);
RETURN_ERR(file->Load(fileContents));
@ -152,19 +152,15 @@ public:
}
private:
static size_t ChooseCacheSize()
{
return 96*MiB;
}
mutable VfsDirectory m_rootDirectory;
size_t m_cacheSize;
FileCache m_fileCache;
PITrace m_trace;
mutable VfsDirectory m_rootDirectory;
};
//-----------------------------------------------------------------------------
PIVFS CreateVfs()
PIVFS CreateVfs(size_t cacheSize)
{
return PIVFS(new VFS);
return PIVFS(new VFS(cacheSize));
}

View File

@ -100,6 +100,6 @@ struct IVFS
};
typedef shared_ptr<IVFS> PIVFS;
LIB_API PIVFS CreateVfs();
LIB_API PIVFS CreateVfs(size_t cacheSize);
#endif // #ifndef INCLUDED_VFS

View File

@ -3,22 +3,19 @@
// note: EXTERN_C cannot be used because shared_ptr is often returned
// by value, which requires C++ linkage.
#ifdef LIB_DLL
#ifdef LIB_STATIC_LINK
# define LIB_API
#else
# ifdef LIB_BUILD
# define LIB_API __declspec(dllexport)
# else
# define LIB_API __declspec(dllimport)
# endif
#else
# define LIB_API
#endif
#if defined(LIB_DLL) && !defined(LIB_BUILD)
# if MSC_VERSION
# ifdef NDEBUG
# pragma comment(lib, "lib.lib")
# else
# pragma comment(lib, "lib_d.lib")
# if MSC_VERSION
# ifdef NDEBUG
# pragma comment(lib, "lib.lib")
# else
# pragma comment(lib, "lib_d.lib")
# endif
# endif
# endif
#endif

View File

@ -56,19 +56,18 @@
#include "lib/code_annotation.h"
// Boost
// .. if this package isn't going to be statically linked, we're better off
// using Boost via DLL. (otherwise, we would have to ensure the exact same
// compiler is used, which is a pain because MSC8, MSC9 and ICC 10 are in use)
#ifndef LIB_STATIC_LINK
# define BOOST_ALL_DYN_LINK
#endif
#include <boost/utility.hpp> // noncopyable
#include <boost/shared_array.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/scoped_ptr.hpp>
#if MSC_VERSION
# pragma warning(push, 3) // filesystem isn't W4-clean
#endif
#include <boost/filesystem.hpp>
#if MSC_VERSION
# pragma warning(pop)
#endif
using boost::shared_ptr; // has been added to TR1
namespace fs = boost::filesystem;
#include "lib/external_libraries/boost_filesystem.h"
// (this must come after boost and common lib headers)
#include "lib/posix/posix.h"

View File

@ -26,16 +26,3 @@ void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
}
#endif
// note: ACPI processor detection not yet implemented here, so we treat
// dual-core systems as multiprocessors.
size_t cpu_NumPackages()
{
return cpu_NumProcessors();
}
size_t cpu_CoresPerPackage()
{
return 1;
}

View File

@ -28,16 +28,6 @@ namespace ERR
**/
LIB_API const char* cpu_IdentifierString();
/**
* @return a rough estimate of the CPU clock frequency.
*
* note: the accuracy of this value is not important. while it is used by
* the TSC timing backend, thermal drift is an issue that requires
* continual recalibration anyway, which makes the initial accuracy moot.
* querying frequency via OS is also much faster than ia32's measurement loop.
**/
LIB_API double cpu_ClockFrequency();
//-----------------------------------------------------------------------------
// lock-free support routines

View File

@ -11,7 +11,7 @@ to add the necessary parts to that generated manifest.
ICC 10.1 IPO considers this string to be an input file, hence this
is currently disabled there.
*/
#if MSC_VERSION >= 1400 && !ICC_VERSION
#if MSC_VERSION >= 1400 && !ICC_VERSION && defined(LIB_STATIC_LINK)
# if ARCH_IA32
# pragma comment(linker, "\"/manifestdependency:type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='X86' publicKeyToken='6595b64144ccf1df'\"")
# elif ARCH_AMD64

View File

@ -46,7 +46,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const = 0;

View File

@ -113,7 +113,7 @@ size_t CounterHPET::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
double CounterHPET::NominalFrequency() const
{

View File

@ -42,7 +42,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -84,7 +84,7 @@ size_t CounterPMT::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
double CounterPMT::NominalFrequency() const
{

View File

@ -43,7 +43,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -11,7 +11,7 @@
#include "precompiled.h"
#include "qpc.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/os_cpu.h"
#include "lib/sysdep/win/win.h"
#include "lib/sysdep/win/wutil.h" // wutil_argv
#include "pit.h" // PIT_FREQ
@ -65,10 +65,10 @@ bool CounterQPC::IsSafe() const
// used on MP HAL systems and can be detected by comparing QPF with the
// CPU clock. we consider it unsafe unless the user promises (via
// command line) that it's patched and thus reliable on their system.
bool usesTsc = IsSimilarMagnitude(m_frequency, cpu_ClockFrequency());
bool usesTsc = IsSimilarMagnitude(m_frequency, os_cpu_ClockFrequency());
// unconfirmed reports indicate QPC sometimes uses 1/3 of the
// CPU clock frequency, so check that as well.
usesTsc |= IsSimilarMagnitude(m_frequency, cpu_ClockFrequency()/3);
usesTsc |= IsSimilarMagnitude(m_frequency, os_cpu_ClockFrequency()/3);
if(usesTsc)
{
const bool isTscSafe = wutil_HasCommandLineArgument("-wQpcTscSafe");
@ -108,7 +108,7 @@ size_t CounterQPC::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
double CounterQPC::NominalFrequency() const
{

View File

@ -41,7 +41,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -69,7 +69,7 @@ size_t CounterTGT::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
double CounterTGT::NominalFrequency() const
{

View File

@ -36,7 +36,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -11,9 +11,10 @@
#include "precompiled.h"
#include "tsc.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/win/win.h"
#include "lib/bits.h"
#include "lib/sysdep/os_cpu.h"
#include "lib/sysdep/win/win.h"
#include "lib/sysdep/win/wutil.h"
#if ARCH_IA32 || ARCH_AMD64
# include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_rdtsc
@ -96,8 +97,12 @@ bool CounterTSC::IsSafe() const
// per-core counter state and the abovementioned race condition.
// however, we won't bother, since such platforms aren't yet widespread
// and would surely support the nice and safe HPET, anyway)
if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
return false;
{
WinScopedLock lock(WHRT_CS);
const CpuTopology* topology = cpu_topology_Detect();
if(cpu_topology_NumPackages(topology) != 1 || cpu_topology_CoresPerPackage(topology) != 1)
return false;
}
#if ARCH_IA32 || ARCH_AMD64
// recent CPU:
@ -154,9 +159,16 @@ size_t CounterTSC::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
double CounterTSC::NominalFrequency() const
{
return cpu_ClockFrequency();
// WARNING: do not call x86_x64_ClockFrequency because it uses the
// HRT, which we're currently in the process of initializing.
// instead query CPU clock frequency via OS.
//
// note: even here, initial accuracy isn't critical because the
// clock is subject to thermal drift and would require continual
// recalibration anyway.
return os_cpu_ClockFrequency();
}

View File

@ -36,7 +36,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
* (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;
};

View File

@ -7,107 +7,113 @@
#include "win.h"
#include "wutil.h"
#include "wcpu.h"
#include "winit.h"
#include <Psapi.h>
#ifdef _OPENMP
# include <omp.h>
#endif
WINIT_REGISTER_EARLY_INIT(wnuma_Init);
//-----------------------------------------------------------------------------
// node topology
//-----------------------------------------------------------------------------
size_t numa_NumNodes()
static size_t NumNodes()
{
static size_t numNodes;
if(!numNodes)
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
if(pGetNumaHighestNodeNumber)
{
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
if(pGetNumaHighestNodeNumber)
{
ULONG highestNode;
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
debug_assert(ok);
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
numNodes = highestNode+1;
}
// NUMA not supported
else
numNodes = 1;
ULONG highestNode;
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
debug_assert(ok);
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
return highestNode+1;
}
return numNodes;
// NUMA not supported
else
return 1;
}
// note: it is easier to implement this in terms of numa_ProcessorMaskFromNode
static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
{
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
if(pGetNumaNodeProcessorMask)
{
DWORD_PTR processAffinity, systemAffinity;
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
for(size_t node = 0; node < numa_NumNodes(); node++)
{
ULONGLONG affinity;
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
debug_assert(ok);
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
nodesProcessorMask[node] = processorMask;
}
}
// NUMA not supported - consider node 0 to consist of all system processors
else
nodesProcessorMask[0] = os_cpu_ProcessorMask();
}
// note: it is easier to implement this in terms of nodesProcessorMask
// rather than the other way around because wcpu provides the
// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
// convert processor to processorNumber.
size_t numa_NodeFromProcessor(size_t processor)
static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode)
{
debug_assert(processor < os_cpu_NumProcessors());
static std::vector<size_t> processorsNode;
#ifdef _OPENMP
#pragma omp critical
#endif
if(processorsNode.empty())
for(size_t node = 0; node < numNodes; node++)
{
processorsNode.resize(os_cpu_NumProcessors(), 0);
for(size_t node = 0; node < numa_NumNodes(); node++)
const uintptr_t processorMask = nodesProcessorMask[node];
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
processorsNode[processor] = node;
}
if(IsBitSet(processorMask, processor))
processorsNode[processor] = node;
}
}
return processorsNode.at(processor);
}
//-----------------------------------------------------------------------------
// node topology interface
struct NodeTopology // POD
{
size_t numNodes;
size_t processorsNode[os_cpu_MaxProcessors];
uintptr_t nodesProcessorMask[os_cpu_MaxProcessors];
};
static NodeTopology s_nodeTopology;
static void DetectNodeTopology()
{
s_nodeTopology.numNodes = NumNodes();
FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask);
FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode);
}
size_t numa_NumNodes()
{
return s_nodeTopology.numNodes;
}
size_t numa_NodeFromProcessor(size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
return s_nodeTopology.processorsNode[processor];
}
uintptr_t numa_ProcessorMaskFromNode(size_t node)
{
debug_assert(node < numa_NumNodes());
static std::vector<uintptr_t> nodesProcessorMask;
#ifdef _OPENMP
#pragma omp critical
#endif
if(nodesProcessorMask.empty())
{
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
if(pGetNumaNodeProcessorMask)
{
DWORD_PTR processAffinity, systemAffinity;
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
for(size_t node = 0; node < numa_NumNodes(); node++)
{
ULONGLONG affinity;
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
debug_assert(ok);
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
nodesProcessorMask.push_back(processorMask);
}
}
// NUMA not supported - consider node 0 to consist of all system processors
else
nodesProcessorMask.push_back(os_cpu_ProcessorMask());
}
return nodesProcessorMask.at(node);
debug_assert(node < s_nodeTopology.numNodes);
return s_nodeTopology.nodesProcessorMask[node];
}
@ -145,16 +151,10 @@ size_t numa_AvailableMemory(size_t node)
double numa_Factor()
{
WinScopedLock lock(WNUMA_CS);
static double factor;
static bool initialized;
#ifdef _OPENMP
#pragma omp critical
#endif
if(!initialized)
if(factor == 0.0)
{
initialized = true;
// if non-NUMA, skip the (expensive) measurements below.
if(numa_NumNodes() == 1)
factor = 1.0;
@ -357,3 +357,12 @@ void numa_Deallocate(void* mem)
{
VirtualFree(mem, 0, MEM_RELEASE);
}
//-----------------------------------------------------------------------------
static LibError wnuma_Init()
{
DetectNodeTopology();
return INFO::OK;
}

View File

@ -58,6 +58,8 @@
// defined by winsock2 and also Linux (with different values)
// (values derived from winsock2 WSA* constants minus WSABASEERR)
// update: disabled on newer Boost versions because filesystem drags in boost/cerrno.hpp
#if BOOST_VERSION <= 103401
#define EWOULDBLOCK 35
#define EINPROGRESS 36
#define EALREADY 37
@ -84,6 +86,7 @@
#define EHOSTUNREACH 65
#define EDQUOT 69
#define ESTALE 70
#endif
// defined by winsock2 but not Linux
// (commented out because they're not portable)

View File

@ -38,7 +38,7 @@ static DWORD win32_prot(int prot)
NODEFAULT;
}
UNREACHABLE;
return 0; // UNREACHABLE
}
@ -176,6 +176,13 @@ static LibError mmap_file(void* start, size_t len, int prot, int flags, int fd,
void* mmap(void* start, size_t len, int prot, int flags, int fd, off_t ofs)
{
if(len == 0) // POSIX says this must cause mmap to fail
{
debug_assert(0);
errno = EINVAL;
return MAP_FAILED;
}
void* p;
LibError err;
if(flags & MAP_ANONYMOUS)

View File

@ -330,7 +330,7 @@ C++ classes. this way is more reliable/documented, but has several drawbacks:
*/
#ifndef LIB_DLL
#ifdef LIB_STATIC_LINK
EXTERN_C int mainCRTStartup();

View File

@ -413,7 +413,7 @@ WinScopedDisableWow64Redirection::~WinScopedDisableWow64Redirection()
//-----------------------------------------------------------------------------
// module handle
#ifdef LIB_DLL
#ifndef LIB_STATIC_LINK
HMODULE wutil_LibModuleHandle;

View File

@ -35,6 +35,8 @@ enum WinLockId
{
WAIO_CS,
WDBG_SYM_CS, // protects (non-reentrant) dbghelp.dll
WHRT_CS,
WNUMA_CS,
NUM_CS
};

View File

@ -16,6 +16,7 @@
#include "lib/sysdep/os_cpu.h"
#include "x86_x64.h"
//-----------------------------------------------------------------------------
// detect *maximum* number of cores/packages/caches.
// note: some of them may be disabled by the OS or BIOS.
@ -143,20 +144,22 @@ static size_t LogicalPerCache()
/**
* @return an array of the processors' unique APIC IDs or zero if
* no APIC is present or process affinity is limited.
* no xAPIC is present or process affinity is limited.
**/
static const u8* ApicIds()
{
static u8 apicIdStorage[os_cpu_MaxProcessors];
static const u8* apicIds;
const u8* const uninitialized = (const u8*)1;
static const u8* apicIds = uninitialized;
static volatile uintptr_t initialized = 0;
if(cpu_CAS(&initialized, 0, 1))
if(apicIds == uninitialized)
{
// requires 'new' APIC (see x86_x64_ApicId for details)
apicIds = 0; // return zero from now on unless the below succeeds
// requires xAPIC (see x86_x64_ApicId for details)
if(x86_x64_Generation() >= 8)
{
// store each processor's APIC ID in turn
static u8 apicIdStorage[os_cpu_MaxProcessors];
struct StoreApicId
{
static void Callback(size_t processor, uintptr_t UNUSED(cbData))
@ -200,89 +203,109 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
}
size_t cpu_NumPackages()
static size_t NumPackages(const u8* apicIds)
{
static size_t numPackages = 0;
if(!numPackages)
if(apicIds)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
const size_t numBits = 8;
numPackages = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// note: correct results cannot be guaranteed because unreported
// and disable logical units are indistinguishable. the below
// assumptions are reasonable because we care most about packages
// (i.e. whether the system is truly SMP). in contrast, it is
// safe to overestimate the number of cores because that
// only determines if memory barriers are needed or not.
// note: requiring modern processors featuring an APIC does not
// prevent this from being reached (the cause may be lack of
// OS support or restricted process affinity).
// assume cores are enabled and count as processors.
const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0);
// assume hyperthreads are enabled; check if they count as processors.
if(numPackagesTimesLogical > LogicalPerCore())
numPackages = numPackagesTimesLogical / LogicalPerCore();
}
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
const size_t numBits = 8;
return NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// note: correct results cannot be guaranteed because unreported
// and disable logical units are indistinguishable. the below
// assumptions are reasonable because we care most about packages
// (i.e. whether the system is truly SMP). in contrast, it is
// safe to overestimate the number of cores because that
// only determines if memory barriers are needed or not.
// note: requiring modern processors featuring an APIC does not
// prevent this from being reached (the cause may be lack of
// OS support or restricted process affinity).
return numPackages;
// assume cores are enabled and count as processors.
const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0);
// assume hyperthreads are enabled.
size_t numPackages = numPackagesTimesLogical;
// if they are reported as processors, remove them from the count.
if(numPackages > LogicalPerCore())
numPackages /= LogicalPerCore();
return numPackages;
}
}
size_t cpu_CoresPerPackage()
static size_t CoresPerPackage(const u8* apicIds)
{
static size_t enabledCoresPerPackage;
if(!enabledCoresPerPackage)
if(apicIds)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = ceil_log2(LogicalPerCore());
const size_t numBits = ceil_log2(CoresPerPackage());
enabledCoresPerPackage = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// guess (must match cpu_NumPackages's assumptions)
enabledCoresPerPackage = CoresPerPackage();
}
const size_t offset = ceil_log2(LogicalPerCore());
const size_t numBits = ceil_log2(CoresPerPackage());
return NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// guess (must match NumPackages's assumptions)
return CoresPerPackage();
}
return enabledCoresPerPackage;
}
size_t cpu_LogicalPerCore()
static size_t LogicalPerCore(const u8* apicIds)
{
static size_t enabledLogicalPerCore;
if(apicIds)
{
const size_t offset = 0;
const size_t numBits = ceil_log2(LogicalPerCore());
return NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// guess (must match NumPackages's assumptions)
return LogicalPerCore();
}
}
if(!enabledLogicalPerCore)
//-----------------------------------------------------------------------------
// CPU topology interface
struct CpuTopology // POD
{
size_t numPackages;
size_t coresPerPackage;
size_t logicalPerCore;
};
const CpuTopology* cpu_topology_Detect()
{
static CpuTopology topology;
if(!topology.numPackages)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = 0;
const size_t numBits = ceil_log2(LogicalPerCore());
enabledLogicalPerCore = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// guess (must match cpu_NumPackages's assumptions)
enabledLogicalPerCore = LogicalPerCore();
}
topology.numPackages = NumPackages(apicIds);
topology.coresPerPackage = CoresPerPackage(apicIds);
topology.logicalPerCore = LogicalPerCore(apicIds);
}
return enabledLogicalPerCore;
return &topology;
}
size_t cpu_topology_NumPackages(const CpuTopology* topology)
{
return topology->numPackages;
}
size_t cpu_topology_CoresPerPackage(const CpuTopology* topology)
{
return topology->coresPerPackage;
}
size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
{
return topology->logicalPerCore;
}
@ -293,29 +316,22 @@ size_t cpu_LogicalPerCore()
// functionality but returns incorrect results. (it claims all cores in
// an Intel Core2 Quad processor share a single L2 cache.)
size_t cpu_NumCaches()
static size_t NumCaches(const u8* apicIds)
{
static size_t numCaches;
if(!numCaches)
if(apicIds)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = 0;
const size_t numBits = ceil_log2(LogicalPerCache());
numCaches = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// assume each processor has its own cache
numCaches = os_cpu_NumProcessors();
}
const size_t offset = 0;
const size_t numBits = ceil_log2(LogicalPerCache());
return NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// assume each processor has its own cache
return os_cpu_NumProcessors();
}
return numCaches;
}
class CacheTopology
class CacheRelations
{
public:
/**
@ -388,64 +404,87 @@ private:
std::vector<SharedCache> m_caches;
};
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
static void DetermineCachesProcessorMask(const u8* apicIds, uintptr_t* cachesProcessorMask)
{
static uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
if(apicIds)
{
const size_t numBits = ceil_log2(LogicalPerCache());
const u8 cacheIdMask = u8(0xFF << numBits);
static volatile uintptr_t initialized = 0;
if(cpu_CAS(&initialized, 0, 1))
CacheRelations cacheRelations;
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
const u8 apicId = apicIds[processor];
const u8 cacheId = apicId & cacheIdMask;
cacheRelations.Add(cacheId, processor);
}
cacheRelations.StoreProcessorMasks(cachesProcessorMask);
}
else
{
// assume each processor has exactly one cache with matching IDs
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
cachesProcessorMask[processor] = uintptr_t(1) << processor;
}
}
static void DetermineProcessorsCache(size_t numCaches, const uintptr_t* cachesProcessorMask, size_t* processorsCache)
{
for(size_t cache = 0; cache < numCaches; cache++)
{
// write to all entries that share this cache
const uintptr_t processorMask = cachesProcessorMask[cache];
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
{
debug_assert(processorsCache[processor] == 0);
processorsCache[processor] = cache;
}
}
}
}
//-----------------------------------------------------------------------------
// cache topology interface
struct CacheTopology // POD
{
size_t numCaches;
size_t processorsCache[os_cpu_MaxProcessors];
uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
};
const CacheTopology* cache_topology_Detect()
{
static CacheTopology topology;
if(!topology.numCaches)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t numBits = ceil_log2(LogicalPerCache());
const u8 cacheIdMask = u8(0xFF << numBits);
CacheTopology cacheManager;
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
const u8 apicId = apicIds[processor];
const u8 cacheId = apicId & cacheIdMask;
cacheManager.Add(cacheId, processor);
}
cacheManager.StoreProcessorMasks(cachesProcessorMask);
}
else
{
// assume each cache belongs to exactly one processor and
// cache index == processor index.
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
cachesProcessorMask[cache] = uintptr_t(1) << cache;
}
topology.numCaches = NumCaches(apicIds);
DetermineCachesProcessorMask(apicIds, topology.cachesProcessorMask);
DetermineProcessorsCache(topology.numCaches, topology.cachesProcessorMask, topology.processorsCache);
}
debug_assert(cache < cpu_NumCaches());
return cachesProcessorMask[cache];
return &topology;
}
size_t cpu_CacheFromProcessor(size_t processor)
size_t cache_topology_NumCaches(const CacheTopology* topology)
{
static size_t processorsCache[os_cpu_MaxProcessors];
static volatile uintptr_t initialized = 0;
if(cpu_CAS(&initialized, 0, 1))
{
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
{
// write to all entries that share this cache
const uintptr_t processorMask = cpu_ProcessorMaskFromCache(cache);
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
{
debug_assert(processorsCache[processor] == 0);
processorsCache[processor] = cache;
}
}
}
}
debug_assert(processor < os_cpu_NumProcessors());
return processorsCache[processor];
return topology->numCaches;
}
size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
return topology->processorsCache[processor];
}
uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache)
{
debug_assert(cache < topology->numCaches);
return topology->cachesProcessorMask[cache];
}

View File

@ -11,53 +11,91 @@
#ifndef INCLUDED_TOPOLOGY
#define INCLUDED_TOPOLOGY
//-----------------------------------------------------------------------------
// CPU
// interface rationale:
// - explicit initialization avoids the difficulty and overhead of
// thread-safe lazy initialization checks.
// - requiring an opaque struct to be passed in ensures users call the
// init function before using the accessors.
// - delegating responsibility for thread-safety to the caller of the
// first *_Detect invocation avoids overhead and keeps us independent of
// the various threading packages (Boost, OpenMP, POSIX, Win32, ..)
// OSes typically consider both SMT units and cores to be "processors".
// the following routines determine how many of each are actually present and
// enabled. this information is useful for detecting SMP systems, predicting
// performance and dimensioning thread pools.
//-----------------------------------------------------------------------------
// cpu
/**
* stores CPU topology, i.e. how many packages, cores and SMT units are
* actually present and enabled. this is useful for detecting SMP systems,
* predicting performance and dimensioning thread pools.
*
* note: OS abstractions usually only mention "processors", which could be
* any mix of the above.
**/
struct CpuTopology;
/**
* initialize static storage from which topology can be retrieved by
* means of the following functions.
* @return const pointer to a shared instance.
*
* WARNING: this function must not be reentered before it has returned once.
**/
LIB_API const CpuTopology* cpu_topology_Detect();
/**
* @return number of *enabled* CPU packages / sockets.
**/
LIB_API size_t cpu_NumPackages();
LIB_API size_t cpu_topology_NumPackages(const CpuTopology*);
/**
* @return number of *enabled* CPU cores per package.
* (2 on dual-core systems)
**/
LIB_API size_t cpu_CoresPerPackage();
LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*);
/**
* @return number of *enabled* hyperthreading units per core.
* (2 on P4 EE)
**/
LIB_API size_t cpu_LogicalPerCore();
LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*);
//-----------------------------------------------------------------------------
// L2 cache
// some CPU micro-architectures (e.g. Intel Core2) feature partitioned
// L2 caches. if the cores sharing a cache work together on the same
// sub-problem, contention may be reduced and effective capacity increased.
// the following routines allow discovery of the L2 cache topology:
/**
* stores L2 cache topology, i.e. the mapping between processor and caches.
* this allows cores sharing a cache to work together on the same dataset,
* which may reduce contention and increase effective capacity.
*
* example: Intel Core2 micro-architectures (e.g. Intel Core2) feature
* partitioned L2 caches shared by two cores.
**/
struct CacheTopology;
/**
* initialize static storage from which topology can be retrieved by
* means of the following functions.
* @return const pointer to a shared instance.
*
* WARNING: this function must not be reentered before it has returned once.
**/
LIB_API const CacheTopology* cache_topology_Detect();
/**
* @return number of distinct L2 caches
**/
LIB_API size_t cpu_NumCaches();
LIB_API size_t cache_topology_NumCaches(const CacheTopology*);
/**
* @return L2 cache number (zero-based) to which <processor> belongs.
**/
LIB_API size_t cpu_CacheFromProcessor(size_t processor);
LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor);
/**
* @return bit-mask of all processors sharing <cache>.
**/
LIB_API uintptr_t cpu_ProcessorMaskFromCache(size_t cache);
LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache);
#endif // #ifndef INCLUDED_TOPOLOGY

View File

@ -338,6 +338,65 @@ const char* cpu_IdentifierString()
}
//-----------------------------------------------------------------------------
// misc stateless functions
u8 x86_x64_ApicId()
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
// note: CPUID function 1 should be available everywhere, but only
// processors with an xAPIC (8th generation or above, e.g. P4/Athlon XP)
// will return a nonzero value.
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
return apicId;
}
u64 x86_x64_rdtsc()
{
#if MSC_VERSION
return (u64)__rdtsc();
#elif GCC_VERSION
// GCC supports "portable" assembly for both x86 and x64
volatile u32 lo, hi;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return u64_from_u32(hi, lo);
#endif
}
void x86_x64_DebugBreak()
{
#if MSC_VERSION
__debugbreak();
#elif GCC_VERSION
// note: this probably isn't necessary, since unix_debug_break
// (SIGTRAP) is most probably available if GCC_VERSION.
// we include it for completeness, though.
__asm__ __volatile__ ("int $3");
#endif
}
// enforce strong memory ordering.
void cpu_MemoryFence()
{
if(x86_x64_cap(X86_X64_CAP_SSE2))
_mm_mfence();
}
void cpu_Serialize()
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
x86_x64_cpuid(&regs); // CPUID serializes execution.
}
//-----------------------------------------------------------------------------
// CPU frequency
@ -367,10 +426,8 @@ public:
};
// note: this function uses timer.cpp!timer_Time, which is implemented via
// whrt.cpp on Windows, which again calls x86_x64_Init. be careful that
// this function isn't called from there as well, else WHRT will be used
// before its init completes.
double cpu_ClockFrequency()
// whrt.cpp on Windows.
double x86_x64_ClockFrequency()
{
// if the TSC isn't available, there's really no good way to count the
// actual CPU clocks per known time interval, so bail.
@ -447,59 +504,3 @@ double cpu_ClockFrequency()
const double clock_frequency = sum / (hi-lo);
return clock_frequency;
}
//-----------------------------------------------------------------------------
// misc stateless functions
u8 x86_x64_ApicId()
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
return apicId;
}
u64 x86_x64_rdtsc()
{
#if MSC_VERSION
return (u64)__rdtsc();
#elif GCC_VERSION
// GCC supports "portable" assembly for both x86 and x64
volatile u32 lo, hi;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return u64_from_u32(hi, lo);
#endif
}
void x86_x64_DebugBreak()
{
#if MSC_VERSION
__debugbreak();
#elif GCC_VERSION
// note: this probably isn't necessary, since unix_debug_break
// (SIGTRAP) is most probably available if GCC_VERSION.
// we include it for completeness, though.
__asm__ __volatile__ ("int $3");
#endif
}
// enforce strong memory ordering.
void cpu_MemoryFence()
{
if(x86_x64_cap(X86_X64_CAP_SSE2))
_mm_mfence();
}
void cpu_Serialize()
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
x86_x64_cpuid(&regs); // CPUID serializes execution.
}

View File

@ -100,13 +100,12 @@ LIB_API bool x86_x64_cap(x86_x64_Cap cap);
// stateless
/**
* @return APIC ID of the currently executing processor.
* @return APIC ID of the currently executing processor or zero if the
* platform does not have an xAPIC (i.e. 7th generation x86 or below).
*
* the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
* (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
* APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
* reliably on WinXP. also, the OS already has the APIC registers mapped and
* in constant use, and we don't want to interfere.
* rationale: the alternative of accessing the APIC mmio registers is not
* feasible - mahaf_MapPhysicalMemory only works reliably on WinXP. we also
* don't want to intefere with the OS's constant use of the APIC registers.
**/
LIB_API u8 x86_x64_ApicId();
@ -122,4 +121,12 @@ LIB_API u64 x86_x64_rdtsc();
**/
LIB_API void x86_x64_DebugBreak(void);
/**
* measure the CPU clock frequency via x86_x64_rdtsc and timer_Time.
* (it follows that this must not be called from WHRT init.)
* this takes several milliseconds (i.e. much longer than
* os_cpu_ClockFrequency) but delivers accurate measurements.
**/
LIB_API double x86_x64_ClockFrequency();
#endif // #ifndef INCLUDED_X86_X64

View File

@ -141,7 +141,7 @@ void CNetLogSink::DoSink( const CNetLogEvent& event )
//-----------------------------------------------------------------------------
void CNetLogSink::DoBulkSink( const CNetLogEvent* pEvents, size_t eventCount )
{
unsigned* pIndices = NULL;
size_t* pIndices = NULL;
size_t indexCount = 0;
size_t i;

View File

@ -577,7 +577,7 @@ static void InitVfs(const CmdLineArgs& args)
// the VFS prevents any accesses to files above this directory.
path_SetRoot(args.GetArg0(), "../data");
g_VFS = CreateVfs();
g_VFS = CreateVfs(96*MiB);
g_VFS->Mount("screenshots/", "screenshots");
g_VFS->Mount("config/", "config");

View File

@ -75,8 +75,9 @@ void WriteSystemInfo()
fprintf(f, "OS : %s %s (%s)\n", un.sysname, un.release, un.version);
// CPU
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), cpu_NumPackages(), cpu_CoresPerPackage(), cpu_LogicalPerCore());
const double cpu_freq = cpu_ClockFrequency();
const CpuTopology* topology = cpu_topology_Detect();
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), cpu_topology_NumPackages(topology), cpu_topology_CoresPerPackage(topology), cpu_topology_LogicalPerCore(topology));
const double cpu_freq = os_cpu_ClockFrequency();
if(cpu_freq != 0.0f)
{
if(cpu_freq < 1e9)

View File

@ -10,7 +10,7 @@ public:
void test_paths()
{
TS_ASSERT_OK(path_SetRoot(0, "../data"));
PIVFS vfs = CreateVfs();
PIVFS vfs = CreateVfs(20*MiB);
TS_ASSERT_OK(vfs->Mount("", "mods/_test.xero"));