improvements/additions from work
bits: fix bit_mask for signed types, add SetBitsTo, LeastSignificantBit, ClearLeastSignificantBit. add MSR support (read/write via mahaf in kernel mode) x86_x64: expose family/model topology: add support for determining core/package from APIC ID. TSC: report actual frequency for nehalem invariant TSC. improved UNREACHABLE/ASSUME_UNREACHABLE (avoid ICC warning, add GCC4.5 support) This was SVN commit r7860.
This commit is contained in:
parent
3a0123b7b4
commit
3d45069b3f
@ -40,7 +40,7 @@ template<typename T>
|
||||
T Bit(size_t n)
|
||||
{
|
||||
const T one = T(1);
|
||||
return (one << n);
|
||||
return (T)(one << n);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -71,16 +71,14 @@ bool IsBitSet(T value, size_t index)
|
||||
template<typename T>
|
||||
T bit_mask(size_t numBits)
|
||||
{
|
||||
if(numBits == 0) // prevent shift count == bitsInT, which would be undefined.
|
||||
return 0;
|
||||
// notes:
|
||||
// - the perhaps more intuitive (1 << numBits)-1 cannot
|
||||
// handle numBits == bitsInT, but this implementation does.
|
||||
// - though bulky, the below statements avoid sign-conversion warnings.
|
||||
const T bitsInT = sizeof(T)*CHAR_BIT;
|
||||
T mask(0);
|
||||
mask = T(~mask);
|
||||
mask >>= T(bitsInT-numBits);
|
||||
const T allBits = (T)~T(0);
|
||||
// (shifts of at least bitsInT are undefined)
|
||||
if(numBits >= bitsInT)
|
||||
return allBits;
|
||||
// (note: the previous allBits >> (bitsInT-numBits) is not safe
|
||||
// because right-shifts of negative numbers are undefined.)
|
||||
const T mask = T(T(1) << numBits)-1;
|
||||
return mask;
|
||||
}
|
||||
|
||||
@ -98,12 +96,31 @@ T bit_mask(size_t numBits)
|
||||
template<typename T>
|
||||
inline T bits(T num, size_t lo_idx, size_t hi_idx)
|
||||
{
|
||||
const size_t count = (hi_idx - lo_idx)+1; // # bits to return
|
||||
const size_t numBits = (hi_idx - lo_idx)+1; // # bits to return
|
||||
T result = T(num >> lo_idx);
|
||||
result = T(result & bit_mask<T>(count));
|
||||
result = T(result & bit_mask<T>(numBits));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the value of bits hi_idx:lo_idx
|
||||
*
|
||||
* @param lo_idx bit index of lowest bit to include
|
||||
* @param hi_idx bit index of highest bit to include
|
||||
* @param value new value to be assigned to these bits
|
||||
**/
|
||||
template<typename T>
|
||||
inline T SetBitsTo(T num, size_t lo_idx, size_t hi_idx, size_t value)
|
||||
{
|
||||
const size_t numBits = (hi_idx - lo_idx)+1;
|
||||
debug_assert(value < (T(1) << numBits));
|
||||
const T mask = bit_mask<T>(numBits) << lo_idx;
|
||||
T result = num & ~mask;
|
||||
result = T(result | (value << lo_idx));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return number of 1-bits in mask
|
||||
**/
|
||||
@ -127,7 +144,7 @@ size_t PopulationCount(T mask)
|
||||
* @return whether the given number is a power of two.
|
||||
**/
|
||||
template<typename T>
|
||||
bool is_pow2(T n)
|
||||
inline bool is_pow2(T n)
|
||||
{
|
||||
// 0 would pass the test below but isn't a POT.
|
||||
if(n == 0)
|
||||
@ -135,6 +152,19 @@ bool is_pow2(T n)
|
||||
return (n & (n-1)) == 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T LeastSignificantBit(T x)
|
||||
{
|
||||
const T negX = T(~x + 1); // 2's complement (avoids 'negating unsigned type' warning)
|
||||
return x & negX;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T ClearLeastSignificantBit(T x)
|
||||
{
|
||||
return x & (x-1);
|
||||
}
|
||||
|
||||
/**
|
||||
* ceil(log2(x))
|
||||
*
|
||||
|
@ -43,60 +43,48 @@
|
||||
|
||||
|
||||
/**
|
||||
"unreachable code" helpers
|
||||
|
||||
unreachable lines of code are often the source or symptom of subtle bugs.
|
||||
they are flagged by compiler warnings; however, the opposite problem -
|
||||
erroneously reaching certain spots (e.g. due to missing return statement)
|
||||
is worse and not detected automatically.
|
||||
|
||||
to defend against this, the programmer can annotate their code to
|
||||
indicate to humans that a particular spot should never be reached.
|
||||
however, that isn't much help; better is a sentinel that raises an
|
||||
error if if it is actually reached. hence, the UNREACHABLE macro.
|
||||
|
||||
ironically, if the code guarded by UNREACHABLE works as it should,
|
||||
compilers may flag the macro's code as unreachable. this would
|
||||
distract from genuine warnings, which is unacceptable.
|
||||
|
||||
even worse, compilers differ in their code checking: GCC only complains if
|
||||
non-void functions end without returning a value (i.e. missing return
|
||||
statement), while VC checks if lines are unreachable (e.g. if they are
|
||||
preceded by a return on all paths).
|
||||
|
||||
our implementation of UNREACHABLE solves this dilemna as follows:
|
||||
- on GCC: call abort(); since it has the noreturn attributes, the
|
||||
"non-void" warning disappears.
|
||||
- on VC: avoid generating any code. we allow the compiler to assume the
|
||||
spot is actually unreachable, which incidentally helps optimization.
|
||||
if reached after all, a crash usually results. in that case, compile with
|
||||
CONFIG_PARANOIA, which will cause an error message to be displayed.
|
||||
|
||||
this approach still allows for the possiblity of automated
|
||||
checking, but does not cause any compiler warnings.
|
||||
**/
|
||||
* "unreachable code" helpers
|
||||
*
|
||||
* unreachable lines of code are often the source or symptom of subtle bugs.
|
||||
* they are flagged by compiler warnings; however, the opposite problem -
|
||||
* erroneously reaching certain spots (e.g. due to missing return statement)
|
||||
* is worse and not detected automatically.
|
||||
*
|
||||
* to defend against this, the programmer can annotate their code to
|
||||
* indicate to humans that a particular spot should never be reached.
|
||||
* however, that isn't much help; better is a sentinel that raises an
|
||||
* error if if it is actually reached. hence, the UNREACHABLE macro.
|
||||
*
|
||||
* ironically, if the code guarded by UNREACHABLE works as it should,
|
||||
* compilers may flag the macro's code as unreachable. this would
|
||||
* distract from genuine warnings, which is unacceptable.
|
||||
*
|
||||
* even worse, compilers differ in their code checking: GCC only complains if
|
||||
* non-void functions end without returning a value (i.e. missing return
|
||||
* statement), while VC checks if lines are unreachable (e.g. if they are
|
||||
* preceded by a return on all paths).
|
||||
*
|
||||
* the implementation below enables optimization and automated checking
|
||||
* without raising warnings.
|
||||
**/
|
||||
#define UNREACHABLE // actually defined below.. this is for
|
||||
# undef UNREACHABLE // CppDoc's benefit only.
|
||||
|
||||
// 1) final build: optimize assuming this location cannot be reached.
|
||||
// may crash if that turns out to be untrue, but removes checking overhead.
|
||||
#if CONFIG_FINAL
|
||||
// compiler supports ASSUME_UNREACHABLE => allow it to assume the code is
|
||||
// never reached (improves optimization at the cost of undefined behavior
|
||||
// if the annotation turns out to be incorrect).
|
||||
#if HAVE_ASSUME_UNREACHABLE && !CONFIG_PARANOIA
|
||||
# define UNREACHABLE ASSUME_UNREACHABLE
|
||||
// 2) normal build:
|
||||
// otherwise (or if CONFIG_PARANOIA is set), add a user-visible
|
||||
// warning if the code is reached. note that abort() fails to stop
|
||||
// ICC from warning about the lack of a return statement, so we
|
||||
// use an infinite loop instead.
|
||||
#else
|
||||
// a) normal implementation: includes "abort", which is declared with
|
||||
// noreturn attribute and therefore avoids GCC's "execution reaches
|
||||
// end of non-void function" warning.
|
||||
# if !MSC_VERSION || ICC_VERSION || CONFIG_PARANOIA
|
||||
# define UNREACHABLE\
|
||||
# define UNREACHABLE\
|
||||
STMT(\
|
||||
debug_assert(0); /* hit supposedly unreachable code */\
|
||||
abort();\
|
||||
for(;;){};\
|
||||
)
|
||||
// b) VC only: don't generate any code; squelch the warning and optimize.
|
||||
# else
|
||||
# define UNREACHABLE ASSUME_UNREACHABLE
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
@ -116,7 +116,7 @@ static size_t MaxLogicalPerCache()
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// determination of enabled cores/HTs
|
||||
// APIC IDs
|
||||
|
||||
// APIC IDs consist of variable-length fields identifying the logical unit,
|
||||
// core, package and shared cache. if they are available, we can determine
|
||||
@ -174,106 +174,102 @@ const u8* ApicIds()
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* count the number of unique APIC IDs after application of a mask.
|
||||
*
|
||||
* this is used to implement NumUniqueValuesInField and also required
|
||||
* for counting the number of caches.
|
||||
**/
|
||||
static size_t NumUniqueMaskedValues(const u8* apicIds, u8 mask)
|
||||
// (if maxValues == 1, the field is zero-width and thus zero)
|
||||
static size_t ApicField(size_t apicId, size_t indexOfLowestBit, size_t maxValues)
|
||||
{
|
||||
std::set<u8> ids;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const u8 apicId = apicIds[processor];
|
||||
const u8 field = u8(apicId & mask);
|
||||
ids.insert(field);
|
||||
}
|
||||
|
||||
return ids.size();
|
||||
const size_t numBits = ceil_log2(maxValues);
|
||||
const size_t mask = bit_mask<size_t>(numBits);
|
||||
return (apicId >> indexOfLowestBit) & mask;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Count the number of values assumed by a certain field within APIC IDs.
|
||||
*
|
||||
* @param apicIds
|
||||
* @param offset Index of the lowest bit that is part of the field.
|
||||
* @param numValues Number of values that can be assumed by the field.
|
||||
* If equal to one, the field is zero-width.
|
||||
* @return number of unique values (for convenience of the topology code,
|
||||
* this is always at least one)
|
||||
**/
|
||||
static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numValues)
|
||||
{
|
||||
if(numValues == 1) // see parameter description above
|
||||
return 1;
|
||||
const size_t numBits = ceil_log2(numValues);
|
||||
const u8 mask = u8((bit_mask<u8>(numBits) << offset) & 0xFF);
|
||||
return NumUniqueMaskedValues(apicIds, mask);
|
||||
}
|
||||
|
||||
|
||||
static size_t MinPackages(size_t maxCoresPerPackage, size_t maxLogicalPerCore)
|
||||
{
|
||||
const size_t numNodes = numa_NumNodes();
|
||||
const size_t logicalPerNode = PopulationCount(numa_ProcessorMaskFromNode(0));
|
||||
// NB: some cores or logical processors may be disabled.
|
||||
const size_t maxLogicalPerPackage = maxCoresPerPackage*maxLogicalPerCore;
|
||||
const size_t minPackagesPerNode = DivideRoundUp(logicalPerNode, maxLogicalPerPackage);
|
||||
return minPackagesPerNode*numNodes;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU topology interface
|
||||
|
||||
struct CpuTopology // POD
|
||||
{
|
||||
size_t numPackages;
|
||||
size_t coresPerPackage;
|
||||
size_t maxLogicalPerCore;
|
||||
size_t maxCoresPerPackage;
|
||||
|
||||
size_t logicalOffset;
|
||||
size_t coreOffset;
|
||||
size_t packageOffset;
|
||||
|
||||
// how many are actually enabled
|
||||
size_t logicalPerCore;
|
||||
size_t coresPerPackage;
|
||||
size_t numPackages;
|
||||
};
|
||||
static CpuTopology cpuTopology;
|
||||
static ModuleInitState cpuInitState;
|
||||
|
||||
static LibError InitCpuTopology()
|
||||
{
|
||||
const size_t numProcessors = os_cpu_NumProcessors();
|
||||
const size_t maxCoresPerPackage = MaxCoresPerPackage();
|
||||
const size_t maxLogicalPerCore = MaxLogicalPerCore();
|
||||
cpuTopology.maxLogicalPerCore = MaxLogicalPerCore();
|
||||
cpuTopology.maxCoresPerPackage = MaxCoresPerPackage();
|
||||
|
||||
cpuTopology.logicalOffset = 0;
|
||||
cpuTopology.coreOffset = ceil_log2(cpuTopology.maxLogicalPerCore);
|
||||
cpuTopology.packageOffset = cpuTopology.coreOffset + ceil_log2(cpuTopology.maxCoresPerPackage);
|
||||
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t packageOffset = ceil_log2(maxCoresPerPackage) + ceil_log2(maxLogicalPerCore);
|
||||
const size_t coreOffset = ceil_log2(maxLogicalPerCore);
|
||||
const size_t logicalOffset = 0;
|
||||
cpuTopology.numPackages = NumUniqueValuesInField(apicIds, packageOffset, 256);
|
||||
cpuTopology.coresPerPackage = NumUniqueValuesInField(apicIds, coreOffset, maxCoresPerPackage);
|
||||
cpuTopology.logicalPerCore = NumUniqueValuesInField(apicIds, logicalOffset, maxLogicalPerCore);
|
||||
struct NumUniqueValuesInField
|
||||
{
|
||||
size_t operator()(const u8* apicIds, size_t indexOfLowestBit, size_t numValues) const
|
||||
{
|
||||
std::set<size_t> values;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const size_t value = ApicField(apicIds[processor], numValues, indexOfLowestBit);
|
||||
values.insert(value);
|
||||
}
|
||||
return values.size();
|
||||
}
|
||||
};
|
||||
|
||||
cpuTopology.logicalPerCore = NumUniqueValuesInField()(apicIds, cpuTopology.logicalOffset, cpuTopology.maxLogicalPerCore);
|
||||
cpuTopology.coresPerPackage = NumUniqueValuesInField()(apicIds, cpuTopology.coreOffset, cpuTopology.maxCoresPerPackage);
|
||||
cpuTopology.numPackages = NumUniqueValuesInField()(apicIds, cpuTopology.packageOffset, 256);
|
||||
}
|
||||
else // the processor lacks an xAPIC, or the IDs are invalid
|
||||
{
|
||||
struct MinPackages
|
||||
{
|
||||
size_t operator()(size_t maxCoresPerPackage, size_t maxLogicalPerCore) const
|
||||
{
|
||||
const size_t numNodes = numa_NumNodes();
|
||||
const size_t logicalPerNode = PopulationCount(numa_ProcessorMaskFromNode(0));
|
||||
// NB: some cores or logical processors may be disabled.
|
||||
const size_t maxLogicalPerPackage = maxCoresPerPackage*maxLogicalPerCore;
|
||||
const size_t minPackagesPerNode = DivideRoundUp(logicalPerNode, maxLogicalPerPackage);
|
||||
return minPackagesPerNode*numNodes;
|
||||
}
|
||||
};
|
||||
|
||||
// we can't differentiate between cores and logical processors.
|
||||
// since the former are less likely to be disabled, we seek the
|
||||
// maximum feasible number of cores and minimal number of packages:
|
||||
const size_t minPackages = MinPackages(maxCoresPerPackage, maxLogicalPerCore);
|
||||
const size_t maxPackages = numProcessors;
|
||||
for(size_t numPackages = minPackages; numPackages <= maxPackages; numPackages++)
|
||||
const size_t minPackages = MinPackages()(cpuTopology.maxCoresPerPackage, cpuTopology.maxLogicalPerCore);
|
||||
const size_t numProcessors = os_cpu_NumProcessors();
|
||||
for(size_t numPackages = minPackages; numPackages <= numProcessors; numPackages++)
|
||||
{
|
||||
if(numProcessors % numPackages != 0)
|
||||
continue;
|
||||
const size_t logicalPerPackage = numProcessors / numPackages;
|
||||
const size_t minCoresPerPackage = DivideRoundUp(logicalPerPackage, maxLogicalPerCore);
|
||||
for(size_t coresPerPackage = maxCoresPerPackage; coresPerPackage >= minCoresPerPackage; coresPerPackage--)
|
||||
const size_t minCoresPerPackage = DivideRoundUp(logicalPerPackage, cpuTopology.maxLogicalPerCore);
|
||||
for(size_t coresPerPackage = cpuTopology.maxCoresPerPackage; coresPerPackage >= minCoresPerPackage; coresPerPackage--)
|
||||
{
|
||||
if(logicalPerPackage % coresPerPackage != 0)
|
||||
continue;
|
||||
const size_t logicalPerCore = logicalPerPackage / coresPerPackage;
|
||||
if(logicalPerCore <= maxLogicalPerCore)
|
||||
if(logicalPerCore <= cpuTopology.maxLogicalPerCore)
|
||||
{
|
||||
debug_assert(numProcessors == numPackages*coresPerPackage*logicalPerCore);
|
||||
cpuTopology.numPackages = numPackages;
|
||||
cpuTopology.coresPerPackage = coresPerPackage;
|
||||
cpuTopology.logicalPerCore = logicalPerCore;
|
||||
cpuTopology.coresPerPackage = coresPerPackage;
|
||||
cpuTopology.numPackages = numPackages;
|
||||
return INFO::OK;
|
||||
}
|
||||
}
|
||||
@ -303,6 +299,24 @@ size_t cpu_topology_LogicalPerCore()
|
||||
return cpuTopology.logicalPerCore;
|
||||
}
|
||||
|
||||
size_t cpu_topology_LogicalFromId(size_t apicId)
|
||||
{
|
||||
ModuleInit(&cpuInitState, InitCpuTopology);
|
||||
return ApicField(apicId, cpuTopology.logicalOffset, cpuTopology.maxLogicalPerCore);
|
||||
}
|
||||
|
||||
size_t cpu_topology_CoreFromId(size_t apicId)
|
||||
{
|
||||
ModuleInit(&cpuInitState, InitCpuTopology);
|
||||
return ApicField(apicId, cpuTopology.coreOffset, cpuTopology.maxCoresPerPackage);
|
||||
}
|
||||
|
||||
size_t cpu_topology_PackageFromId(size_t apicId)
|
||||
{
|
||||
ModuleInit(&cpuInitState, InitCpuTopology);
|
||||
return ApicField(apicId, cpuTopology.packageOffset, 256);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// cache topology
|
||||
|
@ -65,6 +65,11 @@ LIB_API size_t cpu_topology_CoresPerPackage();
|
||||
LIB_API size_t cpu_topology_LogicalPerCore();
|
||||
|
||||
|
||||
LIB_API size_t cpu_topology_LogicalFromId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_CoreFromId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_PackageFromId(size_t apicId);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// L2 cache
|
||||
|
||||
|
@ -157,7 +157,7 @@ bool x86_x64_cap(x86_x64_Cap cap)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU identification
|
||||
// vendor
|
||||
|
||||
static x86_x64_Vendors vendor;
|
||||
|
||||
@ -197,10 +197,14 @@ x86_x64_Vendors x86_x64_Vendor()
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// signature
|
||||
|
||||
static size_t model;
|
||||
static size_t family;
|
||||
static ModuleInitState signatureInitState;
|
||||
|
||||
static void InitModelAndFamily()
|
||||
static LibError InitSignature()
|
||||
{
|
||||
x86_x64_CpuidRegs regs = { 0 };
|
||||
regs.eax = 1;
|
||||
@ -214,71 +218,19 @@ static void InitModelAndFamily()
|
||||
family += extendedFamily;
|
||||
if(family == 0xF || (x86_x64_Vendor() == X86_X64_VENDOR_INTEL && family == 6))
|
||||
model += extendedModel << 4;
|
||||
}
|
||||
|
||||
|
||||
static size_t generation;
|
||||
|
||||
static LibError InitGeneration()
|
||||
{
|
||||
InitModelAndFamily();
|
||||
|
||||
switch(x86_x64_Vendor())
|
||||
{
|
||||
case X86_X64_VENDOR_AMD:
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
if(model < 6)
|
||||
generation = 5; // K5
|
||||
else
|
||||
generation = 6; // K6
|
||||
break;
|
||||
|
||||
case 6:
|
||||
generation = 7; // K7 (Athlon)
|
||||
break;
|
||||
|
||||
case 0xF:
|
||||
case 0x10:
|
||||
generation = 8; // K8 (Opteron)
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case X86_X64_VENDOR_INTEL:
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
generation = 5; // Pentium
|
||||
break;
|
||||
|
||||
case 6:
|
||||
if(model < 0xF)
|
||||
generation = 6; // Pentium Pro/II/III/M
|
||||
else
|
||||
generation = 8; // Core2Duo
|
||||
break;
|
||||
|
||||
case 0xF:
|
||||
if(model <= 6)
|
||||
generation = 7; // Pentium 4/D
|
||||
break;
|
||||
}
|
||||
if(family >= 0x10)
|
||||
generation = 9;
|
||||
break;
|
||||
}
|
||||
|
||||
debug_assert(generation != 0);
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
size_t x86_x64_Generation()
|
||||
size_t x86_x64_Model()
|
||||
{
|
||||
static ModuleInitState initState;
|
||||
ModuleInit(&initState, InitGeneration);
|
||||
return generation;
|
||||
ModuleInit(&signatureInitState, InitSignature);
|
||||
return model;
|
||||
}
|
||||
|
||||
size_t x86_x64_Family()
|
||||
{
|
||||
ModuleInit(&signatureInitState, InitSignature);
|
||||
return family;
|
||||
}
|
||||
|
||||
|
||||
@ -832,7 +784,8 @@ static LibError InitIdentifierString()
|
||||
// doesn't recognize.
|
||||
if(!gotBrandString || strncmp(identifierString, "Unknow", 6) == 0)
|
||||
{
|
||||
InitModelAndFamily();
|
||||
const size_t family = x86_x64_Family();
|
||||
const size_t model = x86_x64_Model();
|
||||
switch(x86_x64_Vendor())
|
||||
{
|
||||
case X86_X64_VENDOR_AMD:
|
||||
|
@ -73,6 +73,11 @@ enum x86_x64_Vendors
|
||||
LIB_API x86_x64_Vendors x86_x64_Vendor();
|
||||
|
||||
|
||||
LIB_API size_t x86_x64_Model();
|
||||
|
||||
LIB_API size_t x86_x64_Family();
|
||||
|
||||
|
||||
/**
|
||||
* @return the colloquial processor generation
|
||||
* (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
|
||||
@ -96,6 +101,7 @@ enum x86_x64_Cap
|
||||
// standard (edx)
|
||||
X86_X64_CAP_FPU = 32+0, // Floating Point Unit
|
||||
X86_X64_CAP_TSC = 32+4, // TimeStamp Counter
|
||||
X86_X64_CAP_MSR = 32+5, // Model Specific Registers
|
||||
X86_X64_CAP_CMOV = 32+15, // Conditional MOVe
|
||||
X86_X64_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
|
||||
X86_X64_CAP_MMX = 32+23, // MultiMedia eXtensions
|
||||
|
@ -175,10 +175,15 @@
|
||||
// this macro should not generate any fallback code; it is merely the
|
||||
// compiler-specific backend for lib.h's UNREACHABLE.
|
||||
// #define it to nothing if the compiler doesn't support such a hint.
|
||||
#if MSC_VERSION
|
||||
#define HAVE_ASSUME_UNREACHABLE 1
|
||||
#if MSC_VERSION && !ICC_VERSION // (ICC ignores this)
|
||||
# define ASSUME_UNREACHABLE __assume(0)
|
||||
#elif GCC_VERSION >= 450
|
||||
# define ASSUME_UNREACHABLE __builtin_unreachable()
|
||||
#else
|
||||
# define ASSUME_UNREACHABLE
|
||||
# undef HAVE_ASSUME_UNREACHABLE
|
||||
# define HAVE_ASSUME_UNREACHABLE 0
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -41,47 +41,75 @@
|
||||
#define IOCTL_AKEN_WRITE_PORT CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+1, METHOD_BUFFERED, FILE_ANY_ACCESS)
|
||||
#define IOCTL_AKEN_MAP CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+2, METHOD_BUFFERED, FILE_ANY_ACCESS)
|
||||
#define IOCTL_AKEN_UNMAP CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+3, METHOD_BUFFERED, FILE_ANY_ACCESS)
|
||||
#define IOCTL_AKEN_READ_MSR CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+4, METHOD_BUFFERED, FILE_ANY_ACCESS)
|
||||
#define IOCTL_AKEN_WRITE_MSR CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+5, METHOD_BUFFERED, FILE_ANY_ACCESS)
|
||||
#define IOCTL_AKEN_READ_PMC CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+6, METHOD_BUFFERED, FILE_ANY_ACCESS)
|
||||
|
||||
|
||||
// input and output data structures for the IOCTLs
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
struct AkenReadPortIn
|
||||
typedef struct AkenReadPortIn_
|
||||
{
|
||||
USHORT port;
|
||||
UCHAR numBytes;
|
||||
};
|
||||
}
|
||||
AkenReadPortIn;
|
||||
|
||||
struct AkenReadPortOut
|
||||
typedef struct AkenReadPortOut_
|
||||
{
|
||||
DWORD32 value;
|
||||
};
|
||||
}
|
||||
AkenReadPortOut;
|
||||
|
||||
struct AkenWritePortIn
|
||||
typedef struct AkenWritePortIn_
|
||||
{
|
||||
DWORD32 value;
|
||||
USHORT port;
|
||||
UCHAR numBytes;
|
||||
};
|
||||
}
|
||||
AkenWritePortIn;
|
||||
|
||||
struct AkenMapIn
|
||||
typedef struct AkenMapIn_
|
||||
{
|
||||
// note: fixed-width types allow the 32 or 64-bit Mahaf wrapper to
|
||||
// interoperate with the 32 or 64-bit Aken driver.
|
||||
DWORD64 physicalAddress;
|
||||
DWORD64 numBytes;
|
||||
};
|
||||
}
|
||||
AkenMapIn;
|
||||
|
||||
struct AkenMapOut
|
||||
typedef struct AkenMapOut_
|
||||
{
|
||||
DWORD64 virtualAddress;
|
||||
};
|
||||
}
|
||||
AkenMapOut;
|
||||
|
||||
struct AkenUnmapIn
|
||||
typedef struct AkenUnmapIn_
|
||||
{
|
||||
DWORD64 virtualAddress;
|
||||
};
|
||||
}
|
||||
AkenUnmapIn;
|
||||
|
||||
typedef struct AkenReadRegisterIn_
|
||||
{
|
||||
DWORD64 reg;
|
||||
}
|
||||
AkenReadRegisterIn;
|
||||
|
||||
typedef struct AkenReadRegisterOut_
|
||||
{
|
||||
DWORD64 value;
|
||||
}
|
||||
AkenReadRegisterOut;
|
||||
|
||||
typedef struct AkenWriteRegisterIn_
|
||||
{
|
||||
DWORD64 reg;
|
||||
DWORD64 value;
|
||||
}
|
||||
AkenWriteRegisterIn;
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
*/
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "lib/sysdep/os/win/mahaf.h"
|
||||
|
||||
#include "lib/sysdep/os/win/win.h"
|
||||
#include <winioctl.h>
|
||||
@ -56,8 +57,7 @@ static u32 ReadPort(u16 port, u8 numBytes)
|
||||
}
|
||||
|
||||
debug_assert(bytesReturned == sizeof(out));
|
||||
const u32 value = out.value;
|
||||
return value;
|
||||
return out.value;
|
||||
}
|
||||
|
||||
u8 mahaf_ReadPort8(u16 port)
|
||||
@ -159,6 +159,48 @@ void mahaf_UnmapPhysicalMemory(volatile void* virtualAddress)
|
||||
}
|
||||
|
||||
|
||||
static u64 ReadRegister(DWORD ioctl, u64 reg)
|
||||
{
|
||||
AkenReadRegisterIn in;
|
||||
in.reg = reg;
|
||||
AkenReadRegisterOut out;
|
||||
|
||||
DWORD bytesReturned;
|
||||
LPOVERLAPPED ovl = 0; // synchronous
|
||||
BOOL ok = DeviceIoControl(hAken, ioctl, &in, sizeof(in), &out, sizeof(out), &bytesReturned, ovl);
|
||||
if(!ok)
|
||||
{
|
||||
WARN_WIN32_ERR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
debug_assert(bytesReturned == sizeof(out));
|
||||
return out.value;
|
||||
}
|
||||
|
||||
u64 mahaf_ReadModelSpecificRegister(u64 reg)
|
||||
{
|
||||
return ReadRegister((DWORD)IOCTL_AKEN_READ_MSR, reg);
|
||||
}
|
||||
|
||||
u64 mahaf_ReadPerformanceMonitoringCounter(u64 reg)
|
||||
{
|
||||
return ReadRegister((DWORD)IOCTL_AKEN_READ_PMC, reg);
|
||||
}
|
||||
|
||||
void mahaf_WriteModelSpecificRegister(u64 reg, u64 value)
|
||||
{
|
||||
AkenWriteRegisterIn in;
|
||||
in.reg = reg;
|
||||
in.value = value;
|
||||
|
||||
DWORD bytesReturned; // unused but must be passed to DeviceIoControl
|
||||
LPOVERLAPPED ovl = 0; // synchronous
|
||||
BOOL ok = DeviceIoControl(hAken, (DWORD)IOCTL_AKEN_WRITE_MSR, &in, sizeof(in), 0, 0u, &bytesReturned, ovl);
|
||||
WARN_IF_FALSE(ok);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// driver installation
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -39,20 +39,26 @@
|
||||
* note: mahaf_MapPhysicalMemory will complain if it
|
||||
* is called despite this function having returned true.
|
||||
**/
|
||||
extern bool mahaf_IsPhysicalMappingDangerous();
|
||||
LIB_API bool mahaf_IsPhysicalMappingDangerous();
|
||||
|
||||
|
||||
extern LibError mahaf_Init();
|
||||
extern void mahaf_Shutdown();
|
||||
LIB_API LibError mahaf_Init();
|
||||
LIB_API void mahaf_Shutdown();
|
||||
|
||||
extern u8 mahaf_ReadPort8 (u16 port);
|
||||
extern u16 mahaf_ReadPort16(u16 port);
|
||||
extern u32 mahaf_ReadPort32(u16 port);
|
||||
extern void mahaf_WritePort8 (u16 port, u8 value);
|
||||
extern void mahaf_WritePort16(u16 port, u16 value);
|
||||
extern void mahaf_WritePort32(u16 port, u32 value);
|
||||
LIB_API u8 mahaf_ReadPort8 (u16 port);
|
||||
LIB_API u16 mahaf_ReadPort16(u16 port);
|
||||
LIB_API u32 mahaf_ReadPort32(u16 port);
|
||||
LIB_API void mahaf_WritePort8 (u16 port, u8 value);
|
||||
LIB_API void mahaf_WritePort16(u16 port, u16 value);
|
||||
LIB_API void mahaf_WritePort32(u16 port, u32 value);
|
||||
|
||||
extern volatile void* mahaf_MapPhysicalMemory(uintptr_t physicalAddress, size_t numBytes);
|
||||
extern void mahaf_UnmapPhysicalMemory(volatile void* virtualAddress);
|
||||
LIB_API volatile void* mahaf_MapPhysicalMemory(uintptr_t physicalAddress, size_t numBytes);
|
||||
LIB_API void mahaf_UnmapPhysicalMemory(volatile void* virtualAddress);
|
||||
|
||||
LIB_API u64 mahaf_ReadModelSpecificRegister(u64 reg);
|
||||
LIB_API void mahaf_WriteModelSpecificRegister(u64 reg, u64 value);
|
||||
|
||||
// must be done in the driver because Windows clears CR4.PCE[8]
|
||||
LIB_API u64 mahaf_ReadPerformanceMonitoringCounter(u64 reg);
|
||||
|
||||
#endif // INCLUDED_MAHAF
|
||||
|
@ -38,6 +38,7 @@
|
||||
#if ARCH_X86_X64
|
||||
# include "lib/sysdep/arch/x86_x64/x86_x64.h" // x86_x64_rdtsc
|
||||
# include "lib/sysdep/arch/x86_x64/topology.h"
|
||||
# include "lib/sysdep/arch/x86_x64/msr.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -173,7 +174,7 @@ public:
|
||||
|
||||
#if ARCH_X86_X64
|
||||
// recent CPU:
|
||||
if(x86_x64_Generation() >= 7)
|
||||
//if(x86_x64_Generation() >= 7)
|
||||
{
|
||||
// note: 8th generation CPUs support C1-clock ramping, which causes
|
||||
// drift on multi-core systems, but those were excluded above.
|
||||
@ -183,7 +184,7 @@ public:
|
||||
// the chipset thinks the system is dangerously overheated; the
|
||||
// OS isn't even notified. this may be rare, but could cause
|
||||
// incorrect results => unsafe.
|
||||
return false;
|
||||
//return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -217,6 +218,15 @@ public:
|
||||
// note: even here, initial accuracy isn't critical because the
|
||||
// clock is subject to thermal drift and would require continual
|
||||
// recalibration anyway.
|
||||
#if ARCH_X86_X64
|
||||
if(MSR::HasNehalem())
|
||||
{
|
||||
const u64 platformInfo = MSR::Read(MSR::PLATFORM_INFO);
|
||||
const u8 maxNonTurboRatio = bits(platformInfo, 8, 15);
|
||||
return maxNonTurboRatio * 133.33e6f;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
return os_cpu_ClockFrequency();
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user