1
0
forked from 0ad/0ad

config: additional macro is checked

lib: add bits() function (actually moved it from zip.cpp)
ia32: start moving code out to external asm file (will use NASM in
pre-build step; avoids needing MSC and GCC-specific inline code). move
memcpy test code out to other file

This was SVN commit r2706.
This commit is contained in:
janwas 2005-09-13 04:00:41 +00:00
parent 0b72d0f86c
commit 808a0f7cbe
7 changed files with 302 additions and 990 deletions

View File

@ -109,7 +109,7 @@
# define OS_WIN 0
#endif
// .. Linux
#if defined(linux) || defined(__linux)
#if defined(linux) || defined(__linux) || defined(__linux__)
# define OS_LINUX 1
#else
# define OS_LINUX 0

View File

@ -212,6 +212,17 @@ int ilog2(const float x)
}
uint bits(uint num, uint lo_idx, uint hi_idx)
{
uint result = num;
result >>= lo_idx;
const uint count = (hi_idx - lo_idx)+1; // # bits to return
result &= (1u << count)-1;
return result;
}
// multiple must be a power of two.
uintptr_t round_up(const uintptr_t n, const uintptr_t multiple)
{

View File

@ -347,7 +347,7 @@ const size_t GiB = 1ul << 30;
#define BIT(n) (1ul << (n))
extern uint bits(uint x, uint from, uint to);

View File

@ -187,17 +187,6 @@ found_ecdr:
//
///////////////////////////////////////////////////////////////////////////////
static uint bits(uint num, uint lo_idx, uint hi_idx)
{
uint result = num;
result >>= lo_idx;
const uint count = (hi_idx - lo_idx)+1;
// number of bits to return
result &= (1u << count)-1;
return result;
}
static time_t convert_dos_date(u16 fatdate, u16 fattime)
{
struct tm t; // struct tm format:

268
source/lib/sysdep/ia32.asm Normal file
View File

@ -0,0 +1,268 @@
CACHEBLOCK equ 128
BP_MIN_THRESHOLD_64 equ 192*1024
MOVNTQ_MIN_THRESHOLD_64 equ 64*1024
%macro MC_UNROLLED_MOVSD 0
and ebx, 63
mov edx, ebx
shr edx, 2 ; dword count
neg edx
add edx, %%movsd_table_end
jmp edx
align 8
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
%%movsd_table_end:
mov eax, ebx
and eax, 3
neg eax
add eax, %%movsb_table_end
jmp eax
movsb
movsb
movsb
%%movsb_table_end:
%endm
%macro MC_ALIGN 0
mov eax, 8
sub eax, edi
and eax, 7
cmp eax, ecx
cmova eax, ecx
sub ecx, eax
neg eax
add eax, %%align_table_end
jmp eax
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
%%align_table_end:
%endm
%macro MC_MOVQ 0
align 16
%%1:
prefetchnta [esi + (200*64/34+192)]
movq mm0, [esi+0]
movq mm1, [esi+8]
movq [edi+0], mm0
movq [edi+8], mm1
movq mm2, [esi+16]
movq mm3, [esi+24]
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64
add edi, 64
dec ecx
jnz %%1
%endm
; we have >= 8kb. until no more 8kb blocks
%macro MC_BP_MOVNTQ 0
%%prefetch_and_copy_chunk:
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
; touch each cache line in reverse order (prevents HW prefetch)
%%prefetch_chunk:
mov edx, [esi-64]
mov edx, [esi-128]
sub esi, 128
dec eax
jnz %%prefetch_chunk
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
%%copy_block:
movq mm0, [esi+ 0]
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64
movntq [edi+ 0], mm0
movntq [edi+ 8], mm1
movntq [edi+16], mm2
movntq [edi+24], mm3
movntq [edi+32], mm4
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64
dec eax
jnz %%copy_block
sub ecx, CACHEBLOCK ; update the 64-byte block count
cmp ecx, CACHEBLOCK
jl %%prefetch_and_copy_chunk
%endm
; we have >= 64, 64B BLOCKS
%macro MC_MOVNTQ 0
align 16
%%1:
prefetchnta [esi + (200*64/34+192)]
movq mm0,[esi+0]
add edi,64
movq mm1,[esi+8]
add esi,64
movq mm2,[esi-48]
movntq [edi-64], mm0
movq mm0,[esi-40]
movntq [edi-56], mm1
movq mm1,[esi-32]
movntq [edi-48], mm2
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz %%1
%endm
; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
mov ecx, [esp+4+8] ; nbytes
mov esi, [esp+4+4] ; src
mov edi, [esp+4+0] ; dst
MC_ALIGN
mov ebx, ecx
shr ecx, 6 ; # blocks
mov eax, _bp
cmp ecx, BP_MIN_THRESHOLD_64
mov edx, _movntq
cmovb eax, edx
cmp ecx, MOVNTQ_MIN_THRESHOLD_64
mov edx, _mmx
cmovb eax, edx
cmp ecx, 64
jbe tiny
jmp eax
tiny:
MC_UNROLLED_MOVSD
ret
_mmx:
MC_MOVQ
emms
jmp tiny
_bp:
MC_BP_MOVNTQ
sfence
emms
; protect routine below
cmp ecx, 0
jz tiny
_movntq:
sfence
emms
jmp tiny
; extern "C" int __cdecl get_cur_processor_id();
global _get_cur_processor_id
_get_cur_processor_id:
push ebx
push 1
pop eax
cpuid
shr ebx, 24
mov eax, ebx ; ebx[31:24]
pop ebx
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
global _ia32_control87
_ia32_control87:
push eax
fnstcw [esp]
pop eax ; old_cw
mov ecx, [esp+4] ; new_cw
mov edx, [esp+8] ; mask
and ecx, edx ; new_cw & mask
not edx ; ~mask
and eax, edx ; old_cw & ~mask
or eax, ecx ; (old_cw & ~mask) | (new_cw & mask)
push edx
fldcw [esp]
pop edx
xor eax, eax ; return value

File diff suppressed because it is too large Load Diff

View File

@ -24,6 +24,12 @@
#include "lib/types.h"
// some of these are implemented in asm, so make sure name mangling is
// disabled.
#ifdef __cplusplus
extern "C" {
#endif
extern double _ceil(double);
@ -38,7 +44,7 @@ extern u64 rdtsc(void);
#endif
#define _control87 ia32_control87
extern uint ia32_control87(uint new_cw, uint mask);
extern uint ia32_control87(uint new_cw, uint mask); // asm
extern void ia32_debug_break(void);
@ -78,4 +84,11 @@ extern bool ia32_cap(CpuCap cap);
extern void ia32_get_cpu_info(void);
// internal use only
extern int get_cur_processor_id();
#ifdef __cplusplus
}
#endif
#endif // #ifndef IA32_H