profiling indicates memcpy is a bottleneck, so globally replace all of them with memcpy2 (hand-optimized). depending on transfer size, this is 10%..300% faster. also made memcpy2 into a macro instead of thunk function because vc7 was still generating a jump.

This was SVN commit r3057.
This commit is contained in:
janwas 2005-10-30 16:19:20 +00:00
parent 252bd478c4
commit 20d038efda
23 changed files with 43 additions and 47 deletions

View File

@ -209,7 +209,7 @@ int CMapReader::ApplyData()
if (unit)
{
CMatrix3D transform;
memcpy(&transform._11, m_Objects[i].m_Transform, sizeof(float)*16);
memcpy2(&transform._11, m_Objects[i].m_Transform, sizeof(float)*16);
unit->GetModel()->SetTransform(transform);
}
}

View File

@ -143,12 +143,12 @@ void CParticleEmitter::Update()
newParticle->m_gravity = m_gravity;
// calculate and assign colour
memcpy(colour, m_startColour, sizeof(float) * 4);
memcpy2(colour, m_startColour, sizeof(float) * 4);
colour[0] += (rand() % (int)((m_endColour[0] - m_startColour[0]) * 1000.0f + 1)) / 1000.0f;
colour[1] += (rand() % (int)((m_endColour[1] - m_startColour[1]) * 1000.0f + 1)) / 1000.0f;
colour[2] += (rand() % (int)((m_endColour[2] - m_startColour[2]) * 1000.0f + 1)) / 1000.0f;
colour[3] += (rand() % (int)((m_endColour[3] - m_startColour[3]) * 1000.0f + 1)) / 1000.0f;
memcpy(newParticle->m_colour, colour, sizeof(float) * 4);
memcpy2(newParticle->m_colour, colour, sizeof(float) * 4);
// assign sprite
newParticle->m_sprite = m_sprite;

View File

@ -58,7 +58,7 @@ bool CTerrain::Initialize(u32 size,const u16* data)
// given a heightmap?
if (data) {
// yes; keep a copy of it
memcpy(m_Heightmap,data,m_MapSize*m_MapSize*sizeof(u16));
memcpy2(m_Heightmap,data,m_MapSize*m_MapSize*sizeof(u16));
} else {
// build a flat terrain
memset(m_Heightmap,0,m_MapSize*m_MapSize*sizeof(u16));
@ -263,7 +263,7 @@ void CTerrain::Resize(u32 size)
u16* dst=newHeightmap;
u32 copysize=newMapSize>m_MapSize ? m_MapSize : newMapSize;
for (j=0;j<copysize;j++) {
memcpy(dst,src,copysize*sizeof(u16));
memcpy2(dst,src,copysize*sizeof(u16));
dst+=copysize;
src+=m_MapSize;
if (newMapSize>m_MapSize) {
@ -280,7 +280,7 @@ void CTerrain::Resize(u32 size)
src=newHeightmap+((m_MapSize-1)*newMapSize);
dst=src+newMapSize;
for (u32 i=0;i<newMapSize-m_MapSize;i++) {
memcpy(dst,src,newMapSize*sizeof(u16));
memcpy2(dst,src,newMapSize*sizeof(u16));
dst+=newMapSize;
}
}
@ -290,7 +290,7 @@ void CTerrain::Resize(u32 size)
for (u32 i=0;i<size;i++) {
// copy over texture data from existing tiles, if possible
if (i<m_MapSizePatches && j<m_MapSizePatches) {
memcpy(newPatches[j*size+i].m_MiniPatches,m_Patches[j*m_MapSizePatches+i].m_MiniPatches,sizeof(CMiniPatch)*PATCH_SIZE*PATCH_SIZE);
memcpy2(newPatches[j*size+i].m_MiniPatches,m_Patches[j*m_MapSizePatches+i].m_MiniPatches,sizeof(CMiniPatch)*PATCH_SIZE*PATCH_SIZE);
}
}
@ -361,7 +361,7 @@ void CTerrain::InitialisePatches()
void CTerrain::SetHeightMap(u16* heightmap)
{
// keep a copy of the given heightmap
memcpy(m_Heightmap,heightmap,m_MapSize*m_MapSize*sizeof(u16));
memcpy2(m_Heightmap,heightmap,m_MapSize*m_MapSize*sizeof(u16));
// recalculate patch bounds, invalidate vertices
for (u32 j=0;j<m_MapSizePatches;j++) {

View File

@ -35,7 +35,7 @@ namespace I18n
ref = new strImW_data;
size_t len = wcslen(s)+1;
ref->data = new wchar_t[len];
memcpy((void*)ref->data, s, len*sizeof(wchar_t));
memcpy2((void*)ref->data, s, len*sizeof(wchar_t));
}
StrImW(const char* s)

View File

@ -47,7 +47,7 @@ void debug_wprintf_mem(const wchar_t* fmt, ...)
{
const size_t copy_size = sizeof(wchar_t) * LOG_CHARS/2;
wchar_t* const middle = &debug_log[LOG_CHARS/2];
memcpy(debug_log, middle, copy_size);
memcpy2(debug_log, middle, copy_size);
memset(middle, 0, copy_size);
debug_log_pos -= LOG_CHARS/2; // don't assign middle (may leave gap)
}

View File

@ -1173,7 +1173,7 @@ void* realloc_dbg(const void* user_p, size_t user_size, AllocType type, const ch
// old_size should only be non-zero if the Alloc security checks all passed
// If the old buffer was actually zero bytes large, do nothing :P
if (old_size)
memcpy(ret, user_p, old_size);
memcpy2(ret, user_p, old_size);
if(user_p)
free_dbg(user_p, AT_FREE, file,line,func, stack_frames+1);

View File

@ -1048,7 +1048,7 @@ int file_invalidate_cache(const char* fn)
// the underlying aio implementation likes buffer and offset to be
// sector-aligned; if not, the transfer goes through an align buffer,
// and requires an extra memcpy.
// and requires an extra memcpy2.
//
// if the user specifies an unaligned buffer, there's not much we can
// do - we can't assume the buffer contains padding. therefore,
@ -1226,7 +1226,7 @@ ssize_t file_io(File* f, off_t data_ofs, size_t data_size, void* data_buf,
// we have useable data from a previous temp buffer,
// but it needs to be copied into the user's buffer
if(from_cache && !temp)
memcpy((char*)data_buf+raw_transferred_cnt, data, size);
memcpy2((char*)data_buf+raw_transferred_cnt, data, size);
//// if size comes out short, we must be at EOF

View File

@ -462,7 +462,7 @@ static int lookup_add_file_cb(uintptr_t user, i32 idx,
char* fn_copy = (char*)malloc(fn_len+1);
if(!fn_copy)
return ERR_NO_MEM;
memcpy(fn_copy, fn, fn_len);
memcpy2(fn_copy, fn, fn_len);
fn_copy[fn_len] = '\0';
ent->fn = fn_copy;

View File

@ -123,7 +123,7 @@ static void create_level(uint level, uint level_w, uint level_h,
if(level == 0)
{
debug_assert(level_data_size == cld->prev_level_data_size);
memcpy(dst, src, level_data_size);
memcpy2(dst, src, level_data_size);
}
else
{
@ -224,7 +224,7 @@ TIMER_ACCRUE(tc_plain_transform);
clone_data = mem_alloc(data_size, 4*KiB);
if(!clone_data)
return ERR_NO_MEM;
memcpy(clone_data, data, data_size);
memcpy2(clone_data, data, data_size);
src = (const u8*)clone_data+data_size-pitch; // last row
row_ofs = -(ssize_t)pitch;
}
@ -234,7 +234,7 @@ TIMER_ACCRUE(tc_plain_transform);
{
for(uint y = 0; y < h; y++)
{
memcpy(dst, src, pitch);
memcpy2(dst, src, pitch);
dst += pitch;
src += row_ofs;
}

View File

@ -63,7 +63,7 @@ size_t read_func(void* ptr, size_t elements, size_t el_size, void* datasource)
Buf& b = incoming_bufs->front();
size_t copy_size = std::min(b.left, size);
memcpy(ptr, (char*)b.p+b.pos, copy_size);
memcpy2(ptr, (char*)b.p+b.pos, copy_size);
total_read += copy_size;
b.pos += copy_size;
b.left -= copy_size;
@ -109,7 +109,7 @@ void ogg_give_raw(void* _o, void* p, size_t size)
IncomingBufs* incoming_bufs = &o->incoming_bufs;
void* copy = malloc(size);
memcpy(copy, p, size);
memcpy2(copy, p, size);
incoming_bufs->push_back(Buf(copy, size));
}

View File

@ -53,16 +53,6 @@ inline double rint(double d)
#endif // !HAVE_C99
void memcpy2(void* dst, const void* src, size_t nbytes)
{
#if CPU_IA32
ia32_memcpy(dst, src, nbytes);
#else
memcpy(dst, src, nbytes);
#endif
}
// not possible with POSIX calls.
// called from ia32.cpp get_cpu_count
int on_each_cpu(void(*cb)())

View File

@ -50,6 +50,13 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
extern void* alloca(size_t size);
#endif
#ifdef CPU_IA32
# define memcpy2 ia32_memcpy
extern void ia32_memcpy(void* dst, const void* src, size_t nbytes);
#else
# define memcpy2 memcpy
#endif
// rint: round float to nearest integer.
// provided by C99, otherwise:
#if !HAVE_C99
@ -190,7 +197,6 @@ wchar_t* get_module_filename(void* addr, wchar_t* path);
extern int pick_directory(char* n_path, size_t buf_size);
extern void memcpy2(void* dst, const void* src, size_t nbytes);
// not possible with POSIX calls.
// called from ia32.cpp get_cpu_count

View File

@ -481,7 +481,7 @@ static int aio_rw(struct aiocb* cb)
// unaligned buffer: copy to align buffer and write from there.
if(buf_misaligned)
{
memcpy(r->buf, buf, size);
memcpy2(r->buf, buf, size);
memset((char*)r->buf + size, 0, actual_size - size);
// clear previous contents at end of align buf
actual_buf = r->buf;
@ -580,7 +580,7 @@ ssize_t aio_return(struct aiocb* cb)
// we read into align buffer - copy to user's buffer
if(r->read_into_align_buffer)
memcpy((void*)cb->aio_buf, (u8*)r->buf + r->pad, cb->aio_nbytes);
memcpy2((void*)cb->aio_buf, (u8*)r->buf + r->pad, cb->aio_nbytes);
// TODO: this copies data back into original buffer from align buffer
// when writing from unaligned buffer. unnecessarily slow.

View File

@ -239,7 +239,7 @@ PinhFromImageBase(HMODULE hmod) {
static inline void WINAPI
OverlayIAT(PImgThunkData pitdDst, PCImgThunkData pitdSrc) {
memcpy(pitdDst, pitdSrc, CountOfImports(pitdDst) * sizeof IMAGE_THUNK_DATA);
memcpy2(pitdDst, pitdSrc, CountOfImports(pitdDst) * sizeof IMAGE_THUNK_DATA);
}
static inline DWORD WINAPI

View File

@ -67,7 +67,7 @@ void CFilePacker::PackRaw(const void* rawdata,u32 rawdatalen)
{
u32 start=(u32)m_Data.size();
m_Data.resize(m_Data.size()+rawdatalen);
memcpy(&m_Data[start],rawdata,rawdatalen);
memcpy2(&m_Data[start],rawdata,rawdatalen);
*(u32*)&m_Data[8] += rawdatalen; // FIXME byte order?
}

View File

@ -146,7 +146,7 @@ void CFileUnpacker::UnpackRaw(void* rawdata,u32 rawdatalen)
{
// yes .. copy over
void* src = (char*)m_Buf + m_UnpackPos;
memcpy(rawdata, src, rawdatalen);
memcpy2(rawdata, src, rawdatalen);
m_UnpackPos += rawdatalen;
}
else

View File

@ -71,7 +71,7 @@ CSocketAddress::CSocketAddress(int port, ESocketProtocol proto)
break;
case IPv6:
m_Union.m_IPv6.sin6_family=PF_INET6;
memcpy(&m_Union.m_IPv6.sin6_addr, &in6addr_any, sizeof(in6addr_any));
memcpy2(&m_Union.m_IPv6.sin6_addr, &in6addr_any, sizeof(in6addr_any));
m_Union.m_IPv6.sin6_port=htons(port);
break;
default:
@ -91,7 +91,7 @@ CSocketAddress CSocketAddress::Loopback(int port, ESocketProtocol proto)
break;
case IPv6:
ret.m_Union.m_IPv6.sin6_family=PF_INET6;
memcpy(&ret.m_Union.m_IPv6.sin6_addr, &in6addr_loopback, sizeof(in6addr_loopback));
memcpy2(&ret.m_Union.m_IPv6.sin6_addr, &in6addr_loopback, sizeof(in6addr_loopback));
ret.m_Union.m_IPv6.sin6_port=htons(port);
break;
default:
@ -109,7 +109,7 @@ PS_RESULT CSocketAddress::Resolve(const char *name, int port, CSocketAddress &ad
if (res == 0)
{
if (ai->ai_addrlen < sizeof(addr.m_Union))
memcpy(&addr.m_Union, ai->ai_addr, ai->ai_addrlen);
memcpy2(&addr.m_Union, ai->ai_addr, ai->ai_addrlen);
switch (addr.m_Union.m_Family)
{
case IPv4:

View File

@ -303,7 +303,7 @@ void WriteBigScreenshot(const char* extension, int tiles)
{
void* dest = (char*)img + ((tile_y*tile_h + y) * img_w + (tile_x*tile_w)) * bpp/8;
void* src = (char*)tile_data + y * tile_w * bpp/8;
memcpy(dest, src, tile_w * bpp/8);
memcpy2(dest, src, tile_w * bpp/8);
}
}
}

View File

@ -149,7 +149,7 @@ InputSource *CVFSEntityResolver::resolveEntity(const XMLCh *const UNUSED(publicI
const ptrdiff_t prefixlen=end-m_DocName;
memcpy(abspath, m_DocName, prefixlen);
memcpy2(abspath, m_DocName, prefixlen);
strncpy(abspath+prefixlen, path, VFS_MAX_PATH-prefixlen);
// strncpy might not have terminated, if path was too long
abspath[VFS_MAX_PATH-1]=0;

View File

@ -43,14 +43,14 @@ public:
void write(const void* data, int size)
{
while (length + size >= allocated) grow();
memcpy(&buffer[length], data, size);
memcpy2(&buffer[length], data, size);
length += size;
}
void write(const void* data, int size, int offset)
{
debug_assert(offset >= 0 && offset+size <= length);
memcpy(&buffer[offset], data, size);
memcpy2(&buffer[offset], data, size);
}
int tell()

View File

@ -71,7 +71,7 @@ namespace std {
static char_type* copy(char_type* s1, const char_type* s2, size_t n)
{
return (char_type *)memcpy(s1, s2, n*sizeof(char_type));
return (char_type *)memcpy2(s1, s2, n*sizeof(char_type));
}
static char_type* assign(char_type* s, size_t n, char_type a)

View File

@ -181,7 +181,7 @@ void CVertexBuffer::AppendBatch(VBChunk* UNUSED(chunk),Handle texture,size_t num
// resize the chunk's batch to fit its indices
batch->m_IndexData.push_back(std::pair<size_t,u16*>(numIndices,indices));
// memcpy(&batch->m_Indices[0]+cursize,indices,sizeof(u16)*numIndices);
// memcpy2(&batch->m_Indices[0]+cursize,indices,sizeof(u16)*numIndices);
}
@ -197,7 +197,7 @@ void CVertexBuffer::UpdateChunkVertices(VBChunk* chunk,void* data)
if (glGetError() != GL_NO_ERROR) throw PSERROR_Renderer_VBOFailed();
} else {
debug_assert(m_SysMem);
memcpy(m_SysMem+chunk->m_Index*m_VertexSize,data,chunk->m_Count*m_VertexSize);
memcpy2(m_SysMem+chunk->m_Index*m_VertexSize,data,chunk->m_Count*m_VertexSize);
}
}

View File

@ -35,7 +35,7 @@ BEGIN_COMMAND(AlterElevation)
int verts = terrain->GetVerticesPerSide()*terrain->GetVerticesPerSide();
OldTerrain = new u16[verts];
memcpy(OldTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
memcpy2(OldTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
int amount = (int)d->amount;
@ -75,7 +75,7 @@ BEGIN_COMMAND(AlterElevation)
{
int verts = terrain->GetVerticesPerSide()*terrain->GetVerticesPerSide();
NewTerrain = new u16[verts];
memcpy(NewTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
memcpy2(NewTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
}
terrain->SetHeightMap(OldTerrain); // CTerrain duplicates the data
}