profiling indicates memcpy is a bottleneck, so globally replace all of them with memcpy2 (hand-optimized). depending on transfer size, this is 10%..300% faster. also made memcpy2 into a macro instead of thunk function because vc7 was still generating a jump.
This was SVN commit r3057.
This commit is contained in:
parent
252bd478c4
commit
20d038efda
@ -209,7 +209,7 @@ int CMapReader::ApplyData()
|
||||
if (unit)
|
||||
{
|
||||
CMatrix3D transform;
|
||||
memcpy(&transform._11, m_Objects[i].m_Transform, sizeof(float)*16);
|
||||
memcpy2(&transform._11, m_Objects[i].m_Transform, sizeof(float)*16);
|
||||
unit->GetModel()->SetTransform(transform);
|
||||
}
|
||||
}
|
||||
|
@ -143,12 +143,12 @@ void CParticleEmitter::Update()
|
||||
newParticle->m_gravity = m_gravity;
|
||||
|
||||
// calculate and assign colour
|
||||
memcpy(colour, m_startColour, sizeof(float) * 4);
|
||||
memcpy2(colour, m_startColour, sizeof(float) * 4);
|
||||
colour[0] += (rand() % (int)((m_endColour[0] - m_startColour[0]) * 1000.0f + 1)) / 1000.0f;
|
||||
colour[1] += (rand() % (int)((m_endColour[1] - m_startColour[1]) * 1000.0f + 1)) / 1000.0f;
|
||||
colour[2] += (rand() % (int)((m_endColour[2] - m_startColour[2]) * 1000.0f + 1)) / 1000.0f;
|
||||
colour[3] += (rand() % (int)((m_endColour[3] - m_startColour[3]) * 1000.0f + 1)) / 1000.0f;
|
||||
memcpy(newParticle->m_colour, colour, sizeof(float) * 4);
|
||||
memcpy2(newParticle->m_colour, colour, sizeof(float) * 4);
|
||||
|
||||
// assign sprite
|
||||
newParticle->m_sprite = m_sprite;
|
||||
|
@ -58,7 +58,7 @@ bool CTerrain::Initialize(u32 size,const u16* data)
|
||||
// given a heightmap?
|
||||
if (data) {
|
||||
// yes; keep a copy of it
|
||||
memcpy(m_Heightmap,data,m_MapSize*m_MapSize*sizeof(u16));
|
||||
memcpy2(m_Heightmap,data,m_MapSize*m_MapSize*sizeof(u16));
|
||||
} else {
|
||||
// build a flat terrain
|
||||
memset(m_Heightmap,0,m_MapSize*m_MapSize*sizeof(u16));
|
||||
@ -263,7 +263,7 @@ void CTerrain::Resize(u32 size)
|
||||
u16* dst=newHeightmap;
|
||||
u32 copysize=newMapSize>m_MapSize ? m_MapSize : newMapSize;
|
||||
for (j=0;j<copysize;j++) {
|
||||
memcpy(dst,src,copysize*sizeof(u16));
|
||||
memcpy2(dst,src,copysize*sizeof(u16));
|
||||
dst+=copysize;
|
||||
src+=m_MapSize;
|
||||
if (newMapSize>m_MapSize) {
|
||||
@ -280,7 +280,7 @@ void CTerrain::Resize(u32 size)
|
||||
src=newHeightmap+((m_MapSize-1)*newMapSize);
|
||||
dst=src+newMapSize;
|
||||
for (u32 i=0;i<newMapSize-m_MapSize;i++) {
|
||||
memcpy(dst,src,newMapSize*sizeof(u16));
|
||||
memcpy2(dst,src,newMapSize*sizeof(u16));
|
||||
dst+=newMapSize;
|
||||
}
|
||||
}
|
||||
@ -290,7 +290,7 @@ void CTerrain::Resize(u32 size)
|
||||
for (u32 i=0;i<size;i++) {
|
||||
// copy over texture data from existing tiles, if possible
|
||||
if (i<m_MapSizePatches && j<m_MapSizePatches) {
|
||||
memcpy(newPatches[j*size+i].m_MiniPatches,m_Patches[j*m_MapSizePatches+i].m_MiniPatches,sizeof(CMiniPatch)*PATCH_SIZE*PATCH_SIZE);
|
||||
memcpy2(newPatches[j*size+i].m_MiniPatches,m_Patches[j*m_MapSizePatches+i].m_MiniPatches,sizeof(CMiniPatch)*PATCH_SIZE*PATCH_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -361,7 +361,7 @@ void CTerrain::InitialisePatches()
|
||||
void CTerrain::SetHeightMap(u16* heightmap)
|
||||
{
|
||||
// keep a copy of the given heightmap
|
||||
memcpy(m_Heightmap,heightmap,m_MapSize*m_MapSize*sizeof(u16));
|
||||
memcpy2(m_Heightmap,heightmap,m_MapSize*m_MapSize*sizeof(u16));
|
||||
|
||||
// recalculate patch bounds, invalidate vertices
|
||||
for (u32 j=0;j<m_MapSizePatches;j++) {
|
||||
|
@ -35,7 +35,7 @@ namespace I18n
|
||||
ref = new strImW_data;
|
||||
size_t len = wcslen(s)+1;
|
||||
ref->data = new wchar_t[len];
|
||||
memcpy((void*)ref->data, s, len*sizeof(wchar_t));
|
||||
memcpy2((void*)ref->data, s, len*sizeof(wchar_t));
|
||||
}
|
||||
|
||||
StrImW(const char* s)
|
||||
|
@ -47,7 +47,7 @@ void debug_wprintf_mem(const wchar_t* fmt, ...)
|
||||
{
|
||||
const size_t copy_size = sizeof(wchar_t) * LOG_CHARS/2;
|
||||
wchar_t* const middle = &debug_log[LOG_CHARS/2];
|
||||
memcpy(debug_log, middle, copy_size);
|
||||
memcpy2(debug_log, middle, copy_size);
|
||||
memset(middle, 0, copy_size);
|
||||
debug_log_pos -= LOG_CHARS/2; // don't assign middle (may leave gap)
|
||||
}
|
||||
|
@ -1173,7 +1173,7 @@ void* realloc_dbg(const void* user_p, size_t user_size, AllocType type, const ch
|
||||
// old_size should only be non-zero if the Alloc security checks all passed
|
||||
// If the old buffer was actually zero bytes large, do nothing :P
|
||||
if (old_size)
|
||||
memcpy(ret, user_p, old_size);
|
||||
memcpy2(ret, user_p, old_size);
|
||||
|
||||
if(user_p)
|
||||
free_dbg(user_p, AT_FREE, file,line,func, stack_frames+1);
|
||||
|
@ -1048,7 +1048,7 @@ int file_invalidate_cache(const char* fn)
|
||||
|
||||
// the underlying aio implementation likes buffer and offset to be
|
||||
// sector-aligned; if not, the transfer goes through an align buffer,
|
||||
// and requires an extra memcpy.
|
||||
// and requires an extra memcpy2.
|
||||
//
|
||||
// if the user specifies an unaligned buffer, there's not much we can
|
||||
// do - we can't assume the buffer contains padding. therefore,
|
||||
@ -1226,7 +1226,7 @@ ssize_t file_io(File* f, off_t data_ofs, size_t data_size, void* data_buf,
|
||||
// we have useable data from a previous temp buffer,
|
||||
// but it needs to be copied into the user's buffer
|
||||
if(from_cache && !temp)
|
||||
memcpy((char*)data_buf+raw_transferred_cnt, data, size);
|
||||
memcpy2((char*)data_buf+raw_transferred_cnt, data, size);
|
||||
|
||||
|
||||
//// if size comes out short, we must be at EOF
|
||||
|
@ -462,7 +462,7 @@ static int lookup_add_file_cb(uintptr_t user, i32 idx,
|
||||
char* fn_copy = (char*)malloc(fn_len+1);
|
||||
if(!fn_copy)
|
||||
return ERR_NO_MEM;
|
||||
memcpy(fn_copy, fn, fn_len);
|
||||
memcpy2(fn_copy, fn, fn_len);
|
||||
fn_copy[fn_len] = '\0';
|
||||
ent->fn = fn_copy;
|
||||
|
||||
|
@ -123,7 +123,7 @@ static void create_level(uint level, uint level_w, uint level_h,
|
||||
if(level == 0)
|
||||
{
|
||||
debug_assert(level_data_size == cld->prev_level_data_size);
|
||||
memcpy(dst, src, level_data_size);
|
||||
memcpy2(dst, src, level_data_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -224,7 +224,7 @@ TIMER_ACCRUE(tc_plain_transform);
|
||||
clone_data = mem_alloc(data_size, 4*KiB);
|
||||
if(!clone_data)
|
||||
return ERR_NO_MEM;
|
||||
memcpy(clone_data, data, data_size);
|
||||
memcpy2(clone_data, data, data_size);
|
||||
src = (const u8*)clone_data+data_size-pitch; // last row
|
||||
row_ofs = -(ssize_t)pitch;
|
||||
}
|
||||
@ -234,7 +234,7 @@ TIMER_ACCRUE(tc_plain_transform);
|
||||
{
|
||||
for(uint y = 0; y < h; y++)
|
||||
{
|
||||
memcpy(dst, src, pitch);
|
||||
memcpy2(dst, src, pitch);
|
||||
dst += pitch;
|
||||
src += row_ofs;
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ size_t read_func(void* ptr, size_t elements, size_t el_size, void* datasource)
|
||||
Buf& b = incoming_bufs->front();
|
||||
size_t copy_size = std::min(b.left, size);
|
||||
|
||||
memcpy(ptr, (char*)b.p+b.pos, copy_size);
|
||||
memcpy2(ptr, (char*)b.p+b.pos, copy_size);
|
||||
total_read += copy_size;
|
||||
b.pos += copy_size;
|
||||
b.left -= copy_size;
|
||||
@ -109,7 +109,7 @@ void ogg_give_raw(void* _o, void* p, size_t size)
|
||||
IncomingBufs* incoming_bufs = &o->incoming_bufs;
|
||||
|
||||
void* copy = malloc(size);
|
||||
memcpy(copy, p, size);
|
||||
memcpy2(copy, p, size);
|
||||
incoming_bufs->push_back(Buf(copy, size));
|
||||
}
|
||||
|
||||
|
@ -53,16 +53,6 @@ inline double rint(double d)
|
||||
#endif // !HAVE_C99
|
||||
|
||||
|
||||
void memcpy2(void* dst, const void* src, size_t nbytes)
|
||||
{
|
||||
#if CPU_IA32
|
||||
ia32_memcpy(dst, src, nbytes);
|
||||
#else
|
||||
memcpy(dst, src, nbytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// not possible with POSIX calls.
|
||||
// called from ia32.cpp get_cpu_count
|
||||
int on_each_cpu(void(*cb)())
|
||||
|
@ -50,6 +50,13 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
|
||||
extern void* alloca(size_t size);
|
||||
#endif
|
||||
|
||||
#ifdef CPU_IA32
|
||||
# define memcpy2 ia32_memcpy
|
||||
extern void ia32_memcpy(void* dst, const void* src, size_t nbytes);
|
||||
#else
|
||||
# define memcpy2 memcpy
|
||||
#endif
|
||||
|
||||
// rint: round float to nearest integer.
|
||||
// provided by C99, otherwise:
|
||||
#if !HAVE_C99
|
||||
@ -190,7 +197,6 @@ wchar_t* get_module_filename(void* addr, wchar_t* path);
|
||||
|
||||
extern int pick_directory(char* n_path, size_t buf_size);
|
||||
|
||||
extern void memcpy2(void* dst, const void* src, size_t nbytes);
|
||||
|
||||
// not possible with POSIX calls.
|
||||
// called from ia32.cpp get_cpu_count
|
||||
|
@ -481,7 +481,7 @@ static int aio_rw(struct aiocb* cb)
|
||||
// unaligned buffer: copy to align buffer and write from there.
|
||||
if(buf_misaligned)
|
||||
{
|
||||
memcpy(r->buf, buf, size);
|
||||
memcpy2(r->buf, buf, size);
|
||||
memset((char*)r->buf + size, 0, actual_size - size);
|
||||
// clear previous contents at end of align buf
|
||||
actual_buf = r->buf;
|
||||
@ -580,7 +580,7 @@ ssize_t aio_return(struct aiocb* cb)
|
||||
|
||||
// we read into align buffer - copy to user's buffer
|
||||
if(r->read_into_align_buffer)
|
||||
memcpy((void*)cb->aio_buf, (u8*)r->buf + r->pad, cb->aio_nbytes);
|
||||
memcpy2((void*)cb->aio_buf, (u8*)r->buf + r->pad, cb->aio_nbytes);
|
||||
|
||||
// TODO: this copies data back into original buffer from align buffer
|
||||
// when writing from unaligned buffer. unnecessarily slow.
|
||||
|
@ -239,7 +239,7 @@ PinhFromImageBase(HMODULE hmod) {
|
||||
|
||||
static inline void WINAPI
|
||||
OverlayIAT(PImgThunkData pitdDst, PCImgThunkData pitdSrc) {
|
||||
memcpy(pitdDst, pitdSrc, CountOfImports(pitdDst) * sizeof IMAGE_THUNK_DATA);
|
||||
memcpy2(pitdDst, pitdSrc, CountOfImports(pitdDst) * sizeof IMAGE_THUNK_DATA);
|
||||
}
|
||||
|
||||
static inline DWORD WINAPI
|
||||
|
@ -67,7 +67,7 @@ void CFilePacker::PackRaw(const void* rawdata,u32 rawdatalen)
|
||||
{
|
||||
u32 start=(u32)m_Data.size();
|
||||
m_Data.resize(m_Data.size()+rawdatalen);
|
||||
memcpy(&m_Data[start],rawdata,rawdatalen);
|
||||
memcpy2(&m_Data[start],rawdata,rawdatalen);
|
||||
|
||||
*(u32*)&m_Data[8] += rawdatalen; // FIXME byte order?
|
||||
}
|
||||
|
@ -146,7 +146,7 @@ void CFileUnpacker::UnpackRaw(void* rawdata,u32 rawdatalen)
|
||||
{
|
||||
// yes .. copy over
|
||||
void* src = (char*)m_Buf + m_UnpackPos;
|
||||
memcpy(rawdata, src, rawdatalen);
|
||||
memcpy2(rawdata, src, rawdatalen);
|
||||
m_UnpackPos += rawdatalen;
|
||||
}
|
||||
else
|
||||
|
@ -71,7 +71,7 @@ CSocketAddress::CSocketAddress(int port, ESocketProtocol proto)
|
||||
break;
|
||||
case IPv6:
|
||||
m_Union.m_IPv6.sin6_family=PF_INET6;
|
||||
memcpy(&m_Union.m_IPv6.sin6_addr, &in6addr_any, sizeof(in6addr_any));
|
||||
memcpy2(&m_Union.m_IPv6.sin6_addr, &in6addr_any, sizeof(in6addr_any));
|
||||
m_Union.m_IPv6.sin6_port=htons(port);
|
||||
break;
|
||||
default:
|
||||
@ -91,7 +91,7 @@ CSocketAddress CSocketAddress::Loopback(int port, ESocketProtocol proto)
|
||||
break;
|
||||
case IPv6:
|
||||
ret.m_Union.m_IPv6.sin6_family=PF_INET6;
|
||||
memcpy(&ret.m_Union.m_IPv6.sin6_addr, &in6addr_loopback, sizeof(in6addr_loopback));
|
||||
memcpy2(&ret.m_Union.m_IPv6.sin6_addr, &in6addr_loopback, sizeof(in6addr_loopback));
|
||||
ret.m_Union.m_IPv6.sin6_port=htons(port);
|
||||
break;
|
||||
default:
|
||||
@ -109,7 +109,7 @@ PS_RESULT CSocketAddress::Resolve(const char *name, int port, CSocketAddress &ad
|
||||
if (res == 0)
|
||||
{
|
||||
if (ai->ai_addrlen < sizeof(addr.m_Union))
|
||||
memcpy(&addr.m_Union, ai->ai_addr, ai->ai_addrlen);
|
||||
memcpy2(&addr.m_Union, ai->ai_addr, ai->ai_addrlen);
|
||||
switch (addr.m_Union.m_Family)
|
||||
{
|
||||
case IPv4:
|
||||
|
@ -303,7 +303,7 @@ void WriteBigScreenshot(const char* extension, int tiles)
|
||||
{
|
||||
void* dest = (char*)img + ((tile_y*tile_h + y) * img_w + (tile_x*tile_w)) * bpp/8;
|
||||
void* src = (char*)tile_data + y * tile_w * bpp/8;
|
||||
memcpy(dest, src, tile_w * bpp/8);
|
||||
memcpy2(dest, src, tile_w * bpp/8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ InputSource *CVFSEntityResolver::resolveEntity(const XMLCh *const UNUSED(publicI
|
||||
|
||||
const ptrdiff_t prefixlen=end-m_DocName;
|
||||
|
||||
memcpy(abspath, m_DocName, prefixlen);
|
||||
memcpy2(abspath, m_DocName, prefixlen);
|
||||
strncpy(abspath+prefixlen, path, VFS_MAX_PATH-prefixlen);
|
||||
// strncpy might not have terminated, if path was too long
|
||||
abspath[VFS_MAX_PATH-1]=0;
|
||||
|
@ -43,14 +43,14 @@ public:
|
||||
void write(const void* data, int size)
|
||||
{
|
||||
while (length + size >= allocated) grow();
|
||||
memcpy(&buffer[length], data, size);
|
||||
memcpy2(&buffer[length], data, size);
|
||||
length += size;
|
||||
}
|
||||
|
||||
void write(const void* data, int size, int offset)
|
||||
{
|
||||
debug_assert(offset >= 0 && offset+size <= length);
|
||||
memcpy(&buffer[offset], data, size);
|
||||
memcpy2(&buffer[offset], data, size);
|
||||
}
|
||||
|
||||
int tell()
|
||||
|
@ -71,7 +71,7 @@ namespace std {
|
||||
|
||||
static char_type* copy(char_type* s1, const char_type* s2, size_t n)
|
||||
{
|
||||
return (char_type *)memcpy(s1, s2, n*sizeof(char_type));
|
||||
return (char_type *)memcpy2(s1, s2, n*sizeof(char_type));
|
||||
}
|
||||
|
||||
static char_type* assign(char_type* s, size_t n, char_type a)
|
||||
|
@ -181,7 +181,7 @@ void CVertexBuffer::AppendBatch(VBChunk* UNUSED(chunk),Handle texture,size_t num
|
||||
|
||||
// resize the chunk's batch to fit its indices
|
||||
batch->m_IndexData.push_back(std::pair<size_t,u16*>(numIndices,indices));
|
||||
// memcpy(&batch->m_Indices[0]+cursize,indices,sizeof(u16)*numIndices);
|
||||
// memcpy2(&batch->m_Indices[0]+cursize,indices,sizeof(u16)*numIndices);
|
||||
}
|
||||
|
||||
|
||||
@ -197,7 +197,7 @@ void CVertexBuffer::UpdateChunkVertices(VBChunk* chunk,void* data)
|
||||
if (glGetError() != GL_NO_ERROR) throw PSERROR_Renderer_VBOFailed();
|
||||
} else {
|
||||
debug_assert(m_SysMem);
|
||||
memcpy(m_SysMem+chunk->m_Index*m_VertexSize,data,chunk->m_Count*m_VertexSize);
|
||||
memcpy2(m_SysMem+chunk->m_Index*m_VertexSize,data,chunk->m_Count*m_VertexSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@ BEGIN_COMMAND(AlterElevation)
|
||||
|
||||
int verts = terrain->GetVerticesPerSide()*terrain->GetVerticesPerSide();
|
||||
OldTerrain = new u16[verts];
|
||||
memcpy(OldTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
|
||||
memcpy2(OldTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
|
||||
|
||||
int amount = (int)d->amount;
|
||||
|
||||
@ -75,7 +75,7 @@ BEGIN_COMMAND(AlterElevation)
|
||||
{
|
||||
int verts = terrain->GetVerticesPerSide()*terrain->GetVerticesPerSide();
|
||||
NewTerrain = new u16[verts];
|
||||
memcpy(NewTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
|
||||
memcpy2(NewTerrain, terrain->GetHeightMap(), verts*sizeof(u16));
|
||||
}
|
||||
terrain->SetHeightMap(OldTerrain); // CTerrain duplicates the data
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user