From e07622b56aad56191819e42deedeb7573b38822a Mon Sep 17 00:00:00 2001
From: janwas <jan@wildfiregames.com>
Date: Sat, 28 Jan 2006 22:19:42 +0000
Subject: [PATCH] Cache: implement meat of landlord algorithm and add remove()
 allocators: add freelist capability to Bucket; add provision for variable XOR
 fixed size allocs archive: re-tag file buffers if reading uncompressed from
 archive; improve LFH fixup logic file_cache: add cache line invalidation;
 lock down pages (readonly) when IO finished file_io: cleanup+docs; properly
 cut off at EOF without breaking alignment. file_stats: add seek accounting
 (WIP) vfs_optimizer: also record file_buf_free in the trace. initial
 implementation of archive builder (WIP) zip: lfh_fixup now more efficient
 (does not involve buffer manager - instead it grabs LFH from temp blocks)
 tex: plug FileIOBuf leak. avoid writing to tex.hm because that is a read-only
 file_buf.

This was SVN commit r3428.
---
 source/lib/adts.h                     |  57 ++-
 source/lib/allocators.cpp             | 154 +++++---
 source/lib/allocators.h               | 106 +++---
 source/lib/res/file/archive.cpp       |  14 +-
 source/lib/res/file/archive.h         |   2 +-
 source/lib/res/file/compression.cpp   |   3 +-
 source/lib/res/file/compression.h     |   1 +
 source/lib/res/file/file.cpp          |   1 +
 source/lib/res/file/file_cache.cpp    | 458 +++++++++++++++---------
 source/lib/res/file/file_cache.h      |  28 +-
 source/lib/res/file/file_internal.h   |   4 +-
 source/lib/res/file/file_io.cpp       | 180 +++++-----
 source/lib/res/file/file_io.h         |   3 +-
 source/lib/res/file/file_stats.cpp    |  21 +-
 source/lib/res/file/file_stats.h      |   9 +-
 source/lib/res/file/vfs.cpp           |  13 +-
 source/lib/res/file/vfs_optimizer.cpp | 485 ++++++++++++++++----------
 source/lib/res/file/vfs_optimizer.h   |  32 +-
 source/lib/res/file/zip.cpp           | 103 ++++--
 source/lib/res/file/zip.h             |   7 +
 source/lib/res/graphics/tex.cpp       |  49 ++-
 21 files changed, 1117 insertions(+), 613 deletions(-)
diff --git a/source/lib/adts.h b/source/lib/adts.h
index e081734e39..4ccbd596ba 100755
--- a/source/lib/adts.h
+++ b/source/lib/adts.h
@@ -230,7 +230,19 @@ public:
 		debug_assert(ret.second);	// must not already be in map
 	}
 
-	T retrieve(Key key, size_t* psize = 0)
+	// remove the entry identified by <key>. expected usage is to check
+	// if present and determine size via retrieve(), so no need to
+	// do anything else here.
+	// useful for invalidating single cache entries.
+	void remove(Key key)
+	{
+		map.erase(key);
+	}
+
+	// if there is no entry for <key> in the cache, return 0 with
+	// psize unchanged. otherwise, return its item and
+	// optionally pass back its size.
+	T retrieve(Key key, size_t* psize = 0, bool refill_credit = true)
 	{
 		CacheMapIt it = map.find(key);
 		if(it == map.end())
@@ -238,22 +250,54 @@ public:
 		CacheEntry& entry = it->second;
 		if(psize)
 			*psize = entry.size;
-// increase credit
+
+		if(refill_credit)
+		{
+			// Landlord algorithm calls for credit to be reset to anything
+			// between its current value and the cost.
+			const float gain = 0.75f;	// restore most credit
+			entry.credit = gain*entry.cost + (1.0f-gain)*entry.credit;
+		}
+
 		return entry.item;
 	}
 
 
+	// remove the least valuable item and optionally indicate
+	// how big it was (useful for statistics).
 	T remove_least_valuable(size_t* psize = 0)
 	{
 		CacheMapIt it;
 
-again:	// until we find someone to evict
+		// one iteration ought to suffice to evict someone due to
+		// definition of min_density, but we provide for repeating
+		// in case of floating-point imprecision.
+		// (goto vs. loop avoids nesting and emphasizes rarity)
+again:	
 
-		// foreach entry: decrease credit and evict if <= 0
+		// find minimum credit density (needed for charge step)
+		float min_density = 1e10;	// = \delta in [Young02]
 		for( it = map.begin(); it != map.end(); ++it)
 		{
 			CacheEntry& entry = it->second;
-			// found someone we can evict
+			const float density = entry.credit / entry.size;
+			min_density = MIN(density, min_density);
+		}
+
+		// .. charge everyone rent (proportional to min_density and size)
+		for( it = map.begin(); it != map.end(); ++it)
+		{
+			CacheEntry& entry = it->second;
+			entry.credit -= min_density * entry.size;
+
+			// evict immediately if credit is exhausted
+			// (note: Landlord algorithm calls for 'any subset' of
+			// these items to be evicted. since we need to return
+			// information about the item, we can only discard one.)
+			//
+			// this means every call will end up charging more than
+			// intended, but we compensate by resetting credit
+			// fairly high upon cache hit.
 			if(entry.credit <= 0.0f)
 			{
 				T item = entry.item;
@@ -264,8 +308,7 @@ again:	// until we find someone to evict
 			}
 		}
 
-		// none were evicted
-// charge rent
+		// none were evicted - do it all again.
 		goto again;
 	}
 
diff --git a/source/lib/allocators.cpp b/source/lib/allocators.cpp
index 7afbc8cde1..a547e3d36b 100644
--- a/source/lib/allocators.cpp
+++ b/source/lib/allocators.cpp
@@ -364,6 +364,7 @@ LibError da_append(DynArray* da, const void* data, size_t size)
 // - doesn't preallocate the entire pool;
 // - returns sequential addresses.
 
+
 // "freelist" is a pointer to the first unused element (0 if there are none);
 // its memory holds a pointer to the next free one in list.
 
@@ -386,7 +387,8 @@ static void* freelist_pop(void** pfreelist)
 }
 
 
-static const size_t POOL_CHUNK = 4*KiB;
+// elements returned are aligned to this many bytes:
+static const size_t ALIGN = 8;
 
 
 // ready <p> for use. <max_size> is the upper limit [bytes] on
@@ -396,15 +398,10 @@ static const size_t POOL_CHUNK = 4*KiB;
 //  (which cannot be freed individually);
 // otherwise, it specifies the number of bytes that will be
 // returned by pool_alloc (whose size parameter is then ignored).
-// in the latter case, size must at least be enough for a pointer
-//  (due to freelist implementation).
 LibError pool_create(Pool* p, size_t max_size, size_t el_size)
 {
-	if(el_size != 0 && el_size < sizeof(void*))
-		WARN_RETURN(ERR_INVALID_PARAM);
-
+	p->el_size = round_up(el_size, ALIGN);
 	RETURN_ERR(da_alloc(&p->da, max_size));
-	p->el_size = el_size;
 	return ERR_OK;
 }
 
@@ -446,7 +443,7 @@ void* pool_alloc(Pool* p, size_t size)
 {
 	// if pool allows variable sizes, go with the size parameter,
 	// otherwise the pool el_size setting.
-	const size_t el_size = p->el_size? p->el_size : size;
+	const size_t el_size = p->el_size? p->el_size : round_up(size, ALIGN);
 
 	// note: this can never happen in pools with variable-sized elements
 	// because they disallow pool_free.
@@ -470,17 +467,19 @@ have_el:
 }
 
 
-// make <el> available for reuse in the given pool.
+// make <el> available for reuse in the given Pool.
 //
-// this is not allowed if the pool was set up for variable-size elements.
-// (copying with fragmentation would defeat the point of a pool - simplicity)
-// we could allow this, but instead warn and bail to make sure it
-// never happens inadvertently (leaking memory in the pool).
+// this is not allowed if created for variable-size elements.
+// rationale: avoids having to pass el_size here and compare with size when
+// allocating; also prevents fragmentation and leaking memory.
 void pool_free(Pool* p, void* el)
 {
+	// only allowed to free items if we were initialized with
+	// fixed el_size. (this avoids having to pass el_size here and
+	// check if requested_size matches that when allocating)
 	if(p->el_size == 0)
 	{
-		debug_warn("pool is set up for variable-size items");
+		debug_warn("cannot free variable-size items");
 		return;
 	}
 
@@ -506,9 +505,8 @@ void pool_free_all(Pool* p)
 //-----------------------------------------------------------------------------
 
 // design goals:
-// - variable-sized allocations;
-// - no reuse of allocations, can only free all at once;
-// - no init necessary;
+// - fixed- XOR variable-sized blocks;
+// - allow freeing individual blocks if they are all fixed-size;
 // - never relocates;
 // - no fixed limit.
 
@@ -518,46 +516,41 @@ void pool_free_all(Pool* p)
 // basically a combination of region and heap, where frees go to the heap and
 // allocs exhaust that memory first and otherwise use the region.
 
-// must be constant and power-of-2 to allow fast modulo.
-const size_t BUCKET_SIZE = 4*KiB;
+// power-of-2 isn't required; value is arbitrary.
+const size_t BUCKET_SIZE = 4000;
 
-// allocate <size> bytes of memory from the given Bucket object.
-// <b> must initially be zeroed (e.g. by defining it as static data).
-void* bucket_alloc(Bucket* b, size_t size)
+// ready <b> for use.
+//
+// <el_size> can be 0 to allow variable-sized allocations
+//  (which cannot be freed individually);
+// otherwise, it specifies the number of bytes that will be
+// returned by bucket_alloc (whose size parameter is then ignored).
+LibError bucket_create(Bucket* b, size_t el_size)
 {
-	// would overflow a bucket
-	if(size > BUCKET_SIZE-sizeof(u8*))
+	b->freelist = 0;
+	b->el_size = round_up(el_size, ALIGN);
+
+	// note: allocating here avoids the is-this-the-first-time check
+	// in bucket_alloc, which speeds things up.
+	b->bucket = (u8*)malloc(BUCKET_SIZE);
+	if(!b->bucket)
 	{
-		debug_warn("size doesn't fit in a bucket");
-		return 0;
+		// cause next bucket_alloc to retry the allocation
+		b->pos = BUCKET_SIZE;
+		b->num_buckets = 0;
+		return ERR_NO_MEM;
 	}
 
-	// make sure the next item will be aligned
-	size = round_up(size, 8);
-
-	// if there's not enough space left or no bucket yet (first call),
-	// close it and allocate another.
-	if(b->pos+size > BUCKET_SIZE || !b->bucket)
-	{
-		u8* bucket = (u8*)malloc(BUCKET_SIZE);
-		if(!bucket)
-			return 0;
-		*(u8**)bucket = b->bucket;
-		b->bucket = bucket;
-		// skip bucket list field and align to 8 bytes (note: malloc already
-		// aligns to at least 8 bytes, so don't take b->bucket into account)
-		b->pos = round_up(sizeof(u8*), 8);
-		b->num_buckets++;
-	}
-
-	void* ret = b->bucket+b->pos;
-	b->pos += size;
-	return ret;
+	*(u8**)b->bucket = 0;	// terminate list
+	b->pos = round_up(sizeof(u8*), ALIGN);
+	b->num_buckets = 1;
+	return ERR_OK;
 }
 
 
-// free all allocations that ensued from the given Bucket.
-void bucket_free_all(Bucket* b)
+// free all memory that ensued from <b>.
+// future alloc and free calls on this Bucket will fail.
+void bucket_destroy(Bucket* b)
 {
 	while(b->bucket)
 	{
@@ -568,6 +561,69 @@ void bucket_free_all(Bucket* b)
 	}
 
 	debug_assert(b->num_buckets == 0);
+
+	// poison pill: cause subsequent alloc and free to fail
+	b->freelist = 0;
+	b->el_size = BUCKET_SIZE;
+}
+
+
+// return an entry from the bucket, or 0 if another would have to be
+// allocated and there isn't enough memory to do so.
+// exhausts the freelist before returning new entries to improve locality.
+//
+// if the bucket was set up with fixed-size elements, <size> is ignored;
+// otherwise, <size> bytes are allocated.
+void* bucket_alloc(Bucket* b, size_t size)
+{
+	size_t el_size = b->el_size? b->el_size : round_up(size, ALIGN);
+	// must fit in a bucket
+	debug_assert(el_size <= BUCKET_SIZE-sizeof(u8*));
+
+	// try to satisfy alloc from freelist
+	void* el = freelist_pop(&b->freelist);
+	if(el)
+		return el;
+
+	// if there's not enough space left, close current bucket and
+	// allocate another.
+	if(b->pos+el_size > BUCKET_SIZE)
+	{
+		u8* bucket = (u8*)malloc(BUCKET_SIZE);
+		if(!bucket)
+			return 0;
+		*(u8**)bucket = b->bucket;
+		b->bucket = bucket;
+		// skip bucket list field and align (note: malloc already
+		// aligns to at least 8 bytes, so don't take b->bucket into account)
+		b->pos = round_up(sizeof(u8*), ALIGN);
+		b->num_buckets++;
+	}
+
+	void* ret = b->bucket+b->pos;
+	b->pos += el_size;
+	return ret;
+}
+
+
+// make <el> available for reuse in <b>.
+//
+// this is not allowed if created for variable-size elements.
+// rationale: avoids having to pass el_size here and compare with size when
+// allocating; also prevents fragmentation and leaking memory.
+void bucket_free(Bucket* b, void* el)
+{
+	if(b->el_size == 0)
+	{
+		debug_warn("cannot free variable-size items");
+		return;
+	}
+
+	freelist_push(&b->freelist, el);
+
+	// note: checking if <el> was actually allocated from <b> is difficult:
+	// it may not be in the currently open bucket, so we'd have to
+	// iterate over the list - too much work.
 }
 
 
diff --git a/source/lib/allocators.h b/source/lib/allocators.h
index 0d82eb716b..6cf19073a5 100644
--- a/source/lib/allocators.h
+++ b/source/lib/allocators.h
@@ -164,8 +164,6 @@ const size_t POOL_VARIABLE_ALLOCS = 0;
 //  (which cannot be freed individually);
 // otherwise, it specifies the number of bytes that will be
 // returned by pool_alloc (whose size parameter is then ignored).
-// in the latter case, size must at least be enough for a pointer
-//  (due to freelist implementation).
 extern LibError pool_create(Pool* p, size_t max_size, size_t el_size);
 
 // free all memory that ensued from <p>. all elements are made unusable
@@ -185,12 +183,11 @@ extern bool pool_contains(Pool* p, void* el);
 // otherwise, <size> bytes are allocated.
 extern void* pool_alloc(Pool* p, size_t size);
 
-// make <el> available for reuse in the given pool.
+// make <el> available for reuse in the given Pool.
 //
-// this is not allowed if the pool was set up for variable-size elements.
-// (copying with fragmentation would defeat the point of a pool - simplicity)
-// we could allow this, but instead warn and bail to make sure it
-// never happens inadvertently (leaking memory in the pool).
+// this is not allowed if created for variable-size elements.
+// rationale: avoids having to pass el_size here and compare with size when
+// allocating; also prevents fragmentation and leaking memory.
 extern void pool_free(Pool* p, void* el);
 
 // "free" all allocations that ensued from the given Pool.
@@ -204,40 +201,61 @@ extern void pool_free_all(Pool* p);
 //
 
 // design goals:
-// - variable-sized allocations;
-// - no reuse of allocations, can only free all at once;
-// - no init necessary;
+// - fixed- XOR variable-sized blocks;
+// - allow freeing individual blocks if they are all fixed-size;
 // - never relocates;
 // - no fixed limit.
 
 // note: this type of allocator is called "region-based" in the literature.
 // see "Reconsidering Custom Memory Allocation" (Berger, Zorn, McKinley).
-// if individual elements must be freeable, consider "reaps":
+// if individual variable-size elements must be freeable, consider "reaps":
 // basically a combination of region and heap, where frees go to the heap and
 // allocs exhaust that memory first and otherwise use the region.
 
 // opaque! do not read/write any fields!
 struct Bucket
 {
-	// currently open bucket. must be initialized to 0.
+	// currently open bucket.
 	u8* bucket;
 
 	// offset of free space at end of current bucket (i.e. # bytes in use).
-	// must be initialized to 0.
 	size_t pos;
 
-	// records # buckets allocated; used to check if the list of them
-	// isn't corrupted. must be initialized to 0.
-	uint num_buckets;
+	void* freelist;
+
+	size_t el_size : 16;
+
+	// records # buckets allocated; verifies the list of buckets is correct.
+	uint num_buckets : 16;
 };
 
 
-// allocate <size> bytes of memory from the given Bucket object.
-// <b> must initially be zeroed (e.g. by defining it as static data).
+// ready <b> for use.
+//
+// <el_size> can be 0 to allow variable-sized allocations
+//  (which cannot be freed individually);
+// otherwise, it specifies the number of bytes that will be
+// returned by bucket_alloc (whose size parameter is then ignored).
+extern LibError bucket_create(Bucket* b, size_t el_size);
+
+// free all memory that ensued from <b>.
+// future alloc and free calls on this Bucket will fail.
+extern void bucket_destroy(Bucket* b);
+
+// return an entry from the bucket, or 0 if another would have to be
+// allocated and there isn't enough memory to do so.
+// exhausts the freelist before returning new entries to improve locality.
+//
+// if the bucket was set up with fixed-size elements, <size> is ignored;
+// otherwise, <size> bytes are allocated.
 extern void* bucket_alloc(Bucket* b, size_t size);
 
-// free all allocations that ensued from the given Bucket.
-extern void bucket_free_all(Bucket* b);
+// make <el> available for reuse in <b>.
+//
+// this is not allowed if created for variable-size elements.
+// rationale: avoids having to pass el_size here and compare with size when
+// allocating; also prevents fragmentation and leaking memory.
+extern void bucket_free(Bucket* b, void* el);
 
 
 //
@@ -267,25 +285,29 @@ extern void matrix_free(void** matrix);
 // overrun protection
 //
 
-// this class wraps an arbitrary object in DynArray memory and can detect
-// inadvertent writes to it. this is useful for tracking down memory overruns.
-//
-// the basic idea is to require users to request access to the object and
-// notify us when done; memory access permission is temporarily granted.
-// (similar in principle to Software Transaction Memory).
-//
-// since this is quite slow, the protection is disabled unless
-// CONFIG_OVERRUN_PROTECTION == 1; this avoids having to remove the
-// wrapper code in release builds and re-write when looking for overruns.
-//
-// example usage:
-// OverrunProtector<your_class> your_class_wrapper;
-// ..
-// your_class* yc = your_class_wrapper.get();
-// if(!yc) abort();	// not enough memory to allocate a your_class instance
-// // access/write to <yc>
-// your_class_wrapper.lock();	// disallow further access
-// ..
+/*
+OverrunProtector wraps an arbitrary object in DynArray memory and can detect
+inadvertent writes to it. this is useful for tracking down memory overruns.
+
+the basic idea is to require users to request access to the object and
+notify us when done; memory access permission is temporarily granted.
+(similar in principle to Software Transaction Memory).
+
+since this is quite slow, the protection is disabled unless
+CONFIG_OVERRUN_PROTECTION == 1; this avoids having to remove the
+wrapper code in release builds and re-write when looking for overruns.
+
+example usage:
+OverrunProtector<your_class> your_class_wrapper;
+..
+your_class* yc = your_class_wrapper.get();	// unlock, make ready for use
+if(!yc)			// your_class_wrapper's one-time alloc of a your_class-
+	abort();	// instance had failed - can't continue.
+doSomethingWith(yc);	// read/write access
+your_class_wrapper.lock();	// disallow further access until next .get()
+..
+*/
+
 template<class T> class OverrunProtector
 {
 	DynArray da;
@@ -322,11 +344,9 @@ private:
 
 	void init()
 	{
-		const size_t size = 4096;
-		cassert(sizeof(T) <= size);
-		if(da_alloc(&da, size) < 0)
+		if(da_alloc(&da, sizeof(T)) < 0)
 			goto fail;
-		if(da_set_size(&da, size) < 0)
+		if(da_set_size(&da, sizeof(T)) < 0)
 			goto fail;
 
 #include "nommgr.h"
diff --git a/source/lib/res/file/archive.cpp b/source/lib/res/file/archive.cpp
index 2fef7da27a..d9cbb8b337 100644
--- a/source/lib/res/file/archive.cpp
+++ b/source/lib/res/file/archive.cpp
@@ -281,11 +281,7 @@ LibError afile_open(const Handle ha, const char* fn, uintptr_t memento, int flag
 	// => need to copy ArchiveEntry fields into AFile.
 	RETURN_ERR(archive_get_file_info(a, atom_fn, memento, ent));
 
-	if(ent->flags & ZIP_LFH_FIXUP_NEEDED)
-	{
-		zip_fixup_lfh(&a->f, ent);
-		ent->flags &= ~ZIP_LFH_FIXUP_NEEDED;
-	}
+	zip_fixup_lfh(&a->f, ent);
 
 	uintptr_t ctx = 0;
 	// slight optimization: do not allocate context if not compressed
@@ -517,8 +513,14 @@ ssize_t afile_read(AFile* af, off_t ofs, size_t size, FileIOBuf* pbuf, FileIOCB
 	H_DEREF(af->ha, Archive, a);
 
 	if(!is_compressed(af))
+	{
+		bool we_allocated = (pbuf != FILE_BUF_TEMP) && (*pbuf == FILE_BUF_ALLOC);
 		// no need to set last_cofs - only checked if compressed.
-		return file_io(&a->f, af->ofs+ofs, size, pbuf, cb, cb_ctx);
+		RETURN_ERR(file_io(&a->f, af->ofs+ofs, size, pbuf, cb, cb_ctx));
+		if(we_allocated)
+			(void)file_buf_set_real_fn(*pbuf, af->fc.atom_fn);
+		return ERR_OK;
+	}
 
 	debug_assert(af->ctx != 0);
 
diff --git a/source/lib/res/file/archive.h b/source/lib/res/file/archive.h
index 79a7ec17ae..7967f179e3 100644
--- a/source/lib/res/file/archive.h
+++ b/source/lib/res/file/archive.h
@@ -174,7 +174,7 @@ struct ArchiveEntry
 	time_t mtime;
 
 	// used in IO
-	off_t ofs;		// bit 31 set if fixup needed
+	off_t ofs;
 	off_t csize;
 	CompressionMethod method;
 
diff --git a/source/lib/res/file/compression.cpp b/source/lib/res/file/compression.cpp
index e411aed2f0..9c43658b2e 100644
--- a/source/lib/res/file/compression.cpp
+++ b/source/lib/res/file/compression.cpp
@@ -392,9 +392,9 @@ uintptr_t comp_alloc(ContextType type, CompressionMethod method)
 		return 0;
 	Compressor* c;
 
+#include "nommgr.h"	// protect placement new and free() from macros
 	switch(method)
 	{
-#include "nommgr.h"
 #ifndef NO_ZLIB
 	case CM_DEFLATE:
 		cassert(sizeof(ZLibCompressor) <= MAX_COMPRESSOR_SIZE);
@@ -407,6 +407,7 @@ uintptr_t comp_alloc(ContextType type, CompressionMethod method)
 		return 0;
 #include "mmgr.h"
 	}
+#include "mmgr.h"
 
 	c->init();
 	return (uintptr_t)c;
diff --git a/source/lib/res/file/compression.h b/source/lib/res/file/compression.h
index 59d7630963..b9b5e0b1b2 100644
--- a/source/lib/res/file/compression.h
+++ b/source/lib/res/file/compression.h
@@ -26,6 +26,7 @@ extern ssize_t comp_feed(uintptr_t ctx, const void* in, size_t in_size);
 
 extern LibError comp_finish(uintptr_t ctx, void** out, size_t* out_size);
 
+extern LibError comp_reset(uintptr_t ctx);
 extern void comp_free(uintptr_t ctx);
 
 #endif	// #ifndef COMPRESSION_H__
diff --git a/source/lib/res/file/file.cpp b/source/lib/res/file/file.cpp
index 700e77d0f8..6a36a2c600 100755
--- a/source/lib/res/file/file.cpp
+++ b/source/lib/res/file/file.cpp
@@ -824,6 +824,7 @@ LibError file_init()
 {
 	atom_init();
 	file_cache_init();
+	file_io_init();
 	return ERR_OK;
 }
 
diff --git a/source/lib/res/file/file_cache.cpp b/source/lib/res/file/file_cache.cpp
index 1c9fd75f5f..49bb1e1b80 100644
--- a/source/lib/res/file/file_cache.cpp
+++ b/source/lib/res/file/file_cache.cpp
@@ -7,17 +7,214 @@
 #include "lib/adts.h"
 #include "file_internal.h"
 
-// strategy:
-// policy:
-// - allocation: use all available mem first, then look at freelist
-// - freelist: good fit, address-ordered, always split
-// - free: immediately coalesce
-// mechanism:
-// - coalesce: boundary tags in freed memory
-// - freelist: 2**n segregated doubly-linked, address-ordered
+//-----------------------------------------------------------------------------
+
+// block cache: intended to cache raw compressed data, since files aren't aligned
+// in the archive; alignment code would force a read of the whole block,
+// which would be a slowdown unless we keep them in memory.
+//
+// keep out of async code (although extra work for sync: must not issue/wait
+// if was cached) to simplify things. disadvantage: problems if same block
+// is issued twice, before the first call completes (via wait_io).
+// that won't happen though unless we have threaded file_ios =>
+// rare enough not to worry about performance.
+//
+// since sync code allocates the (temp) buffer, it's guaranteed
+// to remain valid.
+//
+
+class BlockMgr
+{
+	static const size_t MAX_BLOCKS = 32;
+	enum BlockStatus
+	{
+		BS_PENDING,
+		BS_COMPLETE,
+		BS_INVALID
+	};
+	struct Block
+	{
+		BlockId id;
+		void* mem;
+		BlockStatus status;
+		int refs;
+
+		Block() {}	// for RingBuf
+		Block(BlockId id_, void* mem_)
+			: id(id_), mem(mem_), status(BS_PENDING), refs(0) {}
+	};
+	RingBuf<Block, MAX_BLOCKS> blocks;
+	typedef RingBuf<Block, MAX_BLOCKS>::iterator BlockIt;
+
+	// use Pool to allocate mem for all blocks because it guarantees
+	// page alignment (required for IO) and obviates manually aligning.
+	Pool pool;
+
+public:
+	void init()
+	{
+		(void)pool_create(&pool, MAX_BLOCKS*FILE_BLOCK_SIZE, FILE_BLOCK_SIZE);
+	}
+
+	void shutdown()
+	{
+		(void)pool_destroy(&pool);
+	}
+
+	void* alloc(BlockId id)
+	{
+		if(blocks.size() == MAX_BLOCKS)
+		{
+			Block& b = blocks.front();
+			// if this block is still locked, big trouble..
+			// (someone forgot to free it and we can't reuse it)
+			debug_assert(b.status != BS_PENDING && b.refs == 0);
+			pool_free(&pool, b.mem);
+			blocks.pop_front();
+		}
+		void* mem = pool_alloc(&pool, FILE_BLOCK_SIZE);	// can't fail
+		blocks.push_back(Block(id, mem));
+		return mem;
+	}
+
+	void mark_completed(BlockId id)
+	{
+		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
+		{
+			if(block_eq(it->id, id))
+				it->status = BS_COMPLETE;
+		}
+	}
+
+	void* find(BlockId id)
+	{
+		// linear search is ok, since we only keep a few blocks.
+		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
+		{
+			if(block_eq(it->id, id) && it->status == BS_COMPLETE)
+			{
+				it->refs++;
+				return it->mem;
+			}
+		}
+		return 0;	// not found
+	}
+
+	void release(BlockId id)
+	{
+		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
+		{
+			if(block_eq(it->id, id))
+			{
+				it->refs--;
+				debug_assert(it->refs >= 0);
+				return;
+			}
+		}
+		debug_warn("release: block not found, but ought still to be in cache");
+	}
+
+
+	void invalidate(const char* atom_fn)
+	{
+		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
+		{
+			if(it->id.atom_fn == atom_fn)
+			{
+				if(it->refs)
+					debug_warn("invalidating block that is currently in-use");
+				it->status = BS_INVALID;
+			}
+		}
+	}
+};
+static BlockMgr block_mgr;
+
+
+bool block_eq(BlockId b1, BlockId b2)
+{
+	return b1.atom_fn == b2.atom_fn && b1.block_num == b2.block_num;
+}
+
+// create an id for use with the cache that uniquely identifies
+// the block from the file <atom_fn> starting at <ofs>.
+BlockId block_cache_make_id(const char* atom_fn, const off_t ofs)
+{
+	// <atom_fn> is guaranteed to be unique (see file_make_unique_fn_copy).
+	// block_num should always fit in 32 bits (assuming maximum file size
+	// = 2^32 * FILE_BLOCK_SIZE ~= 2^48 -- plenty). we don't bother
+	// checking this.
+	const u32 block_num = (u32)(ofs / FILE_BLOCK_SIZE);
+	BlockId id = { atom_fn, block_num };
+	return id;
+}
+
+void* block_cache_alloc(BlockId id)
+{
+	return block_mgr.alloc(id);
+}
+
+void block_cache_mark_completed(BlockId id)
+{
+	block_mgr.mark_completed(id);
+}
+
+void* block_cache_find(BlockId id)
+{
+	return block_mgr.find(id);
+}
+
+void block_cache_release(BlockId id)
+{
+	return block_mgr.release(id);
+}
+
+
+//-----------------------------------------------------------------------------
+
+// >= AIO_SECTOR_SIZE or else waio will have to realign.
+// chosen as exactly 1 page: this allows write-protecting file buffers
+// without worrying about their (non-page-aligned) borders.
+// internal fragmentation is considerable but acceptable.
+static const size_t BUF_ALIGN = 4*KiB;
+
+/*
+CacheAllocator
+
+the biggest worry of a file cache is fragmentation. there are 2
+basic approaches to combat this:
+1) 'defragment' periodically - move blocks around to increase
+   size of available 'holes'.
+2) prevent fragmentation from occurring at all via
+   deliberate alloc/free policy.
+
+file_io returns cache blocks directly to the user (zero-copy IO),
+so only currently unreferenced blocks can be moved (while holding a
+lock, to boot). it is believed that this would severely hamper
+defragmentation; we therefore go with the latter approach.
+
+basic insight is: fragmentation occurs when a block is freed whose
+neighbors are not free (thus preventing coalescing). this can be
+prevented by allocating objects of similar lifetimes together.
+typical workloads (uniform access frequency) already show such behavior:
+the Landlord cache manager evicts files in an LRU manner, which matches
+the allocation policy.
+
+references:
+"The Memory Fragmentation Problem - Solved?" (Johnstone and Wilson)
+"Dynamic Storage Allocation - A Survey and Critical Review" (Johnstone and Wilson)
+
+policy:
+- allocation: use all available mem first, then look at freelist
+- freelist: good fit, address-ordered, always split blocks
+- free: immediately coalesce
+mechanism:
+- coalesce: boundary tags in freed memory with magic value
+- freelist: 2**n segregated doubly-linked, address-ordered
+*/
 class CacheAllocator
 {
-	static const size_t MAX_CACHE_SIZE = 64*MiB;
+	static const size_t MAX_CACHE_SIZE = 32*MiB;
 
 public:
 	void init()
@@ -34,27 +231,41 @@ public:
 
 	void* alloc(size_t size)
 	{
-		const size_t size_pa = round_up(size, AIO_SECTOR_SIZE);
-
-		// use all available space first
-		void* p = pool_alloc(&pool, size_pa);
-		if(p)
-			return p;
+		const size_t size_pa = round_up(size, BUF_ALIGN);
+		void* p;
 
 		// try to reuse a freed entry
 		const uint size_class = size_class_of(size_pa);
 		p = alloc_from_class(size_class, size_pa);
 		if(p)
-			return p;
+			goto have_p;
+
+		// grab more space from pool
+		p = pool_alloc(&pool, size_pa);
+		if(p)
+			goto have_p;
+
+		// last resort: split a larger element
 		p = alloc_from_larger_class(size_class, size_pa);
 		if(p)
-			return p;
+			goto have_p;
 
 		// failed - can no longer expand and nothing big enough was
 		// found in freelists.
 		// file cache will decide which elements are least valuable,
 		// free() those and call us again.
 		return 0;
+
+have_p:
+		// make sure range is writable
+		(void)mprotect(p, size_pa, PROT_READ|PROT_WRITE);
+		return p;
+	}
+
+	void make_read_only(u8* p, size_t size)
+	{
+		const size_t size_pa = round_up(size, BUF_ALIGN);
+		(void)mprotect(p, size_pa, PROT_READ);
 	}
 
 #include "nommgr.h"
@@ -63,11 +274,11 @@ public:
 	{
 		if(!pool_contains(&pool, p))
 		{
-			debug_warn("not in arena");
+			debug_warn("invalid pointer");
 			return;
 		}
-		size_t size_pa = round_up(size, AIO_SECTOR_SIZE);
 
+		size_t size_pa = round_up(size, BUF_ALIGN);
 		coalesce(p, size_pa);
 		freelist_add(p, size_pa);
 	}
@@ -92,8 +303,8 @@ private:
 		u32 magic1;
 		u32 magic2;
 	};
-	// must be enough room to stash header+footer in the freed page.
-	cassert(AIO_SECTOR_SIZE >= 2*sizeof(FreePage));
+	// must be enough room to stash 2 FreePage instances in the freed page.
+	cassert(BUF_ALIGN >= 2*sizeof(FreePage));
 
 	FreePage* freed_page_at(u8* p, size_t ofs)
 	{
@@ -105,7 +316,7 @@ private:
 		FreePage* page = (FreePage*)p;
 		if(page->magic1 != MAGIC1 || page->magic2 != MAGIC2)
 			return 0;
-		debug_assert(page->size_pa % AIO_SECTOR_SIZE == 0);
+		debug_assert(page->size_pa % BUF_ALIGN == 0);
 		return page;
 	}
 
@@ -275,19 +486,19 @@ public:
 		extant_bufs.push_back(ExtantBuf(buf, size, atom_fn));
 	}
 
-	bool includes(FileIOBuf buf)
+	const char* get_owner_filename(FileIOBuf buf)
 	{
 		debug_assert(buf != 0);
 		for(size_t i = 0; i < extant_bufs.size(); i++)
 		{
 			ExtantBuf& eb = extant_bufs[i];
 			if(matches(eb, buf))
-				return true;
+				return eb.atom_fn;
 		}
-		return false;
+		return 0;
 	}
 
-	void find_and_remove(FileIOBuf buf, size_t* size)
+	void find_and_remove(FileIOBuf buf, size_t* size, const char** atom_fn)
 	{
 		debug_assert(buf != 0);
 		for(size_t i = 0; i < extant_bufs.size(); i++)
@@ -296,6 +507,7 @@ public:
 			if(matches(eb, buf))
 			{
 				*size = eb.size;
+				*atom_fn = eb.atom_fn;
 				eb.buf     = 0;
 				eb.size    = 0;
 				eb.atom_fn = 0;
@@ -356,7 +568,7 @@ FileIOBuf file_buf_alloc(size_t size, const char* atom_fn)
 
 	extant_bufs.add(buf, size, atom_fn);
 
-	stats_buf_alloc(size, round_up(size, AIO_SECTOR_SIZE));
+	stats_buf_alloc(size, round_up(size, BUF_ALIGN));
 	return buf;
 }
 
@@ -395,38 +607,69 @@ LibError file_buf_free(FileIOBuf buf)
 	if(!buf)
 		return ERR_OK;
 
-	stats_buf_free();
+	size_t size; const char* atom_fn;
+	extant_bufs.find_and_remove(buf, &size, &atom_fn);
+
+	stats_buf_free();
+	trace_notify_free(atom_fn);
 
-	size_t size;
-	extant_bufs.find_and_remove(buf, &size);
 	return ERR_OK;
 }
 
 
+// mark <buf> as belonging to the file <atom_fn>. this is done after
+// reading uncompressed data from archive: file_io.cpp must allocate the
+// buffer, since only it knows how much padding is needed; however,
+// archive.cpp knows the real filename (as opposed to that of the archive,
+// which is what the file buffer is associated with). therefore,
+// we fix up the filename afterwards.
+LibError file_buf_set_real_fn(FileIOBuf buf, const char* atom_fn)
+{
+	// remove and reinsert into list instead of replacing atom_fn
+	// in-place for simplicity (speed isn't critical, since there
+	// should only be a few active bufs).
+	size_t size; const char* old_atom_fn;
+	extant_bufs.find_and_remove(buf, &size, &old_atom_fn);
+	extant_bufs.add(buf, size, atom_fn);
+	return ERR_OK;
+}
+
+
+
+
 LibError file_cache_add(FileIOBuf buf, size_t size, const char* atom_fn)
 {
 	// decide (based on flags) if buf is to be cached; set cost
 	uint cost = 1;
 
+	cache_allocator.make_read_only((u8*)buf, size);
 	file_cache.add(atom_fn, buf, size, cost);
 
 	return ERR_OK;
 }
 
 
-FileIOBuf file_cache_retrieve(const char* atom_fn, size_t* size)
+FileIOBuf file_cache_find(const char* atom_fn, size_t* size)
+{
+	return file_cache.retrieve(atom_fn, size, false);
+}
+
+
+FileIOBuf file_cache_retrieve(const char* atom_fn, size_t* psize)
 {
 	// note: do not query extant_bufs - reusing that doesn't make sense
 	// (why would someone issue a second IO for the entire file while
 	// still referencing the previous instance?)
 
-	return file_cache.retrieve(atom_fn, size);
+	FileIOBuf buf = file_cache.retrieve(atom_fn, psize);
+
+	CacheRet cr = buf? CR_HIT : CR_MISS;
+	stats_cache(cr, *psize, atom_fn);
+
+	return buf;
 }
 
 
-
-
-
 /*
 a) FileIOBuf is opaque type with getter
 FileIOBuf buf;	<--------------------- how to initialize??
@@ -459,147 +702,24 @@ file_buf_free and there are only a few active at a time ( < 10)
 
 
 
-//-----------------------------------------------------------------------------
-
-// block cache: intended to cache raw compressed data, since files aren't aligned
-// in the archive; alignment code would force a read of the whole block,
-// which would be a slowdown unless we keep them in memory.
-//
-// keep out of async code (although extra work for sync: must not issue/wait
-// if was cached) to simplify things. disadvantage: problems if same block
-// is issued twice, before the first call completes (via wait_io).
-// that won't happen though unless we have threaded file_ios =>
-// rare enough not to worry about performance.
-//
-// since sync code allocates the (temp) buffer, it's guaranteed
-// to remain valid.
-//
-
-class BlockMgr
-{
-	static const size_t MAX_BLOCKS = 32;
-	enum BlockStatus
-	{
-		BS_PENDING,
-		BS_COMPLETE,
-		BS_INVALID
-	};
-	struct Block
-	{
-		BlockId id;
-		void* mem;
-		BlockStatus status;
-
-		Block() {}	// for RingBuf
-		Block(BlockId id_, void* mem_)
-			: id(id_), mem(mem_), status(BS_PENDING) {}
-	};
-	RingBuf<Block, MAX_BLOCKS> blocks;
-	typedef RingBuf<Block, MAX_BLOCKS>::iterator BlockIt;
-
-	// use Pool to allocate mem for all blocks because it guarantees
-	// page alignment (required for IO) and obviates manually aligning.
-	Pool pool;
-
-public:
-	void init()
-	{
-		(void)pool_create(&pool, MAX_BLOCKS*FILE_BLOCK_SIZE, FILE_BLOCK_SIZE);
-	}
-
-	void shutdown()
-	{
-		(void)pool_destroy(&pool);
-	}
-
-	void* alloc(BlockId id)
-	{
-		if(blocks.size() == MAX_BLOCKS)
-		{
-			Block& b = blocks.front();
-			// if this block is still locked, big trouble..
-			// (someone forgot to free it and we can't reuse it)
-			debug_assert(b.status != BS_PENDING);
-			pool_free(&pool, b.mem);
-			blocks.pop_front();
-		}
-		void* mem = pool_alloc(&pool, FILE_BLOCK_SIZE);	// can't fail
-		blocks.push_back(Block(id, mem));
-		return mem;
-	}
-
-	void mark_completed(BlockId id)
-	{
-		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
-		{
-			if(it->id == id)
-				it->status = BS_COMPLETE;
-		}
-	}
-
-	void* find(BlockId id)
-	{
-		// linear search is ok, since we only keep a few blocks.
-		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
-		{
-			if(it->status == BS_COMPLETE && it->id == id)
-				return it->mem;
-		}
-		return 0;	// not found
-	}
-
-	void invalidate(const char* atom_fn)
-	{
-		for(BlockIt it = blocks.begin(); it != blocks.end(); ++it)
-			if((const char*)(it->id >> 32) == atom_fn)
-				it->status = BS_INVALID;
-	}
-};
-static BlockMgr block_mgr;
-
-
-// create an id for use with the cache that uniquely identifies
-// the block from the file <atom_fn> starting at <ofs> (aligned).
-BlockId block_cache_make_id(const char* atom_fn, const off_t ofs)
-{
-	cassert(sizeof(atom_fn) == 4);
-	// format: filename atom | block number
-	//         63         32   31         0
-	//
-	// <atom_fn> is guaranteed to be unique (see file_make_unique_fn_copy).
-	//
-	// block_num should always fit in 32 bits (assuming maximum file size
-	// = 2^32 * FILE_BLOCK_SIZE ~= 2^48 -- plenty). we don't bother
-	// checking this.
-
-	const size_t block_num = ofs / FILE_BLOCK_SIZE;
-	return u64_from_u32((u32)(uintptr_t)atom_fn, (u32)block_num);
-}
-
-void* block_cache_alloc(BlockId id)
-{
-	return block_mgr.alloc(id);
-}
-
-void block_cache_mark_completed(BlockId id)
-{
-	block_mgr.mark_completed(id);
-}
-
-void* block_cache_find(BlockId id)
-{
-	return block_mgr.find(id);
-}
-
-
-//-----------------------------------------------------------------------------
 
 // remove all blocks loaded from the file <fn>. used when reloading the file.
 LibError file_cache_invalidate(const char* P_fn)
 {
 	const char* atom_fn = file_make_unique_fn_copy(P_fn, 0);
+
+	// mark all blocks from the file as invalid
 	block_mgr.invalidate(atom_fn);
 
+	// file was cached: remove it and free that memory
+	size_t size;
+	FileIOBuf cached_buf = file_cache.retrieve(atom_fn, &size);
+	if(cached_buf)
+	{
+		file_cache.remove(atom_fn);
+		cache_allocator.free((u8*)cached_buf, size);
+	}
+
 	return ERR_OK;
 }
 
diff --git a/source/lib/res/file/file_cache.h b/source/lib/res/file/file_cache.h
index 269ea7369b..d562643822 100644
--- a/source/lib/res/file/file_cache.h
+++ b/source/lib/res/file/file_cache.h
@@ -1,15 +1,14 @@
-extern LibError file_buf_get(FileIOBuf* pbuf, size_t size,
-	const char* atom_fn, bool is_write, FileIOCB cb);
 
+struct BlockId
+{
+	const char* atom_fn;
+	u32 block_num;
+};
 
-extern FileIOBuf file_cache_retrieve(const char* atom_fn, size_t* size);
-extern LibError file_cache_add(FileIOBuf buf, size_t size, const char* atom_fn);
-
-
-typedef u64 BlockId;
+extern bool block_eq(BlockId b1, BlockId b2);
 
 // create an id for use with the cache that uniquely identifies
-// the block from the file <atom_fn> starting at <ofs> (aligned).
+// the block from the file <atom_fn> starting at <ofs>.
 extern BlockId block_cache_make_id(const char* atom_fn, const off_t ofs);
 
 extern void* block_cache_alloc(BlockId id);
@@ -17,6 +16,19 @@ extern void* block_cache_alloc(BlockId id);
 extern void block_cache_mark_completed(BlockId id);
 
 extern void* block_cache_find(BlockId id);
+extern void block_cache_release(BlockId id);
+
+
+
+
+extern LibError file_buf_get(FileIOBuf* pbuf, size_t size,
+	const char* atom_fn, bool is_write, FileIOCB cb);
+
+extern LibError file_buf_set_real_fn(FileIOBuf buf, const char* atom_fn);
+
+extern FileIOBuf file_cache_find(const char* atom_fn, size_t* size);
+extern FileIOBuf file_cache_retrieve(const char* atom_fn, size_t* size);
+extern LibError file_cache_add(FileIOBuf buf, size_t size, const char* atom_fn);
 
 
 extern void file_cache_init();
diff --git a/source/lib/res/file/file_internal.h b/source/lib/res/file/file_internal.h
index 2e7eed65eb..b1692ce68d 100644
--- a/source/lib/res/file/file_internal.h
+++ b/source/lib/res/file/file_internal.h
@@ -1,9 +1,9 @@
-#include "file_stats.h"
-
 #include "file.h"
 #include "file_cache.h"
 #include "file_io.h"
 
+#include "file_stats.h"	// must come after file and file_cache
+
 #include "compression.h"
 #include "zip.h"
 #include "archive.h"
diff --git a/source/lib/res/file/file_io.cpp b/source/lib/res/file/file_io.cpp
index 0c98b803f8..1d02ef5158 100644
--- a/source/lib/res/file/file_io.cpp
+++ b/source/lib/res/file/file_io.cpp
@@ -13,88 +13,90 @@
 // async I/O
 //-----------------------------------------------------------------------------
 
+// we don't do any caching or alignment here - this is just a thin AIO wrapper.
 // rationale:
-// asynchronous IO routines don't cache; they're just a thin AIO wrapper.
-// it's taken care of by file_io, which splits transfers into blocks
-// and keeps temp buffers in memory (not user-allocated, because they
-// might pull the rug out from under us at any time).
-//
-// caching here would be more complicated: would have to handle "forwarding",
-// i.e. recognizing that the desired block has been issued, but isn't yet
-// complete. file_io also knows more about whether a block should be cached.
+// - aligning the transfer isn't possible here since we have no control
+//   over the buffer, i.e. we cannot read more data than requested.
+//   instead, this is done in file_io.
+// - transfer sizes here are arbitrary (viz. not block-aligned);
+//   that means the cache would have to handle this or also split them up
+//   into blocks, which is redundant (already done by file_io).
+// - if caching here, we'd also have to handle "forwarding" (i.e.
+//   desired block has been issued but isn't yet complete). again, it
+//   is easier to let the synchronous file_io manager handle this.
+// - finally, file_io knows more about whether the block should be cached
+//   (e.g. whether another block request will follow), but we don't
+//   currently make use of this.
 //
 // disadvantages:
 // - streamed data will always be read from disk. no problem, because
 //   such data (e.g. music, long speech) is unlikely to be used again soon.
-// - prefetching (issuing the next few blocks from an archive during idle
-//   time, so that future out-of-order reads don't need to seek) isn't
-//   possible in the background (unless via thread, but that's discouraged).
-//   the utility is questionable, though: how to prefetch so as not to delay
-//   real IOs? can't determine "idle time" without completion notification,
-//   which is hard.
-//   we could get the same effect by bridging small gaps in file_io,
-//   and rearranging files in the archive in order of access.
+// - prefetching (issuing the next few blocks from archive/file during
+//   idle time to satisfy potential future IOs) requires extra buffers;
+//   this is a bit more complicated than just using the cache as storage.
 
-
-static Pool aiocb_pool;
-
-static inline void aiocb_pool_init()
+// FileIO must reference an aiocb, which is used to pass IO params to the OS.
+// unfortunately it is 144 bytes on Linux - too much to put in FileIO,
+// since that is stored in a 'resource control block' (see h_mgr.h).
+// we therefore allocate dynamically, but via suballocator to avoid
+// hitting the heap on every IO.
+class AiocbAllocator
 {
-	(void)pool_create(&aiocb_pool, 32*sizeof(aiocb), sizeof(aiocb));
-}
-
-static inline void aiocb_pool_shutdown()
-{
-	(void)pool_destroy(&aiocb_pool);
-}
-
-static inline aiocb* aiocb_pool_alloc()
-{
-	ONCE(aiocb_pool_init());
-	return (aiocb*)pool_alloc(&aiocb_pool, 0);
-}
-
-static inline void aiocb_pool_free(void* cb)
-{
-	pool_free(&aiocb_pool, cb);
-}
+	Pool pool;
+public:
+	void init()
+	{
+		(void)pool_create(&pool, 32*sizeof(aiocb), sizeof(aiocb));
+	}
+	void shutdown()
+	{
+		(void)pool_destroy(&pool);
+	}
+	aiocb* alloc()
+	{
+		return (aiocb*)pool_alloc(&pool, 0);
+	}
+	// weird name to avoid trouble with mem tracker macros
+	// (renaming is less annoying than #include "nommgr.h")
+	void free_(void* cb)
+	{
+		pool_free(&pool, cb);
+	}
+};
+static AiocbAllocator aiocb_allocator;
 
 
 // starts transferring to/from the given buffer.
 // no attempt is made at aligning or padding the transfer.
 LibError file_io_issue(File* f, off_t ofs, size_t size, void* p, FileIo* io)
 {
+	debug_printf("FILE| issue ofs=%d size=%d\n", ofs, size);
+
 	// zero output param in case we fail below.
 	memset(io, 0, sizeof(FileIo));
 
-	debug_printf("FILE| issue ofs=%d size=%d\n", ofs, size);
-
-
-	//
 	// check params
-	//
-
 	CHECK_FILE(f);
-
 	if(!size || !p || !io)
 		WARN_RETURN(ERR_INVALID_PARAM);
-
 	const bool is_write = (f->fc.flags & FILE_WRITE) != 0;
 
-
-	// cut off at EOF.
-	if(!is_write)
-	{
-		const off_t bytes_left = f->fc.size - ofs;
-		if(bytes_left < 0)
-			WARN_RETURN(ERR_EOF);
-		size = MIN(size, (size_t)bytes_left);
-		size = round_up(size, AIO_SECTOR_SIZE);
-	}
+	// note: cutting off at EOF is necessary to avoid transfer errors,
+	// but makes size no longer sector-aligned, which would force
+	// waio to realign (slow). we want to pad back to sector boundaries
+	// afterwards (to avoid realignment), but that is not possible here
+	// since we have no control over the buffer (there might not be
+	// enough room in it). hence, do cut-off in IOManager.
+	//
+	// example: 200-byte file. IOManager issues 16KB chunks; that is way
+	// beyond EOF, so ReadFile fails. limiting size to 200 bytes works,
+	// but causes waio to pad the transfer and use align buffer (slow).
+	// rounding up to 512 bytes avoids realignment and does not fail
+	// (apparently since NTFS files are sector-padded anyway?)
 
 	// (we can't store the whole aiocb directly - glibc's version is
 	// 144 bytes large)
-	aiocb* cb = aiocb_pool_alloc();
+	aiocb* cb = aiocb_allocator.alloc();
 	io->cb = cb;
 	if(!cb)
 		return ERR_NO_MEM;
@@ -153,10 +155,12 @@ LibError file_io_wait(FileIo* io, void*& p, size_t& size)
 	const ssize_t bytes_transferred = aio_return(cb);
 	debug_printf("FILE| bytes_transferred=%d aio_nbytes=%u\n", bytes_transferred, cb->aio_nbytes);
 
-// disabled: we no longer clamp to EOF
-//	// (size was clipped to EOF in file_io => this is an actual IO error)
-//	if(bytes_transferred < (ssize_t)cb->aio_nbytes)
-//		return ERR_IO;
+	// see if actual transfer count matches requested size.
+	// note: most callers clamp to EOF but round back up to sector size
+	// (see explanation in file_io_issue). since we're not sure what
+	// the exact sector size is (only waio knows), we can only warn of
+	// too small transfer counts (not return error).
+	debug_assert(bytes_transferred >= (ssize_t)(cb->aio_nbytes-AIO_SECTOR_SIZE));
 
 	p = (void*)cb->aio_buf;	// cast from volatile void*
 	size = bytes_transferred;
@@ -167,7 +171,7 @@ LibError file_io_wait(FileIo* io, void*& p, size_t& size)
 LibError file_io_discard(FileIo* io)
 {
 	memset(io->cb, 0, sizeof(aiocb));	// prevent further use.
-	aiocb_pool_free(io->cb);
+	aiocb_allocator.free_(io->cb);
 	io->cb = 0;
 	return ERR_OK;
 }
@@ -239,7 +243,7 @@ class IOManager
 		const void* cached_block;
 
 
-		u64 block_id;
+		BlockId block_id;
 		// needed so that we can add the block to the cache when
 		// its IO is complete. if we add it when issuing, we'd no longer be
 		// thread-safe: someone else might find it in the cache before its
@@ -257,7 +261,7 @@ class IOManager
 		{
 			memset(&io, 0, sizeof(io));
 			temp_buf = 0;
-			block_id = 0;
+			memset(&block_id, 0, sizeof(block_id));
 			cached_block = 0;
 		}
 	};
@@ -350,6 +354,16 @@ class IOManager
 			ofs_misalign = start_ofs % FILE_BLOCK_SIZE;
 			start_ofs -= (off_t)ofs_misalign;
 			size = round_up(ofs_misalign + user_size, FILE_BLOCK_SIZE);
+
+			// but cut off at EOF (necessary to prevent IO error).
+			const off_t bytes_left = f->fc.size - start_ofs;
+			if(bytes_left < 0)
+				WARN_RETURN(ERR_EOF);
+			size = MIN(size, (size_t)bytes_left);
+
+			// and round back up to sector size.
+			// see rationale in file_io_issue.
+			size = round_up(size, AIO_SECTOR_SIZE);
 		}
 
 		RETURN_ERR(file_buf_get(pbuf, size, f->fc.atom_fn, is_write, cb));
@@ -360,16 +374,11 @@ class IOManager
 	void issue(IOSlot& slot)
 	{
 		const off_t ofs = start_ofs+(off_t)total_issued;
-		size_t issue_size;
-
-		// write: must not issue beyond end of data.
-		if(is_write)
-			issue_size = MIN(FILE_BLOCK_SIZE, size - total_issued);
-		// read: always grab whole blocks so we can put them in the cache.
-		// any excess data (can only be within first or last block) is
-		// discarded in wait().
-		else
-			issue_size = FILE_BLOCK_SIZE;
+		// for both reads and writes, do not issue beyond end of file/data
+		const size_t issue_size = MIN(FILE_BLOCK_SIZE, size - total_issued);
+// try to grab whole blocks (so we can put them in the cache).
+// any excess data (can only be within first or last) is
+// discarded in wait().
 
 		// check if in cache
 		slot.block_id = block_cache_make_id(f->fc.atom_fn, ofs);
@@ -441,11 +450,14 @@ class IOManager
 			// pending transfers to complete.
 		}
 
-		if(!slot.cached_block)
+		if(slot.cached_block)
+			block_cache_release(slot.block_id);
+		else
+		{
 			file_io_discard(&slot.io);
-
-		if(!slot.cached_block && pbuf == FILE_BUF_TEMP)
-			block_cache_mark_completed(slot.block_id);
+			if(pbuf == FILE_BUF_TEMP)
+				block_cache_mark_completed(slot.block_id);
+		}
 	}
 
 
@@ -539,9 +551,11 @@ ssize_t file_io(File* f, off_t ofs, size_t size, FileIOBuf* pbuf,
 	FileIOCB cb, uintptr_t ctx) // optional
 {
 	debug_printf("FILE| io: size=%u ofs=%u fn=%s\n", size, ofs, f->fc.atom_fn);
-
 	CHECK_FILE(f);
 
+	// note: do not update stats/trace here: this includes Zip IOs,
+	// which shouldn't be reported.
+
 	IOManager mgr(f, ofs, size, pbuf, cb, ctx);
 	return mgr.run();
 }
@@ -549,7 +563,13 @@ ssize_t file_io(File* f, off_t ofs, size_t size, FileIOBuf* pbuf,
 
 
 
+void file_io_init()
+{
+	aiocb_allocator.init();
+}
+
+
 void file_io_shutdown()
 {
-	aiocb_pool_shutdown();
+	aiocb_allocator.shutdown();
 }
diff --git a/source/lib/res/file/file_io.h b/source/lib/res/file/file_io.h
index 158f2a49df..dd3ca45616 100644
--- a/source/lib/res/file/file_io.h
+++ b/source/lib/res/file/file_io.h
@@ -1 +1,2 @@
-extern void file_io_shutdown();
+extern void file_io_init();
+extern void file_io_shutdown();
diff --git a/source/lib/res/file/file_stats.cpp b/source/lib/res/file/file_stats.cpp
index 41feaf777a..8894c67818 100644
--- a/source/lib/res/file/file_stats.cpp
+++ b/source/lib/res/file/file_stats.cpp
@@ -48,7 +48,8 @@ static uint user_ios;
 static double user_io_size_total;
 static double io_actual_size_total[FI_MAX_IDX][2];
 static double io_elapsed_time[FI_MAX_IDX][2];
-static BlockId io_disk_head_pos;
+static double io_process_time_total;
+static BlockId io_disk_pos_cur;
 static uint io_seeks;
 
 // file_cache
@@ -148,13 +149,19 @@ void stats_user_io(size_t user_size)
 	user_io_size_total += user_size;
 }
 
-void stats_io_start(FileIOImplentation fi, FileOp fo, size_t actual_size, double* start_time_storage)
+void stats_io_start(FileIOImplentation fi, FileOp fo, size_t actual_size,
+	BlockId disk_pos, double* start_time_storage)
 {
 	debug_assert(fi < FI_MAX_IDX);
 	debug_assert(fo == FO_READ || FO_WRITE);
 
 	io_actual_size_total[fi][fo] += actual_size;
 
+	if(disk_pos.atom_fn != io_disk_pos_cur.atom_fn ||
+	   disk_pos.block_num != io_disk_pos_cur.block_num+1)
+		io_seeks++;
+	io_disk_pos_cur = disk_pos;
+
 	timer_start(start_time_storage);
 }
 
@@ -166,6 +173,16 @@ void stats_io_finish(FileIOImplentation fi, FileOp fo, double* start_time_storag
 	io_elapsed_time[fi][fo] += timer_reset(start_time_storage);
 }
 
+void stats_cb_start()
+{
+	timer_start();
+}
+
+void stats_cb_finish()
+{
+	io_process_time_total += timer_reset();
+}
+
 
 //
 // file_cache
diff --git a/source/lib/res/file/file_stats.h b/source/lib/res/file/file_stats.h
index bb5100d131..328ef8d504 100644
--- a/source/lib/res/file/file_stats.h
+++ b/source/lib/res/file/file_stats.h
@@ -28,8 +28,11 @@ extern void stats_buf_free();
 
 // file_io
 extern void stats_user_io(size_t user_size);
-extern void stats_io_start(FileIOImplentation fi, FileOp fo, size_t actual_size, double* start_time_storage);
+extern void stats_io_start(FileIOImplentation fi, FileOp fo,
+	size_t actual_size, BlockId disk_pos, double* start_time_storage);
 extern void stats_io_finish(FileIOImplentation fi, FileOp fo, double* start_time_storage);
+extern void stats_cb_start();
+extern void stats_cb_finish();
 
 // file_cache
 extern void stats_cache(CacheRet cr, size_t size, const char* atom_fn);
@@ -49,8 +52,10 @@ extern void stats_dump();
 #define stats_buf_alloc(user_size, padded_size)
 #define stats_buf_free()
 #define stats_user_io(user_size)
-#define stats_io_start(fi, fo, actual_size, start_time_storage)
+#define stats_io_start(fi, fo, actual_size, disk_pos, start_time_storage)
 #define stats_io_finish(fi, fo, start_time_storage)
+#define stats_cb_start()
+#define stats_cb_finish()
 #define stats_cache(cr, size, atom_fn)
 #define stats_block_cache(cr)
 #define stats_dump()
diff --git a/source/lib/res/file/vfs.cpp b/source/lib/res/file/vfs.cpp
index 05e94aa3f2..a90027e477 100755
--- a/source/lib/res/file/vfs.cpp
+++ b/source/lib/res/file/vfs.cpp
@@ -326,8 +326,6 @@ static LibError VFile_reload(VFile* vf, const char* V_path, Handle)
 	if(x_is_open(&vf->xf))
 		return ERR_OK;
 
-	trace_add(V_path);
-
 	TFile* tf;
 	uint lf = (flags & FILE_WRITE)? LF_CREATE_MISSING : 0;
 	LibError err = tree_lookup(V_path, &tf, lf);
@@ -425,6 +423,10 @@ ssize_t vfs_io(const Handle hf, const size_t size, FileIOBuf* pbuf,
 	debug_printf("VFS| io: size=%d\n", size);
 
 	H_DEREF(hf, VFile, vf);
+	FileCommon* fc = &vf->xf.u.fc;
+
+	stats_user_io(size);
+	trace_notify_load(fc->atom_fn, fc->flags);
 
 	off_t ofs = vf->ofs;
 	vf->ofs += (off_t)size;
@@ -445,7 +447,8 @@ LibError vfs_load(const char* V_fn, FileIOBuf& buf, size_t& size, uint flags /*
 	buf = file_cache_retrieve(atom_fn, &size);
 	if(buf)
 	{
-		stats_cache(CR_HIT, size, atom_fn);
+		stats_user_io(size);
+		trace_notify_load(atom_fn, flags);
 		return ERR_OK;
 	}
 
@@ -459,10 +462,6 @@ LibError vfs_load(const char* V_fn, FileIOBuf& buf, size_t& size, uint flags /*
 	H_DEREF(hf, VFile, vf);
 
 	size = x_size(&vf->xf);
-	// only now can we report misses, since we need to know the size for
-	// statistics purposes. that means vfs_load on nonexistant files will
-	// not show up in cache misses, which is fine.
-	stats_cache(CR_MISS, size, atom_fn);
 
 	buf = FILE_BUF_ALLOC;
 	ssize_t nread = vfs_io(hf, size, &buf);
diff --git a/source/lib/res/file/vfs_optimizer.cpp b/source/lib/res/file/vfs_optimizer.cpp
index 071677137f..41e225f2a5 100644
--- a/source/lib/res/file/vfs_optimizer.cpp
+++ b/source/lib/res/file/vfs_optimizer.cpp
@@ -4,114 +4,58 @@
 #include "lib/timer.h"
 #include "file_internal.h"
 
-
-enum TraceState
-{
-	TS_UNINITIALIZED,
-	TS_DISABLED,
-	TS_ENABLED,
-	TS_ERROR,
-	TS_SHUTDOWN
-};
-static uintptr_t trace_state = TS_UNINITIALIZED;	// values from TraceState; type for use with CAS
-
-
+static uintptr_t trace_initialized;	// set via CAS
 static Pool trace_pool;
 
-
+// call at before using trace_pool. no-op if called more than once.
+static inline void trace_init()
+{
+	if(CAS(&trace_initialized, 0, 1))
+		(void)pool_create(&trace_pool, 4*MiB, sizeof(TraceEntry));
+}
 
 void trace_shutdown()
 {
-	if(trace_state == TS_DISABLED || trace_state == TS_ENABLED)
-	{
+	if(CAS(&trace_initialized, 1, 2))
 		(void)pool_destroy(&trace_pool);
-		trace_state = TS_SHUTDOWN;
-	}
 }
 
+
+static bool trace_enabled;
+
 void trace_enable(bool want_enabled)
 {
-	if(trace_state == TS_SHUTDOWN || trace_state == TS_ERROR)
-		WARN_ERR_RETURN(ERR_LOGIC);
-
-	if(CAS(&trace_state, TS_UNINITIALIZED, TS_ERROR))
-	{
-		if(pool_create(&trace_pool, 4*MiB, sizeof(TraceEntry)) < 0)
-			return;	// leave trace_state set to TS_ERROR
-	}
-
-	trace_state = want_enabled? TS_ENABLED : TS_DISABLED;
+	trace_enabled = want_enabled;
 }
 
 
-void trace_add(const char* P_fn)
+static void trace_add(TraceOp op, const char* P_fn, uint flags = 0, double timestamp = 0.0)
 {
-	if(trace_state == TS_DISABLED || trace_state == TS_UNINITIALIZED)
+	trace_init();
+	if(!trace_enabled)
 		return;
-	if(trace_state != TS_ENABLED)
-		WARN_ERR_RETURN(ERR_LOGIC);
+
+	if(timestamp == 0.0)
+		timestamp = get_time();
 
 	TraceEntry* t = (TraceEntry*)pool_alloc(&trace_pool, 0);
 	if(!t)
 		return;
-	t->timestamp = get_time();
+	t->timestamp = timestamp;
 	t->atom_fn = file_make_unique_fn_copy(P_fn, 0);
+	t->op = op;
+	t->flags = flags;
 }
 
 
-LibError trace_write_to_file(const char* trace_filename)
+void trace_notify_load(const char* P_fn, uint flags)
 {
-	if(trace_state == TS_UNINITIALIZED)
-		return ERR_OK;
-	if(trace_state != TS_ENABLED && trace_state != TS_DISABLED)
-		WARN_RETURN(ERR_LOGIC);
-
-	char N_fn[PATH_MAX];
-	RETURN_ERR(file_make_full_native_path(trace_filename, N_fn));
-	FILE* f = fopen(N_fn, "wt");
-	if(!f)
-		return ERR_FILE_ACCESS;
-
-	Trace t;
-	trace_get(&t);
-	for(size_t i = 0; i < t.num_ents; i++)
-		fprintf(f, "%#010f: %s\n", t.ents[i].timestamp, t.ents[i].atom_fn);
-
-	(void)fclose(f);
-	return ERR_OK;
+	trace_add(TO_LOAD, P_fn, flags);
 }
 
-
-LibError trace_load_from_file(const char* trace_filename)
+void trace_notify_free(const char* P_fn)
 {
-	char N_fn[PATH_MAX];
-	RETURN_ERR(file_make_full_native_path(trace_filename, N_fn));
-	FILE* f = fopen(N_fn, "rt");
-	if(!f)
-		return ERR_FILE_NOT_FOUND;
-
-	// parse lines and stuff them in trace_pool
-	// (as if they had been trace_add-ed; replaces any existing data)
-	pool_free_all(&trace_pool);
-	char fmt[20];
-	snprintf(fmt, ARRAY_SIZE(fmt), "%%f: %%%ds\n", PATH_MAX);
-	for(;;)
-	{
-		double timestamp; char P_path[PATH_MAX];
-		int ret = fscanf(f, fmt, &timestamp, P_path);
-		if(ret == EOF)
-			break;
-		if(ret != 2)
-			debug_warn("invalid line in trace file");
-
-		TraceEntry* ent = (TraceEntry*)pool_alloc(&trace_pool, 0);
-		debug_assert(ent != 0);	// was written to file from same pool => must fit
-		ent->timestamp = timestamp;
-		ent->atom_fn = file_make_unique_fn_copy(P_path, 0);
-	}
-
-	fclose(f);
-	return ERR_OK;
+	trace_add(TO_FREE, P_fn);
 }
 
 
@@ -121,123 +65,314 @@ void trace_get(Trace* t)
 	t->num_ents = (uint)(trace_pool.da.pos / sizeof(TraceEntry));
 }
 
-
-///////////////////////////////////////////////////////////////////////////////
-
-
-
-
-#if 0
-
-struct FileList
+LibError trace_write_to_file(const char* trace_filename)
 {
-	const char* atom_fns;
-	size_t num_files;
+	char N_fn[PATH_MAX];
+	RETURN_ERR(file_make_full_native_path(trace_filename, N_fn));
+	FILE* f = fopen(N_fn, "wt");
+	if(!f)
+		WARN_RETURN(ERR_FILE_ACCESS);
+
+	Trace t;
+	trace_get(&t);
+	const TraceEntry* ent = t.ents;
+	for(size_t i = 0; i < t.num_ents; i++, ent++)
+	{
+		char opcode = '?';
+		switch(ent->op)
+		{
+		case TO_LOAD: opcode = 'L'; break;
+		case TO_FREE: opcode = 'F'; break;
+		default: debug_warn("invalid TraceOp");
+		}
+
+		if(ent->op == TO_LOAD)
+			fprintf(f, "%#010f: %c %s %d\n", ent->timestamp, opcode, ent->atom_fn, ent->flags);
+		else
+		{
+			debug_assert(ent->op == TO_FREE);
+			fprintf(f, "%#010f: %c %s\n", ent->timestamp, opcode, ent->atom_fn);
+		}
+	}
+
+	(void)fclose(f);
+	return ERR_OK;
+}
+
+
+LibError trace_read_from_file(const char* trace_filename, Trace* t)
+{
+	char N_fn[PATH_MAX];
+	RETURN_ERR(file_make_full_native_path(trace_filename, N_fn));
+	FILE* f = fopen(N_fn, "rt");
+	if(!f)
+		WARN_RETURN(ERR_FILE_NOT_FOUND);
+
+	// parse lines and stuff them in trace_pool
+	// (as if they had been trace_add-ed; replaces any existing data)
+	pool_free_all(&trace_pool);
+	char fmt[20];
+	snprintf(fmt, ARRAY_SIZE(fmt), "%%f: %%c %%%ds %%02x\n", PATH_MAX);
+	for(;;)
+	{
+		double timestamp; char opcode; char P_path[PATH_MAX];
+		uint flags = 0;	// optional
+		int ret = fscanf(f, fmt, &timestamp, &opcode, P_path);
+		if(ret == EOF)
+			break;
+		if(ret != 3 && ret != 4)
+			debug_warn("invalid line in trace file");
+
+		TraceOp op = TO_LOAD;	// default in case file is garbled
+		switch(opcode)
+		{
+		case 'L': op = TO_LOAD; break;
+		case 'F': op = TO_FREE; break;
+		default: debug_warn("invalid TraceOp");
+		}
+
+		trace_add(op, P_path, flags, timestamp);
+	}
+
+	fclose(f);
+
+	trace_get(t);
+	return ERR_OK;
+}
+
+
+enum SimulateFlags
+{
+	SF_SYNC_TO_TIMESTAMP = 1
 };
 
-static LibError filelist_build(Trace* t, FileList* fl)
+LibError trace_simulate(const char* trace_filename, uint flags)
 {
-}
+	// prevent the actions we carry out below from generating
+	// trace_add-s.
+	trace_enabled = false;
 
-static LibError filelist_get(FileList* fl, uint i, const char* path)
-{
-	return ERR_DIR_END;
-}
+	Trace t;
+	RETURN_ERR(trace_read_from_file(trace_filename, &t));
 
+	const double start_time = get_time();
+	const double first_timestamp = t.ents[0].timestamp;
 
+	const TraceEntry* ent = t.ents;
+	for(uint i = 0; i < t.num_ents; i++, ent++)
+	{
+		// wait until time for next entry if caller requested this
+		if(flags & SF_SYNC_TO_TIMESTAMP)
+		{
+			while(get_time()-start_time < ent->timestamp-first_timestamp)
+			{
+				// busy-wait (don't sleep - can skew results)
+			}
+		}
 
-static LibError compress_cb(uintptr_t cb_ctx, const void* block, size_t size, size_t* bytes_processed)
-{
-	uintptr_t ctx = cb_ctx;
-
-	*bytes_processed = comp_feed(ctx, block, size);
-	return INFO_CB_CONTINUE;
-}
-
-static LibError read_and_compress_file(uintptr_t ctx, ZipEntry* ze)
-{
-	const char* fn = ze->path;
-
-	struct stat s;
-	RETURN_ERR(file_stat(fn, &s));
-	const size_t ucsize = s.st_size;
-
-	RETURN_ERR(comp_reset(ctx));
-	RETURN_ERR(comp_alloc_output(ctx, ucsize));
-
-	File f;
-	RETURN_ERR(file_open(fn, 0, &f));
-	FileIOBuf buf = FILE_BUF_ALLOC;
-	uintptr_t cb_ctx = ctx;
-	ssize_t cbytes_output = file_io(&f, 0, ucsize, &buf, compress_cb, cb_ctx);
-	(void)file_close(&f);
-
-	void* cdata; size_t csize;
-	RETURN_ERR(comp_finish(ctx, &cdata, &csize));
-	debug_assert(cbytes_output <= csize);
-
-	RETURN_ERR(cbytes_output);
-
-// decide if it was better compressed or not
-
-	ze->ucsize = ucsize;
-	ze->mtime  = s.st_mtime;
-	ze->method = CM_DEFLATE;
-	ze->csize  = csize;
-	ze->cdata  = cdata;
-
-	zip_archive_add(&za, &ze);
+		// carry out this entry's operation
+		FileIOBuf buf; size_t size;
+		switch(ent->op)
+		{
+		case TO_LOAD:
+			(void)vfs_load(ent->atom_fn, buf, size, ent->flags);
+			break;
+		case TO_FREE:
+			buf = file_cache_find(ent->atom_fn, &size);
+			(void)file_buf_free(buf);
+			break;
+		default:
+			debug_warn("unknown TraceOp");
+		}
+	}
 
 	return ERR_OK;
 }
 
-static void build_optimized_archive(const char* trace_file, const char* zip_filename)
+
+//-----------------------------------------------------------------------------
+
+struct FileList
+{
+	const char** atom_fns;
+	size_t num_files;
+	size_t i;
+};
+
+
+static LibError filelist_build(Trace* t, FileList* fl)
+{
+	// count # files
+	fl->num_files = 0;
+	for(size_t i = 0; i < t->num_ents; i++)
+		if(t->ents[i].op == TO_LOAD)
+			fl->num_files;
+
+	fl->atom_fns = new const char*[fl->num_files];
+
+	size_t ti = 0;
+	for(size_t i = 0; i < fl->num_files; i++)
+	{
+		// find next trace entry that is a load (must exist)
+		while(t->ents[ti].op != TO_LOAD)
+			ti++;
+		fl->atom_fns[i] = t->ents[ti].atom_fn;
+	}
+
+	fl->i = 0;
+	return ERR_OK;
+}
+
+
+static const char* filelist_get_next(FileList* fl)
+{
+	if(fl->i == fl->num_files)
+		return 0;
+	return fl->atom_fns[fl->i++];
+}
+
+
+//-----------------------------------------------------------------------------
+
+static inline bool file_type_is_uncompressible(const char* fn)
+{
+	const char* ext = strrchr(fn, '.');
+	// no extension? bail; assume compressible
+	if(!ext)
+		return true;
+
+	// this is a selection of file types that are certainly not
+	// further compressible. we need not include every type under the sun -
+	// this is only a slight optimization that avoids wasting time
+	// compressing files. the real decision as to cmethod is made based
+	// on attained compression ratio.
+	static const char* uncompressible_exts[] =
+	{
+		"zip", "rar",
+		"jpg", "jpeg", "png",
+		"ogg", "mp3"
+	};
+
+	for(uint i = 0; i < ARRAY_SIZE(uncompressible_exts); i++)
+	{
+		if(!stricmp(ext+1, uncompressible_exts[i]))
+			return true;
+	}
+
+	return false;
+}
+
+
+struct CompressParams
+{
+	bool attempt_compress;
+	uintptr_t ctx;
+};
+
+static LibError compress_cb(uintptr_t cb_ctx, const void* block, size_t size, size_t* bytes_processed)
+{
+	const CompressParams* p = (const CompressParams*)cb_ctx;
+
+	// comp_feed already makes note of total #bytes fed, and we need
+	// vfs_io to return the uc size (to check if all data was read).
+	*bytes_processed = size;
+
+	if(p->attempt_compress)
+		(void)comp_feed(p->ctx, block, size);
+	return INFO_CB_CONTINUE;
+}
+
+
+static LibError read_and_compress_file(const char* atom_fn, uintptr_t ctx,
+	ArchiveEntry& ent, void*& file_contents, FileIOBuf& buf)	// out
+{
+	struct stat s;
+	RETURN_ERR(file_stat(atom_fn, &s));
+	const size_t ucsize = s.st_size;
+
+	const bool attempt_compress = !file_type_is_uncompressible(atom_fn);
+	if(attempt_compress)
+	{
+		RETURN_ERR(comp_reset(ctx));
+		RETURN_ERR(comp_alloc_output(ctx, ucsize));
+	}
+
+	// read file into newly allocated buffer. if attempt_compress, also
+	// compress the file into another buffer while waiting for IOs.
+	Handle hf = vfs_open(atom_fn, 0);
+	RETURN_ERR(hf);
+	buf = FILE_BUF_ALLOC;
+	const CompressParams params = { attempt_compress, ctx };
+	ssize_t ucsize_read = vfs_io(hf, ucsize, &buf, compress_cb, (uintptr_t)&params);
+	debug_assert(ucsize_read == (ssize_t)ucsize);
+	(void)vfs_close(hf);
+
+	// if we compressed the file trial-wise, check results and
+	// decide whether to store as such or not (based on compression ratio)
+	bool store_compressed = false;
+	void* cdata = 0; size_t csize = 0;
+	if(attempt_compress)
+	{
+		RETURN_ERR(comp_finish(ctx, &cdata, &csize));
+
+		const float ratio = (float)ucsize / csize;
+		const ssize_t bytes_saved = (ssize_t)ucsize - (ssize_t)csize;
+		if(ratio > 1.05f && bytes_saved > 200)
+			store_compressed = true;
+	}
+
+	// store file info
+	ent.ucsize  = (off_t)ucsize;
+	ent.mtime   = s.st_mtime;
+	// .. ent.ofs is set by zip_archive_add_file
+	ent.flags   = 0;
+	ent.atom_fn = atom_fn;
+	if(store_compressed)
+	{
+		ent.method = CM_DEFLATE;
+		ent.csize  = (off_t)csize;
+		file_contents = cdata;
+	}
+	else
+	{
+		ent.method = CM_NONE;
+		ent.csize  = (off_t)ucsize;
+		file_contents = (void*)buf;
+	}
+
+	// note: no need to free cdata - it is owned by the
+	// compression context and can be reused.
+
+	return ERR_OK;
+}
+
+static LibError build_optimized_archive(const char* trace_filename, const char* zip_filename)
 {
 	FileList fl;
 	{
 		Trace t;
-		RETURN_ERR(trace_load_from_file(trace_filename, &t));
-		filelist_build(&t, &fl);
+		RETURN_ERR(trace_read_from_file(trace_filename, &t));
+		RETURN_ERR(filelist_build(&t, &fl));
 	}
 
-	ZipArchive za;
-	zip_archive_create(zip_filename, &za);
-
-	uintptr_t ctx = comp_alloc();
-	uint trace_i = 0;
-	uint queued_files = 0, committed_files = 0;
+	ZipArchive* za;
+	RETURN_ERR(zip_archive_create(zip_filename, &za));
+	uintptr_t ctx = comp_alloc(CT_COMPRESSION, CM_DEFLATE);
 
 	for(;;)
 	{
-
-/*
-document: zlib layer is ok to allocate. caller shouldnt do so from a pool:
-		when the next file is going to be loaded and decompressed but our pool is full,
-			we need to wait for the archive write to finish and mark pool as reclaimed.
-			this is better done with heap; also, memory isn't bottleneck for readqueue size
-*/
-
-		ZipEntry ze; // TODO: QUEUE
-		const int max_readqueue_depth = 1;
-		for(uint i = 0; i < max_readqueue_depth; i++)
-		{
-			LibError ret = trace_get_next_file(trace, trace_i, ze.path);
-			if(ret == ERR_DIR_END)
-				break;
-
-			WARN_ERR(read_and_compress_file(ctx, &ze));
-			queued_files++;
-		}
-
-		if(committed_files == queued_files)
+		const char* atom_fn = filelist_get_next(&fl);
+		if(!atom_fn)
 			break;
-		zip_archive_add(&za, &ze);
-		committed_files++;
+
+		ArchiveEntry ent; void* file_contents; FileIOBuf buf;
+		if(read_and_compress_file(atom_fn, ctx, ent, file_contents, buf) == ERR_OK)
+		{
+			(void)zip_archive_add_file(za, &ent, file_contents);
+			(void)file_buf_free(buf);
+		}
 	}
 
-
 	comp_free(ctx);
-
-	zip_archive_finish(&za);
+	(void)zip_archive_finish(za);
 }
-#endif
diff --git a/source/lib/res/file/vfs_optimizer.h b/source/lib/res/file/vfs_optimizer.h
index c8889b1278..ac19a48d35 100644
--- a/source/lib/res/file/vfs_optimizer.h
+++ b/source/lib/res/file/vfs_optimizer.h
@@ -2,24 +2,42 @@
 #define VFS_OPTIMIZER_H__
 
 extern void trace_enable(bool want_enabled);
-extern void trace_add(const char* P_fn);
+extern void trace_shutdown();
 
-extern LibError trace_write_to_file(const char* trace_filename);
-extern LibError trace_read_from_file(const char* trace_filename);
+extern void trace_notify_load(const char* P_fn, uint flags);
+extern void trace_notify_free(const char* P_fn);
 
+// TraceEntry operation type.
+// note: rather than only a list of accessed files, we also need to
+// know the application's behavior WRT caching (e.g. when it releases
+// cached buffers). this is necessary so that our simulation can
+// yield the same results.
+enum TraceOp
+{
+	TO_LOAD,
+	TO_FREE
+};
+
+// stores one event that is relevant for file IO / caching.
+//
+// size-optimized a bit since these are all kept in memory
+// (to prevent trace file writes from affecting other IOs)
 struct TraceEntry
 {
-	double timestamp;
-	const char* atom_fn;
+	double timestamp;		// returned by get_time before operation starts
+	const char* atom_fn;	// path+name of affected file
+	uint op : 8;			// operation - see TraceOp
+	uint flags : 24;		// misc, e.g. file_io flags.
 };
 
 struct Trace
 {
 	const TraceEntry* ents;
-	uint num_ents;
+	size_t num_ents;
 };
 
 extern void trace_get(Trace* t);
-extern void trace_shutdown();
+extern LibError trace_write_to_file(const char* trace_filename);
+extern LibError trace_read_from_file(const char* trace_filename, Trace* t);
 
 #endif	// #ifndef VFS_OPTIMIZER_H__
diff --git a/source/lib/res/file/zip.cpp b/source/lib/res/file/zip.cpp
index a6793ec220..b061ee27a2 100755
--- a/source/lib/res/file/zip.cpp
+++ b/source/lib/res/file/zip.cpp
@@ -266,28 +266,62 @@ static LibError za_extract_cdfh(const CDFH* cdfh,
 }
 
 
+// this code grabs an LFH struct from file block(s) that are
+// passed to the callback. usually, one call copies the whole thing,
+// but the LFH may straddle a block boundary.
+//
+// rationale: this allows using temp buffers for zip_fixup_lfh,
+// which avoids involving the file buffer manager and thus
+// unclutters the trace and cache contents.
 
+struct LFH_Copier
+{
+	u8* lfh_dst;
+	size_t lfh_bytes_remaining;
+};
 
+static LibError lfh_copier_cb(uintptr_t ctx, const void* block, size_t size, size_t* bytes_processed)
+{
+	LFH_Copier* p = (LFH_Copier*)ctx;
 
-// find corresponding LFH, needed to calculate file offset
-// (its extra field may not match that reported by CDFH!).
+	debug_assert(size <= p->lfh_bytes_remaining);
+	memcpy2(p->lfh_dst, block, size);
+	p->lfh_dst += size;
+	p->lfh_bytes_remaining -= size;
+
+	*bytes_processed = size;
+	return INFO_CB_CONTINUE;
+}
+
+// ensures <ent.ofs> points to the actual file contents; it is initially
+// the offset of the LFH. we cannot use CDFH filename and extra field
+// lengths to skip past LFH since that may not mirror CDFH (has happened).
+//
+// this is called at file-open time instead of while mounting to
+// reduce seeks: since reading the file will typically follow, the
+// block cache entirely absorbs the IO cost.
 void zip_fixup_lfh(File* f, ArchiveEntry* ent)
 {
-	// improbable that this will be in cache - if this file had already
-	// been read, it would have been fixed up. only in cache if this
-	// file is in the same block as a previously read file (i.e. both small)
-	FileIOBuf buf = FILE_BUF_ALLOC;
-	file_io(f, ent->ofs, LFH_SIZE, &buf);
-	const LFH* lfh = (const LFH*)buf;
+	// already fixed up - done.
+	if(!(ent->flags & ZIP_LFH_FIXUP_NEEDED))
+		return;
 
-	debug_assert(lfh->magic == lfh_magic);
-	const size_t fn_len = read_le16(&lfh->fn_len);
-	const size_t  e_len = read_le16(&lfh->e_len);
+	// performance note: this ends up reading one file block, which is
+	// only in the block cache if the file starts in the same block as a
+	// previously read file (i.e. both are small).
+	LFH lfh;
+	LFH_Copier params = { (u8*)&lfh, sizeof(LFH) };
+	ssize_t ret = file_io(f, ent->ofs, LFH_SIZE, FILE_BUF_TEMP, lfh_copier_cb, (uintptr_t)&params);
+	debug_assert(ret == sizeof(LFH));
+
+	debug_assert(lfh.magic == lfh_magic);
+	const size_t fn_len = read_le16(&lfh.fn_len);
+	const size_t  e_len = read_le16(&lfh.e_len);
 
 	ent->ofs += (off_t)(LFH_SIZE + fn_len + e_len);
 	// LFH doesn't have a comment field!
 
-	file_buf_free(buf);
+	ent->flags &= ~ZIP_LFH_FIXUP_NEEDED;
 }
 
 
@@ -393,21 +427,24 @@ struct ZipArchive
 	uint cd_entries;
 };
 
-struct ZipEntry
-{
-	char path[PATH_MAX];
-	size_t ucsize;
-	time_t mtime;
-	ZipCompressionMethod method;
-	size_t csize;
-	const void* cdata;
-};
+// we don't want to expose ZipArchive to callers, so
+// allocate the storage here and return opaque pointer.
+static SingleAllocator<ZipArchive> za_mgr;
 
-LibError zip_archive_create(const char* zip_filename, ZipArchive* za)
+
+LibError zip_archive_create(const char* zip_filename, ZipArchive** pza)
 {
-	memset(za, 0, sizeof(*za));
-	RETURN_ERR(file_open(zip_filename, 0, &za->f));
-	RETURN_ERR(pool_create(&za->cdfhs, 10*MiB, 0));
+	// local za_copy simplifies things - if something fails, no cleanup is
+	// needed. upon success, we copy into the newly allocated real za.
+	ZipArchive za_copy;
+	RETURN_ERR(file_open(zip_filename, 0, &za_copy.f));
+	RETURN_ERR(pool_create(&za_copy.cdfhs, 10*MiB, 0));
+
+	ZipArchive* za = (ZipArchive*)za_mgr.alloc();
+	if(!za)
+		WARN_RETURN(ERR_NO_MEM);
+	*za = za_copy;
+	*pza = za;
 	return ERR_OK;
 }
 
@@ -424,18 +461,14 @@ static inline u16 u16_from_size_t(size_t x)
 	return (u16)(x & 0xFFFF);
 }
 
-
-LibError zip_archive_add(ZipArchive* za, const ZipEntry* ze)
+LibError zip_archive_add_file(ZipArchive* za, const ArchiveEntry* ze, void* file_contents)
 {
-	FileIOBuf buf;
-
-	const char* fn      = ze->path;
+	const char* fn      = ze->atom_fn;
 	const size_t fn_len = strlen(fn);
 	const size_t ucsize = ze->ucsize;
 	const u32 fat_mtime = FAT_from_time_t(ze->mtime);
 	const u16 method    = (u16)ze->method;
 	const size_t csize  = ze->csize;
-	const void* cdata   = ze->cdata;
 
 	const off_t lfh_ofs = za->cur_file_size;
 
@@ -454,11 +487,12 @@ LibError zip_archive_add(ZipArchive* za, const ZipEntry* ze)
 		u16_from_size_t(fn_len),
 		0	// e_len
 	};
+	FileIOBuf buf;
 	buf = (FileIOBuf)&lfh;
-	file_io(&za->f, lfh_ofs,                          lfh_size, &buf);
+	file_io(&za->f, lfh_ofs, lfh_size, &buf);
 	buf = (FileIOBuf)fn;
-	file_io(&za->f, lfh_ofs+lfh_size,                 fn_len, &buf);
-	buf = (FileIOBuf)cdata;
+	file_io(&za->f, lfh_ofs+lfh_size, fn_len, &buf);
+	buf = (FileIOBuf)file_contents;
 	file_io(&za->f, lfh_ofs+(off_t)(lfh_size+fn_len), csize, &buf);
 	za->cur_file_size += (off_t)(lfh_size+fn_len+csize);
 
@@ -511,6 +545,7 @@ LibError zip_archive_finish(ZipArchive* za)
 
 	(void)file_close(&za->f);
 	(void)pool_destroy(&za->cdfhs);
+	za_mgr.free(za);
 	return ERR_OK;
 }
 
diff --git a/source/lib/res/file/zip.h b/source/lib/res/file/zip.h
index e135c1135e..3d5286c94d 100755
--- a/source/lib/res/file/zip.h
+++ b/source/lib/res/file/zip.h
@@ -8,4 +8,11 @@ extern LibError zip_populate_archive(Archive* a, File* f);
 
 extern void zip_fixup_lfh(File* f, ArchiveEntry* ent);
 
+
+struct ZipArchive;
+extern LibError zip_archive_create(const char* zip_filename, ZipArchive** pza);
+extern LibError zip_archive_add_file(ZipArchive* za, const ArchiveEntry* ze, void* file_contents);
+extern LibError zip_archive_finish(ZipArchive* za);
+
+
 #endif	// #ifndef ZIP_H__
diff --git a/source/lib/res/graphics/tex.cpp b/source/lib/res/graphics/tex.cpp
index 5f303d3124..ba5cb34901 100755
--- a/source/lib/res/graphics/tex.cpp
+++ b/source/lib/res/graphics/tex.cpp
@@ -213,27 +213,30 @@ TIMER_ACCRUE(tc_plain_transform);
 	if(!transforms)
 		return ERR_OK;
 
+	// allocate copy of the image data.
+	// rationale: L1 cache is typically A2 => swapping in-place with a
+	// line buffer leads to thrashing. we'll assume the whole texture*2
+	// fits in cache, allocate a copy, and transfer directly from there.
+	//
+	// this is necessary even when not flipping because the initial Tex.hm
+	// (which is a FileIOBuf) is read-only.
+	Handle hm;
+	void* new_data = mem_alloc(data_size, 4*KiB, 0, &hm);
+	if(!new_data)
+		return ERR_NO_MEM;
+	memcpy2(new_data, data, data_size);
+
 	// setup row source/destination pointers (simplifies outer loop)
-	u8* dst = data;
-	const u8* src = data;
+	u8* dst = (u8*)new_data;
+	const u8* src = (const u8*)new_data;
 	const size_t pitch = w * bpp/8;
+	// .. avoid y*pitch multiply in row loop; instead, add row_ofs.
 	ssize_t row_ofs = (ssize_t)pitch;
-	// avoid y*pitch multiply in row loop; instead, add row_ofs.
-	void* clone_data = 0;
+
 	// flipping rows (0,1,2 -> 2,1,0)
 	if(transforms & TEX_ORIENTATION)
 	{
-		// L1 cache is typically A2 => swapping in-place with a line buffer
-		// leads to thrashing. we'll assume the whole texture*2 fits in cache,
-		// allocate a copy, and transfer directly from there.
-		//
-		// note: we don't want to return a new buffer: the user assumes
-		// buffer address will remain unchanged.
-		clone_data = mem_alloc(data_size, 4*KiB);
-		if(!clone_data)
-			return ERR_NO_MEM;
-		memcpy2(clone_data, data, data_size);
-		src = (const u8*)clone_data+data_size-pitch;	// last row
+		src = (const u8*)data+data_size-pitch;	// last row
 		row_ofs = -(ssize_t)pitch;
 	}
 
@@ -280,8 +283,9 @@ TIMER_ACCRUE(tc_plain_transform);
 		}
 	}
 
-	if(clone_data)
-		(void)mem_free(clone_data);
+	mem_free_h(t->hm);
+	t->hm = hm;
+	t->ofs = 0;
 
 	if(!(t->flags & TEX_MIPMAPS) && transforms & TEX_MIPMAPS)
 	{
@@ -296,10 +300,11 @@ TIMER_ACCRUE(tc_plain_transform);
 		const u8* mipmap_data = (const u8*)mem_alloc(mipmap_size, 4*KiB, 0, &hm);
 		if(!mipmap_data)
 			return ERR_NO_MEM;
-		CreateLevelData cld = { bpp/8, w, h, data, data_size };
+		CreateLevelData cld = { bpp/8, w, h, (const u8*)new_data, data_size };
 		tex_util_foreach_mipmap(w, h, bpp, mipmap_data, 0, 1, create_level, &cld);
 		mem_free_h(t->hm);
 		t->hm = hm;
+		t->ofs = 0;
 	}
 
 	CHECK_TEX(t);
@@ -450,6 +455,12 @@ static LibError tex_load_impl(FileIOBuf file_, size_t file_size, Tex* t)
 }
 
 
+// MEM_DTOR -> file_buf_free adapter (used for mem_wrap-ping FileIOBuf)
+static void file_buf_dtor(void* p, size_t UNUSED(size), uintptr_t UNUSED(ctx))
+{
+	(void)file_buf_free((FileIOBuf)p);
+}
+
 // load the specified image from file into the given Tex object.
 // currently supports BMP, TGA, JPG, JP2, PNG, DDS.
 LibError tex_load(const char* fn, Tex* t)
@@ -460,7 +471,7 @@ LibError tex_load(const char* fn, Tex* t)
 	// must be protected against being accidentally free-d in that case.
 
 	RETURN_ERR(vfs_load(fn, file, file_size));
-	Handle hm = mem_wrap((void*)file, file_size, 0, 0, 0, 0, 0, (void*)tex_load);
+	Handle hm = mem_wrap((void*)file, file_size, 0, 0, 0, file_buf_dtor, 0, (void*)tex_load);
 	t->hm = hm;
 	LibError ret = tex_load_impl(file, file_size, t);
 	if(ret < 0)