work in progress. caching moved to sync IO code (simplifies things).

This was SVN commit r1017.
This commit is contained in:
janwas 2004-08-18 02:12:52 +00:00
parent e2970992fd
commit d58e16e0ff
2 changed files with 270 additions and 325 deletions

View File

@ -509,21 +509,145 @@ int file_close(File* const f)
// starts transferring to/from the given buffer.
// no attempt is made at aligning or padding the transfer.
int file_start_io(File* const f, const off_t ofs, size_t size, void* const p, FileIO* io)
debug_out("file_start_io ofs=%d size=%d\n", ofs, size);
// check params
if(!size || !p || !io)
const bool is_write = (f->flags & FILE_WRITE) != 0;
// cut off at EOF.
// avoid min() due to type conversion warnings.
const off_t bytes_left = f->size - ofs;
if(bytes_left < 0)
debug_warn("file_start_io: EOF");
return ERR_EOF;
if((off_t)size > bytes_left)
size = (size_t)bytes_left;
// guaranteed to fit, since size was > bytes_left
// set the "I/O context", a pointer to the (newly allocated) aiocb.
// we can't store the whole aiocb in a struct - glibc's version is
// 144 bytes large. we don't currently need anything else (since this
// code is only a thin aio wrapper); if that changes, instead return a
// pointer to a struct containing the aiocb*.
aiocb* cb = (aiocb*)malloc(sizeof(aiocb));
return ERR_NO_MEM;
*io = cb;
// send off async read/write request
cb->aio_lio_opcode = is_write? LIO_WRITE : LIO_READ;
cb->aio_buf = p;
cb->aio_fildes = f->fd;
cb->aio_offset = ofs;
cb->aio_nbytes = size;
int err = lio_listio(LIO_NOWAIT, &cb, 1, (struct sigevent*)0);
if(err < 0)
return err;
return 0;
// indicates if the IO referenced by <io> has completed.
// return value: 0 if pending, 1 if complete, < 0 on error.
int file_io_complete(FileIO io)
aiocb* cb = (aiocb*)io;
int ret = aio_error(cb);
if(ret == EINPROGRESS)
return 0;
if(ret == 0)
return 1;
debug_warn("file_io_complete: unexpected aio_error return");
return -1;
int file_wait_io(FileIO io, void*& p, size_t& size)
debug_out("file_wait_io: hio=%I64x\n", hio);
// zero output params in case something (e.g. H_DEREF) fails.
p = 0;
size = 0;
aiocb* cb = (aiocb*)io;
// wait for transfer to complete.
const aiocb** cbs = (const aiocb**)&cb; // pass in an "array"
while(aio_error(cb) == EINPROGRESS)
aio_suspend(cbs, 1, (timespec*)0); // wait indefinitely
// query number of bytes transferred (-1 if the transfer failed)
const ssize_t bytes_transferred = aio_return(cb);
if(bytes_transferred < (ssize_t)cb->aio_nbytes)
return -1;
p = (void*)cb->aio_buf; // cast from volatile void*
size = bytes_transferred;
return 0;
int file_discard_io(FileIO io)
memset(io, 0, sizeof(aiocb));
// discourage further use.
return 0;
// L3 cache: intended to cache raw compressed data, since files aren't aligned
// in the archive; alignment code would force a read of the whole block,
// which would be a slowdown unless we keep them in memory.
// it's a part of the async code (instead of the sync, block-splitting code)
// because if cached, the IO must not be issued. also, when waiting,
// we can return directly if in the cache.
// keep out of async code (although extra work for sync: must not issue/wait
// if was cached) to simplify things. disadvantage: problems if same block
// is issued twice, before the first call completes (via wait_io).
// that won't happen though unless we have threaded file_ios =>
// rare enough not to worry about performance.
// only allow caching for buffers allocated by async read code:
// 1) caller may pull the rug out from under us, freeing its buffer after
// calling file_discard_io. don't want to go to the trouble of getting
// the handle; even worse, buffer may be stack-allocated.
// since sync code allocates the (temp) buffer, it's guaranteed
// to remain valid.
// side effect: any async reads into buffers we allocate may be cached
// (if FILE_CACHE_BLOCK specified).
@ -554,280 +678,103 @@ static u64 block_make_id(const u32 fn_hash, const off_t ofs)
struct Block
FileIO* pending_io;
void* p;
pending_io = 0;
p = 0;
typedef std::map<u64, Block> BlockCache;
typedef std::map<u64, void*> BlockCache;
typedef BlockCache::iterator BlockIt;
static BlockCache block_cache;
enum FileIOFlags
ssize_t lowio(int fd, bool is_write, off_t ofs, size_t size, void* buf)
// coming from cache
// aio_return called
// we allocated buffer
lseek(fd, ofs, SEEK_SET);
return write(fd, buf, size);
return read (fd, buf, size);
struct IOSlot
FileIO io;
void* temp_buf;
u64 block_id;
// needed so that we can add the block to the cache when
// its IO is complete. if we add it when issuing, we'd no longer be
// thread-safe: someone else might find it in the cache before its
// transfer has completed. don't want to add an "is_complete" flag,
// because that'd be hard to update (on every wait_io).
void* cached_block;
// != 0 <==> data coming from cache and no IO issued.
// given buffer
// given buffer, will copy from cache
// temp buffer allocated here
// temp buffer taken from cache
// pads the request up to BLOCK_SIZE, and stores the original parameters in IO.
// transfers of more than 1 block (including padding) are allowed, but do not
// go through the cache. don't see any case where that's necessary, though.
int file_start_io(File* const f, const off_t user_ofs, size_t user_size, void* const user_p, FileIO* io)
// don't just use operator[], so that block_cache isn't cluttered
// with IDs associated with 0 (blocks that wouldn't be cached anyway).
static void* block_find(u64 block_id)
int err;
memset(io, 0, sizeof(FileIO));
// check params
const bool is_write = (f->flags & FILE_WRITE) != 0;
const bool alloc_buf = (user_p == 0);
const bool cache_block = (f->flags & FILE_CACHE_BLOCK) != 0 && alloc_buf;
if(user_size == 0)
debug_warn("file_start_io: user_size = 0 - why?");
// cut off at EOF.
// avoid min() due to type conversion warnings.
const off_t bytes_left = f->size - user_ofs;
if(bytes_left < 0)
debug_warn("file_start_io: EOF");
return ERR_EOF;
if((off_t)user_size > bytes_left)
user_size = (size_t)bytes_left;
// guaranteed to fit, since size was > bytes_left
debug_out("file_start_io hio=%I64x ofs=%d size=%d\n", hio, user_ofs, user_size);
size_t padding = 0;
size_t size = user_size;
void* buf = user_p;
off_t ofs = user_ofs;
// we're supposed to allocate the buffer
debug_warn("file_start_io: writing but buffer = 0");
// optimization: pad to eliminate a memcpy if unaligned
ofs = user_ofs;
padding = ofs % SECTOR_SIZE;
ofs -= (off_t)padding;
size = round_up(padding + user_size, SECTOR_SIZE);
buf = mem_alloc(size, SECTOR_SIZE);
return ERR_NO_MEM;
// store request params needed by file_wait_io
io->cb = (aiocb*)calloc(sizeof(aiocb), 1);
// must be zeroed! (waio complains about req_ != 0)
err = ERR_NO_MEM;
goto fail;
io->padding = padding;
io->user_size = user_size;
io->block_id = 0;
io->our_buf = alloc_buf;
// all other members zeroed by memset above.
// note: cb will hold the actual IO request
// (possibly aligned offset and size).
const u64 block_id = block_make_id(f->fn_hash, ofs);
//debug_out("ofs=%x\tid=%I64x", user_ofs, block_id);
// already in cache?
io->block_id = block_id;
Block& b = block_cache[block_id];
// yes; no need to issue
// debug_out(".. hit\n");
io->from_cache = true;
return 0;
io->given_to_cache = true;
// debug_out(".. miss\n");
b.pending_io = io;
b.p = (void*)buf;
//debug_out(".. uncacheable => miss\n");
// send off async read/write request
aiocb* cb = io->cb;
cb->aio_lio_opcode = is_write? LIO_WRITE : LIO_READ;
cb->aio_buf = buf;
cb->aio_fildes = f->fd;
cb->aio_offset = ofs;
cb->aio_nbytes = size;
err = lio_listio(LIO_NOWAIT, &cb, 1, (struct sigevent*)0);
if(err < 0)
return err;
return 0;
// indicates if the IO referenced by <io> has completed.
// return value: 0 if pending, 1 if complete, < 0 on error.
int file_io_complete(FileIO* io)
int ret = aio_error(io->cb);
if(ret == EINPROGRESS)
BlockIt it = block_cache.find(block_id);
if(it == block_cache.end())
return 0;
if(ret == 0)
return 1;
debug_warn("file_io_complete: unexpected aio_error return");
return -1;
return it->second;
int file_wait_io(FileIO* io, void*& p, size_t& size)
static void block_add(u64 block_id, void* block)
debug_out("file_wait_io: hio=%I64x\n", hio);
// zero output params in case something (e.g. H_DEREF) fails.
p = 0;
size = 0;
// aio_return may be called exactly once.
// if user code must be able to call this > 1x, store bytes_transferred
// in FileIO.
debug_warn("file_wait_io: already called");
return -1;
io->return_called = 1;
aiocb* cb = io->cb;
ssize_t bytes_transferred;
Block* b;
if(io->from_cache || io->given_to_cache)
b = &block_cache[io->block_id];
cb = b->pending_io->cb;
// already finished, no wait necessary
cb = 0;
p = b->p;
bytes_transferred = BLOCK_SIZE;
// wait for transfer to complete.
const aiocb** cbs = (const aiocb**)&cb; // pass in an "array"
while(aio_error(cb) == EINPROGRESS)
aio_suspend(cbs, 1, (timespec*)0); // wait indefinitely
// query number of bytes transferred (-1 if the transfer failed)
bytes_transferred = aio_return(cb);
p = (void*)cb->aio_buf; // cast from volatile void*
// mark block's pending IO as complete
if(io->from_cache || io->given_to_cache)
b->pending_io = 0;
if(bytes_transferred < (ssize_t)io->user_size)
return -1;
size = io->user_size;
// padding optimization: we rounded the start offset down
// to avoid a buffer memcpy in waio. skip past that
(char*&)p += io->padding;
return 0;
debug_warn("block_add: already in cache");
block_cache[block_id] = block;
int file_discard_io(FileIO* io)
static ssize_t block_issue(File* f, IOSlot* slot, const off_t issue_ofs, void* buf)
debug_warn("file_discard_io: file_wait_io wasn't called yet");
return -1;
memset(slot, 0, sizeof(IOSlot));
if(io->our_buf && !io->given_to_cache)
ssize_t issue_size = BLOCK_SIZE;
memset(io->cb, 0, sizeof(aiocb));
// check if in cache
slot->block_id = block_make_id(f->fn_hash, issue_ofs);
slot->cached_block = block_find(slot->block_id);
goto skip_issue;
memset(io, 0, sizeof(FileIO));
return 0;
//debug_out("%x miss\n", issue_ofs);
// allocate temp buffer
buf = slot->temp_buf = mem_alloc(BLOCK_SIZE, BLOCK_SIZE);
// if using buffer, set position in it; otherwise, use temp buffer
int err = file_start_io(f, issue_ofs, BLOCK_SIZE, buf, &slot->io);
if(err < 0)
issue_size = (ssize_t)err;
return issue_size;
// transfer modes:
// *p != 0: *p is the source/destination address for the transfer.
@ -858,6 +805,10 @@ int file_discard_io(FileIO* io)
// the read into blocks, so aio's buffer won't have to cover the
// whole file.
ssize_t file_io(File* const f, const off_t data_ofs, size_t data_size, void** const p,
const FileIOCB cb, const uintptr_t ctx) // optional
@ -923,6 +874,7 @@ invalid:
// only align if we allocate the buffer and in AIO mode
const bool do_align = buf_type != USER && !no_aio;
const bool cache = buf_type == TEMP;
@ -966,16 +918,9 @@ invalid:
// transferring via lowio only.
// skip aio code, use lowio
lseek(f->fd, data_ofs, SEEK_SET);
return write(f->fd, data_buf, data_size);
return read(f->fd, data_buf, data_size);
return lowio(f->fd, is_write, data_ofs, data_size, data_buf);
@ -984,7 +929,8 @@ invalid:
const int MAX_IOS = 4;
FileIO ios[MAX_IOS] = { 0 };
IOSlot ios[MAX_IOS] = { {0} };
int head = 0;
int tail = 0;
@ -1002,6 +948,7 @@ invalid:
ssize_t err = +1; // loop terminates if <= 0
size_t remaining_blocks = actual_size / BLOCK_SIZE;
@ -1009,60 +956,66 @@ invalid:
// start transferring next block.
if(pending_ios < MAX_IOS && !all_issued && err > 0)
// calculate issue_size:
// at most, transfer up to the next block boundary.
off_t issue_ofs = (off_t)(actual_ofs + issue_cnt);
size_t issue_size = BLOCK_SIZE;
const size_t left_in_block = BLOCK_SIZE - (issue_ofs % BLOCK_SIZE);
const size_t total_left = data_size - issue_cnt;
issue_size = MIN(left_in_block, total_left);
// get next free IO slot in ring buffer
FileIO* io = &ios[head];
IOSlot* slot = &ios[head];
head = (head + 1) % MAX_IOS;
// if using buffer, set position in it; otherwise, 0 (temp)
void* data = (buf_type == TEMP)? 0 : (char*)actual_buf + issue_cnt;
int ret = file_start_io(f, issue_ofs, issue_size, data, io);
if(ret < 0)
err = (ssize_t)ret;
off_t issue_ofs = (off_t)(actual_ofs + issue_cnt);
void* buf = (buf_type == TEMP)? 0 : (char*)actual_buf + issue_cnt;
ssize_t issued = block_issue(f, slot, issue_ofs, buf);
if(issued < 0)
err = issued;
// transfer failed - loop will now terminate after
// waiting for all pending transfers to complete.
issue_cnt += issue_size;
issue_cnt += issued;
if(issue_cnt >= actual_size)
all_issued = true;
// IO pending: wait for it to complete, and process it.
else if(pending_ios)
FileIO* io = &ios[tail];
IOSlot* slot = &ios[tail];
tail = (tail + 1) % MAX_IOS;
void* block;
size_t size;
int ret = file_wait_io(io, block, size);
if(ret < 0)
err = (ssize_t)ret;
void* block = slot->cached_block;
size_t size = BLOCK_SIZE;
// wasn't in cache; it was issued, so wait for it
bool from_cache;
from_cache = true;
from_cache = false;
// first time; skip past padding
void* data = block;
if(raw_transferred_cnt == 0)
(char*&)data += lead_padding;
size -= lead_padding;
int ret = file_wait_io(slot->io, block, size);
if(ret < 0)
err = (ssize_t)ret;
// first time; skip past padding
void* data = block;
if(raw_transferred_cnt == 0)
(char*&)data += lead_padding;
size -= lead_padding;
// don't include trailing padding
if(raw_transferred_cnt + size > data_size)
size = data_size - raw_transferred_cnt;
// we have useable data from a previous temp buffer,
// but it needs to be copied into the user's buffer
if(from_cache && buf_type != TEMP)
memcpy((char*)data_buf+raw_transferred_cnt, data, size);
// don't include trailing padding
if(raw_transferred_cnt + size > data_size)
size = data_size - raw_transferred_cnt;
//// if size comes out short, we must be at EOF
@ -1085,7 +1038,15 @@ invalid:
actual_transferred_cnt += size;
if(buf_type == TEMP)
// adding is allowed and we didn't take this from the cache already: add
block_add(slot->block_id, slot->temp_buf);
// (all issued OR error) AND no pending transfers - done.

View File

@ -127,33 +127,17 @@ extern int file_unmap(File* f);
// async IO
struct FileIO
u64 block_id;
// set by file_start_io when in block-cache mode, otherwise 0.
aiocb* cb;
// large (144 bytes) on Linux; cannot store here.
// allocated in file_start_io.
size_t padding;
size_t user_size;
bool our_buf;
bool from_cache;
bool given_to_cache;
bool return_called;
typedef void* FileIO;
extern int file_start_io(File* f, off_t ofs, size_t size, void* buf, FileIO* io);
// indicates if the given IO has completed.
// return value: 0 if pending, 1 if complete, < 0 on error.
extern int file_io_complete(FileIO* io);
extern int file_io_complete(FileIO io);
extern int file_wait_io(FileIO* io, void*& p, size_t& size);
extern int file_wait_io(FileIO io, void*& p, size_t& size);
extern int file_discard_io(FileIO* io);
extern int file_discard_io(FileIO io);