1
0
forked from 0ad/0ad

setup new docs dir for thesis and maybe other files (as discussed with stuart, it doesn't fit in binaries or source)

commit rough draft

This was SVN commit r3676.
This commit is contained in:
janwas 2006-03-23 18:17:13 +00:00
parent 3efeeecb84
commit eba3e5fdd6

623
docs/file_io_thesis.rtf Normal file
View File

@ -0,0 +1,623 @@
{\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
{\f2\fmodern\fcharset0\fprq1{\*\panose 02070309020205020404}Courier New;}{\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f10\fnil\fcharset2\fprq2{\*\panose 05000000000000000000}Wingdings;}
{\f37\froman\fcharset238\fprq2 Times New Roman CE;}{\f38\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman Tur;}
{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f43\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
{\f47\fswiss\fcharset238\fprq2 Arial CE;}{\f48\fswiss\fcharset204\fprq2 Arial Cyr;}{\f50\fswiss\fcharset161\fprq2 Arial Greek;}{\f51\fswiss\fcharset162\fprq2 Arial Tur;}{\f52\fswiss\fcharset177\fprq2 Arial (Hebrew);}
{\f53\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f54\fswiss\fcharset186\fprq2 Arial Baltic;}{\f55\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f57\fmodern\fcharset238\fprq1 Courier New CE;}{\f58\fmodern\fcharset204\fprq1 Courier New Cyr;}
{\f60\fmodern\fcharset161\fprq1 Courier New Greek;}{\f61\fmodern\fcharset162\fprq1 Courier New Tur;}{\f62\fmodern\fcharset177\fprq1 Courier New (Hebrew);}{\f63\fmodern\fcharset178\fprq1 Courier New (Arabic);}
{\f64\fmodern\fcharset186\fprq1 Courier New Baltic;}{\f65\fmodern\fcharset163\fprq1 Courier New (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;
\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}{\*\cs15 \additive \ul\cf2 \sbasedon10 \styrsid15489891 Hyperlink;}}
{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\listtable{\list\listtemplateid-804073296\listhybrid{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\leveltemplateid-2124134138
\'01-;}{\levelnumbers;}\loch\af1\hich\af1\dbch\af0\fbias0 \fi-360\li720\jclisttab\tx720\lin720 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691
\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li1440\jclisttab\tx1440\lin1440 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;}
\f10\fbias0 \fi-360\li2160\jclisttab\tx2160\lin2160 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698689\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li2880
\jclisttab\tx2880\lin2880 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li3600\jclisttab\tx3600\lin3600 }{\listlevel
\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;}\f10\fbias0 \fi-360\li4320\jclisttab\tx4320\lin4320 }{\listlevel\levelnfc23\levelnfcn23
\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698689\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li5040\jclisttab\tx5040\lin5040 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0
\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li5760\jclisttab\tx5760\lin5760 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0
{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;}\f10\fbias0 \fi-360\li6480\jclisttab\tx6480\lin6480 }{\listname ;}\listid2098556351}}{\*\listoverridetable{\listoverride\listid2098556351\listoverridecount0\ls1}}{\*\rsidtbl \rsid141460
\rsid408563\rsid615363\rsid674725\rsid799807\rsid881799\rsid1010827\rsid1080261\rsid1080863\rsid1128024\rsid1770144\rsid1847956\rsid2108982\rsid2117727\rsid2186746\rsid2424877\rsid2508185\rsid2521923\rsid2578444\rsid3289879\rsid3345630\rsid3422301
\rsid3546591\rsid3606292\rsid3629320\rsid4146695\rsid4351785\rsid4401489\rsid4665099\rsid4937740\rsid5048634\rsid5467766\rsid5710756\rsid5721779\rsid5791350\rsid5968267\rsid5980580\rsid6185166\rsid6826037\rsid6831581\rsid6842252\rsid6842857\rsid6951588
\rsid7351464\rsid7371675\rsid7437835\rsid7733471\rsid8066536\rsid8144712\rsid8477628\rsid8848513\rsid8874078\rsid9060782\rsid9109787\rsid9137203\rsid9206128\rsid9373790\rsid9658852\rsid9779530\rsid10040624\rsid10047242\rsid10117750\rsid10239896
\rsid10885058\rsid11155165\rsid11157276\rsid11279558\rsid11368037\rsid11408224\rsid11472464\rsid11491946\rsid11684990\rsid11994078\rsid12070557\rsid12124230\rsid12480624\rsid12547006\rsid12675798\rsid12716889\rsid12808527\rsid12870649\rsid13000184
\rsid13180669\rsid13388513\rsid13582553\rsid13723102\rsid13724273\rsid13779256\rsid14045424\rsid14308065\rsid14363947\rsid14433699\rsid14619881\rsid14696977\rsid14962633\rsid15159648\rsid15160866\rsid15301494\rsid15489891\rsid15548533\rsid15952639
\rsid16131725\rsid16411143\rsid16475960\rsid16678464}{\*\generator Microsoft Word 11.0.6359;}{\info{\title Introduction}{\author Jan Wassenberg}{\operator Jan Wassenberg}{\creatim\yr2006\mo3\dy23\hr13\min57}{\revtim\yr2006\mo3\dy23\hr13\min57}{\version2}
{\edmins1}{\nofpages19}{\nofwords6373}{\nofchars36330}{\*\company a}{\nofcharsws42618}{\vern24703}}\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1701
\dgvorigin1984\dghshow1\dgvshow1\jexpand\viewkind4\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\viewnobound1\snaptogridincell\allowfieldendsel
\wrppunct\asianbrkrule\rsidroot15301494\newtblstyruls\nogrowautofit \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2
\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6
\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang
{\pntxtb (}{\pntxta )}}\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\f1\insrsid1128024 Abstract
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1128024 {\f1\insrsid1128024\charrsid1128024 Slow I/O is widespread, as attested }{\f1\insrsid11155165 to }{\f1\insrsid1128024\charrsid1128024
by splash screens and progress bars; however, it can be done better.
\par We present a reusable and highly efficient I/O library, discuss }{\f1\insrsid11155165 design decisions and }{\f1\insrsid1128024\charrsid1128024 key algorithms, and analy}{\f1\insrsid11155165 z}{\f1\insrsid1128024\charrsid1128024
e the resulting performance.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid1128024
\par }{\f1\insrsid15301494\charrsid12808527 Introduction
\par
\par }{\f1\insrsid615363 Motivation / }{\f1\insrsid15301494\charrsid12808527 Importance of Fast}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527
\par
\par Since}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527 is much slower than CPU or memory, it can very quickly become a bottleneck. }{\f1\insrsid16131725 An }{\f1\insrsid615363 estimate as of 2006}{\f1\insrsid16131725 is}{
\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid615363 6}{\f1\insrsid12808527\charrsid12808527 0}{\f1\insrsid16131725 }{\f1\insrsid12808527\charrsid12808527 MB/s vs}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 2}{\f1\insrsid615363 6}{
\f1\insrsid12808527\charrsid12808527 00}{\f1\insrsid16131725 }{\f1\insrsid12808527\charrsid12808527 MB/s}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid15301494\charrsid12808527
Many applications would therefore benefit from faster}{\f1\insrsid6842252 I/O}{\f1\insrsid12808527\charrsid12808527 ; example scenarios include:}{\f1\insrsid15301494\charrsid12808527
\par }{\f1\insrsid12808527\charrsid12808527 - slow startup time. The user is inconvenienced by waiting for }{\f1\insrsid12808527 required files to load}{\f1\insrsid12808527\charrsid12808527 ; this is often exacerbated by splash screens and other distractions.}
{\f1\insrsid12808527
\par }{\f1\insrsid15489891 For a}{\f1\insrsid16131725 rather extreme}{\f1\insrsid15489891 example of th}{\f1\insrsid16131725 is}{\f1\insrsid15489891 problem, see }{\field{\*\fldinst {\f1\insrsid15489891 HYPERLINK "}{\f1\insrsid15489891\charrsid15489891
http://www.break.com/index/patiencechild.html}{\f1\insrsid15489891 " }{\f1\insrsid15489891\charrsid10249343 {\*\datafield
00d0c9ea79f9bace118c8200aa004ba90b02000000170000002e00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c000000e0c9ea79f9ba
ce118c8200aa004ba90b5c00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c000000}}}{\fldrslt {
\cs15\f1\ul\cf2\insrsid15489891\charrsid10249343 http://www.break.com/index/patiencechild.html}}}{\f1\insrsid15489891 .}{\f1\insrsid15489891\charrsid15489891
\par }{\f1\insrsid12808527\charrsid12808527 - }{\f1\insrsid12808527 on-demand loading. If the data set is too large to fit in memory, it must be loaded }{\f1\insrsid615363 in increments }{\f1\insrsid12808527 as needed. This can cause \lquote freezes\rquote
in the application while waiting for the}{\f1\insrsid6842252 I/O}{\f1\insrsid12808527 to finish.
\par - heavy throughput requirements.}{\f1\insrsid615363 Some applications, e.g. video players or editing tools, require high sustained}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 throughput.}{\f1\insrsid12808527
\par }{\f1\insrsid15489891
\par }{\f1\insrsid9373790 Intended Application}{\f1\insrsid615363
\par
\par The application for which our}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 library has been developed is a Real-Time Strategy computer game}{\f1\insrsid9779530 [0ad}{\f1\insrsid615363 ].}{\f1\insrsid11684990
Both on-demand streaming of data and bulk loading on startup must be efficiently handled. }{\f1\insrsid615363 While intending for this}{\f1\insrsid6842252 I/O}{\f1\insrsid615363
code to remain useful for a wide range of applications, several consequences arise from this and guide our design decisions.
\par First, much emphasis is placed on real-time behavior. Lag or \lquote freezing\rquote in-game is not acceptable and must be minimized. This means that }{\f1\insrsid9373790 the }{\f1\insrsid615363 caching }{\f1\insrsid9373790 algorithm }{\f1\insrsid615363
must }{\f1\insrsid9373790 not have offline performance characteristics, reordering I/Os is probably not acceptable and }{\f1\insrsid2117727 any }{\f1\insrsid9373790 pre-fetching }{\f1\insrsid16131725 would have to}{\f1\insrsid9373790
be quite conservative (so as not to penalize time-critical on-demand loads).
\par Also, the working set is not static; depending on game mode and environment, different files may be needed. Provision must be made for varying access patterns.
\par Finally, and related to the real-time issue, is that of fragmentation. Games can run over several hours; during that time, performance must not degrade to unacceptable levels e.g. due to memory fragmentation. Given the real-time requiremen
ts, offline reorganization is not an option; the algorithms }{\f1\insrsid10117750 used must be designed accordingly.}{\f1\insrsid9373790
\par }{\f1\insrsid10117750
\par Given these central design constraints, we now present the chief ideas behind our fast}{\f1\insrsid6842252 I/O}{\f1\insrsid10117750 method.
\par
\par Techniques
\par
\par Our approach is }{\f1\insrsid11155165 five}{\f1\insrsid10117750 -fold:
\par 1) caching avoids repetitive slow}{\f1\insrsid6842252 I/Os}{\f1\insrsid10117750 ;
\par 2) ordering files according to access patterns minimizes hard-drive seeks;
\par 3) compressing files reduces the amount of data to read;
\par 4) asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid10117750 maximizes throughput and }{\f1\insrsid2117727 allows computation}{\f1\insrsid11155165 to proceed in parallel with}{\f1\insrsid6842252 I/O}{\f1\insrsid11155165 ;}{\f1\insrsid10117750
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11155165 {\f1\insrsid11155165 5) splitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid11155165
into blocks simplifies caching and decompression while also avoiding copying buffers for alignment purposes.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid2117727
\par We will discuss each of these in detail below, but first cover related theoretical work in this field.
\par }{\f1\insrsid10117750
\par }{\f1\insrsid2117727 Related Theoretical Work}{\f1\insrsid10117750
\par }{\f1\insrsid2117727
\par }{\f1\insrsid1080863 Cache
\par
\par }{\f1\insrsid9779530 For the cache, a central question is which files to keep in memory. This is known as the file- or web-caching problem. }{\f1\insrsid2117727
In short, given a set of file requests (each with size and retrieval cost), a cache is maintained such that tot}{\f1\insrsid9779530 al retrieval cost is minimized.
\par }{\f1\insrsid1128024
\par }{\f1\insrsid2117727 The special case where size and cost are uniform is called \'93paging\'94, which has been studied extensively.}{\f1\insrsid9779530
Several algorithms that have an optimal competitive ratio are known. In particular LRU (Least Recently Used, which simply evicts the file whose access time is the least recent) }{\f1\insrsid8874078
is k/(k-h+1) competitive, which is the best possible for a deterministic algorithm}{\f1\insrsid9779530 [Sleator/Tarjan].}{\f1\insrsid8874078
\par }{\f1\insrsid1128024
\par }{\f1\insrsid9779530 This model is appealing due to its simplicity, but is not sufficient for our needs.}{\f1\insrsid8874078 Files are not typically uniform size, and treating them as such would be monstrously inefficient (}{\f1\insrsid3289879
much cache space would be wasted by rounding up element size to that of the largest file}{\f1\insrsid8874078 ).}{\f1\insrsid9779530
\par }{\f1\insrsid1128024
\par }{\f1\insrsid8874078 Irani gives 2 }{\f1\insrsid3289879 O(log^2 k) competitive }{\f1\insrsid8874078 randomized algorit}{\f1\insrsid1128024 hms that can deal with variable-}{\f1\insrsid8874078 sized files}{\f1\insrsid1128024 and uniform cost [Irani].
\par
\par }{\f1\insrsid3289879 However, we would like to achieve full generality and provide for variable cost as well. This can be used as the name suggests to more accurately reflect load time (as will be seen below, this is not solely dependent on file size!),
or as a hint from the application that certain files are not to be removed from the cache as early as they otherwise would.}{\f1\insrsid8874078
\par }{\f1\insrsid1128024
\par }{\f1\insrsid3289879 Young develops such an algorithm and calls it Landlord. Briefly, each file receives \lquote credit\rquote that is initially set to its cost. When determining which file is to be removed from cache (i.e. \lquote evicted\rquote
), each one is charged \lquote rent\rquote proportional to its size and the minimum credit-per-size density currently in the cache. }{\f1\insrsid1128024 Items are evicted once their credit is 0. }{\f1\insrsid3289879 On every access, credit is
increased in an arbitrary manner. This strategy is k/(k-h+1)-competitive, which again is optimal for a deterministic algorithm. [Young02]
\par }{\f1\insrsid1128024
\par }{\f1\insrsid3289879 We }{\f1\insrsid8144712 end up }{\f1\insrsid3289879 us}{\f1\insrsid8144712 ing}{\f1\insrsid3289879 an optimized variant of this Landlord cache management strategy.
\par }{\f1\insrsid9060782
\par }{\f1\insrsid1080863 Allocation
\par
\par }{\f1\insrsid9060782 Another important part of caching is the memory allocation aspect. For reasons that will be discussed below, }{\f1\insrsid12070557 existing general-purposes }{\f1\insrsid9060782
allocators are not adequate; an alternative will have to be developed. We build on decades of work in this area.
\par }{\f1\insrsid1128024
\par }{\f1\insrsid13724273 Wilson et al. give a very thorough }{\f1\insrsid14045424 and helpful }{\f1\insrsid13724273 overview. }{\f1\insrsid14045424
A simple but crucial point is made: fragmentation is caused by freeing regions whose neighbors are not free. Allocators are online algorithms whose only }{\f1\insrsid12070557 tool}{\f1\insrsid14045424 against this is placement \endash
deciding where to allocate regions. The authors advocate benchmarking by }{\f1\insrsid1128024 means of }{\f1\insrsid14045424
traces (a record of allocations) from real-world programs, because randomized tests do not necessarily reflect reality. It is emphasized that allocat}{\f1\insrsid11155165 ion}{\f1\insrsid14045424
policy and mechanism must be considered separately. Results of
tests show certain policies, namely address-ordered first (segregated) fit, to perform quite well, wasting only about 14% memory. Finally, further discussion of implementation details such as boundary tags was helpful. [DynStorageReview]
\par }{\f1\insrsid1128024
\par }{\f1\insrsid13724273 Johnstone and Wilson}{\f1\insrsid14045424 go on to refine their measure of }{\f1\insrsid13724273 fragmentation and }{\f1\insrsid14045424 conclude that the previously mentioned AO-first-fit policy actually }{\f1\insrsid12070557 only
}{\f1\insrsid14045424 suffers from ~1% fragmentation}{\f1\insrsid12070557 , the best of all techniques considered}{\f1\insrsid14045424 . }{\f1\insrsid12070557 [}{\f1\insrsid13724273 MemFragSolved]}{\f1\insrsid12070557
This promising result leads us to focus on that policy.
\par }{\f1\insrsid1128024
\par }{\f1\insrsid12070557 Masmano et al. present a \'93Two Level Segregated Fit\'94 algorithm with O(1) }{\f1\insrsid16131725 time complexity}{\f1\insrsid12070557 . [TLSF]
\par }{\f1\insrsid1128024
\par }{\f1\insrsid12070557 We end up im}{\f1\insrsid11155165 plementing a simpler variant based on this idea }{\f1\insrsid1080863 that also avoids the need for }{\f1\insrsid1128024 block }{\f1\insrsid1080863 headers, which was the }{\f1\insrsid11684990
abovementioned }{\f1\insrsid1080863 problem preventing use of }{\f1\insrsid11684990 a }{\f1\insrsid1080863 general allocator.}{\f1\insrsid12070557
\par }{\f1\insrsid1080863
\par Ordering - Traveling Salesman Problem
\par
\par The problem of ordering files according to access patterns can be seen as an instance of the Traveling Salesman Problem. The latter is defined as: given a graph of nodes (cities)
and the cost of traveling from one to another (travel distance), compute a path that will take the salesman to each city while incurring minimal cost. In our case, files correspond to cities and the hard-disk seek distance to cost.
\par }{\f1\insrsid1128024
\par }{\f1\insrsid1080863 TSP has perhaps been studied most among all optimization problems; numerous algorithms and heuristics have been developed, each with their strengths and w}{\f1\insrsid1128024 eaknesses. [DIMACS Challenge] }{\f1\insrsid10047242
gives an extensive listing of algorithms, relative performance and techniques and was a valuable reference.}{\f1\insrsid1080863
\par }{\f1\insrsid1128024
\par }{\f1\insrsid10047242 For our purposes, however, a simple greedy heuristic is sufficient.
\par }{\f1\insrsid4665099
\par }{\f1\insrsid8477628
\par }{\f1\insrsid4665099
\par Detailed Discussion of Techniques
\par }{\f1\insrsid16131725
\par We now cover the individual techniques used to speed up}{\f1\insrsid6842252 I/O}{\f1\insrsid16131725 in detail.
\par }{\f1\insrsid4665099
\par }{\f1\insrsid8477628
\par }{\f1\insrsid4665099 Efficient Asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid4665099
\par
\par }{\f1\insrsid11408224 For an understanding of how to achieve maximum}{\f1\insrsid6842252 I/O}{\f1\insrsid11408224 read throughput, we briefly explain how the hard-drive is accessed on PC systems.
\par }{\f1\insrsid10040624
\par }{\f1\insrsid11408224 Early IDE (Integrated Drive Electronics \endash a marketing-driven name) disks were addressed via Programmed}{\f1\insrsid6842252 I/O}{\f1\insrsid11408224 , where the CPU instructs the drive to transfer 2
bytes at a time. Due to significant per-transfer overhead (accessing}{\f1\insrsid6842252 I/O}{\f1\insrsid11408224 registers}{\f1\insrsid10040624 and interrupting CPU when complete}{\f1\insrsid11408224 ), throughput }{\f1\insrsid16131725
only reaches a maximum of }{\f1\insrsid11408224 16.7 MB/s (PIO Mode 4)}{\f1\insrsid10040624 [}{\f1\insrsid10040624\charrsid11408224 http://www.pcguide.com/ref/hdd/if/ide/modes_PIO.htm}{\f1\insrsid10040624 ]}{\f1\insrsid11408224 .}{\f1\insrsid4665099
\par }{\f1\insrsid10040624
\par }{\f1\insrsid11408224 Once rising }{\f1\insrsid10040624 HD platter densities }{\f1\insrsid16131725 - }{\f1\insrsid10040624 and }{\f1\insrsid16131725 the }{\f1\insrsid10040624 resu}{\f1\insrsid16131725 lting increased transfer speeds -}{
\f1\insrsid10040624 }{\f1\insrsid11408224 caused this to become a bottleneck, bus-mastering DMA }{\f1\insrsid16131725 (Direct Memory Access) }{\f1\insrsid11408224
over the PCI bus became the norm. Here, the disk controller writes directly to memory, bypassing the CPU.}{\f1\insrsid10040624 It is free to perform other work during this time, so long as the bus is not needed }{\f1\insrsid16131725 - }{
\f1\insrsid10040624 an important point that will affect our choice of}{\f1\insrsid6842252 I/O}{\f1\insrsid2108982 }{\f1\insrsid16131725 block size below}{\f1\insrsid10040624 .}{\f1\insrsid11408224
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid4665099
\par }{\f1\insrsid10040624 Given this information, we now examine the}{\f1\insrsid6842252 I/O}{\f1\insrsid10040624 interfaces provided by the Operating System. POSIX allows synchronous blocking}{\f1\insrsid6842252 I/O}{\f1\insrsid10040624 , blocking}{
\f1\insrsid6842252 I/O}{\f1\insrsid10040624 in another thread, and asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid10040624 (}{\f1\insrsid16131725 \'93}{\f1\insrsid10040624 aio}{\f1\insrsid16131725 \'94}{\f1\insrsid10040624 ).
\par The first falls from consideration because it }{\f1\insrsid9109787 does not allow work to proceed in parallel with the}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 . Several implementation details cause us to choose aio over the threaded approach:}{
\f1\insrsid10040624
\par }{\f1\insrsid9109787 - on Windows, this bypasses the OS file cache, which is key to reaching full throughput.
\par - aio queues reads so that the disk controller can proceed immediately with the next}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 ; the disk is always busy. With threaded blocking}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 , the OS woul
d have to return from and then reenter kernel mode before relaying the next}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 request to the disk. This overhead reduces throughput.
\par - parallelism between computation and}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 is achieved without having to worry about the OS correctly scheduling all parti
cipating threads. Additionally, behavior is predictable and thread-switch overhead is avoided.
\par
\par As a final detail, the POSIX aio functionality is emulated on Windows in terms of the \'93overlapped\'94}{\f1\insrsid14962633 ReadFile API; this ensures portability to virtually all systems. }{\f1\insrsid9109787
\par }{\f1\insrsid14962633
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid14962633 {\f1\insrsid9109787 To summarize}{\f1\insrsid14962633 , we use asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid14962633 }{\f1\insrsid3345630
to achieve best possible throughput and allow computation to proceed in parallel. This is made possible by the hard drive\rquote s DMA interface.}{\f1\insrsid14962633
\par The validity of this approach is shown by a small test program that reaches maximum rated drive throughput and by [}{\f1\insrsid14962633\charrsid4665099 performance study of}{\f1\insrsid14962633 sequential I/O on windows NT 4].}{
\f1\insrsid14962633\charrsid4665099
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid9109787
\par }{\f1\insrsid2108982
\par Compression
\par
\par The next cornerstone of our}{\f1\insrsid6842252 I/O}{\f1\insrsid2108982 library is compressing source files. This can dramatically reduce the amount of data to read. Indeed the current
0ad dataset has been compressed down to 46% of the original, a savings of 75 MB.}{\f1\insrsid9658852 (NB: the dataset includes 13 MB of uncompressible audio; 3d mesh files with compression ratios of ~3x are chiefly responsible for the reduction)}{
\f1\insrsid2108982
\par }{\f1\insrsid9658852
\par The compression algorithm used is Deflate, a combination of LZ77 and Huffman encoding as defined in [RFC1951] and used in the common Zip file format [Zip}{\f1\insrsid10885058 A}{\f1\insrsid9658852 pp}{\f1\insrsid10885058 N}{\f1\insrsid9658852 ote]. }{
\f1\insrsid7733471 Other }{\f1\insrsid9658852 formats }{\f1\insrsid7733471 may }{\f1\insrsid9658852 achieve better compression ratios or feature faster compression/decompression speed}{\f1\insrsid7733471 , but these are not c
ritical to success. We prefer the advantage of interoperability \endash tools to work with Zip archives are universally available.}{\f1\insrsid9658852
\par
\par }{\f1\insrsid6951588 In addition to the abovementioned significant reduction in file size, a further compelling argument to compress all data files }{\f1\insrsid10885058 is that it is }{\f1\insrsid9658852 effectively free!}{\f1\insrsid6951588
\par }{\f1\insrsid9658852 Since the asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid10885058 method mentioned above}{\f1\insrsid7733471 allows parallelizing}{\f1\insrsid6842252 I/O}{\f1\insrsid7733471 and }{\f1\insrsid6951588
decompression, we need only show that the latter takes less time. Indeed a benchmark shows that a typical Pentium IV system (as of 2002) manages 40 MB/s}{\f1\insrsid6842252 I/O}{\f1\insrsid6951588 throughput and 100MB/s decompression [}{
\f1\insrsid6951588\charrsid2108982 http://archive.gamespy.com/hardware/june02/p45331/index2.shtm}{\f1\insrsid6951588 ].
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6951588 {\f1\insrsid6951588 Therefore, any reduction in file size due to compression lessens}{\f1\insrsid6842252 I/O}{\f1\insrsid6951588 time at no cost.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid6951588
\par Note: The balance is not expected to change in the future for single-disk systems; even if it does, a compression method more suited to real-time decompression can be substituted.}{\f1\insrsid9658852
\par }{\f1\insrsid8477628
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2508185 {\f1\insrsid8477628 It remains to discuss how exactly}{\f1\insrsid6842252 I/O}{\f1\insrsid8477628 and decompression will be parallelized. }{
\f1\insrsid2508185 Presuppose that}{\f1\insrsid6842252 I/Os}{\f1\insrsid2508185 are split into blocks, the rationale of which is explained below. These blocks are issued asynchronously up to a surely safe queue depth (currently 4). }{\f1\insrsid6842252
A b}{\f1\insrsid2508185 lock whose}{\f1\insrsid6842252 I/O}{\f1\insrsid2508185 ha}{\f1\insrsid6842252 s}{\f1\insrsid2508185 finished }{\f1\insrsid6842252 is}{\f1\insrsid2508185 then decompressed while the next ones are pending.
\par Since decompression is faster than}{\f1\insrsid6842252 I/O}{\f1\insrsid2508185 (as shown above), this parallelizes perfectly in that decompression is \lquote hidden\rquote behind}{\f1\insrsid6842252 I/O}{\f1\insrsid2508185 cost.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid2508185
\par
\par }{\f1\insrsid11491946 Ordering Files}{\f1\insrsid6842857
\par }{\f1\insrsid11491946
\par The abovementioned tec}{\f1\insrsid11157276 hniques are not yet sufficient.}{\f1\insrsid4351785 }{\f1\insrsid11157276 While good sequential read performance is }{\f1\insrsid2508185 hereby}{\f1\insrsid11157276 attained, }{\f1\insrsid5721779 total }{
\f1\insrsid11157276 throughput }{\f1\insrsid5721779 is quite poor }{\f1\insrsid11157276 because files will tend to be scattered throughout the disk. This incurs }{\f1\insrsid5721779 expensive}{\f1\insrsid11157276 seeks}{\f1\insrsid4351785
(moving the hard-disk read head)}{\f1\insrsid11157276 ; a rough estimation of their cost is the time taken to read 400KB (assuming }{\f1\insrsid5721779 typical 7200 RPM drive with }{\f1\insrsid11157276 10ms seek and 40 MB/s throughput [www.sto
ragereview.com]). Given that files are often much smaller on average (25KB for 0ad), }{\f1\insrsid6842252 seek time }{\f1\insrsid11157276 dwarf}{\f1\insrsid6842252 s}{\f1\insrsid11157276 }{\f1\insrsid5721779 pure}{\f1\insrsid6842252 I/O}{
\f1\insrsid5721779 read }{\f1\insrsid11157276 time.
\par }{\f1\insrsid5721779
\par }{\f1\insrsid11157276 Throughput can be much improved by arranging files on disk in order of access}{\f1\insrsid5721779 , thus avoiding seeks}{\f1\insrsid11157276 . Since we wish to use a standard File System (}{\f1\insrsid5721779
whose placement strategy we cannot control}{\f1\insrsid11157276 ) for simplicity, files will have to be combined into one large OS-visible file \endash an archive.}{\f1\insrsid5721779
As mentioned above, we prefer the Zip format for easy interoperability.}{\f1\insrsid11157276
\par }{\f1\insrsid5721779
\par Incidentally, storing files in arch
ives has an additional advantage. The FS needs to store metadata and typically sector-aligns files; since sectors are 512 bytes or above, this is very costly for tiny files. (NB: ReiserFS4 is the only known exception, able to pack several files into one s
ector.)
\par In contrast, archives can contain files packed end-to-end with only minimal metadata/header information}{\f1\insrsid12480624 , thus wasting less space and by extension reducing read time.}{\f1\insrsid5721779
\par }{\f1\insrsid4351785
\par It remains to determine the optimal file ordering that minimizes seeks.}{\f1\insrsid6831581 This will be done once (offline); performance is therefore not of paramount importance.}{\f1\insrsid4351785
\par
\par Before, though, we decide whether files may be repeated in the archive. To see the problem, consider the following sequence where file \lquote C\rquote is loaded after \lquote A\rquote 50% of the time and otherwise after \lquote B\rquote : AC\'85
BC...AC...BC. It would seem that 50% of \lquote C\rquote accesses must incur a seek}{\f1\insrsid12480624 , but placing}{\f1\insrsid4351785 two copies of this file in the archive - after \lquote A\rquote and \lquote B\rquote \endash can }{
\f1\insrsid12480624 avoid them}{\f1\insrsid4351785 entirely.}{\f1\insrsid11157276
\par }{\f1\insrsid12480624 However, p}{\f1\insrsid4351785 ractical considerations lead us to }{\f1\insrsid12480624 disallow this: the act of finding a file within the archive would be a good deal more complicated. }{\f1\insrsid4351785
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11157276
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2578444 {\f1\insrsid2578444 Now back to the issue of finding an ordering for files. Our strategy is as follows:
\par 1) view all files to be added as nodes in a DAG}{\f1\insrsid6831581 (Directed Acyclic Graph)}{\f1\insrsid2578444 ; edges indicate that 2 files are immediate neighbors in the archive.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid2578444 2) }{\f1\insrsid12480624 record a}{\f1\insrsid2578444 \'93trace\'94 of all}{\f1\insrsid12480624
file accesses over one or more program runs (recall that access patterns may differ}{\f1\insrsid2578444 between runs}{\f1\insrsid12480624 )}{\f1\insrsid15159648 .}{\f1\insrsid2578444
\par }{\f1\insrsid15159648 3) construct from this a list of possible edges sorted by their frequency (i.e. how often they occurred in the trace).
\par 4) generate a set of \lquote }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 \rquote by committing the above edges as long as no cycle results. These }{\f1\insrsid2521923 chains}{\f1\insrsid15159648
are connected portions of the DAG that are known to have been accessed in that order.
\par 5) output the final file ordering by stitching together all }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 and then adding any remaining files that were not included in the trace.
\par
\par Details on these steps follow.
\par
\par }{\f1\insrsid6831581 1: Prepare DAG of Files
\par
\par Each node holds all required information about the file. This includes its filena
me and the nodes that have been chosen to come before and after it in the final layout. All of these are stored as 16-bit IDs to reduce size and therefore improve locality; mapping from filename to ID is accomplished in logarithmic time via tree.
\par
\par }{\f1\insrsid15159648 2: }{\f1\insrsid2521923 Record }{\f1\insrsid15159648 Trace
\par }{\f1\insrsid12480624
\par }{\f1\insrsid3422301 The acts of loading a file and releasing the resulting memory are logged (the latter is required by the }{\f1\insrsid12716889 file }{\f1\insrsid3422301 cache). Records consist of }{\f1\insrsid12716889 timestamp, }{\f1\insrsid3422301
filename, file size and any flags that affect}{\f1\insrsid6842252 I/O}{\f1\insrsid3422301 mode. For simplicity, we do not record file offset / transfer size: that would not }{\f1\insrsid674725 yield any information }{\f1\insrsid3422301
because seeks are incurred by accessing any part of the file. Also, we assume that loading entire files at a time is the dominant model.
\par
\par }{\f1\insrsid674725 Besides the obvious application of determining optimal archive ordering, the resulting plain text file can be used to benchmark the}{\f1\insrsid6842252 I/O}{\f1\insrsid674725 implementation under repeatable conditions.}{
\f1\insrsid3422301
\par }{\f1\insrsid674725 Even when lacking the actual data files, the trace can still be useful to benchmark performance of the file cache and ordering. For this, simply map filenames to an integral ID and simulate }{\f1\insrsid12716889 the cache and}{
\f1\insrsid6842252 I/O}{\f1\insrsid12716889 parts.}{\f1\insrsid674725
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid674725 {\f1\insrsid674725 Note}{\f1\insrsid12716889 s}{\f1\insrsid674725 :}{\f1\insrsid12716889
\par - }{\f1\insrsid674725 we are careful to ensure that recording a trace does not incur any}{\f1\insrsid6842252 I/Os}{\f1\insrsid674725 , which would skew performance measurements. Records are stored in binary format within an expandab
le array (no copying or memory waste due to pre-reserved virtual address space).
\par }{\f1\insrsid12716889 - trace files may log accesses over several program runs. This will be useful in the following steps because several mutual-exclusive but equally probably access patterns may exist, each of which should be equally considered.
\par Program runs are differentiated by examining the timestamp, which starts at 0 on each run.
\par
\par 3: Construct Edge List
\par
\par }{\f1\insrsid2186746 This step constructs a list of edges from the trace file. First, the trace is split into program runs, which are processed most recent first. In each of these, all adjacent pairs of files are examined; those }{\f1\insrsid6842252 not}{
\f1\insrsid2186746 already in the list are added, otherwise the existing edge\rquote s frequency is incremented.}{\f1\insrsid15548533
\par
\par Important note: presuppose the existence of a file cache, which will be presented in the next section. Since frequent accesses to files will be absorbed by this cache, we do not want this inflated frequency to \lquote pollute\rquote
the edge list. That would displace other edges that might actually turn out to be more important because they actually would incur seeks, as opposed to the edge whose file}{\f1\insrsid6842252 I/Os}{\f1\insrsid15548533 would be satisfied by the cache.
\par Our solution to this problem is to simulate the file cache whilst processing trace entries (only in the same program run!); if the file would not result in an}{\f1\insrsid6842252 I/O}{\f1\insrsid15548533 due to the cache, the current edge is ignored.
\par Under the assumption that access patterns are similar to the trace, this scheme improves the quality of the ordering by making it reflect the trace more strongly (rather than being fooled by frequent}{\f1\insrsid6842252 I/Os}{\f1\insrsid15548533
). If not, correctness is not impacted; we merely risk incurring a few more seeks.
\par
\par Checking if an edge already exists }{\f1\insrsid2186746 is accomplished by translating the two filenames into 16-bit IDs (O(logN) time), appending these into a 32-bit number and searching for that in a tree (O(logN) time).
\par }{\f1\insrsid15548533
\par }{\f1\insrsid2186746 Finally, this list is}{\f1\insrsid14363947 }{\f1\insrsid2186746 sorted by decreasing frequency}{\f1\insrsid14363947 (must be stable, i.e. ordering of edges with identical frequency must not change! See note below.)}{
\f1\insrsid2186746 . The result is a list of unique edges (i.e. \'93file A should be stored after file B\'94 relationships).
\par
\par 4: Generate }{\f1\insrsid2521923 Chains}{\f1\insrsid2186746
\par
\par }{\f1\insrsid14363947 This step is the heart of our file ordering }{\f1\insrsid2521923 strategy}{\f1\insrsid14363947 . }{\f1\insrsid2186746 The}{\f1\insrsid14363947 above}{\f1\insrsid2186746 edges are now \lquote committed\rquote into }{
\f1\insrsid14363947 the }{\f1\insrsid2186746 DAG}{\f1\insrsid14363947 in order.}{\f1\insrsid2521923 That means the files are marked to come after one another, i.e. their nodes in the DAG }{\f1\insrsid6842252 will be}{\f1\insrsid2521923 connected}{
\f1\insrsid6842252 by an edge}{\f1\insrsid2521923 (unless a cycle were to result). For simplicity, committed edges are never removed, this being a greedy heuristic.}{\f1\insrsid14363947
\par
\par }{\f1\insrsid2521923 We check for cycles}{\f1\insrsid14363947 via }{\f1\insrsid2521923 \'93}{\f1\insrsid14363947 DFS}{\f1\insrsid2521923 \'94}{\f1\insrsid14363947 , which }{\f1\insrsid2521923 actually }{\f1\insrsid14363947 simplifies to a list walk }{
\f1\insrsid2521923 here }{\f1\insrsid14363947 since nodes
have only one previous and next link. These are typically quite short and overall run time of this entire step is not a problem in practice (7ms for 5000 files), so we do not attempt more efficient and sophisticated cycle detection schemes. One such appro
ach would be to store a pointer to the current end of list for each node and perform list jumping.}{\f1\insrsid2186746
\par }{\f1\insrsid14363947
\par The result of this step is a set of disjoint }{\f1\insrsid2521923 chains}{\f1\insrsid14363947 , which are each a series of files that are to be stored immediately after one another. Due to the nature
of the edge list, the files that are most frequently accessed after one another are grouped together. As such, we have attained a }{\f1\insrsid2186746 good }{\f1\insrsid2521923 a}{\f1\insrsid2186746 pproximation of an optimal tour.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid12716889
\par }{\f1\insrsid14363947 Note: now the reason for the most-recent-first program run ordering becomes clear. All
but the most frequent edges are placed into the list in the order that they occurred in the trace (due to stable sort). Since they are also committed in the DAG}{\f1\insrsid2521923 in this order, they end up mostly as observed}{\f1\insrsid14363947 }{
\f1\insrsid2521923 from the trace. Since the most recent trace is assumed to be the most accurate and reflective of current behavior, it is given the most weight (by allowing all edges that ensued from it to be committed first).}{\f1\insrsid14363947
\par }{\f1\insrsid12716889
\par }{\f1\insrsid2521923 5: Stitch Chain together}{\f1\insrsid12716889
\par
\par }{\f1\insrsid2521923 The final step is to stitch together the disjoint chains and output
them into the final ordered list. File nodes will be marked once they have been output. We iterate over all nodes and output the entire chain of which it is a part; this is done by following the node\rquote
s previous link until at beginning of the chain.
\par Incidentally, this iteration ensures all files appear in the output list, even if they were not included in the trace.
\par }{\f1\insrsid12716889
\par }{\f1\insrsid4146695 We have thus generated an ordering of files that minimize seeks assuming application behavior is similar to that which was recorded in the trace(s).
\par }{\f1\insrsid6831581
\par This is an approximation to a variant of the Traveling Salesman Problem; the question as to its quality (i.e. how many seeks are avoided) is interesting and will be examined in }{\f1\insrsid12547006 <<}{\f1\insrsid6831581 section 3}{\f1\insrsid12547006 >>
}{\f1\insrsid6831581 .
\par }{\f1\insrsid4146695
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1770144 {\f1\insrsid1770144
\par Splitting Into Blocks
\par }{\f1\insrsid10239896
\par S}{\f1\insrsid1770144 plitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144 into fixed-sized blocks }{\f1\insrsid10239896 is desirable for two reasons:}{\f1\insrsid1770144
\par
\par Decompressing entire files at a time cannot be parallelized effectively; to see this, consider a series of alternating large and small files (L and S). The time spent waiting for L\rquote s}{\f1\insrsid6842252 I/O}{\f1\insrsid1770144
remains unused, and L\rquote s decompression cannot be hidden behind S\rquote s}{\f1\insrsid6842252 I/O}{\f1\insrsid1770144 . This alone requires splitting up}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144 into blocks.
\par
\par One further advantage is that of sector alignment. Due to the end-to-end packing in archives, files often start at unaligned offsets on disk. A limitation in the Windows }{\f1\insrsid2508185 ReadFile}{\f1\insrsid1770144
API would require copying such files to/from an align buffer. This can be avoided by splitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144 into blocks and rounding their offset/size down/up to sector boundaries.
\par
\par }{\f1\insrsid2508185 We now decide on the block size. Many }{\f1\insrsid10239896 considerations}{\f1\insrsid7371675 }{\f1\insrsid10239896 come in to play:}{\f1\insrsid2508185
\par }{\f1\insrsid1770144 + theoretically, larger sizes are good due to economy of scale (less overhead per transfer).
\par + blocks should be aligned to sector sizes.
\par - blocks should not be too large, or else decompression cannot be done in-cache. That would result in bus accesses, which interfere with the DMA}{\f1\insrsid6842252 I/O}{\f1\insrsid1770144
operation. Typical L2 cache sizes are 256 to 512KiB, which must cover the compressed source and decompressed destination buffers.
\par - large blocks risk exceeding scatter-gather list length, which forces splitting a request i
nto 2 operations anyway. Background: scatter-gather lists are a series of physical pages into which DMA is to occur. Providing support for physically noncontiguous buffers in this manner is helpful if memory available for DMA is fragmented or the buffer c
onsists of several fragments. For concreteness, the Windows ASPI layer has a default limit of 64KiB per transfer.
\par - in practice, there is no difference between aio read throughput for transfer sizes between 4 and 192 KiB [Win}{\f1\insrsid2508185 dows 2000 Disk}{\f1\insrsid6842252 I/O}{\f1\insrsid2508185 Performance].
\par + }{\f1\insrsid1770144 However, the aio queue depth (maximum number of concurrent}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144
that can be queued by the OS) is system-dependent and should not be relied upon. Therefore, it is better to avoid all too small blocks, because it may not be possible to queue enough buffers to keep the disk continuously busy.
\par
\par T}{\f1\insrsid6842252 he result of these ruminations wa}{\f1\insrsid1770144 s a block size of 16 KiB.}{\f1\insrsid11684990 However, our measurements have sho}{\f1\insrsid6842252 wn 32 KiB to be most efficient.}{\f1\insrsid1770144
\par }{\f1\insrsid7371675
\par This concludes discussion of our}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675 techniques. To review,}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 }{\f1\insrsid1770144 are automatically split i
nto blocks (of aligned start position and length) and issued asynchronously}{\f1\insrsid7371675 . Once a block finishes, it is decompressed while the next block}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675
is in progress. Finally, seeks are avoided by having arranged the files within an archive in order of access.
\par }{\f1\insrsid1770144
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid1770144
\par }{\f1\insrsid12675798 Caching}{\f1\insrsid1770144
\par }{\f1\insrsid12675798
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15489891 {\f1\insrsid15489891\charrsid15489891 It's not true that life is one damn thing after another; it is one damn thing over and over.
\par - Edna St. Vincent Millay
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid15489891
\par }{\f1\insrsid12675798 The final step we take in optimizing}{\f1\insrsid6842252 I/O}{\f1\insrsid12675798 is caching. By keeping commonly used files in memory, some }{\f1\insrsid15489891 repeated}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 can be avoi
ded outright.
\par }{\f1\insrsid881799
\par }{\f1\insrsid12675798 There are two \lquote levels\rquote of cache: }{\f1\insrsid11368037 entire }{\f1\insrsid12675798 file}{\f1\insrsid11368037 s }{\f1\insrsid12675798 and block}{\f1\insrsid11368037 s}{\f1\insrsid12675798 .}{\f1\insrsid11368037
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11368037 {\f1\insrsid11368037
\par The small block cache serves to avoid overhead due to sector-aligning}{\f1\insrsid6842252 I/Os}{\f1\insrsid11368037 in transfers. Since files usually start at unaligned offsets within archives, data lying at the beginning o
f a sector would be read twice (once for the real}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037 and then again during the next file\rquote s}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037
). The block cache absorbs this cost by keeping in memory the last few blocks read; it is organized as LRU.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11368037
\par The per-file }{\f1\insrsid6842252 caching strategy}{\f1\insrsid11368037 is due to the assumption that files will usually be loaded in one burst; it simplifies }{\f1\insrsid6842252 bookkeeping }{\f1\insrsid11368037
and avoids having to copy pieces of the file into a final buffer.}{\f1\insrsid12675798
\par }{\f1\insrsid11472464 Our }{\f1\insrsid11368037 file cache is a system consisting of the following components:
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11472464 {\f1\insrsid11472464 - an allocator doles out variable-sized chunks of a fixed-size memory region.
\par - the \lquote extant list\rquote keeps track of which buffers are currently in use by the application.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11472464 - a cache manager provides efficient lookup of the file contents given filename and decides which files to keep in memory.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid11472464
\par }{\f1\insrsid3629320 T}{\f1\insrsid11472464 hese }{\f1\insrsid3629320 are}{\f1\insrsid11472464 explained in detail below.
\par
\par }{\f1\insrsid10239896 Allocator
\par
\par A general-purpose allocator (e.g. malloc) is not acceptable for this application because file buffer addresses are required by Windows ReadFile to be aligned to a sector boundary}{\f1\insrsid13180669 . Rounding up returned addresses would
waste unacceptable amounts of memory, so a }{\f1\insrsid10239896 special allocation scheme }{\f1\insrsid13180669 is needed}{\f1\insrsid4401489 that always returns aligned regions}{\f1\insrsid13180669 .
\par
\par }{\f1\insrsid4401489 This }{\f1\insrsid13180669 entails not prefixing the allocated regions with a header. }{\f1\insrsid4401489 Our idea is to}{\f1\insrsid7351464 transfer}{\f1\insrsid4401489 }{\f1\insrsid13180669 ownership of a}{\f1\insrsid5467766 n}{
\f1\insrsid13180669 }{\f1\insrsid4401489 allocated }{\f1\insrsid13180669 region from the allocator
to cache and/or extant list; these have to record region address and size anyway for their bookkeeping. When the region is to be freed, the extant list inform}{\f1\insrsid7351464 s}{\f1\insrsid13180669 the allocator of its size and address}{
\f1\insrsid4401489 , which is typically what a header would }{\f1\insrsid7437835 have }{\f1\insrsid4401489 store}{\f1\insrsid7437835 d}{\f1\insrsid4401489 . }{\f1\insrsid10239896
\par
\par }{\f1\insrsid4401489 Having now established the requirement for alignment and how to ensure it, we discuss the main problem of an allocator}{\f1\insrsid7437835 -}{\f1\insrsid4401489 fragmentation.
\par There are basically two ways to deal with this: perform periodic reorganization, }{\f1\insrsid5467766 or}{\f1\insrsid4401489 }{\f1\insrsid5467766 prevent }{\f1\insrsid4401489 it from happening in the first place.}{\f1\insrsid7351464
\par
\par }{\f1\insrsid4401489 The former is not feasible due to our real-time requirements, and - more importantly \endash because users receive direct pointers to the cache memory. }{\f1\insrsid7351464 This allows zero-copy}{\f1\insrsid6842252 I/O}{
\f1\insrsid7351464 and reduces memory footprint because multiple users of a file can share its
(read-only) contents. However, it is believed that currently in-use and therefore unmovable regions would severely hamper defragmentation. We therefore focus on the latter approach.}{\f1\insrsid4401489
\par }{\f1\insrsid7351464
\par As shown by Johnstone and Wilson, fragmentation can be mitigated with a good allocation policy, e.g. }{\f1\insrsid13388513 A}{\f1\insrsid7351464 ddress-}{\f1\insrsid13388513 O}{\f1\insrsid7351464 rdered }{\f1\insrsid13388513 good}{\f1\insrsid7351464
-fit. However, }{\f1\insrsid13388513 we also attack the root cause}{\f1\insrsid7351464 : freeing objects whose neighbors are not free. Provision is made for }{\f1\insrsid13388513 the application to }{\f1\insrsid7351464
pass hints as to buffer lifetimes, so that long-lived objects }{\f1\insrsid13388513 can }{\f1\insrsid7351464 be placed differently and not cause \lquote holes\rquote around }{\f1\insrsid13388513 freed short-lived objects.}{\f1\insrsid7351464
\par }{\f1\insrsid4401489
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid7351464 {\f1\insrsid13388513 With all pieces in places, we now discuss the }{\f1\insrsid7351464 allocation }{\f1\insrsid13388513 policy}{\f1\insrsid408563 . }{
\f1\insrsid13388513 As shown in [MemFragSolved], A-O good}{\f1\insrsid408563 -fit performs well. When freeing, we coalesce regions immediately. This may perform
unnecessary work, but is acceptable in light of its simplicity. Allocation first exhausts all available memory before reusing freelist entries. This is fine because the cache size is chosen such that it can and should be used in its entirety. The benefit
is reducing freelist splitting, which tends to produce larger coalesced regions.
\par
\par Implementation Details
\par
\par }{\f1\insrsid7437835 A}{\f1\insrsid13388513 }{\f1\insrsid7437835 \lquote }{\f1\insrsid13388513 good}{\f1\insrsid7437835 \rquote }{\f1\insrsid13388513 fit is achieved by }{\f1\insrsid7437835 searching in }{\f1\insrsid13388513 segregat}{
\f1\insrsid7437835 ed}{\f1\insrsid13388513 freelists}{\f1\insrsid7437835 . They are divided }{\f1\insrsid13388513 into size classes, }{\f1\insrsid408563 where class i (>= 0) }{\f1\insrsid13388513 holds regions of size }{
\f1\insrsid13388513\charrsid13388513 (2**(i-1), 2**i]}{\f1\insrsid13388513 . Determining size class can be done }{\f1\insrsid7437835 by taking the}{\f1\insrsid13388513 }{\f1\insrsid408563 base-2 logarithm of }{\f1\insrsid7437835 the }{\f1\insrsid408563
size}{\f1\insrsid13388513 . If a freelist is empty, the allocation can be satisfied by finding the next }{\f1\insrsid5467766 highest }{\f1\insrsid13388513 non-empty class (O(1) due to bit}{\f1\insrsid408563 scan}{\f1\insrsid13388513 )}{\f1\insrsid408563
and splitting its first block.}{\f1\insrsid7351464
\par }{\f1\insrsid3629320
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3629320 Total alloca
tion performance can be made O(1) by further splitting size classes into fixed-size subclasses; this is the approach taken by [TLSF]. However, we find that freelists are typically empty anyway (because the cache is always as full as possible) and therefor
e }{\f1\insrsid7437835 omit this for simplicity.}{\f1\insrsid4401489
\par }{\f1\insrsid7437835
\par }{\f1\insrsid3629320 Coalescing works by storing boundary tags within the freed}{\f1\insrsid12547006 (!)}{\f1\insrsid3629320
memory. When freeing a block, we check if the regions that come before and after it have such tags (identified via distinctive bit patterns very likely to occur in normal data); if so, they are merged. Note that this is somewhat risky but the \lquote
magic\rquote bit pattern is long enough to make any mix-up extremely unlikely. This trouble is necessary because the tags cannot be a}{\f1\insrsid7437835 dded}{\f1\insrsid3629320 to }{\f1\insrsid7437835 the }{\f1\insrsid3629320 beginning/end of }{
\f1\insrsid7437835 a }{\f1\insrsid3629320 region due to alignment requirements.
\par
\par }{\f1\insrsid408563 F}{\f1\insrsid13180669 or convenience, memory is doled out from a fixed-size chunk of virtual address space, rather than separate on-demand allocations from the OS. }{\f1\insrsid4401489
This allows easily checking whether a given pointer is valid and was taken from the chunk. Due to on-demand committing of the virtual memory, only }{\f1\insrsid5467766 as}{\f1\insrsid4401489 much physical memory as necessary is used.}{
\f1\insrsid10239896
\par }{\f1\insrsid4401489
\par }{\f1\insrsid3629320 Extant List
\par
\par This list tracks all buffers that have been }{\f1\insrsid12547006 handed out}{\f1\insrsid3629320 to the application but not yet freed. Since they are expected to be freed immediately (
before allocating the next, which is enforced by a warning), this list only contains a few entries and therefore need not be organized as a tree.
\par
\par It stores address and size of the allocated regions, which are passed to the allocator when freeing a buffer. This }{\f1\insrsid7437835 avoids the need for}{\f1\insrsid3629320 per-regi}{\f1\insrsid15952639 on headers, as explained above.
\par An alternative would be }{\f1\insrsid3629320 providing a separate data structure }{\f1\insrsid15952639 associating allocated address with its size, but this is redundant since many of these regions are also stored in the cache. There
fore, our approach uses less memory.}{\f1\insrsid3629320
\par
\par }{\f1\insrsid15952639 Cache}{\f1\insrsid7437835 Manager}{\f1\insrsid3629320
\par
\par }{\f1\insrsid7437835 The cache manager is the heart of this system; it maps filenames to the file\rquote s cached contents and decides which ones}{\f1\insrsid12547006 to keep in memory. As stated above}{\f1\insrsid7437835
, we use the Landlord algorithm for this purpose.}{\f1\insrsid3629320
\par }{\f1\insrsid7437835
\par }{\f1\insrsid12547006 <}{\f1\insrsid7437835 Pseudocode}{\f1\insrsid12547006 >}{\f1\insrsid7437835
\par
\par We see that th}{\f1\insrsid11994078 e na\'efve version of this}{\f1\insrsid7437835 algorithm has a high CPU cost: }{\f1\insrsid11994078
eviction involves 2 complete loops over all cached items. The first step towards mitigating this cost is to choose a container with good locality, namely std::hash_map instead of std::map.}{\f1\insrsid3629320
\par }{\f1\insrsid14308065
\par }{\f1\insrsid11994078 We have }{\f1\insrsid14308065 also }{\f1\insrsid11994078 developed several improvements:
\par }{\f1\insrsid14308065 1) The costly divisions required to calculate credit density can be replaced with multiplying by the reciprocal. This trades less latency (}{\f1\insrsid12547006 4 vs. }{\f1\insrsid14308065 20 c}{\f1\insrsid12547006 ycles}{
\f1\insrsid14308065 on Athlon XP) for increased memory use.
\par 2}{\f1\insrsid11994078 a) the calcMCD and chargeAll loops can effectively be fused by calculating the next MCD}{\f1\insrsid12547006 value on the side. We therefore avoid iterating over all items twice}{\f1\insrsid11994078 , which is }{
\f1\insrsid12547006 especially }{\f1\insrsid11994078 important for large sets of items that do not fit in cache.
\par }{\f1\insrsid14308065 2}{\f1\insrsid11994078 b) a priority
queue can return and remove the MCD item in O(logN) time; the rent that should be charged from all items can be accumulated and applied in batches. The validity of this approach is not immediately clear. Landlord specifies decreasing all credit by delta *
item.size and removing any subset of items with no credit remaining. By definition of delta (min credit density), at least one item will be removed, and this is exactly the one returned by the priority queue.
\par Note that any pending charges must be committed before adding any items; otherwise, they too would be charged during the next commit cycle, which would be incorrect.
\par }{\f1\insrsid14308065 Implementation note: to avoid duplicating code, the priority queue is separate from the filename->cached contents mapping. Since it is or
dered by the item credit, the queue must be re-sorted after an item is accessed, which increases its credit. Due to limitations in the STL priority_queue, this takes O(N) time on every access. Since }{\f1\insrsid12547006 cache }{\f1\insrsid14308065
hits are fairly rare, time is still saved}{\f1\insrsid12547006 overall}{\f1\insrsid14308065 ; however, this bottleneck should be removed by substituting a heap implementation that allows a logN \'93sift\'94 operation.}{\f1\insrsid3629320
\par
\par }{\f1\insrsid5980580 These improvements are made av}{\f1\insrsid1847956 ailable as template}{\f1\insrsid5980580 policy classes and can therefore easily be enabled for applications where they provide a benefit.}{\f1\insrsid3629320
\par }{\f1\insrsid5980580
\par We examine results of these optimizations in }{\f1\insrsid12547006 <<section 3>>.}{\f1\insrsid5980580
\par }{\f1\insrsid3629320
\par }{\f1\insrsid5980580 This concludes discussion of the cache. To recap, }{\f1\insrsid799807 the}{\f1\insrsid11684990 small block cache absorbs the cost of rounding}{\f1\insrsid6842252 I/Os}{\f1\insrsid11684990 up to block size boundaries. }{
\f1\insrsid799807 A }{\f1\insrsid11684990 file cache managed by the Landlord algorithm caches the contents of entire files}{\f1\insrsid799807 .}{\f1\insrsid3629320
\par }{\f1\insrsid5980580
\par }{\f1\insrsid14619881
\par }{\f1\insrsid5980580 Experimental Results}{\f1\insrsid3629320
\par }{\f1\insrsid5980580
\par System Information
\par }{\f1\insrsid1010827
\par The test system has the following specifications:
\par CPU: Athlon XP 2400+ (2000 MHz)
\par Memory: 768 MB DDR 2100 CL2.5
\par }{\f1\lang1031\langfe1033\langnp1031\insrsid1010827\charrsid1010827 Chipset: NForce2
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1010827 {\f1\insrsid1010827\charrsid1010827 HD: }{\f1\insrsid1010827 Deskstar }{\f1\insrsid1010827\charrsid1010827 7K250 (160 GB}{\f1\insrsid1010827 ,}{
\f1\insrsid1010827\charrsid1010827 PATA}{\f1\insrsid1010827 ,}{\f1\insrsid1010827\charrsid1010827 8 MB cache, 8.5}{\f1\insrsid1010827 }{\f1\insrsid1010827\charrsid1010827 ms rated seek}{\f1\insrsid1010827 , 30-40 MB/s measured throughput}{
\f1\insrsid11684990 }{\f1\insrsid1010827 )
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid14619881 {\f1\insrsid14619881\charrsid14619881 OS: Windows XP SP2
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1010827 {\f1\insrsid14619881 Compiler: MS Visual C++ 7.1}{\f1\insrsid14619881\charrsid1010827
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid1010827
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid9206128 {\f1\insrsid9206128 We now describe methodology and show results of several tests measuring performance of our}{\f1\insrsid6842252 I/O}{\f1\insrsid9206128
library.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128
\par }{\f1\insrsid5980580 IO Throughput
\par }{\f1\insrsid12124230
\par }{\f1\insrsid9206128 Methodology
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid9206128 {\f1\insrsid9206128 For all}{\f1\insrsid6842252 I/O}{\f1\insrsid9206128 -related meas
urements, we use a trace file recorded from the startup of 0ad encompassing ~500 file loads. Using the trace simulation feature described above, we issue these}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128
as fast as possible; this removes the influence of other system-specific conditions such as graphics card performance etc.
\par If a cache is involved, we ensure it is empty so as not to skew results; in the case of the OS file cache, testing takes place after a clean reboot.
\par
\par What is actually measured is the total amount of time elapsed between start and end of}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128 ; this together with the amount of user data transferred yields effective throughput (\'93effective\'94
because it differs from the actual disk throughput due to compression).
\par }{\f1\insrsid2424877
\par }{\f1\insrsid9206128 This was chosen as the benchmark measure because it reflects real-world performance of the entire system.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128
\par Results and Discussion
\par
\par W}{\f1\insrsid12124230 e are interested in the total improvement yielded by our}{\f1\insrsid6842252 I/O}{\f1\insrsid12124230 library, as compared to throughput reached by the bare OS-provided read() API.
\par According to the above measure, we see }{\f1\insrsid12124230\charrsid12124230 29.3}{\f1\insrsid12124230 MB/s vs. 2.96 MB/s, a staggering speedup of 990 %!
\par
\par We now examine which}{\f1\insrsid6842252 I/O}{\f1\insrsid12124230 techniques are chiefly responsible for these gains.
\par
\par Leaving everything else the same but no longer compressing files stored in archives, performance fall}{\f1\insrsid12547006 s}{\f1\insrsid12124230 from 27.2 MB/s to 22.2 MB/s. (N
ote: this measure differs from the peak performance listed above in that file block size was not yet the optimal value.)
\par This leads us to conclude that }{\f1\insrsid16475960 disk throughput is a limiting factor; a good sign indicating seeks are not the bottleneck. This will be further discussed below.}{\f1\insrsid5980580
\par }{\f1\insrsid16475960 As an aside, decompression performance indeed mirrors the previously quoted 100 MB/s figure; we }{\f1\insrsid12547006 observe }{\f1\insrsid16475960 94.5 MB/s}{\f1\insrsid9206128 .}{\f1\insrsid16475960
\par }{\f1\insrsid9206128
\par }{\f1\insrsid16475960 When archives are disabled entirely and}{\f1\insrsid6842252 I/O}{\f1\insrsid16475960 is from loose }{\f1\insrsid12547006 files }{\f1\insrsid16475960 (stored in the }{\f1\insrsid12547006 normal files}{\f1\insrsid16475960
ystem), performance drops to }{\f1\insrsid4937740 2.62 MB/s. The immediate conclusion is that reduced locality (due to poor FS ordering and extra headers) induces many costly seeks.
\par }{\f1\insrsid16475960 We }{\f1\insrsid4937740 also }{\f1\insrsid16475960 notice that }{\f1\insrsid4937740 performance is worse }{\f1\insrsid16475960 than th}{\f1\insrsid4937740 at}{\f1\insrsid16475960
measured for the synchronous API; this could be explained by increased overhead of the aio APIs. Indeed, }{\f1\insrsid4937740 they do not support the Windows FastIO entry points that avoid needing to create a driver request packet.}{\f1\insrsid16475960
\par }{\f1\insrsid4937740
\par Finally, we revisit the question of file block size. The initial choice of 16 KiB was not optimal; based on the following results, we go with 32 KiB.
\par Block Size (KiB)\tab Th}{\f1\insrsid9206128 r}{\f1\insrsid4937740 oughput (MB/s)
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4937740 {\f1\insrsid4937740\charrsid4937740 4\tab }{\f1\insrsid4937740 \tab \tab }{\f1\insrsid4937740\charrsid4937740 23.7
\par 16\tab \tab \tab 27.2
\par 32\tab \tab \tab 29.3
\par 64\tab \tab \tab 29.1
\par 128\tab \tab \tab 23.3}{\f1\insrsid4937740
\par It is interesting that performance }{\f1\insrsid2424877 begins to}{\f1\insrsid4937740 falls off }{\f1\insrsid2424877 starting with }{\f1\insrsid4937740 64 KiB blocks. This might be explained by transfers needing to b
e split due to the previously mentioned scatter-gather list limit, but this is speculation.}{\f1\insrsid4937740\charrsid4937740
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128
\par In summary, we have found that bundling files into archives is the most worthwhile improvement, due to reducing seeks. Once these are eliminated, the increased throughput afforded by the (free) data compression step contributes an additional 23 % speedup.
\par }{\f1\insrsid5980580
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2424877 {\f1\insrsid2424877 Cache Manager Optimizations
\par
\par Of further theoretical and practical interest is how well the various Landlord algorithm optimizations fare.
\par
\par Accounting CPU cost is done as follows. First, external influences are minimized by running at highest scheduler priority. }{\f1\insrsid141460 S}{\f1\insrsid2424877
everal thousand iterations of the target code are run while measuring elapsed time via high-resolution timer (precise to 1 CPU clock!). Each of these itera
tions performs an operation (e.g. allocate or free) chosen randomly; this avoids measuring characteristics that are specific to a given trace. Note, however, that we control the random distribution (in the example, ratio of \'93allocate\'94 to \'93free
\'94 operations); these }{\f1\insrsid141460 are }{\f1\insrsid2424877 weighted towards the }{\f1\insrsid141460 most frequent and important }{\f1\insrsid2424877 operations.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128
\par }{\f1\insrsid141460 The first result is that with the na\'efve Landlord implementation, dividing via multiplying by reciprocal is actually 1.4 % slower! This is likely because the additional storage requir
ed for the reciprocal breaks the nice cache-friendly 16 byte element size. Since this algorithm iterates over all items twice, th}{\f1\insrsid5710756 e}{\f1\insrsid141460 memory access cost weighs more heavily than a few extra CPU cycles spent dividing.}
{\f1\insrsid15160866
\par }{\f1\insrsid141460
\par Next, we find that the Landlord_Cached strategy (recall that it calculates minimum credit density while updating and therefore often avoids needing to iterate over all items) performs 21 % faster.
\par However, its divide-via-reciprocal variant is again slower \endash this time by 0.6 %. We see that }{\f1\insrsid13000184 iterating less often increases the benefit from the reciprocal divider.}{\f1\insrsid141460
\par }{\f1\insrsid13000184
\par The final variant is Landlord_Lazy (which uses a priority queue to find the least valuable item in O(logN) and thus avoids iterating over all items when wanting to remove one from the cache)
. It performs 19 % better than baseline, which is slightly slower than the previous variant. Note that this result is heavily dependent on the relative frequency of add and remove operations: since the former require iteration over all items (to \lquote
commit\rquote a previous pending charge), decreasing their number from the current (and quite arbitrary) 70 % will cause this implementation to come out far ahead.
\par Applying the reciprocal divider results in further gains of 0}{\f1\insrsid5710756 .}{\f1\insrsid13000184 8 %. Since we rarely iterate over all items}{\f1\insrsid5710756 here}{\f1\insrsid13000184
, the increase in size is outweighed by the faster division.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15160866 {\f1\insrsid15160866
\par }{\f1\insrsid13000184 To conclude this section, we find that Landlord_Cached performs best in the current benchmark. Since it is less complex and requires less memory than the possibly faster Landlord_Lazy strategy, it is chosen as the default.
\par However, the implementation via template policy classes allows easily switching strategies in applications where results differ.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid15160866
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 Allocator Fragmentation
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid2424877
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 The important question of allocator fragmentation is next. We gauge
it in the course of simulating the previous 500-file trace. A simple and adequate measure is to compare the total requested size with how much of the total file cache is actually occupied.}{\f1\insrsid1847956
The result is a total memory waste of 14 %, which is in line with the findings of [Johnstone and Wilson]. While not great, this is acceptable.}{\f1\insrsid5048634
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid5048634
\par }{\f1\insrsid9206128
\par }{\f1\insrsid5980580 Ordering Quality
\par
\par .. seeks (seen in}{\f1\insrsid6842252 I/O}{\f1\insrsid5980580 throughput results)
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13779256 {\f1\insrsid13779256\charrsid13779256 paging volume; # disk hits (decides quality of cache); # seeks (decides quality of ordering)
\par #seeks is interesting, but may be more interesting to count total jump distance (short seeks cheap?)
\par remember disk prefetcher! 8mb cache; may already have read ahead! cost = const overhead+ constant
\par accel up to max speed; slow down; the move exactly to track position (constant) AND THEN rotational delay
\par \tab this is more for prefetching - blows the scope.}{\f1\insrsid5980580
\par }{\f1\insrsid13779256
\par
\par }{\f1\insrsid13779256\charrsid13779256 count total weight of TSP weights * their cost; compare to total weights; that is savings in seeks
\par does that apply to runs that weren't trained?
\par split into 2 halves; train with one, test with other. not only one instance
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid8848513 obviously performance }{\f1\insrsid16411143 will }{\f1\insrsid8848513
suffer because current scheme is 100% dependent *on having seen* the correct ordering before.}{\f1\insrsid1847956
\par
\par
\par
\par }{\f1\insrsid13779256 Conclusion}{\f1\insrsid1847956
\par }{\f1\insrsid3546591
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5968267 {\f1\insrsid5968267 Waiting for slow}{\f1\insrsid6842252 I/O}{\f1\insrsid5968267 is the bane of m
any a computer user; we have shown that this need not be and can be mitigated to a large degree.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid5968267
\par A method for }{\f1\insrsid16678464 fast}{\f1\insrsid6842252 I/O}{\f1\insrsid5968267 has been presented and analyzed}{\f1\insrsid16678464 .}{\f1\insrsid5968267 }{\f1\insrsid3546591
The main contribution is a combination of techniques that greatly improves effective}{\f1\insrsid6842252 I/O}{\f1\insrsid16678464 }{\f1\insrsid3546591 throughput.
\par By caching file contents, we can avoid repetitive}{\f1\insrsid6842252 I/Os}{\f1\insrsid3546591 ; placing files in archives arranged in order of access reduces costly seeks. }{\f1\insrsid16678464 A}{\f1\insrsid3546591
synchronous access maximizes read throughput and (together with block-splitting) allows }{\f1\insrsid16678464 the data to be compressed, which reduces the amount that must be read.}{\f1\insrsid3546591
\par }{\f1\insrsid5968267 The end result is a measured speedup of nearly 1000 % in the target application, which is expected to apply widely due to inefficient filesystems.
\par }{\f1\insrsid16678464
\par }{\f1\insrsid3546591 Of further interest are optimizations made to the memory allocation and cache management algorithms.}{\f1\insrsid5968267
They respectively allow returning aligned file buffers (required by the aio implementation) without serious fragmentation and reduce CPU cost of the cache manager by 20 %.}{\f1\insrsid3546591
\par }{\f1\insrsid5968267
\par Other applications can build on our work and easily speed up their load times and file accesses.
\par
\par }{\f1\insrsid3546591
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6826037 {\f1\insrsid6826037 Implementation
\par
\par Our}{\f1\insrsid6842252 I/O}{\f1\insrsid6826037 code has been developed in C++ and also contains a few time-critical assembly language subroutines. It encompasses about 12000 lines of code, about 7K of which are new; the rest was built u
pon previous work.
\par Unfortunately there are dependencies on another ~30KLOC, so releasing and integrating into other applications is not as easy as it could be; this is being worked upon.
\par }{\f1\insrsid5710756 Eventually releasing the code }{\f1\insrsid6826037 under the GNU General Public License (Free Software)}{\f1\insrsid5710756 is planned}{\f1\insrsid6826037 .
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3546591
\par }{\f1\insrsid13779256 Future Direction}{\f1\insrsid3546591 s}{\f1\insrsid13779256
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13779256 {\f1\insrsid3546591
\par }{\f1\insrsid5710756 We have further ideas for improvement that could not yet be implemented due to time constraints.
\par
\par Prefetching, }{\f1\insrsid16678464 i.e. reading data before it is needed (during idle time)}{\f1\insrsid5710756 , shows promise}{\f1\insrsid16678464 . While requiring more work
and tighter integration with the application, this can improve performance by always keeping the hard disk busy. The downsides that must be mitigated are increased power usage and potentially interfering with time-critical}{\f1\insrsid6842252 I/Os}{
\f1\insrsid16678464 .
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid13779256
\par }{\f1\insrsid6826037 Currently, the main bit of \lquote intelligence\rquote
is offline and consists of finding a good ordering for files within an archive. We would like to bring more of this into real time and e.g. make decisions in the file cache based on predicted future behavior. In particular, small files know
n to be accessed after one another could be removed from the file cache together, thus freeing up more space (meaning less fragmentation) without hurting performance (because one file not in cache will force reading the block in which it is stored, anyway
).
\par }{\f1\insrsid16678464
\par }{\f1\insrsid6826037 Two approaches are envisaged that could realize these wishes. A Markov chain could be constructed and used to decide the probability of certain}{\f1\insrsid6842252 I/Os}{\f1\insrsid6826037
coming after one another. Also, previous traces could be examined at runtime to determine where in the load sequence we are, thus predicting further}{\f1\insrsid6842252 I/Os}{\f1\insrsid6826037 .
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid16678464 {\f1\insrsid16678464
\par }{\f1\insrsid6826037 Stay tuned!
\par }{\f1\insrsid16678464
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13779256 {\f1\insrsid13779256 Bibliography}{\f1\insrsid13779256\charrsid13779256
\par }{\f1\insrsid13779256
\par }{\f1\insrsid13779256\charrsid13779256 online file caching
\par survey of web replacement strategies
\par 3level caching for efficient query processing in large web search engines
\par new results on web caching with request reordering
\par The Memory Fragmentation Problem - Solved
\par Dynamic Storage Allocation - A Survey and Critical Review
\par description of the TLSF memory allocator
\par a performance study of sequential I/O on windows NT 4
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid1847956
\par
\par
\par
\par
\par }{\f1\insrsid3629320
\par }}