diff --git a/docs/file_io_thesis.rtf b/docs/file_io_thesis.rtf deleted file mode 100644 index 6893e2d6fc..0000000000 --- a/docs/file_io_thesis.rtf +++ /dev/null @@ -1,820 +0,0 @@ -{\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;} -{\f2\fmodern\fcharset0\fprq1{\*\panose 02070309020205020404}Courier New;}{\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f10\fnil\fcharset2\fprq2{\*\panose 05000000000000000000}Wingdings;} -{\f37\froman\fcharset238\fprq2 Times New Roman CE;}{\f38\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman Tur;} -{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f43\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman (Vietnamese);} -{\f47\fswiss\fcharset238\fprq2 Arial CE;}{\f48\fswiss\fcharset204\fprq2 Arial Cyr;}{\f50\fswiss\fcharset161\fprq2 Arial Greek;}{\f51\fswiss\fcharset162\fprq2 Arial Tur;}{\f52\fswiss\fcharset177\fprq2 Arial (Hebrew);} -{\f53\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f54\fswiss\fcharset186\fprq2 Arial Baltic;}{\f55\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f57\fmodern\fcharset238\fprq1 Courier New CE;}{\f58\fmodern\fcharset204\fprq1 Courier New Cyr;} -{\f60\fmodern\fcharset161\fprq1 Courier New Greek;}{\f61\fmodern\fcharset162\fprq1 Courier New Tur;}{\f62\fmodern\fcharset177\fprq1 Courier New (Hebrew);}{\f63\fmodern\fcharset178\fprq1 Courier New (Arabic);} -{\f64\fmodern\fcharset186\fprq1 Courier New Baltic;}{\f65\fmodern\fcharset163\fprq1 Courier New (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0; -\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{ -\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\* -\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv -\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}{\*\cs15 \additive \ul\cf2 \sbasedon10 \styrsid15489891 Hyperlink;}} -{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\listtable{\list\listtemplateid-804073296\listhybrid{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\leveltemplateid-2124134138 -\'01-;}{\levelnumbers;}\loch\af1\hich\af1\dbch\af0\fbias0 \fi-360\li720\jclisttab\tx720\lin720 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691 -\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li1440\jclisttab\tx1440\lin1440 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;} -\f10\fbias0 \fi-360\li2160\jclisttab\tx2160\lin2160 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698689\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li2880 -\jclisttab\tx2880\lin2880 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li3600\jclisttab\tx3600\lin3600 }{\listlevel -\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;}\f10\fbias0 \fi-360\li4320\jclisttab\tx4320\lin4320 }{\listlevel\levelnfc23\levelnfcn23 -\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698689\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li5040\jclisttab\tx5040\lin5040 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0 -\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li5760\jclisttab\tx5760\lin5760 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0 -{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;}\f10\fbias0 \fi-360\li6480\jclisttab\tx6480\lin6480 }{\listname ;}\listid2098556351}}{\*\listoverridetable{\listoverride\listid2098556351\listoverridecount0\ls1}}{\*\rsidtbl \rsid141460 -\rsid202317\rsid408563\rsid615363\rsid674725\rsid799807\rsid881799\rsid1001543\rsid1010827\rsid1080261\rsid1080863\rsid1128024\rsid1337770\rsid1516560\rsid1733211\rsid1770144\rsid1847956\rsid1911773\rsid2108982\rsid2117727\rsid2186746\rsid2424877 -\rsid2508185\rsid2521923\rsid2556592\rsid2578444\rsid3160535\rsid3243975\rsid3289879\rsid3345630\rsid3422301\rsid3546591\rsid3550232\rsid3606292\rsid3629320\rsid4146695\rsid4156677\rsid4282397\rsid4351785\rsid4401489\rsid4665099\rsid4722959\rsid4916741 -\rsid4933345\rsid4937740\rsid4994568\rsid5048634\rsid5053763\rsid5245032\rsid5467766\rsid5710756\rsid5715141\rsid5721779\rsid5791350\rsid5906787\rsid5968267\rsid5980580\rsid6185166\rsid6191932\rsid6382957\rsid6762521\rsid6826037\rsid6831581\rsid6842252 -\rsid6842857\rsid6951588\rsid7279159\rsid7351464\rsid7371675\rsid7437835\rsid7543581\rsid7561109\rsid7733471\rsid7764402\rsid8066536\rsid8144712\rsid8264387\rsid8400636\rsid8477628\rsid8805545\rsid8813409\rsid8848513\rsid8874078\rsid9060782\rsid9109787 -\rsid9137203\rsid9206128\rsid9373790\rsid9524216\rsid9658852\rsid9779530\rsid10040624\rsid10047242\rsid10117750\rsid10176575\rsid10239896\rsid10249343\rsid10617934\rsid10885058\rsid11155165\rsid11157276\rsid11279558\rsid11288483\rsid11368037\rsid11408224 -\rsid11472464\rsid11472636\rsid11491946\rsid11684990\rsid11994078\rsid12070557\rsid12124230\rsid12262430\rsid12480624\rsid12547006\rsid12663892\rsid12675798\rsid12716889\rsid12808527\rsid12870649\rsid13000184\rsid13180669\rsid13388513\rsid13582553 -\rsid13723102\rsid13724273\rsid13779256\rsid13977746\rsid14045424\rsid14093323\rsid14308065\rsid14363947\rsid14426874\rsid14433699\rsid14619881\rsid14696977\rsid14962633\rsid15098734\rsid15159648\rsid15160866\rsid15301494\rsid15489891\rsid15548533 -\rsid15672389\rsid15688925\rsid15952639\rsid16060755\rsid16131725\rsid16411143\rsid16475960\rsid16678464}{\*\generator Microsoft Word 11.0.6359;}{\info{\title Introduction}{\author Jan Wassenberg}{\operator Jan Wassenberg} -{\creatim\yr2006\mo3\dy23\hr13\min57}{\revtim\yr2006\mo3\dy25\hr16\min41}{\version18}{\edmins502}{\nofpages24}{\nofwords8510}{\nofchars43917}{\*\company a}{\nofcharsws52080}{\vern24703}} -\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1701\dgvorigin1984\dghshow1\dgvshow1 -\jexpand\viewkind4\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\viewnobound1\snaptogridincell\allowfieldendsel -\wrppunct\asianbrkrule\rsidroot15301494\newtblstyruls\nogrowautofit \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2 -\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6 -\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang -{\pntxtb (}{\pntxta )}}\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1033\langfe1033\cgrid\langnp1033\langfenp1033 {\f1\insrsid1128024 Abstract -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1128024 {\f1\insrsid1128024\charrsid1128024 Slow I/O is widespread, as attested }{\f1\insrsid11155165 to }{\f1\insrsid1128024\charrsid1128024 -by splash screens and progress bars; however, it can be done better. -\par }{\f1\insrsid7543581 A }{\f1\insrsid1128024\charrsid1128024 reusable and highly efficient I/O }{\f1\insrsid7543581 solution is presented; }{\f1\insrsid11155165 design decisions and }{\f1\insrsid1128024\charrsid1128024 key algorithms}{\f1\insrsid7543581 - are discussed}{\f1\insrsid1128024\charrsid1128024 , and }{\f1\insrsid7543581 the resulting }{\f1\insrsid1128024\charrsid1128024 performance}{\f1\insrsid7543581 analyzed}{\f1\insrsid1128024\charrsid1128024 . -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid1128024 -\par }{\f1\insrsid15301494\charrsid12808527 Introduction -\par -\par }{\f1\insrsid615363 Motivation / }{\f1\insrsid15301494\charrsid12808527 Importance of Fast}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527 -\par }{\f1\insrsid15301494 -\par }{\f1\insrsid15301494\charrsid12808527 Since}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527 is much slower than CPU or memory, it can }{\f1\insrsid7543581 easily }{\f1\insrsid15301494\charrsid12808527 become a bottleneck}{ -\f1\insrsid7543581 }{\f1\insrsid14093323 within}{\f1\insrsid7543581 the system as a whole. }{\f1\insrsid16131725 An }{\f1\insrsid615363 estimate as of 2006}{\f1\insrsid16131725 is}{\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid615363 6}{ -\f1\insrsid12808527\charrsid12808527 0}{\f1\insrsid16131725 }{\f1\insrsid12808527\charrsid12808527 MB/s vs}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 2}{\f1\insrsid615363 6}{\f1\insrsid12808527\charrsid12808527 00}{\f1\insrsid16131725 - }{\f1\insrsid12808527\charrsid12808527 MB/s}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid15301494\charrsid12808527 Many }{\f1\insrsid7543581 applications }{\f1\insrsid15301494\charrsid12808527 -would therefore benefit from faster}{\f1\insrsid6842252 I/O}{\f1\insrsid12808527\charrsid12808527 ; example scenarios include:}{\f1\insrsid15301494\charrsid12808527 -\par }{\f1\insrsid12808527\charrsid12808527 - slow startup time. The user is inconvenienced by waiting for }{\f1\insrsid12808527 required files to load}{\f1\insrsid7543581 ; splash screens }{\f1\insrsid4933345 are one attempt to mitigate this by }{ -\f1\insrsid7543581 distracting the user}{\f1\insrsid12808527\charrsid12808527 .}{\f1\insrsid12808527 -\par }{\f1\insrsid15489891 For a}{\f1\insrsid16131725 rather extreme}{\f1\insrsid15489891 }{\f1\insrsid4933345 illustration }{\f1\insrsid15489891 of th}{\f1\insrsid4933345 e}{\f1\insrsid15489891 problem, see }{\field{\*\fldinst {\f1\insrsid15489891 - HYPERLINK "}{\f1\insrsid15489891\charrsid15489891 http://www.break.com/index/patiencechild.html}{\f1\insrsid15489891 " }{\f1\insrsid16060755\charrsid10249343 {\*\datafield -00d0c9ea79f9bace118c8200aa004ba90b02000000170000002e00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c000000e0c9ea79f9ba -ce118c8200aa004ba90b5c00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c0000000065006700}}}{\fldrslt { -\cs15\f1\ul\cf2\insrsid15489891\charrsid10249343 http://www.break.com/index/patiencechild.html}}}{\f1\insrsid15489891 .}{\f1\insrsid15489891\charrsid15489891 -\par }{\f1\insrsid12808527\charrsid12808527 - }{\f1\insrsid12808527 on-demand loading. If the data set is too large to fit in memory, it must be loaded }{\f1\insrsid615363 in increments }{\f1\insrsid12808527 as needed. This can cause \lquote freezes\rquote - in the application while waiting for the}{\f1\insrsid6842252 I/O}{\f1\insrsid12808527 to finish. -\par - heavy throughput requirements.}{\f1\insrsid615363 Some applications, e.g. video players or editing tools, require high sustained}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 throughput.}{\f1\insrsid12808527 -\par }{\f1\insrsid15489891 -\par }{\f1\insrsid9373790 Intended Application}{\f1\insrsid615363 -\par -\par The application for which our}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 library has been developed is a Real-Time Strategy computer game}{\f1\insrsid9779530 [0ad}{\f1\insrsid615363 ].}{\f1\insrsid11684990 }{\f1\insrsid7543581 It }{\f1\insrsid4933345 -utilizes }{\f1\insrsid7543581 the traditional }{\f1\insrsid4933345 method}{\f1\insrsid7543581 of loading files on startup as well as on-demand streaming of data, so both should be efficiently handled. -\par }{\f1\insrsid615363 While intending for }{\f1\insrsid4933345 the }{\f1\insrsid6842252 I/O}{\f1\insrsid615363 }{\f1\insrsid4933345 solution }{\f1\insrsid615363 -to remain useful for a wide range of applications, several consequences arise from this and guide our design decisions. -\par First, much emphasis is placed on real-time behavior. Lag or \lquote freezing\rquote in-game is not acceptable and must be minimized. This means that }{\f1\insrsid9373790 the }{\f1\insrsid615363 caching }{\f1\insrsid9373790 algorithm }{\f1\insrsid615363 -must }{\f1\insrsid9373790 not have offline performance characteristics, reordering I/Os is probably not acceptable and }{\f1\insrsid2117727 any }{\f1\insrsid9373790 pre-fetching }{\f1\insrsid16131725 would have to}{\f1\insrsid9373790 - be quite conservative (so as not to penalize time-critical on-demand loads). -\par Also, the working set is not static; depending on game mode and environment, different files may be needed. Provision must be made for varying access patterns. -\par Finally, and related to the real-time issue, is that of fr -agmentation. Games can run over several hours; during that time, performance must not degrade to unacceptable levels e.g. due to memory fragmentation. Given the real-time requirements, offline reorganization is not an option; the algorithms }{ -\f1\insrsid10117750 used must be designed accordingly.}{\f1\insrsid9373790 -\par }{\f1\insrsid10117750 -\par Given these central design constraints, we now present the chief ideas behind our fast}{\f1\insrsid6842252 I/O}{\f1\insrsid10117750 method. -\par -\par Techniques -\par -\par Our approach is }{\f1\insrsid11155165 five}{\f1\insrsid10117750 -fold: -\par 1) caching avoids repetitive slow}{\f1\insrsid6842252 I/Os}{\f1\insrsid10117750 ; -\par 2) ordering files according to access patterns minimizes hard-drive seeks; -\par 3) compressing files reduces the amount of data to }{\f1\insrsid7561109 be }{\f1\insrsid10117750 read; -\par 4) asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid10117750 maximizes throughput and }{\f1\insrsid2117727 allows computation}{\f1\insrsid11155165 to proceed in parallel with}{\f1\insrsid6842252 I/O}{\f1\insrsid11155165 ;}{\f1\insrsid10117750 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11155165 {\f1\insrsid11155165 5) splitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid11155165 into b -locks simplifies caching and decompression while also avoiding copying buffers for alignment purposes. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid2117727 -\par We will discuss each of these in detail below, but first cover related theoretical work in this field. -\par }{\f1\insrsid10117750 -\par }{\f1\insrsid2117727 Related Theoretical Work}{\f1\insrsid10117750 -\par }{\f1\insrsid2117727 -\par }{\f1\insrsid1080863 Cache -\par -\par }{\f1\insrsid9779530 For the cache, a central question is which files to keep in memory. This is known as the file- or web-caching problem. }{\f1\insrsid2117727 -In short, given a set of file requests (each with size and retrieval cost), a cache is maintained such that tot}{\f1\insrsid9779530 al retrieval cost is minimized. -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid2117727 The special case where size and cost are uniform is called \'93paging\'94, which has been studied extensively.}{\f1\insrsid9779530 - Several algorithms that have an optimal competitive ratio are known. In particular LRU (Least Recently Used, which simply evicts the file whose access time is the least recent) }{\f1\insrsid8874078 -is k/(k-h+1) competitive, which is the best possible for a deterministic algorithm}{\f1\insrsid9779530 [Sleator/Tarjan].}{\f1\insrsid8874078 -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid9779530 This model is appealing due to its simplicity, but is not sufficient for our needs.}{\f1\insrsid8874078 Files are not typically uniform size, and treating them as such would be monstrously inefficient (}{\f1\insrsid3289879 -much cache space would be wasted by rounding up element size to that of the largest file}{\f1\insrsid8874078 ).}{\f1\insrsid9779530 -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid8874078 Irani gives 2 }{\f1\insrsid3289879 O(log^2 k) competitive }{\f1\insrsid8874078 randomized algorit}{\f1\insrsid1128024 hms that can deal with variable-}{\f1\insrsid8874078 sized files}{\f1\insrsid1128024 and uniform cost [Irani]. - -\par -\par }{\f1\insrsid3289879 However, we would like to achieve full generality and provide for variable cost as well. This can be used as the name suggests to more accurately reflect load time (as will be seen below, this is - not solely dependent on file size!), or as a hint from the application that certain files are not to be removed from the cache as early as they otherwise would.}{\f1\insrsid8874078 -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid3289879 Young develops such an algorithm and calls it Landlord. Briefly, each file receives \lquote credit\rquote that is initially set to its cost. When determining which file is to be removed from cache (i.e. \lquote evicted\rquote -), each one is charged \lquote rent\rquote proportional to its size and the minimum credit-per-size density currently in the cache. }{\f1\insrsid1128024 Items are evicted once their credit is 0. }{\f1\insrsid3289879 -On every access, credit is increased in an arbitrary manner. This strategy is k/(k-h+1)-competitive, which again is optimal for a deterministic algorithm. [Young02] -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid3289879 We }{\f1\insrsid8144712 end up }{\f1\insrsid3289879 us}{\f1\insrsid8144712 ing}{\f1\insrsid3289879 an optimized variant of this Landlord cache management strategy. -\par }{\f1\insrsid9060782 -\par }{\f1\insrsid1080863 Allocation -\par -\par }{\f1\insrsid9060782 Another important part of caching is the memory allocation aspect. For reasons that will be discussed below, }{\f1\insrsid12070557 existing general-purposes }{\f1\insrsid9060782 -allocators are not adequate; an alternative will have to be developed. We build on decades of work in this area. -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid13724273 Wilson et al. give a very thorough }{\f1\insrsid14045424 and helpful }{\f1\insrsid13724273 overview. }{\f1\insrsid14045424 -A simple but crucial point is made: fragmentation is caused by freeing regions whose neighbors are not free. Allocators are online algorithms whose only }{\f1\insrsid12070557 tool}{\f1\insrsid14045424 against this is placement \endash - deciding where to allocate regions. The authors advocate benchmarking by }{\f1\insrsid1128024 means of }{\f1\insrsid14045424 -traces (a record of allocations) from real-world programs, because randomized tests do not necessarily reflect reality. It is emphasized that allocat}{\f1\insrsid11155165 ion}{\f1\insrsid14045424 policy and mechanism - must be considered separately. Results of tests show certain policies, namely address-ordered first (segregated) fit, to perform quite well, wasting only about 14% memory. Finally, further discussion of implementation details such as boundary tags was he -lpful. [DynStorageReview] -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid13724273 Johnstone and Wilson}{\f1\insrsid14045424 go on to refine their measure of }{\f1\insrsid13724273 fragmentation and }{\f1\insrsid14045424 conclude that the previously mentioned AO-first-fit policy actually }{\f1\insrsid12070557 only -}{\f1\insrsid14045424 suffers from ~1% fragmentation}{\f1\insrsid12070557 , the best of all techniques considered}{\f1\insrsid14045424 . }{\f1\insrsid12070557 [}{\f1\insrsid13724273 MemFragSolved]}{\f1\insrsid12070557 - This promising result leads us to focus on that policy. -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid12070557 Masmano et al. present a \'93Two Level Segregated Fit\'94 algorithm with O(1) }{\f1\insrsid16131725 time complexity}{\f1\insrsid12070557 . [TLSF] -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid12070557 We end up im}{\f1\insrsid11155165 plementing a simpler variant based on this idea }{\f1\insrsid1080863 that also avoids the need for }{\f1\insrsid1128024 block }{\f1\insrsid1080863 headers, which was the }{\f1\insrsid11684990 -abovementioned }{\f1\insrsid1080863 problem preventing use of }{\f1\insrsid11684990 a }{\f1\insrsid1080863 general allocator.}{\f1\insrsid12070557 -\par }{\f1\insrsid1080863 -\par Ordering - Traveling Salesman Problem -\par -\par The problem of ordering files according to access patterns can be seen as an instance of the Traveling Salesman Problem. The latter - is defined as: given a graph of nodes (cities) and the cost of traveling from one to another (travel distance), compute a path that will take the salesman to each city while incurring minimal cost. In our case, files correspond to cities and the hard-dis -k seek distance to cost. -\par }{\f1\insrsid1128024 -\par }{\f1\insrsid1080863 TSP has perhaps been studied most among all optimization problems; numerous algorithms and heuristics have been developed, each with their strengths and w}{\f1\insrsid1128024 eaknesses. [DIMACS }{\f1\insrsid4722959 TSP }{ -\f1\insrsid1128024 Challenge] }{\f1\insrsid10047242 gives an extensive listing of algorithms, relative performance and techniques and was a valuable reference.}{\f1\insrsid1080863 -\par }{\f1\insrsid15688925 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4282397 {\f1\insrsid4282397 -For our application, less than optimal orderings are acceptable due to non-static access patterns. Since variational file accesses (e.g. due to differing modes of play) would invalidate any -ordering we establish, it does not make sense to insist on an optimal solution.}{\f1\insrsid4282397\charrsid4722959 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid4282397 The DIMACS Challenge }{\f1\insrsid4722959 shows several heuristics to perform quite well, coming to within 11 % of the H}{\f1\insrsid15688925 eld-}{ -\f1\insrsid4722959 K}{\f1\insrsid15688925 arp}{\f1\insrsid4722959 bound }{\f1\insrsid15688925 (a good approximation of the optimal solution to an instance of TSP). }{\f1\insrsid4722959 [}{\f1\insrsid4722959\charrsid4722959 -D. B. Shmoys and D. P. Williamson. Analyzing the Held-Karp TSP bound: a monotonicity property with application. Info. Proc. Lett., 35(6):281-285, 1990.}{\f1\insrsid4722959 ]}{\f1\insrsid10047242 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4282397 {\f1\insrsid4282397 We therefore settle on a greedy heuristic for simplicity.}{\f1\insrsid4282397\charrsid4722959 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid8477628 -\par }{\f1\insrsid4665099 -\par Detailed Discussion of Techniques -\par }{\f1\insrsid16131725 -\par We now cover the individual techniques used to speed up}{\f1\insrsid6842252 I/O}{\f1\insrsid16131725 in detail. -\par }{\f1\insrsid4665099 -\par }{\f1\insrsid8477628 -\par }{\f1\insrsid4665099 Efficient Asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid4665099 -\par -\par }{\f1\insrsid11408224 For an understanding of how to achieve maximum}{\f1\insrsid6842252 I/O}{\f1\insrsid11408224 read throughput, we briefly explain how the hard-drive is accessed on PC systems. -\par }{\f1\insrsid10040624 -\par }{\f1\insrsid11408224 Early IDE (Integrated Drive Electronics \endash a marketing-driven name) disks were addressed via Programmed}{\f1\insrsid6842252 I/O}{\f1\insrsid11408224 -, where the CPU instructs the drive to transfer 2 bytes at a time. Due to significant per-transfer overhead (accessing}{\f1\insrsid6842252 I/O}{\f1\insrsid11408224 registers}{\f1\insrsid10040624 and interrupting CPU when complete}{\f1\insrsid11408224 -), throughput }{\f1\insrsid16131725 only reaches a maximum of }{\f1\insrsid11408224 16.7 MB/s (PIO Mode 4)}{\f1\insrsid10040624 [}{\f1\insrsid10040624\charrsid11408224 http://www.pcguide.com/ref/hdd/if/ide/modes_PIO.htm}{\f1\insrsid10040624 ]}{ -\f1\insrsid11408224 .}{\f1\insrsid4665099 -\par }{\f1\insrsid10040624 -\par }{\f1\insrsid11408224 Once rising }{\f1\insrsid10040624 HD platter densities }{\f1\insrsid16131725 - }{\f1\insrsid10040624 and }{\f1\insrsid16131725 the }{\f1\insrsid10040624 resu}{\f1\insrsid16131725 lting increased transfer speeds -}{ -\f1\insrsid10040624 }{\f1\insrsid11408224 caused this to become a bottleneck, bus-mastering DMA }{\f1\insrsid16131725 (Direct Memory Access) }{\f1\insrsid11408224 -over the PCI bus became the norm. Here, the disk controller writes directly to memory, bypassing the CPU.}{\f1\insrsid10040624 It is free to perform other work during this time, so long as the bus is not needed }{\f1\insrsid16131725 - }{ -\f1\insrsid10040624 an important point that will affect our choice of}{\f1\insrsid6842252 I/O}{\f1\insrsid2108982 }{\f1\insrsid16131725 block size below}{\f1\insrsid10040624 .}{\f1\insrsid11408224 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid4665099 -\par }{\f1\insrsid10040624 Given this information, we now examine the}{\f1\insrsid6842252 I/O}{\f1\insrsid10040624 interfaces provided by the Operating System. POSIX }{\f1\insrsid3160535 supports}{\f1\insrsid10040624 synchronous blocking}{ -\f1\insrsid6842252 I/O}{\f1\insrsid10040624 , blocking}{\f1\insrsid6842252 I/O}{\f1\insrsid10040624 in another thread, and asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid10040624 (}{\f1\insrsid16131725 \'93}{\f1\insrsid10040624 aio}{ -\f1\insrsid16131725 \'94}{\f1\insrsid10040624 ). -\par }{\f1\insrsid4282397 We remove the first option }{\f1\insrsid10040624 from consideration because it }{\f1\insrsid9109787 does not allow work to proceed in parallel with the}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 -. Several implementation details cause us to choose aio over the threaded approach:}{\f1\insrsid10040624 -\par }{\f1\insrsid9109787 - on Windows, }{\f1\insrsid4282397 aio }{\f1\insrsid9109787 bypasses the OS file cache}{\f1\insrsid4282397 . This allows }{\f1\insrsid3160535 bulk }{\f1\insrsid4282397 DMA}{\f1\insrsid3160535 transfers}{\f1\insrsid4282397 , which }{ -\f1\insrsid3160535 achieve }{\f1\insrsid4282397 higher throughput than }{\f1\insrsid3160535 the single page-in }{\f1\insrsid1337770 operations }{\f1\insrsid3160535 that would be issued by the memory-mapping-based }{\f1\insrsid4282397 OS file cache. }{ -\f1\insrsid9109787 -\par - aio }{\f1\insrsid3160535 places pending read requests in a queue so }{\f1\insrsid9109787 that the disk controller can proceed immediately with the next}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 ; the disk is always busy. With threaded blocking}{ -\f1\insrsid6842252 I/O}{\f1\insrsid9109787 , the OS would have to return from and then reenter kernel mode before relaying the }{\f1\insrsid1337770 application\rquote s }{\f1\insrsid9109787 next}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 - request to the disk. This overhead reduces throughput. -\par - parallelism between computation and}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 - is achieved without having to worry about the OS correctly scheduling all participating threads. Additionally, behavior is predictable and thread-switch overhead is avoided. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid3160535 {\f1\insrsid3160535 -\par }{\f1\insrsid1337770 Note: Linux used to emulate aio by spawning threads, which led to less than stellar performance. This is no longer the case; indeed a decent aio implementation should not fare worse than the threaded blocking I/O approach. -\par In fact, asynchronous I/O performs better on Windows due to the abovementioned issues. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid9109787 -\par As a final detail, the POSIX aio functionality is emulated on Windows in terms of the \'93overlapped\'94}{\f1\insrsid14962633 ReadFile API}{\f1\insrsid1337770 . By using the POSIX interface, we }{\f1\insrsid14962633 -ensure portability to virtually all syste}{\f1\insrsid2556592 ms.}{\f1\insrsid9109787 -\par }{\f1\insrsid14962633 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid14962633 {\f1\insrsid9109787 To summarize}{\f1\insrsid14962633 , we use asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid14962633 }{\f1\insrsid3345630 -to achieve best possible throughput and allow computation to proceed in parallel. This is made possible by the hard drive\rquote s DMA }{\f1\insrsid1337770 capability}{\f1\insrsid3345630 .}{\f1\insrsid14962633 -\par The validity of this approach is shown by a small test program that reaches maximum rated drive throughput and by [}{\f1\insrsid14962633\charrsid4665099 performance study of}{\f1\insrsid14962633 sequential I/O on windows NT 4].}{ -\f1\insrsid14962633\charrsid4665099 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid9109787 -\par }{\f1\insrsid2108982 -\par Compression -\par -\par The next cornerstone of our}{\f1\insrsid6842252 I/O}{\f1\insrsid2108982 library is compressing source files. This can dramatically reduce the amount of data to }{\f1\insrsid7561109 be }{\f1\insrsid2108982 -read. Indeed the current 0ad dataset has been compressed down to 46% of the original, a savings of 75 MB.}{\f1\insrsid9658852 - (NB: the dataset includes 13 MB of uncompressible audio; 3d mesh files with compression ratios of ~3x are chiefly responsible for the reduction)}{\f1\insrsid2108982 -\par }{\f1\insrsid9658852 -\par The compression algorithm used is Deflate, a combination of LZ77 and Huffman encoding as defined in [RFC1951] and used in the common Zip file format [Zip}{\f1\insrsid10885058 A}{\f1\insrsid9658852 pp}{\f1\insrsid10885058 N}{\f1\insrsid9658852 ote]. }{ -\f1\insrsid7733471 Other }{\f1\insrsid9658852 formats }{\f1\insrsid7733471 may }{\f1\insrsid9658852 achieve better compression ratios or feature faster compression/decompression speed}{\f1\insrsid7733471 -, but these are not critical to success. We prefer the advantage of interoperability \endash tools to work with Zip archives are universally available.}{\f1\insrsid9658852 -\par -\par }{\f1\insrsid6951588 In addition to the abovementioned significant reduction in file size, a further compelling argument to compress all data files }{\f1\insrsid10885058 is that it is }{\f1\insrsid9658852 effectively free!}{\f1\insrsid13977746 - To show this, we must first discuss how exactly I/O and decompression will be parallelized.}{\f1\insrsid6951588 -\par }{\f1\insrsid13977746 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13977746 {\f1\insrsid13977746 Presuppose that I/Os are split into fixed-size blocks, the rationale of which will be explained in \{Splitting Into Blocks\} -. These blocks are issued asynchronously up to a surely safe queue depth (currently 4). A block whose I/O has finished is then decompressed while the next ones are pending. This gives perfect parallelization if decompression requires less time than I/O. - -\par -\par Indeed a benchmark shows that a typical Pentium IV system (as of 2002) manages 40 MB/s I/O throughput and 100MB/s decompression [}{\f1\insrsid13977746\charrsid2108982 http://archive.gamespy.com/hardware/june02/p45331/index2.shtm}{\f1\insrsid13977746 ]. - -\par Note: The balance is not expected to change in the future for single-disk systems; even if it does, a compression method more suited to real-time decompression can be substituted. -\par -\par Therefore, any reduction in file size due to compression lessens I/O time at no cost. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10040624 {\f1\insrsid2508185 -\par -\par }{\f1\insrsid11491946 Ordering Files}{\f1\insrsid6842857 -\par }{\f1\insrsid11491946 -\par The tec}{\f1\insrsid11157276 hniques }{\f1\insrsid13977746 so far }{\f1\insrsid11157276 are not yet sufficient.}{\f1\insrsid4351785 }{\f1\insrsid2556592 They achieve good }{\f1\insrsid11157276 sequential read performance}{\f1\insrsid2556592 , but}{ -\f1\insrsid5721779 }{\f1\insrsid2556592 overall }{\f1\insrsid11157276 throughput }{\f1\insrsid5721779 is quite poor }{\f1\insrsid11157276 because files will tend to be scattered throughout the disk. This incurs }{\f1\insrsid5721779 expensive}{ -\f1\insrsid11157276 seeks}{\f1\insrsid4351785 (moving the hard-disk read head)}{\f1\insrsid11157276 ; a rough estimation of their cost is the time taken to read 400KB (assuming }{\f1\insrsid5721779 typical 7200 RPM drive with }{\f1\insrsid11157276 -10ms seek and 40 MB/s throughput [www.storagereview.com]). Given that files are often much smaller on average (25KB for 0ad), }{\f1\insrsid6842252 seek time }{\f1\insrsid11157276 dwarf}{\f1\insrsid6842252 s}{\f1\insrsid11157276 }{\f1\insrsid5721779 pure} -{\f1\insrsid6842252 I/O}{\f1\insrsid5721779 read }{\f1\insrsid11157276 time. -\par }{\f1\insrsid5721779 -\par }{\f1\insrsid11157276 Throughput can be much improved by arranging files on disk in order of access}{\f1\insrsid5721779 , thus avoiding seeks}{\f1\insrsid11157276 . Since we wish to use a standard File System (}{\f1\insrsid5721779 -whose placement strategy we cannot control}{\f1\insrsid11157276 ) for simplicity, files will have to be combined into one large OS-visible file \endash an archive.}{\f1\insrsid5721779 - As mentioned above, we prefer the Zip format for easy interoperability.}{\f1\insrsid11157276 -\par }{\f1\insrsid5721779 -\par Incidentally, storing files in archives has an additional advantage. The FS needs to store metadata and typically sector-aligns files; since sectors are 512 bytes or }{\f1\insrsid13977746 more}{\f1\insrsid5721779 -, this is very costly for tiny files. (NB: ReiserFS4 is the only known exception, able to pack several files into one sector.) -\par In contrast, archives can contain files packed end-to-end with only minimal metadata/header information}{\f1\insrsid12480624 , thus wasting less space and by extension reducing read time.}{\f1\insrsid5721779 -\par }{\f1\insrsid4351785 -\par It remains to determine the optimal file ordering that minimizes seeks.}{\f1\insrsid6831581 This will be done once (offline); performance is therefore not of paramount importance.}{\f1\insrsid4351785 -\par -\par Before, though, we decide whether files may be repeated in the archive. To see the problem, consider the following sequence where file \lquote C\rquote is loaded after \lquote A\rquote 50% of the time and otherwise after \lquote B\rquote : AC\'85 -BC...AC...BC. It would seem that 50% of \lquote C\rquote accesses must incur a seek}{\f1\insrsid12480624 , but placing}{\f1\insrsid4351785 two copies of this file in the archive - after \lquote A\rquote and \lquote B\rquote \endash can }{ -\f1\insrsid12480624 avoid them}{\f1\insrsid4351785 entirely.}{\f1\insrsid11157276 -\par }{\f1\insrsid12480624 However, p}{\f1\insrsid4351785 ractical considerations lead us to }{\f1\insrsid12480624 disallow this: the act of finding a file within the archive would be a good deal more complicated. }{\f1\insrsid4351785 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11157276 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2578444 {\f1\insrsid2578444 Now back to the issue of finding an ordering for files. Our strategy is as follows: -\par 1) view all files to be added as nodes in a DAG}{\f1\insrsid6831581 (Directed Acyclic Graph)}{\f1\insrsid2578444 ; edges indicate that 2 files are immediate neighbors in the archive. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid2578444 2) }{\f1\insrsid12480624 record a}{\f1\insrsid2578444 \'93trace\'94 of all}{\f1\insrsid12480624 file accesses over one or mo -re program runs (recall that access patterns may differ}{\f1\insrsid2578444 between runs}{\f1\insrsid12480624 )}{\f1\insrsid15159648 .}{\f1\insrsid2578444 -\par }{\f1\insrsid15159648 3) construct from this a list of possible edges sorted by their frequency (i.e. how often they occurred in the trace). -\par 4) generate a set of \lquote }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 \rquote by committing the above edges as long as no cycle results. These }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 - are connected portions of the DAG that are known to have been accessed in that order. -\par 5) output the final file ordering by stitching together all }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 and then adding any remaining files that were not included in the trace. -\par -\par Details on these steps follow. -\par -\par }{\f1\insrsid6831581 1: Prepare DAG of Files -\par -\par Each node holds all required information about the file. This includes its filename and the nodes that have been chosen to come before and after it in the -final layout. All of these are stored as 16-bit IDs to reduce size and therefore improve locality; mapping from filename to ID is accomplished in logarithmic time via tree. -\par -\par }{\f1\insrsid15159648 2: }{\f1\insrsid2521923 Record }{\f1\insrsid15159648 Trace -\par }{\f1\insrsid12480624 -\par }{\f1\insrsid3422301 The acts of loading a file and releasing the resulting memory are logged (the latter is required by the }{\f1\insrsid12716889 file }{\f1\insrsid3422301 cache). Records consist of }{\f1\insrsid12716889 timestamp, }{\f1\insrsid3422301 -filename, file size and any flags that affect}{\f1\insrsid6842252 I/O}{\f1\insrsid3422301 mode. For simplicity, we do not record file offset / transfer size: that would not }{\f1\insrsid674725 yield any information }{\f1\insrsid3422301 -because seeks are incurred by accessing any part of the file. Also, we assume that loading entire files at a time is the dominant model. -\par -\par }{\f1\insrsid674725 Besides the obvious application of determining optimal archive ordering, the resulting plain text file can be used to benchmark the}{\f1\insrsid6842252 I/O}{\f1\insrsid674725 implementation under repeatable conditions.}{ -\f1\insrsid3422301 -\par }{\f1\insrsid674725 Even when lacking the actual data files, the trace can still be useful to benchmark performance of the file cache and ordering. For this, simply map filenames to an integral ID and simulate }{\f1\insrsid12716889 the cache and}{ -\f1\insrsid6842252 I/O}{\f1\insrsid12716889 parts.}{\f1\insrsid674725 -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid674725 {\f1\insrsid674725 Note}{\f1\insrsid12716889 s}{\f1\insrsid674725 :}{\f1\insrsid12716889 -\par - }{\f1\insrsid674725 we are careful to ensure that recording a trace does not incur any}{\f1\insrsid6842252 I/Os}{\f1\insrsid674725 , which would skew performance measurements. Records are stored in binary format within an expandable array -(no copying or memory waste due to pre-reserved virtual address space). -\par }{\f1\insrsid12716889 - trace files may log accesses over several program runs. This will be useful in the following steps because several mutual-exclusive but equally probably access patterns may exist, each of which should be equally considered. -\par Program runs are differentiated by examining the timestamp, which starts at 0 on each run. -\par -\par 3: Construct Edge List -\par -\par }{\f1\insrsid2186746 This step constructs a list of edges from the trace file. First, the trace is split into program runs, which are processed most recent first. In each of these, all adjacent pairs of files are examined; those }{\f1\insrsid6842252 not}{ -\f1\insrsid2186746 already in the list are added, otherwise the existing edge\rquote s frequency is incremented.}{\f1\insrsid15548533 -\par -\par Important note: presuppose the existence of a file cache, which will be presented in the next section. Since frequent accesses to files will be absorbed by this cache, we do not want this inflated frequency to \lquote pollute\rquote - the edge list. That would displace other edges that might actually turn out to be more important because they actually would incur seeks, as opposed to the edge whose file}{\f1\insrsid6842252 I/Os}{\f1\insrsid15548533 would be satisfied by the cache. - -\par Our solution to this problem is to simulate the file cache whilst processing trace entries (only in the same program run!); if the file would not result in an}{\f1\insrsid6842252 I/O}{\f1\insrsid15548533 due to the cache, the current edge is ignored. - -\par Under the assumption that access patterns are similar to the trace, this scheme improves the quality of the ordering by making it reflect the trace more strongly (rather than being fooled by frequent}{\f1\insrsid6842252 I/Os}{\f1\insrsid15548533 -). If not, correctness is not impacted; we merely risk incurring a few more seeks. -\par -\par Checking if an edge already exists }{\f1\insrsid2186746 is accomplished by translating the two filenames into 16-bit IDs (O(logN) time), appending these into a 32-bit number and searching for that in a tree (O(logN) time). -\par }{\f1\insrsid15548533 -\par }{\f1\insrsid2186746 Finally, this list is}{\f1\insrsid14363947 }{\f1\insrsid2186746 sorted by decreasing frequency}{\f1\insrsid14363947 (must be stable, i.e. ordering of edges with identical frequency must not change! See note below.)}{ -\f1\insrsid2186746 . The result is a list of unique edges (i.e. \'93file A should be stored after file B\'94 relationships). -\par -\par 4: Generate }{\f1\insrsid2521923 Chains}{\f1\insrsid2186746 -\par -\par }{\f1\insrsid14363947 This step is the heart of our file ordering }{\f1\insrsid2521923 strategy}{\f1\insrsid14363947 . }{\f1\insrsid2186746 The}{\f1\insrsid14363947 above}{\f1\insrsid2186746 edges are now \lquote committed\rquote into }{ -\f1\insrsid14363947 the }{\f1\insrsid2186746 DAG}{\f1\insrsid14363947 in order.}{\f1\insrsid2521923 That means the files are marked to come after one another, i.e. their nodes in the DAG }{\f1\insrsid6842252 will be}{\f1\insrsid2521923 connected}{ -\f1\insrsid6842252 by an edge}{\f1\insrsid2521923 (unless a cycle were to result). For simplicity, committed edges are never removed, this being a greedy heuristic.}{\f1\insrsid14363947 -\par -\par }{\f1\insrsid2521923 We check for cycles}{\f1\insrsid14363947 via }{\f1\insrsid2521923 \'93}{\f1\insrsid14363947 DFS}{\f1\insrsid2521923 \'94}{\f1\insrsid14363947 , which }{\f1\insrsid2521923 actually }{\f1\insrsid14363947 simplifies to a list walk }{ -\f1\insrsid2521923 here }{\f1\insrsid14363947 -since nodes have only one previous and next link. These are typically quite short and overall run time of this entire step is not a problem in practice (7ms for 5000 files), so we do not attempt more efficient and sophisticat -ed cycle detection schemes. One such approach would be to store a pointer to the current end of list for each node and perform list jumping.}{\f1\insrsid2186746 -\par }{\f1\insrsid14363947 -\par The result of this step is a set of disjoint }{\f1\insrsid2521923 chains}{\f1\insrsid14363947 , which are each a series of files that are to be stored immedi -ately after one another. Due to the nature of the edge list, the files that are most frequently accessed after one another are grouped together. As such, we have attained a }{\f1\insrsid2186746 good }{\f1\insrsid2521923 a}{\f1\insrsid2186746 -pproximation of an optimal tour. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid12716889 -\par }{\f1\insrsid14363947 Note: now the reason for the most-recent-fi -rst program run ordering becomes clear. All but the most frequent edges are placed into the list in the order that they occurred in the trace (due to stable sort). Since they are also committed in the DAG}{\f1\insrsid2521923 - in this order, they end up mostly as observed}{\f1\insrsid14363947 }{\f1\insrsid2521923 from -the trace. Since the most recent trace is assumed to be the most accurate and reflective of current behavior, it is given the most weight (by allowing all edges that ensued from it to be committed first).}{\f1\insrsid14363947 -\par }{\f1\insrsid12716889 -\par }{\f1\insrsid2521923 5: Stitch Chain}{\f1\insrsid2556592 s}{\f1\insrsid2521923 together}{\f1\insrsid12716889 -\par -\par }{\f1\insrsid2521923 The final step is to st -itch together the disjoint chains and output them into the final ordered list. File nodes will be marked once they have been output. We iterate over all nodes and output the entire chain of which it is a part; this is done by following the node\rquote -s previous link until at beginning of the chain. -\par Incidentally, this iteration ensures all files appear in the output list, even if they were not included in the trace. -\par }{\f1\insrsid12716889 -\par }{\f1\insrsid4146695 We have thus generated an ordering of files that minimize seeks assuming application behavior is similar to that which was recorded in the trace(s). -\par }{\f1\insrsid4994568 -\par }{\f1\insrsid6831581 This is an approximation to a variant of the Traveling Salesman Problem; the question as to its quality (i.e. how many seeks are avoided) is interesting and will be examined in }{\f1\insrsid12547006 <<}{\f1\insrsid6831581 section 3}{ -\f1\insrsid12547006 >>}{\f1\insrsid6831581 . -\par }{\f1\insrsid4994568 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4994568 {\f1\insrsid4994568 Rough compl -exity analysis: except for the cycle determination, none of these steps require more than O(logN) work per file. Expected case is therefore O(N*logN), with O(N^2) work in the worst case (if DFS always scans through very long chains). However, as mentioned - above, this is an offline process; performance is entirely adequate, so we do not delve into a complete analysis or optimize the cycle determination step. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1770144 {\f1\insrsid1770144 -\par }{\f1\insrsid4994568 -\par }{\f1\insrsid1770144 Splitting Into Blocks -\par }{\f1\insrsid10239896 -\par S}{\f1\insrsid1770144 plitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144 into fixed-sized blocks }{\f1\insrsid10239896 is desirable for two reasons:}{\f1\insrsid1770144 -\par -\par }{\f1\insrsid15672389 It would allow decompression of large files to proceed immediately and in parallel with the I/O. This is especially important when loading an alternating sequence of large and small files: all decompression can be \lquote hidden -\rquote behind I/O. -\par The alternative, namely only decompressing after having finished loading the entire file, clearly breaks down in this case and does not parallelize well. -\par }{\f1\insrsid1770144 -\par One further advantage is that of sector alignment. Due to the end-to-end packing in archives, files often start at unaligned offsets on disk. A limitation in the Windows }{\f1\insrsid2508185 ReadFile}{\f1\insrsid1770144 - API would require copying such files to/from an align buffer. This can be avoided by splitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144 into blocks and rounding their offset/size down/up to sector boundaries. -\par -\par }{\f1\insrsid2508185 We now decide on the block size. Many }{\f1\insrsid10239896 considerations}{\f1\insrsid7371675 }{\f1\insrsid10239896 come in to play:}{\f1\insrsid2508185 -\par }{\f1\insrsid1770144 + theoretically, larger sizes are good due to economy of scale (less overhead per transfer). -\par + block}{\f1\insrsid15672389 length }{\f1\insrsid1770144 should be }{\f1\insrsid15672389 a multiple of the }{\f1\insrsid1770144 sector size}{\f1\insrsid15672389 (required for sector alignment mentioned above).}{\f1\insrsid1770144 -\par - blocks should not be too large, or else decompression cannot be done in-cache. That would result in bus accesses, which interfere with the DMA}{\f1\insrsid6842252 I/O}{\f1\insrsid1770144 - operation. Typical L2 cache sizes are 256 to 512KiB, which must cover the compressed source and decompressed destination buffers. -\par - }{\f1\insrsid15672389 I/Os for large blocks }{\f1\insrsid11288483 may end up being }{\f1\insrsid15672389 split into several I/O requests}{\f1\insrsid11288483 ; beyond that point, there would be no advantage to increasing the block size. -\par }{\f1\insrsid15672389 Background: PC DMA requires *physically* contiguous memory, which cannot be guaranteed from user programs because they }{\f1\insrsid11288483 only see}{\f1\insrsid15672389 virtual addresses. }{\f1\insrsid1001543 As a workaround, the} -{\f1\insrsid15672389 OS }{\f1\insrsid1001543 typically }{\f1\insrsid11288483 analyz}{\f1\insrsid1001543 es}{\f1\insrsid11288483 the buffer and mak}{\f1\insrsid1001543 es}{\f1\insrsid11288483 a \'93scatter-gather list\'94 - out of it. This is a list of contiguous regions (typically }{\f1\insrsid3243975 only 1 }{\f1\insrsid11288483 memory page}{\f1\insrsid3243975 due to fragmentation}{\f1\insrsid11288483 -) that constitute the buffer; the driver can DMA into them individually without having to copy from a central DMA buffer.}{\f1\insrsid3243975 -\par }{\f1\insrsid11288483 For concreteness, the Windows ASPI layer has a limit of 64 KiB per transfer because its scatter-gather lists are stored in non-paged pool, a memory region of limited size.}{\f1\insrsid202317 [}{\f1\insrsid202317\charrsid202317 -http://www.eetkorea.com/ARTICLES/2000APR/2000APR05_CT_ID_AN.PDF}{\f1\insrsid202317 ]}{\f1\insrsid11288483\charrsid202317 -\par }{\f1\insrsid1770144 - in practice, there is no difference between aio read throughput for transfer sizes between 4 and 192 KiB [Win}{\f1\insrsid2508185 dows 2000 Disk}{\f1\insrsid6842252 I/O}{\f1\insrsid2508185 Performance]. -\par + }{\f1\insrsid1770144 However, the aio queue depth (maximum number of concurrent}{\f1\insrsid6842252 I/Os}{\f1\insrsid1770144 - that can be queued by the OS) is system-dependent and should not be relied upon. Therefore, it is better to avoid all too small blocks, because it may not be possible to queue enough buffers to keep the disk continuously busy. -\par -\par T}{\f1\insrsid6842252 he result of these ruminations wa}{\f1\insrsid1770144 s a block size of 16 KiB.}{\f1\insrsid11684990 However, our measurements have sho}{\f1\insrsid6842252 wn 32 KiB to be most efficient.}{\f1\insrsid1770144 -\par }{\f1\insrsid7371675 -\par This concludes discussion of our}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675 techniques. To review,}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 }{\f1\insrsid1770144 are automatic -ally split into blocks (of aligned start position and length) and issued asynchronously}{\f1\insrsid7371675 . Once a block finishes, it is decompressed while the next block}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675 - is in progress. Finally, seeks are avoided by having arranged the files within an archive in order of access. -\par }{\f1\insrsid1770144 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid1770144 -\par }{\f1\insrsid12675798 Caching}{\f1\insrsid1770144 -\par }{\f1\insrsid12675798 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15489891 {\f1\insrsid15489891\charrsid15489891 It's not true that life is one damn thing after another; it is one damn thing over and over. -\par -}{\f1\insrsid10617934 -}{\f1\insrsid15489891\charrsid15489891 Edna St. Vincent Millay -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid15489891 -\par }{\f1\insrsid12675798 The final step we take in optimizing}{\f1\insrsid6842252 I/O}{\f1\insrsid12675798 is caching. By keeping commonly used files in memory, some }{\f1\insrsid15489891 repeated}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 - can be avoided outright. -\par }{\f1\insrsid881799 -\par }{\f1\insrsid12675798 There are two \lquote levels\rquote of cache: }{\f1\insrsid11368037 entire }{\f1\insrsid12675798 file}{\f1\insrsid11368037 s }{\f1\insrsid12675798 and block}{\f1\insrsid11368037 s}{\f1\insrsid12675798 .}{\f1\insrsid11368037 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11368037 {\f1\insrsid11368037 -\par The small block cache serves to avoid overhead due to sector-aligning}{\f1\insrsid6842252 I/Os}{\f1\insrsid11368037 in transfers. Since files usually start at unaligned offsets within archives, data lying at the - beginning of a sector would be read twice (once for the real}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037 and then again during the next file\rquote s}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037 -). The block cache absorbs this cost by keeping in memory the last few blocks read; it is organized as LRU. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11368037 -\par The per-file }{\f1\insrsid6842252 caching strategy}{\f1\insrsid11368037 is due to the assumption that files will usually be loaded in one burst; it simplifies }{\f1\insrsid6842252 bookkeeping }{\f1\insrsid11368037 -and avoids having to copy pieces of the file into a final buffer.}{\f1\insrsid12675798 -\par }{\f1\insrsid11472464 Our }{\f1\insrsid11368037 file cache is a system consisting of the following components: -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11472464 {\f1\insrsid11472464 - an allocator doles out variable-sized chunks of a fixed-size memory region. -\par - the \lquote extant list\rquote keeps track of which buffers are currently in use by the application. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11472464 - a cache manager provides efficient lookup of the file contents given filename and decides which files to keep in memory. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid11472464 -\par }{\f1\insrsid3629320 T}{\f1\insrsid11472464 hese }{\f1\insrsid3629320 are}{\f1\insrsid11472464 explained in detail below. -\par -\par }{\f1\insrsid10239896 Allocator -\par -\par A general-purpose allocator (e.g. malloc) is not acceptable for this application because file buffer addresses are required by Windows ReadFile to be aligned to a sector boundary}{\f1\insrsid13180669 . Rounding up returned addr -esses would waste unacceptable amounts of memory, so a }{\f1\insrsid10239896 special allocation scheme }{\f1\insrsid13180669 is needed}{\f1\insrsid4401489 that always returns aligned regions}{\f1\insrsid13180669 . -\par -\par }{\f1\insrsid4401489 This }{\f1\insrsid13180669 entails not prefixing the allocated regions with a header. }{\f1\insrsid4401489 Our idea is to}{\f1\insrsid7351464 transfer}{\f1\insrsid4401489 }{\f1\insrsid13180669 ownership of a}{\f1\insrsid5467766 n}{ -\f1\insrsid13180669 }{\f1\insrsid4401489 allocated }{\f1\insrsid13180669 -region from the allocator to cache and/or extant list; these have to record region address and size anyway for their bookkeeping. When the region is to be freed, the extant list inform}{\f1\insrsid7351464 s}{\f1\insrsid13180669 - the allocator of its size and address}{\f1\insrsid4401489 , which is typically what a header would }{\f1\insrsid7437835 have }{\f1\insrsid4401489 store}{\f1\insrsid7437835 d}{\f1\insrsid4401489 . }{\f1\insrsid10239896 -\par -\par }{\f1\insrsid4401489 Having now established the requirement for alignment and how to ensure it, we discuss the main problem of an allocator}{\f1\insrsid7437835 -}{\f1\insrsid4401489 fragmentation. -\par There are basically two ways to deal with this: perform periodic reorganization, }{\f1\insrsid5467766 or}{\f1\insrsid4401489 }{\f1\insrsid5467766 prevent }{\f1\insrsid4401489 it from happening in the first place.}{\f1\insrsid7351464 -\par -\par }{\f1\insrsid4401489 The former is not feasible due to our real-time requirements, and - more importantly \endash because users receive direct pointers to the cache memory. }{\f1\insrsid7351464 This allows zero-copy}{\f1\insrsid6842252 I/O}{ -\f1\insrsid7351464 and reduces memory footprint because multiple user -s of a file can share its (read-only) contents. However, it is believed that currently in-use and therefore unmovable regions would severely hamper defragmentation. We therefore focus on the latter approach.}{\f1\insrsid4401489 -\par }{\f1\insrsid7351464 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid7351464 {\f1\insrsid13388513 With all pieces in places, we now discuss the }{\f1\insrsid7351464 allocation }{\f1\insrsid13388513 policy}{\f1\insrsid408563 . }{ -\f1\insrsid13388513 As shown in [MemFragSolved], }{\f1\insrsid6191932 Address-Ordered good-fit}{\f1\insrsid408563 - performs well. When freeing, we coalesce regions immediately. This may perform unnecessary work, but is acceptable in light of its simplicity. Allocation first exhausts all available -memory before reusing freelist entries. This is fine because the cache size is chosen such that it can and should be used in its entirety. The benefit is reducing freelist splitting, which tends to produce larger coalesced regions. -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6191932 {\f1\insrsid6191932 Note: in addition to po -licy, there is another approach to mitigating fragmentation. Its root cause is freeing objects whose neighbors are not free. We attack this by allowing for the application to pass hints as to buffer lifetimes, so that long-lived objects can be placed diff -erently and not cause \lquote holes\rquote around freed short-lived objects. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid7351464 {\f1\insrsid6191932 -\par -\par }{\f1\insrsid408563 Implementation Details -\par -\par }{\f1\insrsid7437835 A}{\f1\insrsid13388513 }{\f1\insrsid7437835 \lquote }{\f1\insrsid13388513 good}{\f1\insrsid7437835 \rquote }{\f1\insrsid13388513 fit is achieved by }{\f1\insrsid7437835 searching in }{\f1\insrsid13388513 segregat}{ -\f1\insrsid7437835 ed}{\f1\insrsid13388513 freelists}{\f1\insrsid7437835 . They are divided }{\f1\insrsid13388513 into size classes, }{\f1\insrsid408563 where class i (>= 0) }{\f1\insrsid13388513 holds regions of size }{ -\f1\insrsid13388513\charrsid13388513 (2**(i-1), 2**i]}{\f1\insrsid13388513 . Determining size class can be done }{\f1\insrsid7437835 by taking the}{\f1\insrsid13388513 }{\f1\insrsid408563 base-2 logarithm of }{\f1\insrsid7437835 the }{\f1\insrsid408563 -size}{\f1\insrsid13388513 . If a freelist is empty, the allocation can be satisfied by finding the next }{\f1\insrsid5467766 highest }{\f1\insrsid13388513 non-empty class (O(1) due to bit}{\f1\insrsid408563 scan}{\f1\insrsid13388513 )}{\f1\insrsid408563 - and splitting its first block.}{\f1\insrsid7351464 -\par }{\f1\insrsid3629320 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3629320 Total allocation per -formance can be made O(1) by further splitting size classes into fixed-size subclasses; this is the approach taken by [TLSF]. However, we find that freelists are typically empty anyway (because the cache is always as full as possible) and therefore }{ -\f1\insrsid7437835 omit this for simplicity.}{\f1\insrsid4401489 -\par }{\f1\insrsid7437835 -\par }{\f1\insrsid3629320 Coalescing works by storing boundary tags within the freed}{\f1\insrsid12547006 (!)}{\f1\insrsid3629320 - memory. When freeing a block, we check if the regions that come before and after it have such tags (identified via distinctive bit patterns very likely to occur in normal data); if so, they are merged. Note that this is somewhat risky but the \lquote -magic\rquote bit pattern is long enough to make any mix-up extremely unlikely. This trouble is necessary because the tags cannot be a}{\f1\insrsid7437835 dded}{\f1\insrsid3629320 to }{\f1\insrsid7437835 the }{\f1\insrsid3629320 beginning/end of }{ -\f1\insrsid7437835 a }{\f1\insrsid3629320 region due to alignment requirements. -\par -\par }{\f1\insrsid408563 F}{\f1\insrsid13180669 or convenience, memory is doled out from a fixed-size chunk of virtual address space, rather than separate on-demand allocations from the OS. }{\f1\insrsid4401489 -This allows easily checking whether a given pointer is valid and was taken from the chunk. Due to on-demand committing of the virtual memory, only }{\f1\insrsid5467766 as}{\f1\insrsid4401489 much physical memory as necessary is used.}{ -\f1\insrsid10239896 -\par }{\f1\insrsid4401489 -\par }{\f1\insrsid3629320 Extant List -\par -\par This list tracks all buffers that have been }{\f1\insrsid12547006 handed out}{\f1\insrsid3629320 to the application but not yet freed. Since they are expected to be freed immediately (before a -llocating the next, which is enforced by a warning), this list only contains a few entries and therefore need not be organized as a tree. -\par -\par It stores address and size of the allocated regions, which are passed to the allocator when freeing a buffer. This }{\f1\insrsid7437835 avoids the need for}{\f1\insrsid3629320 per-regi}{\f1\insrsid15952639 on headers, as explained above. -\par An alternative would be }{\f1\insrsid3629320 providing a separate data structure }{\f1\insrsid15952639 associating allocated address with its size, but this is redundant since many of these regions are also stored in the cache. Therefore, ou -r approach uses less memory.}{\f1\insrsid3629320 -\par -\par }{\f1\insrsid15952639 Cache}{\f1\insrsid7437835 Manager}{\f1\insrsid3629320 -\par -\par }{\f1\insrsid7437835 The cache manager is the heart of this system; it maps filenames to the file\rquote s cached contents and decides which ones}{\f1\insrsid12547006 to keep in memory. As }{\f1\insrsid7764402 mentioned in \{Related Theoretical Work\}}{ -\f1\insrsid7437835 , we use the Landlord algorithm for this purpose.}{\f1\insrsid3629320 -\par }{\f1\insrsid7437835 -\par }{\f1\insrsid12547006 <}{\f1\insrsid7437835 Pseudocode}{\f1\insrsid12547006 >}{\f1\insrsid7437835 -\par -\par We see that th}{\f1\insrsid11994078 e na\'efve version of this}{\f1\insrsid7437835 algorithm has a high }{\f1\insrsid7764402 memory access}{\f1\insrsid7437835 cost: }{\f1\insrsid11994078 eviction involves 2 complete loops over all cached items.}{ -\f1\insrsid7764402 -\par -\par }{\f1\insrsid11994078 The first step towards mitigating this is to }{\f1\insrsid7764402 optimize the manager\rquote s item }{\f1\insrsid11994078 container }{\f1\insrsid7764402 -(used to implement the filename -> cached-file mapping) for good locality. An array-based hash table will perform much better than a tree whose elements are scattered throughout memory.}{\f1\insrsid3629320 -\par }{\f1\insrsid14308065 -\par }{\f1\insrsid11994078 We have developed several }{\f1\insrsid7764402 further }{\f1\insrsid11994078 improvements: -\par }{\f1\insrsid14308065 1) The costly divisions required to calculate credit density can be replaced with multiplying by the reciprocal. This trades less latency (}{\f1\insrsid12547006 4 vs. }{\f1\insrsid14308065 20 c}{\f1\insrsid12547006 ycles}{ -\f1\insrsid14308065 on Athlon XP) for increased memory use. -\par 2}{\f1\insrsid11994078 a) the calcMCD and chargeAll loops can effectively be fused by calculating the next MCD}{\f1\insrsid12547006 value on the side. We therefore avoid iterating over all items twice}{\f1\insrsid11994078 , which is }{ -\f1\insrsid12547006 especially }{\f1\insrsid11994078 important for large sets of items that do not fit in cache. -\par }{\f1\insrsid14308065 2}{\f1\insrsid11994078 b) a priority queue can return and remove the MCD item in O -(logN) time; the rent that should be charged from all items can be accumulated and applied in batches. The validity of this approach is not immediately clear. Landlord specifies decreasing all credit by delta * item.size and removing any subset of items w -ith no credit remaining. By definition of delta (min credit density), at least one item will be removed, and this is exactly the one returned by the priority queue. -\par Note that any pending charges must be committed before adding any items; otherwise, they too would be charged during the next commit cycle, which would be incorrect. -\par }{\f1\insrsid14308065 Implementation note: to avoid duplicating code, the priority queue is separate from the filename->cached contents mapping. Since it is ordered by the item credit, the queue must be r -e-sorted after an item is accessed, which increases its credit. Due to limitations in the STL priority_queue, this takes O(N) time on every access. Since }{\f1\insrsid12547006 cache }{\f1\insrsid14308065 hits are fairly rare, time is still saved}{ -\f1\insrsid12547006 overall}{\f1\insrsid14308065 ; however, this bottleneck should be removed by substituting a heap implementation that allows a logN \'93sift\'94 operation.}{\f1\insrsid3629320 -\par -\par }{\f1\insrsid5980580 These improvements are made av}{\f1\insrsid1847956 ailable as template}{\f1\insrsid5980580 policy classes and can therefore easily be enabled for applications where they provide a benefit.}{\f1\insrsid3629320 -\par }{\f1\insrsid5980580 -\par We examine results of these optimizations in }{\f1\insrsid12547006 <
>.}{\f1\insrsid5980580 -\par }{\f1\insrsid3629320 -\par }{\f1\insrsid5980580 This concludes discussion of the cache. To recap, }{\f1\insrsid799807 the}{\f1\insrsid11684990 small block cache absorbs the cost of rounding}{\f1\insrsid6842252 I/Os}{\f1\insrsid11684990 up to block size boundaries. }{ -\f1\insrsid799807 A }{\f1\insrsid11684990 file cache managed by the Landlord algorithm caches the contents of entire files}{\f1\insrsid799807 .}{\f1\insrsid3629320 -\par }{\f1\insrsid5980580 -\par }{\f1\insrsid14619881 -\par }{\f1\insrsid5980580 Experimental Results}{\f1\insrsid3629320 -\par }{\f1\insrsid5980580 -\par System Information -\par }{\f1\insrsid1010827 -\par The test system has the following specifications: -\par CPU: Athlon XP 2400+ (2000 MHz) -\par Memory: 768 MB DDR 2100 CL2.5 -\par }{\f1\insrsid1010827\charrsid4722959 Chipset: NForce2 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1010827 {\f1\insrsid1010827\charrsid1010827 HD: }{\f1\insrsid1010827 Deskstar }{\f1\insrsid1010827\charrsid1010827 7K250 (160 GB}{\f1\insrsid1010827 ,}{ -\f1\insrsid1010827\charrsid1010827 PATA}{\f1\insrsid1010827 ,}{\f1\insrsid1010827\charrsid1010827 8 MB cache, 8.5}{\f1\insrsid1010827 }{\f1\insrsid1010827\charrsid1010827 ms rated seek}{\f1\insrsid1010827 , 30-40 MB/s throughput}{\f1\insrsid11684990 } -{\f1\insrsid8264387 measured by MS MemSpeed}{\f1\insrsid1010827 ) -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid14619881 {\f1\insrsid14619881\charrsid14619881 OS: Windows XP SP2 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1010827 {\f1\insrsid14619881 Compiler: MS Visual C++ 7.1}{\f1\insrsid11472636 (o}{\f1\insrsid8264387 ptimization flags \'93}{\f1\insrsid8264387\charrsid8264387 /Ox}{ -\f1\insrsid8264387 g}{\f1\insrsid8264387\charrsid8264387 b1y}{\f1\insrsid8264387 /G6\'94}{\f1\insrsid11472636 )}{\f1\insrsid14619881 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid1010827 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid9206128 {\f1\insrsid9206128 We now describe methodology and show results of several tests measuring performance of our}{\f1\insrsid6842252 I/O}{\f1\insrsid9206128 - library. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128 -\par }{\f1\insrsid5980580 IO Throughput -\par }{\f1\insrsid12124230 -\par }{\f1\insrsid9206128 Methodology -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid9206128 {\f1\insrsid6762521 The basis for our I/O throughput measurement is}{\f1\insrsid9206128 - a trace file recorded from the startup of 0ad encompassing ~500 file loads. Using the trace simulation feature described above, we issue these}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128 as }{\f1\insrsid8400636 quickly}{\f1\insrsid9206128 - as possible; this removes the influence of other system-specific conditions such as graphics card performance etc. -\par -\par What is actually measured is the total amount of time elapsed between start and end of}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128 ; this together with the amount of user data transferred yields effective throughput (\'93effective\'94 - because it differs from the actual disk throughput due to compression). -\par }{\f1\insrsid2424877 -\par }{\f1\insrsid9206128 This was chosen as the benchmark measure because it reflects real-world performance of the entire system. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6762521 {\f1\insrsid6762521 Note: if a cache i -s involved, we ensure it is empty so as not to skew results; in the case of the OS file cache, testing takes place after a clean reboot. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid6762521 -\par }{\f1\insrsid9206128 Results and Discussion -\par -\par W}{\f1\insrsid12124230 e are interested in the total improvement yielded by our}{\f1\insrsid6842252 I/O}{\f1\insrsid12124230 library, as compared to throughput reached by the bare OS-provided read() API. -\par According to the above measure, we see }{\f1\insrsid12124230\charrsid12124230 29.3}{\f1\insrsid12124230 MB/s vs. 2.96 MB/s, a staggering speedup of 990 %! -\par -\par We now examine which}{\f1\insrsid6842252 I/O}{\f1\insrsid12124230 techniques are chiefly responsible for these gains. -\par -\par Leaving everything else the same but no longer compressing files stored in archives, performance fall}{\f1\insrsid12547006 s}{\f1\insrsid12124230 - from 27.2 MB/s to 22.2 MB/s. (Note: this measure differs from the peak performance listed above in that file block size was not yet the optimal value.) -\par This leads us to conclude that }{\f1\insrsid16475960 disk throughput is a limiting factor; a good sign indicating seeks are not the bottleneck. This will be further discussed below.}{\f1\insrsid5980580 -\par }{\f1\insrsid16475960 As an aside, decompression performance indeed mirrors the previously quoted 100 MB/s figure; we }{\f1\insrsid12547006 observe }{\f1\insrsid16475960 94.5 MB/s}{\f1\insrsid9206128 .}{\f1\insrsid16475960 -\par }{\f1\insrsid9206128 -\par }{\f1\insrsid16475960 When archives are disabled entirely and}{\f1\insrsid6842252 I/O}{\f1\insrsid16475960 is from loose }{\f1\insrsid12547006 files }{\f1\insrsid16475960 (stored in the }{\f1\insrsid12547006 normal files}{\f1\insrsid16475960 -ystem), performance drops to }{\f1\insrsid4937740 2.62 MB/s. The immediate conclusion is that reduced locality (due to poor FS ordering and extra headers) induces many costly seeks. -\par }{\f1\insrsid16475960 We }{\f1\insrsid4937740 also }{\f1\insrsid16475960 notice that }{\f1\insrsid4937740 performance is worse }{\f1\insrsid16475960 than th}{\f1\insrsid4937740 at}{\f1\insrsid16475960 - measured for the synchronous API; this could be explained by increased overhead of the aio APIs. Indeed, }{\f1\insrsid4937740 they do not support the Windows FastIO }{\f1\insrsid8400636 driver }{\f1\insrsid4937740 entry points}{\f1\insrsid8400636 - that avoid needing to build an I/O }{\f1\insrsid4937740 request packet.}{\f1\insrsid16475960 -\par }{\f1\insrsid4937740 -\par Finally, we revisit the question of file block size. The initial choice of 16 KiB was not optimal; based on the following results, we go with 32 KiB. -\par Block Size (KiB)\tab Th}{\f1\insrsid9206128 r}{\f1\insrsid4937740 oughput (MB/s) -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4937740 {\f1\insrsid4937740\charrsid4937740 4\tab }{\f1\insrsid4937740 \tab \tab }{\f1\insrsid4937740\charrsid4937740 23.7 -\par 16\tab \tab \tab 27.2 -\par 32\tab \tab \tab 29.3 -\par 64\tab \tab \tab 29.1 -\par 128\tab \tab \tab 23.3}{\f1\insrsid4937740 -\par It is interesting that performance }{\f1\insrsid2424877 begins to}{\f1\insrsid4937740 falls off }{\f1\insrsid2424877 starting with }{\f1\insrsid4937740 64 KiB blocks. }{\f1\insrsid12262430 An explanation might be that }{\f1\insrsid4937740 transfers }{ -\f1\insrsid12262430 are }{\f1\insrsid4937740 split }{\f1\insrsid12262430 up }{\f1\insrsid4937740 due to the previously mentioned scatter-gather list limit, but this is speculation.}{\f1\insrsid4937740\charrsid4937740 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128 -\par In summary, we have found that bundling files into archives is the most worthwhile improvement, due to reducing seeks. Once these are eliminated, the increased throughput afforded by the (free) data compression step contributes an additional 23 % speedup. - -\par }{\f1\insrsid5980580 -\par }{\f1\insrsid14426874 Ordering Quality -\par -\par The above result indirectly shows that storing files in archives manages to avoid numerous seeks; else throughput would not be so high. We now examine exactly how many are incurred, thus evaluating the -quality of the archive ordering and its TSP heuristic. -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1911773 {\f1\insrsid5245032 To measure total seek impact, we must first define their cost}{\f1\insrsid1911773 .}{\f1\insrsid5245032 S}{\f1\insrsid1911773 -hort seeks may actually be free because the HD controller has already read the target data into its cache. Also, long seeks may be more expensive due to physical limitations of the disk head (it must accelerate/decelerate to/from - maximum velocity and then }{\f1\insrsid5245032 settle on the target track). -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid1911773 A }{\f1\insrsid5245032 good }{\f1\insrsid1911773 model}{\f1\insrsid5245032 for }{\f1\insrsid1911773 this -would be a constant overhead plus cost proportional to the seek distance}{\f1\insrsid5245032 , plus rotational latency}{\f1\insrsid1911773 .}{\f1\insrsid5245032 However, this is quite disk-dependent and difficult to determine. For simplic -ity, we currently assume uniform cost and try to avoid all seeks. -\par -\par }{\f1\insrsid5906787 Our }{\f1\insrsid14426874 first step in measuring }{\f1\insrsid5906787 them is }{\f1\insrsid14426874 to record a trace of 3 different 0ad startup sequences, each loading a separate map -(which share some files but differ in others, e.g. environment textures). This }{\f1\insrsid8805545 large }{\f1\insrsid14426874 trace }{\f1\insrsid8805545 consisting of ~2300 loads }{\f1\insrsid14426874 is used to guide creation of an archive. We then -count the seeks incurred by each of the individual sequences }{\f1\insrsid6382957 \endash this is }{\f1\insrsid14426874 easily done in our code by comparing current }{\f1\insrsid8805545 I/O }{\f1\insrsid14426874 file and offset with the last known values -. -\par }{\f1\insrsid1911773 -\par }{\f1\insrsid14426874 For the combined trace, }{\f1\insrsid6382957 no }{\f1\insrsid14426874 seeks are }{\f1\insrsid6382957 observed}{\f1\insrsid5906787 }{\f1\insrsid14426874 (}{\f1\insrsid1911773 2}{\f1\insrsid14426874 ). This is }{\f1\insrsid6382957 -as expected }{\f1\insrsid14426874 because the archive was specifically ordered for that sequence. -\par }{\f1\insrsid5053763 The individual \'93Cantabrian Highlands\'94 and \'93Neareastern\'94 map sequences incur }{\f1\insrsid6382957 only }{\f1\insrsid12663892 49}{\f1\insrsid5053763 (}{\f1\insrsid12663892 9.4}{\f1\insrsid5053763 % of total I/O requests) - and 6}{\f1\insrsid12663892 0}{\f1\insrsid5053763 (1}{\f1\insrsid12663892 0.6}{\f1\insrsid5053763 %) seeks, respectively.}{\f1\insrsid8805545 (}{\f1\insrsid1911773 3}{\f1\insrsid8805545 )}{\f1\insrsid5053763 -\par }{\f1\insrsid8805545 -\par }{\f1\insrsid5053763 These positive results justify our decision to use }{\f1\insrsid12262430 a }{\f1\insrsid5053763 heuristic}{\f1\insrsid12262430 to approximate TSP}{\f1\insrsid5053763 . Because the access patterns -induced by separate maps differ widely, insisting on }{\f1\insrsid12663892 an optimal ordering for one particular pattern does not make sense. Instead, this heuristic produces good results for a variety of maps.}{\f1\insrsid5053763 -\par -\par }{\f1\insrsid1911773 2}{\f1\insrsid6382957 )}{\f1\insrsid14426874 we have excluded }{\f1\insrsid5906787 4 }{\f1\insrsid14426874 unavoidable seeks that are unrelated to the archive ordering, namely 1 during the course of opening the Zi -p file (reading ECDR and then Central Directory) and 3 due to files that cannot be added to an archive. -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid8805545 {\f1\insrsid1911773 3}{\f1\insrsid6382957 )}{\f1\insrsid8805545 it may be surprising that subsequences of the trace incur seeks, while the whole does not. }{ -\f1\insrsid12262430 The explanation is that }{\f1\insrsid5906787 our }{\f1\insrsid12262430 file cache also serves to avoid seeks. To see this, consider the following simple example:}{\f1\insrsid8805545 -\par }{\f1\insrsid12262430 trace = AB|AC|AD, optimal ordering = ABCD. For the entire trace, the file cache will absorb the latter two A accesses, whereas the subsequences AC }{\f1\insrsid5906787 and AD }{\f1\insrsid12262430 each incur a seek. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid8805545 -\par }{\f1\insrsid6382957 -\par }{\f1\insrsid6762521 Caching Efficacy -\par -\par We now appraise the effectiveness of the cache replacement policy, i.e. its tendency to keep }{\f1\insrsid12663892 files }{\f1\insrsid6762521 in memory that will be needed later. }{\f1\insrsid12663892 To measure this, we simulate cache operation over the -combined trace mentioned above. }{\f1\insrsid1516560 It comprises 57 MB of data, of which 14 are repeated and therefore potentially cacheable.}{\f1\insrsid6762521 -\par -\par S}{\f1\insrsid1516560 ince the}{\f1\insrsid6762521 0ad dataset }{\f1\insrsid5906787 is }{\f1\insrsid1516560 as }{\f1\insrsid6762521 yet relatively small}{\f1\insrsid1516560 (real-world cache sizes may well be larger)}{\f1\insrsid6762521 -, we have artificially }{\f1\insrsid7279159 limited}{\f1\insrsid6762521 the cache size to ensure that items will have to be evicted from the cache}{\f1\insrsid1516560 . Without this}{\f1\insrsid12262430 }{\f1\insrsid5906787 action}{\f1\insrsid1516560 -, the cache replacement policy would be irrelevant. A size of 10 MB has been chosen arbitrarily.}{\f1\insrsid6762521 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2424877 {\f1\insrsid1516560 -\par We first }{\f1\insrsid9524216 evaluate the }{\f1\insrsid1516560 well-known LRU algorithm}{\f1\insrsid9524216 under these conditions}{\f1\insrsid1516560 . }{\f1\insrsid9524216 The cache h}{\f1\insrsid1516560 it rate is }{\f1\insrsid9524216 -determined to be }{\f1\insrsid1516560 19 % (473 hits totaling 6.18 MB vs. 1915 misses totaling 51.22 MB). -\par }{\f1\insrsid9524216 Our Landlord implementation more }{\f1\insrsid1516560 than doubles }{\f1\insrsid9524216 this }{\f1\insrsid1516560 to 39 % (}{\f1\insrsid1516560\charrsid1516560 945 }{\f1\insrsid1516560 hits totaling }{ -\f1\insrsid1516560\charrsid1516560 8.88}{\f1\insrsid1516560 }{\f1\insrsid1516560\charrsid1516560 MB}{\f1\insrsid1516560 vs. }{\f1\insrsid1516560\charrsid1516560 1443 }{\f1\insrsid1516560 misses totaling }{\f1\insrsid1516560\charrsid1516560 48.52 MB)}{ -\f1\insrsid9524216 .}{\f1\insrsid1516560 -\par }{\f1\insrsid9524216 A more intuitive view of these numbers is that the percentage of non-compulsory misses (i.e. }{\f1\insrsid12663892 files}{\f1\insrsid9524216 that were evicted but }{\f1\insrsid12663892 needed }{\f1\insrsid9524216 -later) drops from 26 % to 2 %.}{\f1\insrsid9524216\charrsid1516560 -\par }{\f1\insrsid1516560 -\par }{\f1\insrsid9524216 We are pleasantly surprised by this favorable result. Since our implementation does not yet take advantage of file cost hints from the application, the difference }{\f1\insrsid7279159 in performance }{\f1\insrsid9524216 is due }{ -\f1\insrsid7279159 solely }{\f1\insrsid9524216 to the Landlord algorithm\rquote s awareness of item size. This apparently leads to more efficient handling of the cache memory}{\f1\insrsid7279159 : fewer files -need be evicted to make enough room for the next item. -\par Another factor is that the repeated files in this trace are spaced widely apart (e.g. at the start of }{\f1\insrsid12663892 each of the }{\f1\insrsid7279159 3 map loads constituting the trace); LRU would tend to remove exactly these items. -\par }{\f1\insrsid1516560 -\par }{\f1\insrsid2424877 Cache Manager Optimizations -\par -\par Of further theoretical and practical interest is how }{\f1\insrsid1516560 much improvement }{\f1\insrsid2424877 the various Landlord algorithm optimizations}{\f1\insrsid1516560 yield}{\f1\insrsid2424877 . -\par -\par Accounting CPU cost is done as follows. First, external influences are minimized by running at highest scheduler priority. }{\f1\insrsid141460 S}{\f1\insrsid2424877 -everal thousand iterations of the target code are run while measuring elapsed time via high-resolution timer (precise to 1 CPU clock!). Each of these iterations performs an operation (e.g. allocate or free) chosen randomly; this avoids measuring character -istics that are specific to a given trace. Note, however, that we control the random distribution (in the example, ratio of \'93allocate\'94 to \'93free\'94 operations); these }{\f1\insrsid141460 are }{\f1\insrsid2424877 weighted towards the }{ -\f1\insrsid141460 most frequent and important }{\f1\insrsid2424877 operations. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128 -\par }{\f1\insrsid141460 The first result is that with the na\'ef -ve Landlord implementation, dividing via multiplying by reciprocal is actually 1.4 % slower! This is likely because the additional storage required for the reciprocal breaks the nice cache-friendly 16 byte element size. Since this algorithm itera -tes over all items twice, th}{\f1\insrsid5710756 e}{\f1\insrsid141460 memory access cost weighs more heavily than a few extra CPU cycles spent dividing.}{\f1\insrsid15160866 -\par }{\f1\insrsid141460 -\par Next, we find that the Landlord_Cached strategy (recall that it calculates minimum credit density while updating and therefore often avoids needing to iterate over all items) performs 21 % faster. -\par However, its divide-via-reciprocal variant is again slower \endash this time by 0.6 %. We see that }{\f1\insrsid13000184 iterating less often increases the benefit from the reciprocal divider.}{\f1\insrsid141460 -\par }{\f1\insrsid13000184 -\par The final variant is Landlord_Lazy -(which uses a priority queue to find the least valuable item in O(logN) and thus avoids iterating over all items when wanting to remove one from the cache). It performs 19 % better than baseline, which is slightly slower than the previous variant. Note th -at this result is heavily dependent on the relative frequency of add and remove operations: since the former require iteration over all items (to \lquote commit\rquote - a previous pending charge), decreasing their number from the current (and quite arbitrary) 70 % will cause this implementation to come out far ahead. -\par Applying the reciprocal divider results in further gains of 0}{\f1\insrsid5710756 .}{\f1\insrsid13000184 8 %. Since we rarely iterate over all items}{\f1\insrsid5710756 here}{\f1\insrsid13000184 -, the increase in size is outweighed by the faster division. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15160866 {\f1\insrsid15160866 -\par }{\f1\insrsid13000184 To conclude this section, we find that Landlord_Cached performs best in the current benchmark. Since it is less complex and requires less memory than the possibly faster Landlord_Lazy strategy, it is chosen as the default. -\par However, the implementation via template policy classes allows easily switching strategies in applications where results differ. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid15160866 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 Allocator Fragmentation -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid2424877 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 The important -question of allocator fragmentation is next. We gauge it in the course of simulating the previous 500-file trace. A simple and adequate measure is to compare the total requested size with how much of the total file cache is actually occupied.}{ -\f1\insrsid1847956 The result is a total memory waste of 1}{\f1\insrsid12262430 2}{\f1\insrsid1847956 %, which is in line with the findings of [Johnstone and Wilson]. While not great, this is acceptable.}{\f1\insrsid5048634 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid5048634 -\par }{\f1\insrsid9206128 -\par }{\f1\insrsid1911773 C}{\f1\insrsid13779256 onclusion}{\f1\insrsid1847956 -\par }{\f1\insrsid3546591 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5968267 {\f1\insrsid5968267 Waiting for slow}{\f1\insrsid6842252 I/O}{\f1\insrsid5968267 - is the bane of many a computer user; we have shown that this need not be and can be mitigated to a large degree. -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid5968267 -\par A method for }{\f1\insrsid16678464 fast}{\f1\insrsid6842252 I/O}{\f1\insrsid5968267 has been presented and analyzed}{\f1\insrsid16678464 .}{\f1\insrsid5968267 }{\f1\insrsid3546591 -The main contribution is a combination of techniques that greatly improves effective}{\f1\insrsid6842252 I/O}{\f1\insrsid16678464 }{\f1\insrsid3546591 throughput. -\par By caching file contents, we can avoid repetitive}{\f1\insrsid6842252 I/Os}{\f1\insrsid3546591 ; placing files in archives arranged in order of access reduces costly seeks. }{\f1\insrsid16678464 A}{\f1\insrsid3546591 -synchronous access maximizes read throughput and (together with block-splitting) allows }{\f1\insrsid16678464 the data to be compressed, which reduces the amount that must be read.}{\f1\insrsid3546591 -\par }{\f1\insrsid5968267 The end result is a measured speedup of nearly 1000 % in the target application, which is expected to apply widely due to inefficient filesystems. -\par }{\f1\insrsid16678464 -\par }{\f1\insrsid3546591 Of further interest are optimizations made to the memory allocation and cache management algorithms.}{\f1\insrsid5968267 - They respectively allow returning aligned file buffers (required by the aio implementation) without serious fragmentation and reduce CPU cost of the cache manager by 20 %.}{\f1\insrsid3546591 -\par }{\f1\insrsid5968267 -\par Other applications can build on our work and easily speed up their load times and file accesses. -\par -\par }{\f1\insrsid6382957 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6826037 {\f1\insrsid6826037 Implementation -\par -\par Our}{\f1\insrsid6842252 I/O}{\f1\insrsid6826037 code has been developed in C++ and also contains a few time-critical assembly language subroutines. It encompasses about 12000 lines of code, about 7K of which are new; the - rest was built upon previous work. -\par Unfortunately there are dependencies on another ~30KLOC, so releasing and integrating into other applications is not as easy as it could be; this is being worked upon. -\par }{\f1\insrsid5710756 Eventually releasing the code }{\f1\insrsid6826037 under the GNU General Public License (Free Software)}{\f1\insrsid5710756 is planned}{\f1\insrsid6826037 . -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3546591 -\par }{\f1\insrsid6382957 -\par }{\f1\insrsid10617934 Lessons Learned -\par -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid10617934 {\f1\insrsid10617934\charrsid10617934 Experience is what you get when you don't get what you want. -\par }{\f1\insrsid10617934 -- }{\f1\insrsid10617934\charrsid10617934 Dan Stanford}{\f1\insrsid10617934\charrsid10617934 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid10617934 -\par Despite application of Software Engineering best-practices such as careful modularization, built-in self tests and pre/postcondition guards, the defect rate was unpleasantly high. This may be the norm for low-level C++ -codebases of the above size, but was a problem given that file loading is a critical part of the application. -\par The takeaway is that more self-tests and condition checking can be recommended }{\f1\insrsid10176575 unreservedly \endash they exposed many bugs.}{\f1\insrsid10617934 -\par -\par The trace functionality (recording all I/Os) was }{\f1\insrsid10176575 found to be }{\f1\insrsid10617934 quite valuable. }{\f1\insrsid10176575 Besides its immediate application of ordering files, i}{\f1\insrsid10617934 -t allows testing I/O performance under repeatable conditions}{\f1\insrsid10176575 and}{\f1\insrsid10617934 reproducing bugs}{\f1\insrsid10176575 .}{\f1\insrsid10617934 -\par }{\f1\insrsid10176575 -\par On a final positive note, }{\f1\insrsid6382957 much room for improvement was to be had with I/O! The gains achieved were surprising.}{\f1\insrsid10176575 -\par }{\f1\insrsid10617934 -\par -\par }{\f1\insrsid13779256 Future Direction}{\f1\insrsid3546591 s}{\f1\insrsid13779256 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13779256 {\f1\insrsid3546591 -\par }{\f1\insrsid5710756 We have further ideas for improvement that could not yet be implemented due to time constraints. -\par -\par Prefetching, }{\f1\insrsid16678464 i.e. reading data before it is needed (during idle time)}{\f1\insrsid5710756 , shows promise}{\f1\insrsid16678464 . While req -uiring more work and tighter integration with the application, this can improve performance by always keeping the hard disk busy. The downsides that must be mitigated are increased power usage and potentially interfering with time-critical}{ -\f1\insrsid6842252 I/Os}{\f1\insrsid16678464 . -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid13779256 -\par }{\f1\insrsid6826037 Currently, the main bit of \lquote intelligence\rquote - is offline and consists of finding a good ordering for files within an archive. We would like to bring more of this into real time and e.g. make decisions in the file cache based on predicted future behavior. In particular, - small files known to be accessed after one another could be removed from the file cache together, thus freeing up more space (meaning less fragmentation) without hurting performance (because one file not in cache will force reading the block in which it -is stored, anyway). -\par }{\f1\insrsid16678464 -\par }{\f1\insrsid6826037 Two approaches are envisaged that could realize these wishes. A Markov chain could be constructed and used to decide the probability of certain}{\f1\insrsid6842252 I/Os}{\f1\insrsid6826037 - coming after one another. Also, previous traces could be examined at runtime to determine where in the load sequence we are, thus predicting further}{\f1\insrsid6842252 I/Os}{\f1\insrsid6826037 . -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid16678464 {\f1\insrsid16678464 -\par }{\f1\insrsid6826037 Stay tuned! -\par }{\f1\insrsid16678464 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13779256 {\f1\insrsid13779256 Bibliography}{\f1\insrsid13779256\charrsid13779256 -\par }{\f1\insrsid13779256 -\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15098734 {\f1\insrsid15098734\charrsid15098734 0 A.D. www.wildfiregames.com/0ad -\par -\par @Article\{SleTar85, -\par author ="Sleator and Tarjan", -\par title ="Amortized Efficiency of List Update and Paging Rules", -\par journal ="CACM: Communications of the ACM", -\par volume ="28", -\par year = "1985", -\par \} -\par -\par @Article\{Irani02, -\par author ="Irani", -\par title ="Page Replacement with Multi-Size Pages and -\par Applications to Web Caching", -\par journal ="ALGRTHMICA: Algorithmica", -\par volume ="33", -\par year = "2002", -\par \} -\par -\par @InProceedings\{conf/soda/Young98, -\par title ="On-Line File Caching", -\par author ="Neal E. Young", -\par year = "1998", -\par bibdate ="2002-12-09", -\par bibsource ="DBLP, -\par http://dblp.uni-trier.de/db/conf/soda/soda98.html#Young98", -\par booktitle ="SODA", -\par pages ="82--86", -\par \} -\par -\par @InProceedings\{conf/iwmm/WilsonJNB95, -\par title ="Dynamic Storage Allocation: \{A\} Survey and Critical -\par Review", -\par author ="Paul R. Wilson and Mark S. Johnstone and Michael Neely -\par and David Boles", -\par year = "1995", -\par bibdate ="2002-12-18", -\par bibsource ="DBLP, -\par http://dblp.uni-trier.de/db/conf/iwmm/iwmm95.html#WilsonJNB95", -\par booktitle ="IWMM", -\par crossref ="conf/iwmm/95", -\par pages ="1--116", -\par \} -\par -\par @Article\{JohWil99, -\par author ="Johnstone and Wilson", -\par title ="The Memory Fragmentation Problem: Solved?", -\par journal ="SPNOTICES: ACM SIGPLAN Notices", -\par volume ="34", -\par year = "1999", -\par \} -\par -\par @InProceedings\{conf/ecrts/MasmanoRCR04, -\par title ="\{TLSF\}: \{A\} New Dynamic Memory Allocator for Real-Time -\par Systems", -\par author ="Miguel Masmano and Ismael Ripoll and Alfons Crespo and -\par Jorge Real", -\par year = "2004", -\par bibdate ="2004-12-06", -\par bibsource ="DBLP, -\par http://dblp.uni-trier.de/db/conf/ecrts/ecrts2004.html#MasmanoRCR04", -\par booktitle ="ECRTS", -\par crossref ="conf/ecrts/2004", -\par }{\f1\lang1031\langfe1033\langnp1031\insrsid15098734\charrsid15098734 pages ="79--86", -\par URL = "http://doi.ieeecomputersociety.org/10.1109/ECRTS.2004.35", -\par }{\f1\insrsid15098734\charrsid15098734 \} -\par -\par DIMACS TSP Challenge http://public.research.att.com/~dsj/chtsp/ -\par -\par @Article\{SW, -\par author ="D. B. Shmoys and D. P. Williamson", -\par title ="Analyzing the Held-Karp \{TSP\} bound: a monotonicity -\par property with application", -\par journal ="Information Processing Letters", -\par year = "1991", -\par volume ="37", -\par pages ="281--285", -\par \} -\par -\par PIO http://www.pcguide.com/ref/hdd/if/ide/modes_PIO.htm -\par -\par @TechReport\{P117, -\par author ="Jim Gray and Erik Riedel and Catharine Van Ingen", -\par title ="A Study of Windows \{NT\} Sequential \{IO\} Perforamnce", -\par institution = "Microsoft Research (MSR)", -\par number ="P117", -\par year = "1997", -\par month =sep, -\par URL = "http://research.microsoft.com/barc/Sequential_IO/default.htm", -\par \} -\par -\par @Misc\{oai:CiteSeerPSU:290934, -\par title ="Windows 2000 Disk \{IO\} Performance", -\par author ="Jim Gray and Bruce Worthington and Robert Horst", -\par year = "2000", -\par month =jun # "~05", -\par annote ="Jim Gray (Microsoft Research; Advanced Technology -\par Division; Microsoft Corporation; One Microsoft; Way; -\par Redmond , WA. 98052; Windows 2000 Disk IO Performance); -\par Bruce Worthington (Microsoft Research; Advanced -\par Technology Division; Microsoft Corporation; One -\par Microsoft; Way; Redmond , WA. 98052; Windows 2000 Disk -\par IO Performance); Robert Horst (Microsoft Research; -\par Advanced Technology Division; Microsoft Corporation; -\par One Microsoft; Way; Redmond , WA. 98052; Windows 2000 -\par Disk IO Performance);", -\par bibsource ="OAI-PMH server at cs1.ist.psu.edu", -\par language ="en", -\par oai = "oai:CiteSeerPSU:290934", -\par rights ="unrestricted", -\par URL = "http://citeseer.ist.psu.edu/290934.html; -\par http://research.microsoft.com/~gray/papers/Win2K_IO_MSTR_2000_55.pdf", -\par \} -\par -\par RFC1951 http://rfc.net/rfc1951.html -\par -\par ZipAppNote http://www.pkware.com/business_and_developers/developer/appnote/ -\par -\par ZipBenchmark http://archive.gamespy.com/hardware/june02/p45331/index2.shtm -\par -\par IO benchmark www.storagereview.com -\par -\par ASPI FAQ http://www.eetkorea.com/ARTICLES/2000APR/2000APR05_CT_ID_AN.PDF}{\f1\insrsid3629320\charrsid15098734 -\par }} \ No newline at end of file diff --git a/docs/io_thesis.pdf b/docs/io_thesis.pdf new file mode 100644 index 0000000000..0f8afce03e Binary files /dev/null and b/docs/io_thesis.pdf differ