1
0
forked from 0ad/0ad

# IO thesis: incorporate revisions from 2nd reviewer and add experimental results for cache

This was SVN commit r3683.
This commit is contained in:
janwas 2006-03-25 07:05:04 +00:00
parent 8371f42da9
commit dadd15ef00

View File

@ -19,15 +19,15 @@
\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698689\'01\u-3913 ?;}{\levelnumbers;}\f3\fbias0 \fi-360\li5040\jclisttab\tx5040\lin5040 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0
\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67698691\'01o;}{\levelnumbers;}\f2\fbias0 \fi-360\li5760\jclisttab\tx5760\lin5760 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0
{\leveltext\leveltemplateid67698693\'01\u-3929 ?;}{\levelnumbers;}\f10\fbias0 \fi-360\li6480\jclisttab\tx6480\lin6480 }{\listname ;}\listid2098556351}}{\*\listoverridetable{\listoverride\listid2098556351\listoverridecount0\ls1}}{\*\rsidtbl \rsid141460
\rsid202317\rsid408563\rsid615363\rsid674725\rsid799807\rsid881799\rsid1001543\rsid1010827\rsid1080261\rsid1080863\rsid1128024\rsid1337770\rsid1733211\rsid1770144\rsid1847956\rsid2108982\rsid2117727\rsid2186746\rsid2424877\rsid2508185\rsid2521923
\rsid2556592\rsid2578444\rsid3160535\rsid3243975\rsid3289879\rsid3345630\rsid3422301\rsid3546591\rsid3550232\rsid3606292\rsid3629320\rsid4146695\rsid4282397\rsid4351785\rsid4401489\rsid4665099\rsid4722959\rsid4937740\rsid4994568\rsid5048634\rsid5467766
\rsid5710756\rsid5715141\rsid5721779\rsid5791350\rsid5968267\rsid5980580\rsid6185166\rsid6191932\rsid6826037\rsid6831581\rsid6842252\rsid6842857\rsid6951588\rsid7351464\rsid7371675\rsid7437835\rsid7733471\rsid7764402\rsid8066536\rsid8144712\rsid8264387
\rsid8477628\rsid8813409\rsid8848513\rsid8874078\rsid9060782\rsid9109787\rsid9137203\rsid9206128\rsid9373790\rsid9658852\rsid9779530\rsid10040624\rsid10047242\rsid10117750\rsid10239896\rsid10249343\rsid10885058\rsid11155165\rsid11157276\rsid11279558
\rsid11288483\rsid11368037\rsid11408224\rsid11472464\rsid11472636\rsid11491946\rsid11684990\rsid11994078\rsid12070557\rsid12124230\rsid12480624\rsid12547006\rsid12675798\rsid12716889\rsid12808527\rsid12870649\rsid13000184\rsid13180669\rsid13388513
\rsid13582553\rsid13723102\rsid13724273\rsid13779256\rsid13977746\rsid14045424\rsid14308065\rsid14363947\rsid14433699\rsid14619881\rsid14696977\rsid14962633\rsid15159648\rsid15160866\rsid15301494\rsid15489891\rsid15548533\rsid15672389\rsid15688925
\rsid15952639\rsid16131725\rsid16411143\rsid16475960\rsid16678464}{\*\generator Microsoft Word 11.0.6359;}{\info{\title Introduction}{\author Jan Wassenberg}{\operator Jan Wassenberg}{\creatim\yr2006\mo3\dy23\hr13\min57}{\revtim\yr2006\mo3\dy23\hr19}
{\version10}{\edmins297}{\nofpages20}{\nofwords6684}{\nofchars38102}{\*\company a}{\nofcharsws44697}{\vern24703}}\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\formshade\horzdoc\dgmargin\dghspace180\dgvspace180
\dghorigin1701\dgvorigin1984\dghshow1\dgvshow1
\rsid202317\rsid408563\rsid615363\rsid674725\rsid799807\rsid881799\rsid1001543\rsid1010827\rsid1080261\rsid1080863\rsid1128024\rsid1337770\rsid1516560\rsid1733211\rsid1770144\rsid1847956\rsid2108982\rsid2117727\rsid2186746\rsid2424877\rsid2508185
\rsid2521923\rsid2556592\rsid2578444\rsid3160535\rsid3243975\rsid3289879\rsid3345630\rsid3422301\rsid3546591\rsid3550232\rsid3606292\rsid3629320\rsid4146695\rsid4156677\rsid4282397\rsid4351785\rsid4401489\rsid4665099\rsid4722959\rsid4916741\rsid4933345
\rsid4937740\rsid4994568\rsid5048634\rsid5467766\rsid5710756\rsid5715141\rsid5721779\rsid5791350\rsid5968267\rsid5980580\rsid6185166\rsid6191932\rsid6762521\rsid6826037\rsid6831581\rsid6842252\rsid6842857\rsid6951588\rsid7279159\rsid7351464\rsid7371675
\rsid7437835\rsid7543581\rsid7561109\rsid7733471\rsid7764402\rsid8066536\rsid8144712\rsid8264387\rsid8477628\rsid8813409\rsid8848513\rsid8874078\rsid9060782\rsid9109787\rsid9137203\rsid9206128\rsid9373790\rsid9524216\rsid9658852\rsid9779530\rsid10040624
\rsid10047242\rsid10117750\rsid10239896\rsid10249343\rsid10885058\rsid11155165\rsid11157276\rsid11279558\rsid11288483\rsid11368037\rsid11408224\rsid11472464\rsid11472636\rsid11491946\rsid11684990\rsid11994078\rsid12070557\rsid12124230\rsid12480624
\rsid12547006\rsid12675798\rsid12716889\rsid12808527\rsid12870649\rsid13000184\rsid13180669\rsid13388513\rsid13582553\rsid13723102\rsid13724273\rsid13779256\rsid13977746\rsid14045424\rsid14093323\rsid14308065\rsid14363947\rsid14433699\rsid14619881
\rsid14696977\rsid14962633\rsid15159648\rsid15160866\rsid15301494\rsid15489891\rsid15548533\rsid15672389\rsid15688925\rsid15952639\rsid16131725\rsid16411143\rsid16475960\rsid16678464}{\*\generator Microsoft Word 11.0.6359;}{\info{\title Introduction}
{\author Jan Wassenberg}{\operator Jan Wassenberg}{\creatim\yr2006\mo3\dy23\hr13\min57}{\revtim\yr2006\mo3\dy25\hr2\min44}{\version14}{\edmins354}{\nofpages20}{\nofwords6969}{\nofchars39727}{\*\company a}{\nofcharsws46603}{\vern24703}}
\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1701\dgvorigin1984\dghshow1\dgvshow1
\jexpand\viewkind4\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\viewnobound1\snaptogridincell\allowfieldendsel
\wrppunct\asianbrkrule\rsidroot15301494\newtblstyruls\nogrowautofit \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2
\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6
@ -36,23 +36,24 @@
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1128024 {\f1\insrsid1128024\charrsid1128024 Slow I/O is widespread, as attested }{\f1\insrsid11155165 to }{\f1\insrsid1128024\charrsid1128024
by splash screens and progress bars; however, it can be done better.
\par We present a reusable and highly efficient I/O library, discuss }{\f1\insrsid11155165 design decisions and }{\f1\insrsid1128024\charrsid1128024 key algorithms, and analy}{\f1\insrsid11155165 z}{\f1\insrsid1128024\charrsid1128024
e the resulting performance.
\par }{\f1\insrsid7543581 A }{\f1\insrsid1128024\charrsid1128024 reusable and highly efficient I/O }{\f1\insrsid7543581 solution is presented; }{\f1\insrsid11155165 design decisions and }{\f1\insrsid1128024\charrsid1128024 key algorithms}{\f1\insrsid7543581
are discussed}{\f1\insrsid1128024\charrsid1128024 , and }{\f1\insrsid7543581 the resulting }{\f1\insrsid1128024\charrsid1128024 performance}{\f1\insrsid7543581 analyzed}{\f1\insrsid1128024\charrsid1128024 .
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid1128024
\par }{\f1\insrsid15301494\charrsid12808527 Introduction
\par
\par }{\f1\insrsid615363 Motivation / }{\f1\insrsid15301494\charrsid12808527 Importance of Fast}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527
\par
\par Since}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527 is much slower than CPU or memory, it can very quickly become a bottleneck. }{\f1\insrsid16131725 An }{\f1\insrsid615363 estimate as of 2006}{\f1\insrsid16131725 is}{
\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid615363 6}{\f1\insrsid12808527\charrsid12808527 0}{\f1\insrsid16131725 }{\f1\insrsid12808527\charrsid12808527 MB/s vs}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 2}{\f1\insrsid615363 6}{
\f1\insrsid12808527\charrsid12808527 00}{\f1\insrsid16131725 }{\f1\insrsid12808527\charrsid12808527 MB/s}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid15301494\charrsid12808527
Many applications would therefore benefit from faster}{\f1\insrsid6842252 I/O}{\f1\insrsid12808527\charrsid12808527 ; example scenarios include:}{\f1\insrsid15301494\charrsid12808527
\par }{\f1\insrsid12808527\charrsid12808527 - slow startup time. The user is inconvenienced by waiting for }{\f1\insrsid12808527 required files to load}{\f1\insrsid12808527\charrsid12808527 ; this is often exacerbated by splash screens and other distractions.}
{\f1\insrsid12808527
\par }{\f1\insrsid15489891 For a}{\f1\insrsid16131725 rather extreme}{\f1\insrsid15489891 example of th}{\f1\insrsid16131725 is}{\f1\insrsid15489891 problem, see }{\field{\*\fldinst {\f1\insrsid15489891 HYPERLINK "}{\f1\insrsid15489891\charrsid15489891
http://www.break.com/index/patiencechild.html}{\f1\insrsid15489891 " }{\f1\insrsid5715141\charrsid10249343 {\*\datafield
\par Since}{\f1\insrsid6842252 I/O}{\f1\insrsid15301494\charrsid12808527 is much slower than CPU or memory, it can }{\f1\insrsid7543581 easily }{\f1\insrsid15301494\charrsid12808527 become a bottleneck}{\f1\insrsid7543581 }{\f1\insrsid14093323 within}{
\f1\insrsid7543581 the system as a whole. }{\f1\insrsid16131725 An }{\f1\insrsid615363 estimate as of 2006}{\f1\insrsid16131725 is}{\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid615363 6}{\f1\insrsid12808527\charrsid12808527 0}{\f1\insrsid16131725
}{\f1\insrsid12808527\charrsid12808527 MB/s vs}{\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 2}{\f1\insrsid615363 6}{\f1\insrsid12808527\charrsid12808527 00}{\f1\insrsid16131725 }{\f1\insrsid12808527\charrsid12808527 MB/s}{
\f1\insrsid16131725 .}{\f1\insrsid12808527\charrsid12808527 }{\f1\insrsid15301494\charrsid12808527 Many }{\f1\insrsid7543581 applications }{\f1\insrsid15301494\charrsid12808527 would therefore benefit from faster}{\f1\insrsid6842252 I/O}{
\f1\insrsid12808527\charrsid12808527 ; example scenarios include:}{\f1\insrsid15301494\charrsid12808527
\par }{\f1\insrsid12808527\charrsid12808527 - slow startup time. The user is inconvenienced by waiting for }{\f1\insrsid12808527 required files to load}{\f1\insrsid7543581 ; splash screens }{\f1\insrsid4933345 are one attempt to mitigate this by }{
\f1\insrsid7543581 distracting the user}{\f1\insrsid12808527\charrsid12808527 .}{\f1\insrsid12808527
\par }{\f1\insrsid15489891 For a}{\f1\insrsid16131725 rather extreme}{\f1\insrsid15489891 }{\f1\insrsid4933345 illustration }{\f1\insrsid15489891 of th}{\f1\insrsid4933345 e}{\f1\insrsid15489891 problem, see }{\field{\*\fldinst {\f1\insrsid15489891
HYPERLINK "}{\f1\insrsid15489891\charrsid15489891 http://www.break.com/index/patiencechild.html}{\f1\insrsid15489891 " }{\f1\insrsid4916741\charrsid10249343 {\*\datafield
00d0c9ea79f9bace118c8200aa004ba90b02000000170000002e00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c000000e0c9ea79f9ba
ce118c8200aa004ba90b5c00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c0000000065}}}{\fldrslt {
ce118c8200aa004ba90b5c00000068007400740070003a002f002f007700770077002e0062007200650061006b002e0063006f006d002f0069006e006400650078002f00700061007400690065006e00630065006300680069006c0064002e00680074006d006c00000000650067}}}{\fldrslt {
\cs15\f1\ul\cf2\insrsid15489891\charrsid10249343 http://www.break.com/index/patiencechild.html}}}{\f1\insrsid15489891 .}{\f1\insrsid15489891\charrsid15489891
\par }{\f1\insrsid12808527\charrsid12808527 - }{\f1\insrsid12808527 on-demand loading. If the data set is too large to fit in memory, it must be loaded }{\f1\insrsid615363 in increments }{\f1\insrsid12808527 as needed. This can cause \lquote freezes\rquote
in the application while waiting for the}{\f1\insrsid6842252 I/O}{\f1\insrsid12808527 to finish.
@ -60,15 +61,17 @@ ce118c8200aa004ba90b5c00000068007400740070003a002f002f007700770077002e0062007200
\par }{\f1\insrsid15489891
\par }{\f1\insrsid9373790 Intended Application}{\f1\insrsid615363
\par
\par The application for which our}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 library has been developed is a Real-Time Strategy computer game}{\f1\insrsid9779530 [0ad}{\f1\insrsid615363 ].}{\f1\insrsid11684990
Both on-demand streaming of data and bulk loading on startup must be efficiently handled. }{\f1\insrsid615363 While intending for this}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 code to remain useful for a wide range of ap
plications, several consequences arise from this and guide our design decisions.
\par The application for which our}{\f1\insrsid6842252 I/O}{\f1\insrsid615363 library has been developed is a Real-Time Strategy computer game}{\f1\insrsid9779530 [0ad}{\f1\insrsid615363 ].}{\f1\insrsid11684990 }{\f1\insrsid7543581 It }{\f1\insrsid4933345
utilizes }{\f1\insrsid7543581 the traditional }{\f1\insrsid4933345 method}{\f1\insrsid7543581 of loading files on startup as well as on-demand streaming of data, so both should be efficiently handled.
\par }{\f1\insrsid615363 While intending for }{\f1\insrsid4933345 the }{\f1\insrsid6842252 I/O}{\f1\insrsid615363 }{\f1\insrsid4933345 solution }{\f1\insrsid615363
to remain useful for a wide range of applications, several consequences arise from this and guide our design decisions.
\par First, much emphasis is placed on real-time behavior. Lag or \lquote freezing\rquote in-game is not acceptable and must be minimized. This means that }{\f1\insrsid9373790 the }{\f1\insrsid615363 caching }{\f1\insrsid9373790 algorithm }{\f1\insrsid615363
must }{\f1\insrsid9373790 not have offline performance characteristics, reordering I/Os is probably not acceptable and }{\f1\insrsid2117727 any }{\f1\insrsid9373790 pre-fetching }{\f1\insrsid16131725 would have to}{\f1\insrsid9373790
be quite conservative (so as not to penalize time-critical on-demand loads).
\par Also, the working set is not static; depending on game mode and environment, different files may be needed. Provision must be made for varying access patterns.
\par Finally, and related to the real-time issue, is that of fragmentation. Games can run over several hours; during that time, performance must not degrade to unaccep
table levels e.g. due to memory fragmentation. Given the real-time requirements, offline reorganization is not an option; the algorithms }{\f1\insrsid10117750 used must be designed accordingly.}{\f1\insrsid9373790
\par Finally, and related to the real-time issue, is that of fr
agmentation. Games can run over several hours; during that time, performance must not degrade to unacceptable levels e.g. due to memory fragmentation. Given the real-time requirements, offline reorganization is not an option; the algorithms }{
\f1\insrsid10117750 used must be designed accordingly.}{\f1\insrsid9373790
\par }{\f1\insrsid10117750
\par Given these central design constraints, we now present the chief ideas behind our fast}{\f1\insrsid6842252 I/O}{\f1\insrsid10117750 method.
\par
@ -77,10 +80,10 @@ table levels e.g. due to memory fragmentation. Given the real-time requirements,
\par Our approach is }{\f1\insrsid11155165 five}{\f1\insrsid10117750 -fold:
\par 1) caching avoids repetitive slow}{\f1\insrsid6842252 I/Os}{\f1\insrsid10117750 ;
\par 2) ordering files according to access patterns minimizes hard-drive seeks;
\par 3) compressing files reduces the amount of data to read;
\par 3) compressing files reduces the amount of data to }{\f1\insrsid7561109 be }{\f1\insrsid10117750 read;
\par 4) asynchronous}{\f1\insrsid6842252 I/O}{\f1\insrsid10117750 maximizes throughput and }{\f1\insrsid2117727 allows computation}{\f1\insrsid11155165 to proceed in parallel with}{\f1\insrsid6842252 I/O}{\f1\insrsid11155165 ;}{\f1\insrsid10117750
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11155165 {\f1\insrsid11155165 5) splitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid11155165
into blocks simplifies caching and decompression while also avoiding copying buffers for alignment purposes.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11155165 {\f1\insrsid11155165 5) splitting}{\f1\insrsid6842252 I/Os}{\f1\insrsid11155165 into b
locks simplifies caching and decompression while also avoiding copying buffers for alignment purposes.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid2117727
\par We will discuss each of these in detail below, but first cover related theoretical work in this field.
\par }{\f1\insrsid10117750
@ -88,12 +91,12 @@ table levels e.g. due to memory fragmentation. Given the real-time requirements,
\par }{\f1\insrsid2117727
\par }{\f1\insrsid1080863 Cache
\par
\par }{\f1\insrsid9779530 For the cache, a central question is which files to keep in memory. This is known as the file- or web-caching problem. }{\f1\insrsid2117727 In short, given a set of file requests (each with
size and retrieval cost), a cache is maintained such that tot}{\f1\insrsid9779530 al retrieval cost is minimized.
\par }{\f1\insrsid9779530 For the cache, a central question is which files to keep in memory. This is known as the file- or web-caching problem. }{\f1\insrsid2117727
In short, given a set of file requests (each with size and retrieval cost), a cache is maintained such that tot}{\f1\insrsid9779530 al retrieval cost is minimized.
\par }{\f1\insrsid1128024
\par }{\f1\insrsid2117727 The special case where size and cost are uniform is called \'93paging\'94, which has been studied extensively.}{\f1\insrsid9779530 Several algorithms that have an optimal competitive ratio
are known. In particular LRU (Least Recently Used, which simply evicts the file whose access time is the least recent) }{\f1\insrsid8874078 is k/(k-h+1) competitive, which is the best possible for a deterministic algorithm}{\f1\insrsid9779530
[Sleator/Tarjan].}{\f1\insrsid8874078
\par }{\f1\insrsid2117727 The special case where size and cost are uniform is called \'93paging\'94, which has been studied extensively.}{\f1\insrsid9779530
Several algorithms that have an optimal competitive ratio are known. In particular LRU (Least Recently Used, which simply evicts the file whose access time is the least recent) }{\f1\insrsid8874078
is k/(k-h+1) competitive, which is the best possible for a deterministic algorithm}{\f1\insrsid9779530 [Sleator/Tarjan].}{\f1\insrsid8874078
\par }{\f1\insrsid1128024
\par }{\f1\insrsid9779530 This model is appealing due to its simplicity, but is not sufficient for our needs.}{\f1\insrsid8874078 Files are not typically uniform size, and treating them as such would be monstrously inefficient (}{\f1\insrsid3289879
much cache space would be wasted by rounding up element size to that of the largest file}{\f1\insrsid8874078 ).}{\f1\insrsid9779530
@ -101,8 +104,8 @@ much cache space would be wasted by rounding up element size to that of the larg
\par }{\f1\insrsid8874078 Irani gives 2 }{\f1\insrsid3289879 O(log^2 k) competitive }{\f1\insrsid8874078 randomized algorit}{\f1\insrsid1128024 hms that can deal with variable-}{\f1\insrsid8874078 sized files}{\f1\insrsid1128024 and uniform cost [Irani].
\par
\par }{\f1\insrsid3289879 However, we would like to achieve full generality and provide for variable cost as well. This can be used as the name suggests to more accurately
reflect load time (as will be seen below, this is not solely dependent on file size!), or as a hint from the application that certain files are not to be removed from the cache as early as they otherwise would.}{\f1\insrsid8874078
\par }{\f1\insrsid3289879 However, we would like to achieve full generality and provide for variable cost as well. This can be used as the name suggests to more accurately reflect load time (as will be seen below, this is
not solely dependent on file size!), or as a hint from the application that certain files are not to be removed from the cache as early as they otherwise would.}{\f1\insrsid8874078
\par }{\f1\insrsid1128024
\par }{\f1\insrsid3289879 Young develops such an algorithm and calls it Landlord. Briefly, each file receives \lquote credit\rquote that is initially set to its cost. When determining which file is to be removed from cache (i.e. \lquote evicted\rquote
), each one is charged \lquote rent\rquote proportional to its size and the minimum credit-per-size density currently in the cache. }{\f1\insrsid1128024 Items are evicted once their credit is 0. }{\f1\insrsid3289879
@ -118,9 +121,9 @@ allocators are not adequate; an alternative will have to be developed. We build
\par }{\f1\insrsid13724273 Wilson et al. give a very thorough }{\f1\insrsid14045424 and helpful }{\f1\insrsid13724273 overview. }{\f1\insrsid14045424
A simple but crucial point is made: fragmentation is caused by freeing regions whose neighbors are not free. Allocators are online algorithms whose only }{\f1\insrsid12070557 tool}{\f1\insrsid14045424 against this is placement \endash
deciding where to allocate regions. The authors advocate benchmarking by }{\f1\insrsid1128024 means of }{\f1\insrsid14045424
traces (a record of allocations) from real-world programs, because randomized tests do not necessarily reflect reality. It is emphasized that allocat}{\f1\insrsid11155165 ion}{\f1\insrsid14045424
policy and mechanism must be considered separately. Results of tests show certain policies,
namely address-ordered first (segregated) fit, to perform quite well, wasting only about 14% memory. Finally, further discussion of implementation details such as boundary tags was helpful. [DynStorageReview]
traces (a record of allocations) from real-world programs, because randomized tests do not necessarily reflect reality. It is emphasized that allocat}{\f1\insrsid11155165 ion}{\f1\insrsid14045424 policy and mechanism
must be considered separately. Results of tests show certain policies, namely address-ordered first (segregated) fit, to perform quite well, wasting only about 14% memory. Finally, further discussion of implementation details such as boundary tags was he
lpful. [DynStorageReview]
\par }{\f1\insrsid1128024
\par }{\f1\insrsid13724273 Johnstone and Wilson}{\f1\insrsid14045424 go on to refine their measure of }{\f1\insrsid13724273 fragmentation and }{\f1\insrsid14045424 conclude that the previously mentioned AO-first-fit policy actually }{\f1\insrsid12070557 only
}{\f1\insrsid14045424 suffers from ~1% fragmentation}{\f1\insrsid12070557 , the best of all techniques considered}{\f1\insrsid14045424 . }{\f1\insrsid12070557 [}{\f1\insrsid13724273 MemFragSolved]}{\f1\insrsid12070557
@ -133,18 +136,19 @@ abovementioned }{\f1\insrsid1080863 problem preventing use of }{\f1\insrsid11684
\par }{\f1\insrsid1080863
\par Ordering - Traveling Salesman Problem
\par
\par The problem of ordering files according to access patterns can be seen as an instance of the Traveling Salesman Problem. The latter is defined as: given a graph of nodes (cities) and the cost of traveling f
rom one to another (travel distance), compute a path that will take the salesman to each city while incurring minimal cost. In our case, files correspond to cities and the hard-disk seek distance to cost.
\par The problem of ordering files according to access patterns can be seen as an instance of the Traveling Salesman Problem. The latter
is defined as: given a graph of nodes (cities) and the cost of traveling from one to another (travel distance), compute a path that will take the salesman to each city while incurring minimal cost. In our case, files correspond to cities and the hard-dis
k seek distance to cost.
\par }{\f1\insrsid1128024
\par }{\f1\insrsid1080863 TSP has perhaps been studied most among all optimization problems; numerous algorithms and heuristics have been developed, each with their strengths and w}{\f1\insrsid1128024 eaknesses. [DIMACS }{\f1\insrsid4722959 TSP }{
\f1\insrsid1128024 Challenge] }{\f1\insrsid10047242 gives an extensive listing of algorithms, relative performance and techniques and was a valuable reference.}{\f1\insrsid1080863
\par }{\f1\insrsid15688925
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4282397 {\f1\insrsid4282397 For our a
pplication, less than optimal orderings are acceptable due to non-static access patterns. Since variational file accesses (e.g. due to differing modes of play) would invalidate any ordering we establish, it does not make sense to insist on an optimal solu
tion.}{\f1\insrsid4282397\charrsid4722959
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4282397 {\f1\insrsid4282397
For our application, less than optimal orderings are acceptable due to non-static access patterns. Since variational file accesses (e.g. due to differing modes of play) would invalidate any
ordering we establish, it does not make sense to insist on an optimal solution.}{\f1\insrsid4282397\charrsid4722959
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid4282397 The DIMACS Challenge }{\f1\insrsid4722959 shows several heuristics to perform quite well, coming to within 11 % of the H}{\f1\insrsid15688925 eld-}{
\f1\insrsid4722959 K}{\f1\insrsid15688925 arp}{\f1\insrsid4722959 bound }{\f1\insrsid15688925 (a good approximation of the optimal solution to an instance of TSP). }{\f1\insrsid4722959 [}{\f1\insrsid4722959\charrsid4722959 D. B. Shmoys and D.
P. Williamson. Analyzing the Held-Karp TSP bound: a monotonicity property with application. Info. Proc. Lett., 35(6):281-285, 1990.}{\f1\insrsid4722959 ]}{\f1\insrsid10047242
\f1\insrsid4722959 K}{\f1\insrsid15688925 arp}{\f1\insrsid4722959 bound }{\f1\insrsid15688925 (a good approximation of the optimal solution to an instance of TSP). }{\f1\insrsid4722959 [}{\f1\insrsid4722959\charrsid4722959
D. B. Shmoys and D. P. Williamson. Analyzing the Held-Karp TSP bound: a monotonicity property with application. Info. Proc. Lett., 35(6):281-285, 1990.}{\f1\insrsid4722959 ]}{\f1\insrsid10047242
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4282397 {\f1\insrsid4282397 We therefore settle on a greedy heuristic for simplicity.}{\f1\insrsid4282397\charrsid4722959
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\f1\insrsid8477628
\par }{\f1\insrsid4665099
@ -178,8 +182,8 @@ over the PCI bus became the norm. Here, the disk controller writes directly to m
\par - aio }{\f1\insrsid3160535 places pending read requests in a queue so }{\f1\insrsid9109787 that the disk controller can proceed immediately with the next}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 ; the disk is always busy. With threaded blocking}{
\f1\insrsid6842252 I/O}{\f1\insrsid9109787 , the OS would have to return from and then reenter kernel mode before relaying the }{\f1\insrsid1337770 application\rquote s }{\f1\insrsid9109787 next}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787
request to the disk. This overhead reduces throughput.
\par - parallelism between computation and}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787
is achieved without having to worry about the OS correctly scheduling all participating threads. Additionally, behavior is predictable and thread-switch overhead is avoided.
\par - parallelism between computation and}{\f1\insrsid6842252 I/O}{\f1\insrsid9109787 is ach
ieved without having to worry about the OS correctly scheduling all participating threads. Additionally, behavior is predictable and thread-switch overhead is avoided.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid3160535 {\f1\insrsid3160535
\par }{\f1\insrsid1337770 Note: Linux used to emulate aio by spawning threads, which led to less than stellar performance. This is no longer the case; indeed a decent aio implementation should not fare worse than the threaded blocking I/O approach.
\par In fact, asynchronous I/O performs better on Windows due to the abovementioned issues.
@ -195,9 +199,9 @@ to achieve best possible throughput and allow computation to proceed in parallel
\par }{\f1\insrsid2108982
\par Compression
\par
\par The next cornerstone of our}{\f1\insrsid6842252 I/O}{\f1\insrsid2108982
library is compressing source files. This can dramatically reduce the amount of data to read. Indeed the current 0ad dataset has been compressed down to 46% of the original, a savings of 75 MB.}{\f1\insrsid9658852 (NB: the data
set includes 13 MB of uncompressible audio; 3d mesh files with compression ratios of ~3x are chiefly responsible for the reduction)}{\f1\insrsid2108982
\par The next cornerstone of our}{\f1\insrsid6842252 I/O}{\f1\insrsid2108982 library is compressing source files. This can dramatically reduce the amount of data to }{\f1\insrsid7561109 be }{\f1\insrsid2108982
read. Indeed the current 0ad dataset has been compressed down to 46% of the original, a savings of 75 MB.}{\f1\insrsid9658852
(NB: the dataset includes 13 MB of uncompressible audio; 3d mesh files with compression ratios of ~3x are chiefly responsible for the reduction)}{\f1\insrsid2108982
\par }{\f1\insrsid9658852
\par The compression algorithm used is Deflate, a combination of LZ77 and Huffman encoding as defined in [RFC1951] and used in the common Zip file format [Zip}{\f1\insrsid10885058 A}{\f1\insrsid9658852 pp}{\f1\insrsid10885058 N}{\f1\insrsid9658852 ote]. }{
\f1\insrsid7733471 Other }{\f1\insrsid9658852 formats }{\f1\insrsid7733471 may }{\f1\insrsid9658852 achieve better compression ratios or feature faster compression/decompression speed}{\f1\insrsid7733471
@ -206,8 +210,8 @@ set includes 13 MB of uncompressible audio; 3d mesh files with compression ratio
\par }{\f1\insrsid6951588 In addition to the abovementioned significant reduction in file size, a further compelling argument to compress all data files }{\f1\insrsid10885058 is that it is }{\f1\insrsid9658852 effectively free!}{\f1\insrsid13977746
To show this, we must first discuss how exactly I/O and decompression will be parallelized.}{\f1\insrsid6951588
\par }{\f1\insrsid13977746
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13977746 {\f1\insrsid13977746 Presuppose that I/Os are split into fixed-size blocks, the rationale of which will be explained in \{Splitting Into Blocks\}.
These blocks are issued asynchronously up to a surely safe queue depth (currently 4). A block whose I/O has finished is then decompressed while the next ones are pending. This gives perfect parallelization if decompression requires less time than I/O.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13977746 {\f1\insrsid13977746 Presuppose that I/Os are split into fixed-size blocks, the rationale of which will be explained in \{Splitting Into Blocks\}
. These blocks are issued asynchronously up to a surely safe queue depth (currently 4). A block whose I/O has finished is then decompressed while the next ones are pending. This gives perfect parallelization if decompression requires less time than I/O.
\par
\par Indeed a benchmark shows that a typical Pentium IV system (as of 2002) manages 40 MB/s I/O throughput and 100MB/s decompression [}{\f1\insrsid13977746\charrsid2108982 http://archive.gamespy.com/hardware/june02/p45331/index2.shtm}{\f1\insrsid13977746 ].
@ -226,11 +230,11 @@ These blocks are issued asynchronously up to a surely safe queue depth (currentl
{\f1\insrsid6842252 I/O}{\f1\insrsid5721779 read }{\f1\insrsid11157276 time.
\par }{\f1\insrsid5721779
\par }{\f1\insrsid11157276 Throughput can be much improved by arranging files on disk in order of access}{\f1\insrsid5721779 , thus avoiding seeks}{\f1\insrsid11157276 . Since we wish to use a standard File System (}{\f1\insrsid5721779
whose placement strategy we cannot control}{\f1\insrsid11157276 ) for simplicity, files will have to be combined into one large OS-visible file \endash an archive.}{\f1\insrsid5721779 As m
entioned above, we prefer the Zip format for easy interoperability.}{\f1\insrsid11157276
whose placement strategy we cannot control}{\f1\insrsid11157276 ) for simplicity, files will have to be combined into one large OS-visible file \endash an archive.}{\f1\insrsid5721779 As mentioned above, we prefer the Zip format for easy interoperabil
ity.}{\f1\insrsid11157276
\par }{\f1\insrsid5721779
\par Incidentally, storing files in archives has an additional advantage. The FS needs to store metadata and typically sector-aligns files; since sectors are 512 bytes or }{\f1\insrsid13977746 more}{\f1\insrsid5721779 , this is very co
stly for tiny files. (NB: ReiserFS4 is the only known exception, able to pack several files into one sector.)
\par Incidentally, storing files in archives has an additional advantage. The FS needs to store metadata and typically sector-aligns files; since sectors are 512 bytes or }{\f1\insrsid13977746 more}{\f1\insrsid5721779
, this is very costly for tiny files. (NB: ReiserFS4 is the only known exception, able to pack several files into one sector.)
\par In contrast, archives can contain files packed end-to-end with only minimal metadata/header information}{\f1\insrsid12480624 , thus wasting less space and by extension reducing read time.}{\f1\insrsid5721779
\par }{\f1\insrsid4351785
\par It remains to determine the optimal file ordering that minimizes seeks.}{\f1\insrsid6831581 This will be done once (offline); performance is therefore not of paramount importance.}{\f1\insrsid4351785
@ -245,16 +249,16 @@ BC...AC...BC. It would seem that 50% of \lquote C\rquote accesses must incur a
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid2578444 2) }{\f1\insrsid12480624 record a}{\f1\insrsid2578444 \'93trace\'94 of all}{\f1\insrsid12480624
file accesses over one or more program runs (recall that access patterns may differ}{\f1\insrsid2578444 between runs}{\f1\insrsid12480624 )}{\f1\insrsid15159648 .}{\f1\insrsid2578444
\par }{\f1\insrsid15159648 3) construct from this a list of possible edges sorted by their frequency (i.e. how often they occurred in the trace).
\par 4) generate a set of \lquote }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 \rquote by committing the above edges as long as no cycle results. These }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 are
connected portions of the DAG that are known to have been accessed in that order.
\par 4) generate a set of \lquote }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 \rquote by committing the above edges as long as no cycle results. These }{\f1\insrsid2521923 chains}{\f1\insrsid15159648
are connected portions of the DAG that are known to have been accessed in that order.
\par 5) output the final file ordering by stitching together all }{\f1\insrsid2521923 chains}{\f1\insrsid15159648 and then adding any remaining files that were not included in the trace.
\par
\par Details on these steps follow.
\par
\par }{\f1\insrsid6831581 1: Prepare DAG of Files
\par
\par Each node holds all required information about the file. This includes its filename and the nodes that have been chosen to come before and after it in the final layout. All of these are stored as 16-bit IDs to reduce size and there
fore improve locality; mapping from filename to ID is accomplished in logarithmic time via tree.
\par Each node holds all required information about the file. This includes its filename and the nodes that have been chosen to come before and after it in the final layout. All of these are stored as 16-bit IDs to reduce size and t
herefore improve locality; mapping from filename to ID is accomplished in logarithmic time via tree.
\par
\par }{\f1\insrsid15159648 2: }{\f1\insrsid2521923 Record }{\f1\insrsid15159648 Trace
\par }{\f1\insrsid12480624
@ -298,22 +302,23 @@ because seeks are incurred by accessing any part of the file. Also, we assume th
\f1\insrsid6842252 by an edge}{\f1\insrsid2521923 (unless a cycle were to result). For simplicity, committed edges are never removed, this being a greedy heuristic.}{\f1\insrsid14363947
\par
\par }{\f1\insrsid2521923 We check for cycles}{\f1\insrsid14363947 via }{\f1\insrsid2521923 \'93}{\f1\insrsid14363947 DFS}{\f1\insrsid2521923 \'94}{\f1\insrsid14363947 , which }{\f1\insrsid2521923 actually }{\f1\insrsid14363947 simplifies to a list walk }{
\f1\insrsid2521923 here }{\f1\insrsid14363947 since nodes have only one previous and next link. These are typically quit
e short and overall run time of this entire step is not a problem in practice (7ms for 5000 files), so we do not attempt more efficient and sophisticated cycle detection schemes. One such approach would be to store a pointer to the current end of list for
each node and perform list jumping.}{\f1\insrsid2186746
\f1\insrsid2521923 here }{\f1\insrsid14363947 since n
odes have only one previous and next link. These are typically quite short and overall run time of this entire step is not a problem in practice (7ms for 5000 files), so we do not attempt more efficient and sophisticated cycle detection schemes. One such
approach would be to store a pointer to the current end of list for each node and perform list jumping.}{\f1\insrsid2186746
\par }{\f1\insrsid14363947
\par The result of this step is a set of disjoint }{\f1\insrsid2521923 chains}{\f1\insrsid14363947
, which are each a series of files that are to be stored immediately after one another. Due to the nature of the edge list, the files that are most frequently accessed after one another are grouped together. As such, we have attained a }{
\f1\insrsid2186746 good }{\f1\insrsid2521923 a}{\f1\insrsid2186746 pproximation of an optimal tour.
\par The result of this step is a set of disjoint }{\f1\insrsid2521923 chains}{\f1\insrsid14363947 , which are each a series of files that are to be stored immediately after one another. Due to the na
ture of the edge list, the files that are most frequently accessed after one another are grouped together. As such, we have attained a }{\f1\insrsid2186746 good }{\f1\insrsid2521923 a}{\f1\insrsid2186746 pproximation of an optimal tour.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid12716889
\par }{\f1\insrsid14363947 Note: now the reason for the most-recent-first program run ordering becomes clear. All but the most frequent edges are placed into the list in the o
rder that they occurred in the trace (due to stable sort). Since they are also committed in the DAG}{\f1\insrsid2521923 in this order, they end up mostly as observed}{\f1\insrsid14363947 }{\f1\insrsid2521923
from the trace. Since the most recent trace is assumed to be the most accurate and reflective of current behavior, it is given the most weight (by allowing all edges that ensued from it to be committed first).}{\f1\insrsid14363947
\par }{\f1\insrsid14363947 Note: now the reason for the most-recent-first program run ordering becomes clear
. All but the most frequent edges are placed into the list in the order that they occurred in the trace (due to stable sort). Since they are also committed in the DAG}{\f1\insrsid2521923 in this order, they end up mostly as observed}{\f1\insrsid14363947
}{\f1\insrsid2521923 from the trace. Since the most recent trace is assumed to be the most accurate and reflective of current behavior, it is given the most weight (by allowing all edges that ensued from it to be committed first).}{\f1\insrsid14363947
\par }{\f1\insrsid12716889
\par }{\f1\insrsid2521923 5: Stitch Chain}{\f1\insrsid2556592 s}{\f1\insrsid2521923 together}{\f1\insrsid12716889
\par
\par }{\f1\insrsid2521923 The final step is to stitch together the disjoint chains and output them into the final ordered list. File nodes will be marked
once they have been output. We iterate over all nodes and output the entire chain of which it is a part; this is done by following the node\rquote s previous link until at beginning of the chain.
\par }{\f1\insrsid2521923 The final step is to stitch together the disjoint chains and
output them into the final ordered list. File nodes will be marked once they have been output. We iterate over all nodes and output the entire chain of which it is a part; this is done by following the node\rquote
s previous link until at beginning of the chain.
\par Incidentally, this iteration ensures all files appear in the output list, even if they were not included in the trace.
\par }{\f1\insrsid12716889
\par }{\f1\insrsid4146695 We have thus generated an ordering of files that minimize seeks assuming application behavior is similar to that which was recorded in the trace(s).
@ -321,9 +326,9 @@ once they have been output. We iterate over all nodes and output the entire chai
\par }{\f1\insrsid6831581 This is an approximation to a variant of the Traveling Salesman Problem; the question as to its quality (i.e. how many seeks are avoided) is interesting and will be examined in }{\f1\insrsid12547006 <<}{\f1\insrsid6831581 section 3}{
\f1\insrsid12547006 >>}{\f1\insrsid6831581 .
\par }{\f1\insrsid4994568
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4994568 {\f1\insrsid4994568 Rough complexity analysis: except for the cycle determination,
none of these steps require more than O(logN) work per file. Expected case is therefore O(N*logN), with O(N^2) work in the worst case (if DFS always scans through very long chains). However, as mentioned above, this is an offline process; performance is e
ntirely adequate, so we do not delve into a complete analysis or optimize the cycle determination step.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid4994568 {\f1\insrsid4994568 Rough complexity analysis: except for the cycle d
etermination, none of these steps require more than O(logN) work per file. Expected case is therefore O(N*logN), with O(N^2) work in the worst case (if DFS always scans through very long chains). However, as mentioned above, this is an offline process; pe
rformance is entirely adequate, so we do not delve into a complete analysis or optimize the cycle determination step.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid1770144 {\f1\insrsid1770144
\par }{\f1\insrsid4994568
\par }{\f1\insrsid1770144 Splitting Into Blocks
@ -355,8 +360,8 @@ http://www.eetkorea.com/ARTICLES/2000APR/2000APR05_CT_ID_AN.PDF}{\f1\insrsid2023
\par
\par T}{\f1\insrsid6842252 he result of these ruminations wa}{\f1\insrsid1770144 s a block size of 16 KiB.}{\f1\insrsid11684990 However, our measurements have sho}{\f1\insrsid6842252 wn 32 KiB to be most efficient.}{\f1\insrsid1770144
\par }{\f1\insrsid7371675
\par This concludes discussion of our}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675 techniques. To review,}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 }{\f1\insrsid1770144 are automatically split into block
s (of aligned start position and length) and issued asynchronously}{\f1\insrsid7371675 . Once a block finishes, it is decompressed while the next block}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675
\par This concludes discussion of our}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675 techniques. To review,}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 }{\f1\insrsid1770144 are automatic
ally split into blocks (of aligned start position and length) and issued asynchronously}{\f1\insrsid7371675 . Once a block finishes, it is decompressed while the next block}{\f1\insrsid6842252 I/O}{\f1\insrsid7371675
is in progress. Finally, seeks are avoided by having arranged the files within an archive in order of access.
\par }{\f1\insrsid1770144
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid1770144
@ -365,13 +370,13 @@ s (of aligned start position and length) and issued asynchronously}{\f1\insrsid7
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15489891 {\f1\insrsid15489891\charrsid15489891 It's not true that life is one damn thing after another; it is one damn thing over and over.
\par - Edna St. Vincent Millay
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid15489891
\par }{\f1\insrsid12675798 The final step we take in optimizing}{\f1\insrsid6842252 I/O}{\f1\insrsid12675798 is caching. By keeping commonly used files in memory, some }{\f1\insrsid15489891 repeated}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798 can be avoi
ded outright.
\par }{\f1\insrsid12675798 The final step we take in optimizing}{\f1\insrsid6842252 I/O}{\f1\insrsid12675798 is caching. By keeping commonly used files in memory, some }{\f1\insrsid15489891 repeated}{\f1\insrsid6842252 I/Os}{\f1\insrsid12675798
can be avoided outright.
\par }{\f1\insrsid881799
\par }{\f1\insrsid12675798 There are two \lquote levels\rquote of cache: }{\f1\insrsid11368037 entire }{\f1\insrsid12675798 file}{\f1\insrsid11368037 s }{\f1\insrsid12675798 and block}{\f1\insrsid11368037 s}{\f1\insrsid12675798 .}{\f1\insrsid11368037
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11368037 {\f1\insrsid11368037
\par The small block cache serves to avoid overhead due to sector-aligning}{\f1\insrsid6842252 I/Os}{\f1\insrsid11368037 in transfers. Since files usually start at unaligned offsets within archives, data lying at the beginning o
f a sector would be read twice (once for the real}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037 and then again during the next file\rquote s}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037
\par The small block cache serves to avoid overhead due to sector-aligning}{\f1\insrsid6842252 I/Os}{\f1\insrsid11368037 in transfers. Since files usually start at unaligned offsets within archives, data lying at the
beginning of a sector would be read twice (once for the real}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037 and then again during the next file\rquote s}{\f1\insrsid6842252 I/O}{\f1\insrsid11368037
). The block cache absorbs this cost by keeping in memory the last few blocks read; it is organized as LRU.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid11157276 {\f1\insrsid11368037
\par The per-file }{\f1\insrsid6842252 caching strategy}{\f1\insrsid11368037 is due to the assumption that files will usually be loaded in one burst; it simplifies }{\f1\insrsid6842252 bookkeeping }{\f1\insrsid11368037
@ -385,29 +390,29 @@ and avoids having to copy pieces of the file into a final buffer.}{\f1\insrsid12
\par
\par }{\f1\insrsid10239896 Allocator
\par
\par A general-purpose allocator (e.g. malloc) is not acceptable for this application because file buffer addresses are required by Windows ReadFile to be aligned to a sector boundary}{\f1\insrsid13180669 . Rounding up returned addresses would
waste unacceptable amounts of memory, so a }{\f1\insrsid10239896 special allocation scheme }{\f1\insrsid13180669 is needed}{\f1\insrsid4401489 that always returns aligned regions}{\f1\insrsid13180669 .
\par A general-purpose allocator (e.g. malloc) is not acceptable for this application because file buffer addresses are required by Windows ReadFile to be aligned to a sector boundary}{\f1\insrsid13180669 . Rounding up returned addr
esses would waste unacceptable amounts of memory, so a }{\f1\insrsid10239896 special allocation scheme }{\f1\insrsid13180669 is needed}{\f1\insrsid4401489 that always returns aligned regions}{\f1\insrsid13180669 .
\par
\par }{\f1\insrsid4401489 This }{\f1\insrsid13180669 entails not prefixing the allocated regions with a header. }{\f1\insrsid4401489 Our idea is to}{\f1\insrsid7351464 transfer}{\f1\insrsid4401489 }{\f1\insrsid13180669 ownership of a}{\f1\insrsid5467766 n}{
\f1\insrsid13180669 }{\f1\insrsid4401489 allocated }{\f1\insrsid13180669 region from the allocator
to cache and/or extant list; these have to record region address and size anyway for their bookkeeping. When the region is to be freed, the extant list inform}{\f1\insrsid7351464 s}{\f1\insrsid13180669 the allocator of its size and address}{
\f1\insrsid4401489 , which is typically what a header would }{\f1\insrsid7437835 have }{\f1\insrsid4401489 store}{\f1\insrsid7437835 d}{\f1\insrsid4401489 . }{\f1\insrsid10239896
\f1\insrsid13180669 }{\f1\insrsid4401489 allocated }{\f1\insrsid13180669
region from the allocator to cache and/or extant list; these have to record region address and size anyway for their bookkeeping. When the region is to be freed, the extant list inform}{\f1\insrsid7351464 s}{\f1\insrsid13180669
the allocator of its size and address}{\f1\insrsid4401489 , which is typically what a header would }{\f1\insrsid7437835 have }{\f1\insrsid4401489 store}{\f1\insrsid7437835 d}{\f1\insrsid4401489 . }{\f1\insrsid10239896
\par
\par }{\f1\insrsid4401489 Having now established the requirement for alignment and how to ensure it, we discuss the main problem of an allocator}{\f1\insrsid7437835 -}{\f1\insrsid4401489 fragmentation.
\par There are basically two ways to deal with this: perform periodic reorganization, }{\f1\insrsid5467766 or}{\f1\insrsid4401489 }{\f1\insrsid5467766 prevent }{\f1\insrsid4401489 it from happening in the first place.}{\f1\insrsid7351464
\par
\par }{\f1\insrsid4401489 The former is not feasible due to our real-time requirements, and - more importantly \endash because users receive direct pointers to the cache memory. }{\f1\insrsid7351464 This allows zero-copy}{\f1\insrsid6842252 I/O}{
\f1\insrsid7351464 and reduces memory footprint because multiple
users of a file can share its (read-only) contents. However, it is believed that currently in-use and therefore unmovable regions would severely hamper defragmentation. We therefore focus on the latter approach.}{\f1\insrsid4401489
\f1\insrsid7351464 and reduces memory footprint because multiple user
s of a file can share its (read-only) contents. However, it is believed that currently in-use and therefore unmovable regions would severely hamper defragmentation. We therefore focus on the latter approach.}{\f1\insrsid4401489
\par }{\f1\insrsid7351464
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid7351464 {\f1\insrsid13388513 With all pieces in places, we now discuss the }{\f1\insrsid7351464 allocation }{\f1\insrsid13388513 policy}{\f1\insrsid408563 . }{
\f1\insrsid13388513 As shown in [MemFragSolved], }{\f1\insrsid6191932 Address-Ordered good-fit}{\f1\insrsid408563
performs well. When freeing, we coalesce regions immediately. This may perform unnecessary work, but is acceptable in light of its simplicity. Allocation first exhausts all availa
ble memory before reusing freelist entries. This is fine because the cache size is chosen such that it can and should be used in its entirety. The benefit is reducing freelist splitting, which tends to produce larger coalesced regions.
performs well. When freeing, we coalesce regions immediately. This may perform unnecessary work, but is acceptable in light of its simplicity. Allocation first exhausts all available
memory before reusing freelist entries. This is fine because the cache size is chosen such that it can and should be used in its entirety. The benefit is reducing freelist splitting, which tends to produce larger coalesced regions.
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6191932 {\f1\insrsid6191932
Note: in addition to policy, there is another approach to mitigating fragmentation. Its root cause is freeing objects whose neighbors are not free. We attack this by allowing for the application to pass hints as to buffer lifetimes, so that long-lived obj
ects can be placed differently and not cause \lquote holes\rquote around freed short-lived objects.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6191932 {\f1\insrsid6191932 Note: in addition to po
licy, there is another approach to mitigating fragmentation. Its root cause is freeing objects whose neighbors are not free. We attack this by allowing for the application to pass hints as to buffer lifetimes, so that long-lived objects can be placed diff
erently and not cause \lquote holes\rquote around freed short-lived objects.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid7351464 {\f1\insrsid6191932
\par
\par }{\f1\insrsid408563 Implementation Details
@ -418,9 +423,9 @@ ects can be placed differently and not cause \lquote holes\rquote around freed
size}{\f1\insrsid13388513 . If a freelist is empty, the allocation can be satisfied by finding the next }{\f1\insrsid5467766 highest }{\f1\insrsid13388513 non-empty class (O(1) due to bit}{\f1\insrsid408563 scan}{\f1\insrsid13388513 )}{\f1\insrsid408563
and splitting its first block.}{\f1\insrsid7351464
\par }{\f1\insrsid3629320
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3629320 Total
allocation performance can be made O(1) by further splitting size classes into fixed-size subclasses; this is the approach taken by [TLSF]. However, we find that freelists are typically empty anyway (because the cache is always as full as possible) and th
erefore }{\f1\insrsid7437835 omit this for simplicity.}{\f1\insrsid4401489
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3629320 Total allocation per
formance can be made O(1) by further splitting size classes into fixed-size subclasses; this is the approach taken by [TLSF]. However, we find that freelists are typically empty anyway (because the cache is always as full as possible) and therefore }{
\f1\insrsid7437835 omit this for simplicity.}{\f1\insrsid4401489
\par }{\f1\insrsid7437835
\par }{\f1\insrsid3629320 Coalescing works by storing boundary tags within the freed}{\f1\insrsid12547006 (!)}{\f1\insrsid3629320
memory. When freeing a block, we check if the regions that come before and after it have such tags (identified via distinctive bit patterns very likely to occur in normal data); if so, they are merged. Note that this is somewhat risky but the \lquote
@ -433,12 +438,12 @@ This allows easily checking whether a given pointer is valid and was taken from
\par }{\f1\insrsid4401489
\par }{\f1\insrsid3629320 Extant List
\par
\par This list tracks all buffers that have been }{\f1\insrsid12547006 handed out}{\f1\insrsid3629320 to the application but not yet freed. Since they are expected to be
freed immediately (before allocating the next, which is enforced by a warning), this list only contains a few entries and therefore need not be organized as a tree.
\par This list tracks all buffers that have been }{\f1\insrsid12547006 handed out}{\f1\insrsid3629320 to the application but not yet freed. Since they are expected to be freed immediately (before a
llocating the next, which is enforced by a warning), this list only contains a few entries and therefore need not be organized as a tree.
\par
\par It stores address and size of the allocated regions, which are passed to the allocator when freeing a buffer. This }{\f1\insrsid7437835 avoids the need for}{\f1\insrsid3629320 per-regi}{\f1\insrsid15952639 on headers, as explained above.
\par An alternative would be }{\f1\insrsid3629320 providing a separate data structure }{\f1\insrsid15952639 associating allocated address with its size, but this is redundant since many of these regions are also stored
in the cache. Therefore, our approach uses less memory.}{\f1\insrsid3629320
\par An alternative would be }{\f1\insrsid3629320 providing a separate data structure }{\f1\insrsid15952639 associating allocated address with its size, but this is redundant since many of these regions are also stored in the cache. Therefore, ou
r approach uses less memory.}{\f1\insrsid3629320
\par
\par }{\f1\insrsid15952639 Cache}{\f1\insrsid7437835 Manager}{\f1\insrsid3629320
\par
@ -458,13 +463,13 @@ in the cache. Therefore, our approach uses less memory.}{\f1\insrsid3629320
\f1\insrsid14308065 on Athlon XP) for increased memory use.
\par 2}{\f1\insrsid11994078 a) the calcMCD and chargeAll loops can effectively be fused by calculating the next MCD}{\f1\insrsid12547006 value on the side. We therefore avoid iterating over all items twice}{\f1\insrsid11994078 , which is }{
\f1\insrsid12547006 especially }{\f1\insrsid11994078 important for large sets of items that do not fit in cache.
\par }{\f1\insrsid14308065 2}{\f1\insrsid11994078 b) a priority queue can return and remove the
MCD item in O(logN) time; the rent that should be charged from all items can be accumulated and applied in batches. The validity of this approach is not immediately clear. Landlord specifies decreasing all credit by delta * item.size and removing any sub
set of items with no credit remaining. By definition of delta (min credit density), at least one item will be removed, and this is exactly the one returned by the priority queue.
\par }{\f1\insrsid14308065 2}{\f1\insrsid11994078 b) a priority queue can return and remove the MCD item in O
(logN) time; the rent that should be charged from all items can be accumulated and applied in batches. The validity of this approach is not immediately clear. Landlord specifies decreasing all credit by delta * item.size and removing any subset of items w
ith no credit remaining. By definition of delta (min credit density), at least one item will be removed, and this is exactly the one returned by the priority queue.
\par Note that any pending charges must be committed before adding any items; otherwise, they too would be charged during the next commit cycle, which would be incorrect.
\par }{\f1\insrsid14308065 Implementation note: to avoid duplicating code, the priority queue is separate from the filename->cached contents mapping. Since it is ordered by the item credit, the q
ueue must be re-sorted after an item is accessed, which increases its credit. Due to limitations in the STL priority_queue, this takes O(N) time on every access. Since }{\f1\insrsid12547006 cache }{\f1\insrsid14308065
hits are fairly rare, time is still saved}{\f1\insrsid12547006 overall}{\f1\insrsid14308065 ; however, this bottleneck should be removed by substituting a heap implementation that allows a logN \'93sift\'94 operation.}{\f1\insrsid3629320
\par }{\f1\insrsid14308065 Implementation note: to avoid duplicating code, the priority queue is separate from the filename->cached contents mapping. Since it is ordered by the item credit, the queue must be r
e-sorted after an item is accessed, which increases its credit. Due to limitations in the STL priority_queue, this takes O(N) time on every access. Since }{\f1\insrsid12547006 cache }{\f1\insrsid14308065 hits are fairly rare, time is still saved}{
\f1\insrsid12547006 overall}{\f1\insrsid14308065 ; however, this bottleneck should be removed by substituting a heap implementation that allows a logN \'93sift\'94 operation.}{\f1\insrsid3629320
\par
\par }{\f1\insrsid5980580 These improvements are made av}{\f1\insrsid1847956 ailable as template}{\f1\insrsid5980580 policy classes and can therefore easily be enabled for applications where they provide a benefit.}{\f1\insrsid3629320
\par }{\f1\insrsid5980580
@ -496,17 +501,19 @@ hits are fairly rare, time is still saved}{\f1\insrsid12547006 overall}{\f1\ins
\par }{\f1\insrsid12124230
\par }{\f1\insrsid9206128 Methodology
\par
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid9206128 {\f1\insrsid9206128 For all}{\f1\insrsid6842252 I/O}{\f1\insrsid9206128
-related measurements, we use a trace file recorded from the startup of 0ad encompassing ~500 file loads. Using the trace simulation feature described above, we issue these}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid9206128 {\f1\insrsid6762521 The basis for our I/O throughput measurement is}{\f1\insrsid9206128
a trace file recorded from the startup of 0ad encompassing ~500 file loads. Using the trace simulation feature described above, we issue these}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128
as fast as possible; this removes the influence of other system-specific conditions such as graphics card performance etc.
\par If a cache is involved, we ensure it is empty so as not to skew results; in the case of the OS file cache, testing takes place after a clean reboot.
\par
\par What is actually measured is the total amount of time elapsed between start and end of}{\f1\insrsid6842252 I/Os}{\f1\insrsid9206128 ; this together with the amount of user data transferred yields effective throughput (\'93effective\'94
because it differs from the actual disk throughput due to compression).
\par }{\f1\insrsid2424877
\par }{\f1\insrsid9206128 This was chosen as the benchmark measure because it reflects real-world performance of the entire system.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128
\par Results and Discussion
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6762521 {\f1\insrsid6762521 Note: if a cache is involved, we ensure it is empty so as not to skew results; in the case of the OS file cache, testing takes place after
a clean reboot.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid6762521
\par }{\f1\insrsid9206128 Results and Discussion
\par
\par W}{\f1\insrsid12124230 e are interested in the total improvement yielded by our}{\f1\insrsid6842252 I/O}{\f1\insrsid12124230 library, as compared to throughput reached by the bare OS-provided read() API.
\par According to the above measure, we see }{\f1\insrsid12124230\charrsid12124230 29.3}{\f1\insrsid12124230 MB/s vs. 2.96 MB/s, a staggering speedup of 990 %!
@ -520,8 +527,9 @@ hits are fairly rare, time is still saved}{\f1\insrsid12547006 overall}{\f1\ins
\par }{\f1\insrsid9206128
\par }{\f1\insrsid16475960 When archives are disabled entirely and}{\f1\insrsid6842252 I/O}{\f1\insrsid16475960 is from loose }{\f1\insrsid12547006 files }{\f1\insrsid16475960 (stored in the }{\f1\insrsid12547006 normal files}{\f1\insrsid16475960
ystem), performance drops to }{\f1\insrsid4937740 2.62 MB/s. The immediate conclusion is that reduced locality (due to poor FS ordering and extra headers) induces many costly seeks.
\par }{\f1\insrsid16475960 We }{\f1\insrsid4937740 also }{\f1\insrsid16475960 notice that }{\f1\insrsid4937740 performance is worse }{\f1\insrsid16475960 than th}{\f1\insrsid4937740 at}{\f1\insrsid16475960 measured for the
synchronous API; this could be explained by increased overhead of the aio APIs. Indeed, }{\f1\insrsid4937740 they do not support the Windows FastIO entry points that avoid needing to create a driver request packet.}{\f1\insrsid16475960
\par }{\f1\insrsid16475960 We }{\f1\insrsid4937740 also }{\f1\insrsid16475960 notice that }{\f1\insrsid4937740 performance is worse }{\f1\insrsid16475960 than th}{\f1\insrsid4937740 at}{\f1\insrsid16475960
measured for the synchronous API; this could be explained by increased overhead of the aio APIs. Indeed, }{\f1\insrsid4937740 they do not support the Windows FastIO entry points that avoid needing to create a driver request packet.}{\f1\insrsid16475960
\par }{\f1\insrsid4937740
\par Finally, we revisit the question of file block size. The initial choice of 16 KiB was not optimal; based on the following results, we go with 32 KiB.
\par Block Size (KiB)\tab Th}{\f1\insrsid9206128 r}{\f1\insrsid4937740 oughput (MB/s)
@ -536,23 +544,46 @@ synchronous API; this could be explained by increased overhead of the aio APIs.
\par In summary, we have found that bundling files into archives is the most worthwhile improvement, due to reducing seeks. Once these are eliminated, the increased throughput afforded by the (free) data compression step contributes an additional 23 % speedup.
\par }{\f1\insrsid5980580
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2424877 {\f1\insrsid2424877 Cache Manager Optimizations
\par }{\f1\insrsid6762521 Caching Efficacy
\par
\par Of further theoretical and practical interest is how well the various Landlord algorithm optimizations fare.
\par We now appraise the effectiveness of the cache replacement policy, i.e. its tendency to keep items in memory that will be needed later. To measure this, }{\f1\insrsid1516560 we record a }{\f1\insrsid6762521 trace of 3 different 0ad startup sequences
, each loading a separate map (which share some files but differ in others, e.g. environment textures).}{\f1\insrsid1516560 It comprises 57 MB of data, of which 14 are repeated and therefore potentially cacheable.}{\f1\insrsid6762521
\par
\par Accounting CPU cost is done as follows. First, external influences are minimized by running at highest scheduler priority. }{\f1\insrsid141460 S}{\f1\insrsid2424877 everal thousand iterations of the target code are run while measuring elapsed t
ime via high-resolution timer (precise to 1 CPU clock!). Each of these iterations performs an operation (e.g. allocate or free) chosen randomly; this avoids measuring characteristics that are specific to a given trace. Note, however, that we control the r
andom distribution (in the example, ratio of \'93allocate\'94 to \'93free\'94 operations); these }{\f1\insrsid141460 are }{\f1\insrsid2424877 weighted towards the }{\f1\insrsid141460 most frequent and important }{\f1\insrsid2424877 operations.
\par S}{\f1\insrsid1516560 ince this trace and the}{\f1\insrsid6762521 0ad dataset }{\f1\insrsid1516560 are}{\f1\insrsid6762521 }{\f1\insrsid1516560 as }{\f1\insrsid6762521 yet relatively small}{\f1\insrsid1516560 (real-world cache sizes may well be larger)
}{\f1\insrsid6762521 , we have artificially }{\f1\insrsid7279159 limited}{\f1\insrsid6762521 the cache size to ensure that items will have to be evicted from the cache}{\f1\insrsid1516560 . Without this, the cache replacement policy would be irrelevant.
A size of 10 MB has been chosen arbitrarily.}{\f1\insrsid6762521
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid2424877 {\f1\insrsid1516560
\par We first }{\f1\insrsid9524216 evaluate the }{\f1\insrsid1516560 well-known LRU algorithm}{\f1\insrsid9524216 under these conditions}{\f1\insrsid1516560 . }{\f1\insrsid9524216 The cache h}{\f1\insrsid1516560 it rate is }{\f1\insrsid9524216
determined to be }{\f1\insrsid1516560 19 % (473 hits totaling 6.18 MB vs. 1915 misses totaling 51.22 MB).
\par }{\f1\insrsid9524216 Our Landlord implementation more }{\f1\insrsid1516560 than doubles }{\f1\insrsid9524216 this }{\f1\insrsid1516560 to 39 % (}{\f1\insrsid1516560\charrsid1516560 945 }{\f1\insrsid1516560 hits totaling }{
\f1\insrsid1516560\charrsid1516560 8.88}{\f1\insrsid1516560 }{\f1\insrsid1516560\charrsid1516560 MB}{\f1\insrsid1516560 vs. }{\f1\insrsid1516560\charrsid1516560 1443 }{\f1\insrsid1516560 misses totaling }{\f1\insrsid1516560\charrsid1516560 48.52 MB)}{
\f1\insrsid9524216 .}{\f1\insrsid1516560
\par }{\f1\insrsid9524216 A more intuitive view of these numbers is that the percentage of non-compulsory misses (i.e. items that were evicted but referenced later) drops from 26 % to 2 %.}{\f1\insrsid9524216\charrsid1516560
\par }{\f1\insrsid1516560
\par }{\f1\insrsid9524216 We are pleasantly surprised by this favorable result. Since our implementation does not yet take advantage of file cost hints from the application, the difference }{\f1\insrsid7279159 in performance }{\f1\insrsid9524216 is due }{
\f1\insrsid7279159 solely }{\f1\insrsid9524216 to the Landlord algorithm\rquote s awareness of item size. This apparently leads to more efficient handling of the cache memory}{\f1\insrsid7279159 : fewer files need
be evicted to make enough room for the next item.
\par Another factor is that the repeated files in this trace are spaced widely apart (e.g. at the start of all 3 map loads constituting the trace); LRU would tend to remove exactly these items.
\par }{\f1\insrsid1516560
\par }{\f1\insrsid2424877 Cache Manager Optimizations
\par
\par Of further theoretical and practical interest is how }{\f1\insrsid1516560 much improvement }{\f1\insrsid2424877 the various Landlord algorithm optimizations}{\f1\insrsid1516560 yield}{\f1\insrsid2424877 .
\par
\par Accounting CPU cost is done as follows. First, external influences are minimized by running at highest scheduler priority. }{\f1\insrsid141460 S}{\f1\insrsid2424877
everal thousand iterations of the target code are run while measuring elapsed time via high-resolution timer (precise to 1 CPU clock!). Each of these iterations performs an operation (e.g. allocate or free) chosen randomly; this avoids measuring character
istics that are specific to a given trace. Note, however, that we control the random distribution (in the example, ratio of \'93allocate\'94 to \'93free\'94 operations); these }{\f1\insrsid141460 are }{\f1\insrsid2424877 weighted towards the }{
\f1\insrsid141460 most frequent and important }{\f1\insrsid2424877 operations.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid9206128
\par }{\f1\insrsid141460 The first result is that with the na\'efve Landlord implementation, dividing via multiplying by reciprocal is
actually 1.4 % slower! This is likely because the additional storage required for the reciprocal breaks the nice cache-friendly 16 byte element size. Since this algorithm iterates over all items twice, th}{\f1\insrsid5710756 e}{\f1\insrsid141460
memory access cost weighs more heavily than a few extra CPU cycles spent dividing.}{\f1\insrsid15160866
\par }{\f1\insrsid141460 The first result is that with the na\'ef
ve Landlord implementation, dividing via multiplying by reciprocal is actually 1.4 % slower! This is likely because the additional storage required for the reciprocal breaks the nice cache-friendly 16 byte element size. Since this algorithm itera
tes over all items twice, th}{\f1\insrsid5710756 e}{\f1\insrsid141460 memory access cost weighs more heavily than a few extra CPU cycles spent dividing.}{\f1\insrsid15160866
\par }{\f1\insrsid141460
\par Next, we find that the Landlord_Cached strategy (recall that it calculates minimum credit density while updating and therefore often avoids needing to iterate over all items) performs 21 % faster.
\par However, its divide-via-reciprocal variant is again slower \endash this time by 0.6 %. We see that }{\f1\insrsid13000184 iterating less often increases the benefit from the reciprocal divider.}{\f1\insrsid141460
\par }{\f1\insrsid13000184
\par The final variant is Landlord_Lazy (which uses a priority queue to find the least valuable item in O(logN) and thus avoids iterating over all items when wanting to remove one from the cache). It performs 19 % better than baseline, which is slightly slower
than the previous variant. Note that this result is heavily dependent on the relative frequency of add and remove operations: since the former require iteration over all items (to \lquote commit\rquote
\par The final variant is Landlord_Lazy
(which uses a priority queue to find the least valuable item in O(logN) and thus avoids iterating over all items when wanting to remove one from the cache). It performs 19 % better than baseline, which is slightly slower than the previous variant. Note th
at this result is heavily dependent on the relative frequency of add and remove operations: since the former require iteration over all items (to \lquote commit\rquote
a previous pending charge), decreasing their number from the current (and quite arbitrary) 70 % will cause this implementation to come out far ahead.
\par Applying the reciprocal divider results in further gains of 0}{\f1\insrsid5710756 .}{\f1\insrsid13000184 8 %. Since we rarely iterate over all items}{\f1\insrsid5710756 here}{\f1\insrsid13000184
, the increase in size is outweighed by the faster division.
@ -562,8 +593,8 @@ andom distribution (in the example, ratio of \'93allocate\'94 to \'93free\'94 op
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid15160866
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 Allocator Fragmentation
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid2424877
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 The important question of allocator fragmentation is next. We gauge it in the course of simulating the previous 500-file trace. A simp
le and adequate measure is to compare the total requested size with how much of the total file cache is actually occupied.}{\f1\insrsid1847956
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid5048634 {\f1\insrsid5048634 The important question of allocator fragmentation is next. We gauge it in the course of simulating the previous 500-file trace. A sim
ple and adequate measure is to compare the total requested size with how much of the total file cache is actually occupied.}{\f1\insrsid1847956
The result is a total memory waste of 14 %, which is in line with the findings of [Johnstone and Wilson]. While not great, this is acceptable.}{\f1\insrsid5048634
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid5048634
\par }{\f1\insrsid9206128
@ -597,16 +628,16 @@ The main contribution is a combination of techniques that greatly improves effec
synchronous access maximizes read throughput and (together with block-splitting) allows }{\f1\insrsid16678464 the data to be compressed, which reduces the amount that must be read.}{\f1\insrsid3546591
\par }{\f1\insrsid5968267 The end result is a measured speedup of nearly 1000 % in the target application, which is expected to apply widely due to inefficient filesystems.
\par }{\f1\insrsid16678464
\par }{\f1\insrsid3546591 Of further interest are optimizations made to the memory allocation and cache management algorithms.}{\f1\insrsid5968267 They respectively allow r
eturning aligned file buffers (required by the aio implementation) without serious fragmentation and reduce CPU cost of the cache manager by 20 %.}{\f1\insrsid3546591
\par }{\f1\insrsid3546591 Of further interest are optimizations made to the memory allocation and cache management algorithms.}{\f1\insrsid5968267
They respectively allow returning aligned file buffers (required by the aio implementation) without serious fragmentation and reduce CPU cost of the cache manager by 20 %.}{\f1\insrsid3546591
\par }{\f1\insrsid5968267
\par Other applications can build on our work and easily speed up their load times and file accesses.
\par
\par }{\f1\insrsid3546591
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid6826037 {\f1\insrsid6826037 Implementation
\par
\par Our}{\f1\insrsid6842252 I/O}{\f1\insrsid6826037
code has been developed in C++ and also contains a few time-critical assembly language subroutines. It encompasses about 12000 lines of code, about 7K of which are new; the rest was built upon previous work.
\par Our}{\f1\insrsid6842252 I/O}{\f1\insrsid6826037 code has been developed in C++ and also contains a few time-critical assembly language subroutines. It encompasses about 12000 lines of code, about 7K of which are new; the
rest was built upon previous work.
\par Unfortunately there are dependencies on another ~30KLOC, so releasing and integrating into other applications is not as easy as it could be; this is being worked upon.
\par }{\f1\insrsid5710756 Eventually releasing the code }{\f1\insrsid6826037 under the GNU General Public License (Free Software)}{\f1\insrsid5710756 is planned}{\f1\insrsid6826037 .
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid3546591
@ -614,12 +645,14 @@ eturning aligned file buffers (required by the aio implementation) without serio
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid13779256 {\f1\insrsid3546591
\par }{\f1\insrsid5710756 We have further ideas for improvement that could not yet be implemented due to time constraints.
\par
\par Prefetching, }{\f1\insrsid16678464 i.e. reading data before it is needed (during idle time)}{\f1\insrsid5710756 , shows promise}{\f1\insrsid16678464 . While requiring more work and tighter integration with the application, this c
an improve performance by always keeping the hard disk busy. The downsides that must be mitigated are increased power usage and potentially interfering with time-critical}{\f1\insrsid6842252 I/Os}{\f1\insrsid16678464 .
\par Prefetching, }{\f1\insrsid16678464 i.e. reading data before it is needed (during idle time)}{\f1\insrsid5710756 , shows promise}{\f1\insrsid16678464 . While req
uiring more work and tighter integration with the application, this can improve performance by always keeping the hard disk busy. The downsides that must be mitigated are increased power usage and potentially interfering with time-critical}{
\f1\insrsid6842252 I/Os}{\f1\insrsid16678464 .
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid12675798 {\f1\insrsid13779256
\par }{\f1\insrsid6826037 Currently, the main bit of \lquote intelligence\rquote is offline and consists of finding a
good ordering for files within an archive. We would like to bring more of this into real time and e.g. make decisions in the file cache based on predicted future behavior. In particular, small files known to be accessed after one another could be removed
from the file cache together, thus freeing up more space (meaning less fragmentation) without hurting performance (because one file not in cache will force reading the block in which it is stored, anyway).
\par }{\f1\insrsid6826037 Currently, the main bit of \lquote intelligence\rquote
is offline and consists of finding a good ordering for files within an archive. We would like to bring more of this into real time and e.g. make decisions in the file cache based on predicted future behavior. In particular,
small files known to be accessed after one another could be removed from the file cache together, thus freeing up more space (meaning less fragmentation) without hurting performance (because one file not in cache will force reading the block in which it
is stored, anyway).
\par }{\f1\insrsid16678464
\par }{\f1\insrsid6826037 Two approaches are envisaged that could realize these wishes. A Markov chain could be constructed and used to decide the probability of certain}{\f1\insrsid6842252 I/Os}{\f1\insrsid6826037
coming after one another. Also, previous traces could be examined at runtime to determine where in the load sequence we are, thus predicting further}{\f1\insrsid6842252 I/Os}{\f1\insrsid6826037 .