This file is indexed.

/usr/share/doc/ganeti/html/design-resource-model.html is in ganeti-doc 2.16.0~rc2-1build1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>Resource model changes &#8212; Ganeti 2.16.0~rc2 documentation</title>
    <link rel="stylesheet" href="_static/style.css" type="text/css" />
    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
    <script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    './',
        VERSION:     '2.16.0~rc2',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  true,
        SOURCELINK_SUFFIX: '.txt'
      };
    </script>
    <script type="text/javascript" src="_static/jquery.js"></script>
    <script type="text/javascript" src="_static/underscore.js"></script>
    <script type="text/javascript" src="_static/doctools.js"></script>
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Design for executing commands via RPC" href="design-restricted-commands.html" />
    <link rel="prev" title="Ganeti reason trail" href="design-reason-trail.html" /> 
  </head>
  <body>
    <div class="related" role="navigation" aria-label="related navigation">
      <h3>Navigation</h3>
      <ul>
        <li class="right" style="margin-right: 10px">
          <a href="design-restricted-commands.html" title="Design for executing commands via RPC"
             accesskey="N">next</a></li>
        <li class="right" >
          <a href="design-reason-trail.html" title="Ganeti reason trail"
             accesskey="P">previous</a> |</li>
        <li class="nav-item nav-item-0"><a href="index.html">Ganeti 2.16.0~rc2 documentation</a> &#187;</li> 
      </ul>
    </div>  

    <div class="document">
      <div class="documentwrapper">
        <div class="bodywrapper">
          <div class="body" role="main">
            
  <div class="section" id="resource-model-changes">
<h1>Resource model changes<a class="headerlink" href="#resource-model-changes" title="Permalink to this headline"></a></h1>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Created:</th><td class="field-body">2011-Oct-12</td>
</tr>
<tr class="field-even field"><th class="field-name">Status:</th><td class="field-body">Implemented</td>
</tr>
<tr class="field-odd field"><th class="field-name">Ganeti-Version:</th><td class="field-body">2.6.0</td>
</tr>
</tbody>
</table>
<div class="section" id="introduction">
<h2>Introduction<a class="headerlink" href="#introduction" title="Permalink to this headline"></a></h2>
<p>In order to manage virtual machines across the cluster, Ganeti needs to
understand the resources present on the nodes, the hardware and software
limitations of the nodes, and how much can be allocated safely on each
node. Some of these decisions are delegated to IAllocator plugins, for
easier site-level customisation.</p>
<p>Similarly, the HTools suite has an internal model that simulates the
hardware resource changes in response to Ganeti operations, in order to
provide both an iallocator plugin and for balancing the
cluster.</p>
<p>While currently the HTools model is much more advanced than Ganeti’s,
neither one is flexible enough and both are heavily geared toward a
specific Xen model; they fail to work well with (e.g.) KVM or LXC, or
with Xen when <a class="reference internal" href="glossary.html#term-tmem"><span class="xref std std-term">tmem</span></a> is enabled. Furthermore, the set of metrics
contained in the models is limited to historic requirements and fails to
account for (e.g.)  heterogeneity in the I/O performance of the nodes.</p>
</div>
<div class="section" id="current-situation">
<h2>Current situation<a class="headerlink" href="#current-situation" title="Permalink to this headline"></a></h2>
<div class="section" id="ganeti">
<h3>Ganeti<a class="headerlink" href="#ganeti" title="Permalink to this headline"></a></h3>
<p>At this moment, Ganeti itself doesn’t do any static modelling of the
cluster resources. It only does some runtime checks:</p>
<ul class="simple">
<li>when creating instances, for the (current) free disk space</li>
<li>when starting instances, for the (current) free memory</li>
<li>during cluster verify, for enough N+1 memory on the secondaries, based
on the (current) free memory</li>
</ul>
<p>Basically this model is a pure <a class="reference internal" href="glossary.html#term-sow"><span class="xref std std-term">SoW</span></a> one, and it works well when
there are other instances/LVs on the nodes, as it allows Ganeti to deal
with ‘orphan’ resource usage, but on the other hand it has many issues,
described below.</p>
</div>
<div class="section" id="htools">
<h3>HTools<a class="headerlink" href="#htools" title="Permalink to this headline"></a></h3>
<p>Since HTools does an pure in-memory modelling of the cluster changes as
it executes the balancing or allocation steps, it had to introduce a
static (<a class="reference internal" href="glossary.html#term-sor"><span class="xref std std-term">SoR</span></a>) cluster model.</p>
<p>The model is constructed based on the received node properties from
Ganeti (hence it basically is constructed on what Ganeti can export).</p>
<div class="section" id="disk">
<h4>Disk<a class="headerlink" href="#disk" title="Permalink to this headline"></a></h4>
<p>For disk it consists of just the total (<code class="docutils literal"><span class="pre">tdsk</span></code>) and the free disk
space (<code class="docutils literal"><span class="pre">fdsk</span></code>); we don’t directly track the used disk space. On top of
this, we compute and warn if the sum of disk sizes used by instance does
not match with <code class="docutils literal"><span class="pre">tdsk</span> <span class="pre">-</span> <span class="pre">fdsk</span></code>, but otherwise we do not track this
separately.</p>
</div>
<div class="section" id="memory">
<h4>Memory<a class="headerlink" href="#memory" title="Permalink to this headline"></a></h4>
<p>For memory, the model is more complex and tracks some variables that
Ganeti itself doesn’t compute. We start from the total (<code class="docutils literal"><span class="pre">tmem</span></code>), free
(<code class="docutils literal"><span class="pre">fmem</span></code>) and node memory (<code class="docutils literal"><span class="pre">nmem</span></code>) as supplied by Ganeti, and
additionally we track:</p>
<dl class="docutils">
<dt>instance memory (<code class="docutils literal"><span class="pre">imem</span></code>)</dt>
<dd>the total memory used by primary instances on the node, computed
as the sum of instance memory</dd>
<dt>reserved memory (<code class="docutils literal"><span class="pre">rmem</span></code>)</dt>
<dd>the memory reserved by peer nodes for N+1 redundancy; this memory is
tracked per peer-node, and the maximum value out of the peer memory
lists is the node’s <code class="docutils literal"><span class="pre">rmem</span></code>; when not using DRBD, this will be
equal to zero</dd>
<dt>missing memory (<code class="docutils literal"><span class="pre">xmem</span></code>)</dt>
<dd><p class="first">memory that cannot be unaccounted for via the Ganeti model; this is
computed at startup as:</p>
<blockquote>
<div>tmem - imem - nmem - fmem</div></blockquote>
<p>if we define state-of-record free mem as:</p>
<blockquote>
<div>tmem - imem - nmem</div></blockquote>
<p class="last">then we can interpret this as the difference between the state-of-record
and state-of-world free memory; it presumed to remain constant irrespective
of any instance moves</p>
</dd>
<dt>unallocated memory (<code class="docutils literal"><span class="pre">umem</span></code>)</dt>
<dd><p class="first">the memory that is guaranteed to be not allocated to existing processes;
in case of a static node model this is simply:</p>
<blockquote>
<div>min(state-of-record_free_mem, fmem)</div></blockquote>
<p>since the state-of-record changes during instance placement simulations,
we can’t use that definition directly (see the above note about missing
memory presumed being constant); we need to use an equivalent definiton:</p>
<blockquote class="last">
<div>state-of-record_free_mem - max(0, missing_memory)</div></blockquote>
</dd>
<dt>available memory (<code class="docutils literal"><span class="pre">amem</span></code>)</dt>
<dd><p class="first">this is defined as a zero bounded difference between unallocated and
reserved memory:</p>
<blockquote>
<div>max(0, umem - rmem)</div></blockquote>
<p class="last">so unless we use DRBD, this will be equal to <code class="docutils literal"><span class="pre">umem</span></code></p>
</dd>
</dl>
<p><code class="docutils literal"><span class="pre">tmem</span></code>, <code class="docutils literal"><span class="pre">nmem</span></code> and <code class="docutils literal"><span class="pre">xmem</span></code> are presumed constant during the
instance moves, whereas the <code class="docutils literal"><span class="pre">fmem</span></code>, <code class="docutils literal"><span class="pre">imem</span></code>, <code class="docutils literal"><span class="pre">rmem</span></code>, <code class="docutils literal"><span class="pre">umem</span></code> and
<code class="docutils literal"><span class="pre">amem</span></code> values are updated according to the executed moves.</p>
</div>
<div class="section" id="cpu">
<h4>CPU<a class="headerlink" href="#cpu" title="Permalink to this headline"></a></h4>
<p>The CPU model is different than the disk/memory models, since it’s the
only one where:</p>
<ol class="arabic simple">
<li>we do oversubscribe physical CPUs</li>
<li>and there is no natural limit for the number of VCPUs we can allocate</li>
</ol>
<p>We therefore track the total number of VCPUs used on the node and the
number of physical CPUs, and we cap the vcpu-to-cpu ratio in order to
make this somewhat more similar to the other resources which are
limited.</p>
</div>
<div class="section" id="dynamic-load">
<h4>Dynamic load<a class="headerlink" href="#dynamic-load" title="Permalink to this headline"></a></h4>
<p>There is also a model that deals with <em>dynamic load</em> values in
htools. As far as we know, it is not currently used actually with load
values, but it is active by default with unitary values for all
instances; it currently tracks these metrics:</p>
<ul class="simple">
<li>disk load</li>
<li>memory load</li>
<li>cpu load</li>
<li>network load</li>
</ul>
<p>Even though we do not assign real values to these load values, the fact
that we at least sum them means that the algorithm tries to equalise
these loads, and especially the network load, which is otherwise not
tracked at all. The practical result (due to a combination of these four
metrics) is that the number of secondaries will be balanced.</p>
</div>
</div>
<div class="section" id="limitations">
<h3>Limitations<a class="headerlink" href="#limitations" title="Permalink to this headline"></a></h3>
<p>There are unfortunately many limitations to the current model.</p>
<div class="section" id="id1">
<h4>Memory<a class="headerlink" href="#id1" title="Permalink to this headline"></a></h4>
<p>The memory model doesn’t work well in case of KVM. For Xen, the memory
for the node (i.e. <code class="docutils literal"><span class="pre">dom0</span></code>) can be static or dynamic; we don’t support
the latter case, but for the former case, the static value is configured
in Xen/kernel command line, and can be queried from Xen
itself. Therefore, Ganeti can query the hypervisor for the memory used
for the node; the same model was adopted for the chroot/KVM/LXC
hypervisors, but in these cases there’s no natural value for the memory
used by the base OS/kernel, and we currently try to compute a value for
the node memory based on current consumption. This, being variable,
breaks the assumptions in both Ganeti and HTools.</p>
<p>This problem also shows for the free memory: if the free memory on the
node is not constant (Xen with <a class="reference internal" href="glossary.html#term-tmem"><span class="xref std std-term">tmem</span></a> auto-ballooning enabled), or
if the node and instance memory are pooled together (Linux-based
hypervisors like KVM and LXC), the current value of the free memory is
meaningless and cannot be used for instance checks.</p>
<p>A separate issue related to the free memory tracking is that since we
don’t track memory use but rather memory availability, an instance that
is temporary down changes Ganeti’s understanding of the memory status of
the node. This can lead to problems such as:</p>
<img src="_images/graphviz-9f1ae213f94d296331aa93e5ffd2b0bf4a63585d.png" alt="digraph &quot;free-mem-issue&quot; {
node  [shape=box];
inst1 [label=&quot;instance1&quot;];
inst2 [label=&quot;instance2&quot;];

node  [shape=note];
nodeA [label=&quot;fmem=0&quot;];
nodeB [label=&quot;fmem=1&quot;];
nodeC [label=&quot;fmem=0&quot;];

node  [shape=ellipse, style=filled, fillcolor=green]

{rank=same; inst1 inst2}

stop    [label=&quot;crash!&quot;, fillcolor=orange];
migrate [label=&quot;migrate/ok&quot;];
start   [style=filled, fillcolor=red, label=&quot;start/fail&quot;];
inst1   -&gt; stop -&gt; start;
stop    -&gt; migrate -&gt; start [style=invis, weight=0];
inst2   -&gt; migrate;

{rank=same; inst1 inst2 nodeA}
{rank=same; stop nodeB}
{rank=same; migrate nodeC}

nodeA -&gt; nodeB -&gt; nodeC [style=invis, weight=1];
}" />
<p>The behaviour here is wrong; the migration of <em>instance2</em> to the node in
question will succeed or fail depending on whether <em>instance1</em> is
running or not. And for <em>instance1</em>, it can lead to cases where it if
crashes, it cannot restart anymore.</p>
<p>Finally, not a problem but rather a missing important feature is support
for memory over-subscription: both Xen and KVM support memory
ballooning, even automatic memory ballooning, for a while now. The
entire memory model is based on a fixed memory size for instances, and
if memory ballooning is enabled, it will “break” the HTools
algorithm. Even the fact that KVM instances do not use all memory from
the start creates problems (although not as high, since it will grow and
stabilise in the end).</p>
</div>
<div class="section" id="disks">
<h4>Disks<a class="headerlink" href="#disks" title="Permalink to this headline"></a></h4>
<p>Because we only track disk space currently, this means if we have a
cluster of <code class="docutils literal"><span class="pre">N</span></code> otherwise identical nodes but half of them have 10
drives of size <code class="docutils literal"><span class="pre">X</span></code> and the other half 2 drives of size <code class="docutils literal"><span class="pre">5X</span></code>, HTools
will consider them exactly the same. However, in the case of mechanical
drives at least, the I/O performance will differ significantly based on
spindle count, and a “fair” load distribution should take this into
account (a similar comment can be made about processor/memory/network
speed).</p>
<p>Another problem related to the spindle count is the LVM allocation
algorithm. Currently, the algorithm always creates (or tries to create)
striped volumes, with the stripe count being hard-coded to the
<code class="docutils literal"><span class="pre">./configure</span></code> parameter <code class="docutils literal"><span class="pre">--with-lvm-stripecount</span></code>. This creates
problems like:</p>
<ul class="simple">
<li>when installing from a distribution package, all clusters will be
either limited or overloaded due to this fixed value</li>
<li>it is not possible to mix heterogeneous nodes (even in different node
groups) and have optimal settings for all nodes</li>
<li>the striping value applies both to LVM/DRBD data volumes (which are on
the order of gigabytes to hundreds of gigabytes) and to DRBD metadata
volumes (whose size is always fixed at 128MB); when stripping such
small volumes over many PVs, their size will increase needlessly (and
this can confuse HTools’ disk computation algorithm)</li>
</ul>
<p>Moreover, the allocation currently allocates based on a ‘most free
space’ algorithm. This balances the free space usage on disks, but on
the other hand it tends to mix rather badly the data and metadata
volumes of different instances. For example, it cannot do the following:</p>
<ul class="simple">
<li>keep DRBD data and metadata volumes on the same drives, in order to
reduce exposure to drive failure in a many-drives system</li>
<li>keep DRBD data and metadata volumes on different drives, to reduce
performance impact of metadata writes</li>
</ul>
<p>Additionally, while Ganeti supports setting the volume separately for
data and metadata volumes at instance creation, there are no defaults
for this setting.</p>
<p>Similar to the above stripe count problem (which is about not good
enough customisation of Ganeti’s behaviour), we have limited
pass-through customisation of the various options of our storage
backends; while LVM has a system-wide configuration file that can be
used to tweak some of its behaviours, for DRBD we don’t use the
<strong class="command">drbdadmin</strong> tool, and instead we call <strong class="command">drbdsetup</strong>
directly, with a fixed/restricted set of options; so for example one
cannot tweak the buffer sizes.</p>
<p>Another current problem is that the support for shared storage in HTools
is still limited, but this problem is outside of this design document.</p>
</div>
<div class="section" id="locking">
<h4>Locking<a class="headerlink" href="#locking" title="Permalink to this headline"></a></h4>
<p>A further problem generated by the “current free” model is that during a
long operation which affects resource usage (e.g. disk replaces,
instance creations) we have to keep the respective objects locked
(sometimes even in exclusive mode), since we don’t want any concurrent
modifications to the <em>free</em> values.</p>
<p>A classic example of the locking problem is the following:</p>
<img src="_images/graphviz-86de5121f077bbeafb4e5fcea921318a9aa3de29.png" alt="digraph &quot;iallocator-lock-issues&quot; {
rankdir=TB;

start [style=invis];
node  [shape=box,width=2];
job1  [label=&quot;add instance\niallocator run\nchoose A,B&quot;];
job1e [label=&quot;finish add&quot;];
job2  [label=&quot;add instance\niallocator run\nwait locks&quot;];
job2s [label=&quot;acquire locks\nchoose C,D&quot;];
job2e [label=&quot;finish add&quot;];

job1  -&gt; job1e;
job2  -&gt; job2s -&gt; job2e;
edge [style=invis,weight=0];
start -&gt; {job1; job2}
job1  -&gt; job2;
job2  -&gt; job1e;
job1e -&gt; job2s [style=dotted,label=&quot;release locks&quot;];
}" />
<p>In the above example, the second IAllocator run will wait for locks for
nodes <code class="docutils literal"><span class="pre">A</span></code> and <code class="docutils literal"><span class="pre">B</span></code>, even though in the end the second instance will
be placed on another set of nodes (<code class="docutils literal"><span class="pre">C</span></code> and <code class="docutils literal"><span class="pre">D</span></code>). This wait shouldn’t
be needed, since right after the first IAllocator run has finished,
<strong class="command">hail</strong> knows the status of the cluster after the allocation,
and it could answer the question for the second run too; however, Ganeti
doesn’t have such visibility into the cluster state and thus it is
forced to wait with the second job.</p>
<p>Similar examples can be made about replace disks (another long-running
opcode).</p>
</div>
<div class="section" id="policies">
<span id="label-policies"></span><h4>Policies<a class="headerlink" href="#policies" title="Permalink to this headline"></a></h4>
<p>For most of the resources, we have metrics defined by policy: e.g. the
over-subscription ratio for CPUs, the amount of space to reserve,
etc. Furthermore, although there are no such definitions in Ganeti such
as minimum/maximum instance size, a real deployment will need to have
them, especially in a fully-automated workflow where end-users can
request instances via an automated interface (that talks to the cluster
via RAPI, LUXI or command line). However, such an automated interface
will need to also take into account cluster capacity, and if the
<strong class="command">hspace</strong> tool is used for the capacity computation, it needs to
be told the maximum instance size, however it has a built-in minimum
instance size which is not customisable.</p>
<p>It is clear that this situation leads to duplicate definition of
resource policies which makes it hard to easily change per-cluster (or
globally) the respective policies, and furthermore it creates
inconsistencies if such policies are not enforced at the source (i.e. in
Ganeti).</p>
</div>
<div class="section" id="balancing-algorithm">
<h4>Balancing algorithm<a class="headerlink" href="#balancing-algorithm" title="Permalink to this headline"></a></h4>
<p>The balancing algorithm, as documented in the HTools <code class="docutils literal"><span class="pre">README</span></code> file,
tries to minimise the cluster score; this score is based on a set of
metrics that describe both exceptional conditions and how spread the
instances are across the nodes. In order to achieve this goal, it moves
the instances around, with a series of moves of various types:</p>
<ul class="simple">
<li>disk replaces (for DRBD-based instances)</li>
<li>instance failover/migrations (for all types)</li>
</ul>
<p>However, the algorithm only looks at the cluster score, and not at the
<em>“cost”</em> of the moves. In other words, the following can and will happen
on a cluster:</p>
<img src="_images/graphviz-a5e06308192cb06bd14e53778380fe6f4ed30197.png" alt="digraph &quot;balancing-cost-issues&quot; {
rankdir=LR;
ranksep=1;

start     [label=&quot;score α&quot;, shape=hexagon];

node      [shape=box, width=2];
replace1  [label=&quot;replace_disks 500G\nscore α-3ε\ncost 3&quot;];
replace2a [label=&quot;replace_disks 20G\nscore α-2ε\ncost 2&quot;];
migrate1  [label=&quot;migrate\nscore α-ε\ncost 1&quot;];

choose    [shape=ellipse,label=&quot;choose min(score)=α-3ε\ncost 3&quot;];

start -&gt; {replace1; replace2a; migrate1} -&gt; choose;
}" />
<p>Even though a migration is much, much cheaper than a disk replace (in
terms of network and disk traffic on the cluster), if the disk replace
results in a score infinitesimally smaller, then it will be
chosen. Similarly, between two disk replaces, one moving e.g. <code class="docutils literal"><span class="pre">500GiB</span></code>
and one moving <code class="docutils literal"><span class="pre">20GiB</span></code>, the first one will be chosen if it results in
a score smaller than the second one. Furthermore, even if the resulting
scores are equal, the first computed solution will be kept, whichever it
is.</p>
<p>Fixing this algorithmic problem is doable, but currently Ganeti doesn’t
export enough information about nodes to make an informed decision; in
the above example, if the <code class="docutils literal"><span class="pre">500GiB</span></code> move is between nodes having fast
I/O (both disks and network), it makes sense to execute it over a disk
replace of <code class="docutils literal"><span class="pre">100GiB</span></code> between nodes with slow I/O, so simply relating to
the properties of the move itself is not enough; we need more node
information for cost computation.</p>
</div>
<div class="section" id="allocation-algorithm">
<h4>Allocation algorithm<a class="headerlink" href="#allocation-algorithm" title="Permalink to this headline"></a></h4>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">This design document will not address this limitation, but it
is worth mentioning as it directly related to the resource model.</p>
</div>
<p>The current allocation/capacity algorithm works as follows (per
node-group):</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">repeat</span><span class="p">:</span>
    <span class="n">allocate</span> <span class="n">instance</span> <span class="n">without</span> <span class="n">failing</span> <span class="n">N</span><span class="o">+</span><span class="mi">1</span>
</pre></div>
</div>
<p>This simple algorithm, and its use of <code class="docutils literal"><span class="pre">N+1</span></code> criterion, has a built-in
limit of 1 machine failure in case of DRBD. This means the algorithm
guarantees that, if using DRBD storage, there are enough resources to
(re)start all affected instances in case of one machine failure. This
relates mostly to memory; there is no account for CPU over-subscription
(i.e. in case of failure, make sure we can failover while still not
going over CPU limits), or for any other resource.</p>
<p>In case of shared storage, there’s not even the memory guarantee, as the
N+1 protection doesn’t work for shared storage.</p>
<p>If a given cluster administrator wants to survive up to two machine
failures, or wants to ensure CPU limits too for DRBD, there is no
possibility to configure this in HTools (neither in <strong class="command">hail</strong> nor
in <strong class="command">hspace</strong>). Current workaround employ for example deducting a
certain number of instances from the size computed by <strong class="command">hspace</strong>,
but this is a very crude method, and requires that instance creations
are limited before Ganeti (otherwise <strong class="command">hail</strong> would allocate
until the cluster is full).</p>
</div>
</div>
</div>
<div class="section" id="proposed-architecture">
<h2>Proposed architecture<a class="headerlink" href="#proposed-architecture" title="Permalink to this headline"></a></h2>
<p>There are two main changes proposed:</p>
<ul class="simple">
<li>changing the resource model from a pure <a class="reference internal" href="glossary.html#term-sow"><span class="xref std std-term">SoW</span></a> to a hybrid
<a class="reference internal" href="glossary.html#term-sor"><span class="xref std std-term">SoR</span></a>/<a class="reference internal" href="glossary.html#term-sow"><span class="xref std std-term">SoW</span></a> one, where the <a class="reference internal" href="glossary.html#term-sor"><span class="xref std std-term">SoR</span></a> component is
heavily emphasised</li>
<li>extending the resource model to cover additional properties,
completing the “holes” in the current coverage</li>
</ul>
<p>The second change is rather straightforward, but will add more
complexity in the modelling of the cluster. The first change, however,
represents a significant shift from the current model, which Ganeti had
from its beginnings.</p>
<div class="section" id="lock-improved-resource-model">
<h3>Lock-improved resource model<a class="headerlink" href="#lock-improved-resource-model" title="Permalink to this headline"></a></h3>
<div class="section" id="hybrid-sor-sow-model">
<h4>Hybrid SoR/SoW model<a class="headerlink" href="#hybrid-sor-sow-model" title="Permalink to this headline"></a></h4>
<p>The resources of a node can be characterised in two broad classes:</p>
<ul class="simple">
<li>mostly static resources</li>
<li>dynamically changing resources</li>
</ul>
<p>In the first category, we have things such as total core count, total
memory size, total disk size, number of network interfaces etc. In the
second category we have things such as free disk space, free memory, CPU
load, etc. Note that nowadays we don’t have (anymore) fully-static
resources: features like CPU and memory hot-plug, online disk replace,
etc. mean that theoretically all resources can change (there are some
practical limitations, of course).</p>
<p>Even though the rate of change of the two resource types is wildly
different, right now Ganeti handles both the same. Given that the
interval of change of the semi-static ones is much bigger than most
Ganeti operations, even more than lengthy sequences of Ganeti jobs, it
makes sense to treat them separately.</p>
<p>The proposal is then to move the following resources into the
configuration and treat the configuration as the authoritative source
for them (a <a class="reference internal" href="glossary.html#term-sor"><span class="xref std std-term">SoR</span></a> model):</p>
<ul class="simple">
<li><dl class="first docutils">
<dt>CPU resources:</dt>
<dd><ul class="first last">
<li>total core count</li>
<li>node core usage (<em>new</em>)</li>
</ul>
</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>memory resources:</dt>
<dd><ul class="first last">
<li>total memory size</li>
<li>node memory size</li>
<li>hypervisor overhead (<em>new</em>)</li>
</ul>
</dd>
</dl>
</li>
<li><dl class="first docutils">
<dt>disk resources:</dt>
<dd><ul class="first last">
<li>total disk size</li>
<li>disk overhead (<em>new</em>)</li>
</ul>
</dd>
</dl>
</li>
</ul>
<p>Since these resources can though change at run-time, we will need
functionality to update the recorded values.</p>
</div>
<div class="section" id="pre-computing-dynamic-resource-values">
<h4>Pre-computing dynamic resource values<a class="headerlink" href="#pre-computing-dynamic-resource-values" title="Permalink to this headline"></a></h4>
<p>Remember that the resource model used by HTools models the clusters as
obeying the following equations:</p>
<blockquote>
<div><p>disk<sub>free</sub> = disk<sub>total</sub> - ∑ disk<sub>instances</sub></p>
<p>mem<sub>free</sub> = mem<sub>total</sub> - ∑ mem<sub>instances</sub> - mem<sub>node</sub> - mem<sub>overhead</sub></p>
</div></blockquote>
<p>As this model worked fine for HTools, we can consider it valid and adopt
it in Ganeti. Furthermore, note that all values in the right-hand side
come now from the configuration:</p>
<ul class="simple">
<li>the per-instance usage values were already stored in the configuration</li>
<li>the other values will are moved to the configuration per the previous
section</li>
</ul>
<p>This means that we can now compute the free values without having to
actually live-query the nodes, which brings a significant advantage.</p>
<p>There are a couple of caveats to this model though. First, as the
run-time state of the instance is no longer taken into consideration, it
means that we have to introduce a new <em>offline</em> state for an instance
(similar to the node one). In this state, the instance’s runtime
resources (memory and VCPUs) are no longer reserved for it, and can be
reused by other instances. Static resources like disk and MAC addresses
are still reserved though. Transitioning into and out of this reserved
state will be more involved than simply stopping/starting the instance
(e.g. de-offlining can fail due to missing resources). This complexity
is compensated by the increased consistency of what guarantees we have
in the stopped state (we always guarantee resource reservation), and the
potential for management tools to restrict which users can transition
into/out of this state separate from which users can stop/start the
instance.</p>
</div>
<div class="section" id="separating-per-node-resource-locks">
<h4>Separating per-node resource locks<a class="headerlink" href="#separating-per-node-resource-locks" title="Permalink to this headline"></a></h4>
<p>Many of the current node locks in Ganeti exist in order to guarantee
correct resource state computation, whereas others are designed to
guarantee reasonable run-time performance of nodes (e.g. by not
overloading the I/O subsystem). This is an unfortunate coupling, since
it means for example that the following two operations conflict in
practice even though they are orthogonal:</p>
<ul class="simple">
<li>replacing a instance’s disk on a node</li>
<li>computing node disk/memory free for an IAllocator run</li>
</ul>
<p>This conflict increases significantly the lock contention on a big/busy
cluster and at odds with the goal of increasing the cluster size.</p>
<p>The proposal is therefore to add a new level of locking that is only
used to prevent concurrent modification to the resource states (either
node properties or instance properties) and not for long-term
operations:</p>
<ul class="simple">
<li>instance creation needs to acquire and keep this lock until adding the
instance to the configuration</li>
<li>instance modification needs to acquire and keep this lock until
updating the instance</li>
<li>node property changes will need to acquire this lock for the
modification</li>
</ul>
<p>The new lock level will sit before the instance level (right after BGL)
and could either be single-valued (like the “Big Ganeti Lock”), in which
case we won’t be able to modify two nodes at the same time, or per-node,
in which case the list of locks at this level needs to be synchronised
with the node lock level. To be determined.</p>
</div>
<div class="section" id="lock-contention-reduction">
<h4>Lock contention reduction<a class="headerlink" href="#lock-contention-reduction" title="Permalink to this headline"></a></h4>
<p>Based on the above, the locking contention will be reduced as follows:
IAllocator calls will no longer need the <code class="docutils literal"><span class="pre">LEVEL_NODE:</span> <span class="pre">ALL_SET</span></code> lock,
only the resource lock (in exclusive mode). Hence allocating/computing
evacuation targets will no longer conflict for longer than the time to
compute the allocation solution.</p>
<p>The remaining long-running locks will be the DRBD replace-disks ones
(exclusive mode). These can also be removed, or changed into shared
locks, but that is a separate design change.</p>
<div class="admonition-fixme admonition">
<p class="first admonition-title">FIXME</p>
<p class="last">Need to rework instance replace disks. I don’t think we need exclusive
locks for replacing disks: it is safe to stop/start the instance while
it’s doing a replace disks. Only modify would need exclusive, and only
for transitioning into/out of offline state.</p>
</div>
</div>
</div>
<div class="section" id="instance-memory-model">
<h3>Instance memory model<a class="headerlink" href="#instance-memory-model" title="Permalink to this headline"></a></h3>
<p>In order to support ballooning, the instance memory model needs to be
changed from a “memory size” one to a “min/max memory size”. This
interacts with the new static resource model, however, and thus we need
to declare a-priori the expected oversubscription ratio on the cluster.</p>
<p>The new minimum memory size parameter will be similar to the current
memory size; the cluster will guarantee that in all circumstances, all
instances will have available their minimum memory size. The maximum
memory size will permit burst usage of more memory by instances, with
the restriction that the sum of maximum memory usage will not be more
than the free memory times the oversubscription factor:</p>
<blockquote>
<div><p>∑ memory<sub>min</sub> ≤ memory<sub>available</sub></p>
<p>∑ memory<sub>max</sub> ≤ memory<sub>free</sub> * oversubscription_ratio</p>
</div></blockquote>
<p>The hypervisor will have the possibility of adjusting the instance’s
memory size dynamically between these two boundaries.</p>
<p>Note that the minimum memory is related to the available memory on the
node, whereas the maximum memory is related to the free memory. On
DRBD-enabled clusters, this will have the advantage of using the
reserved memory for N+1 failover for burst usage, instead of having it
completely idle.</p>
<div class="admonition-fixme admonition">
<p class="first admonition-title">FIXME</p>
<p class="last">Need to document how Ganeti forces minimum size at runtime, overriding
the hypervisor, in cases of failover/lack of resources.</p>
</div>
</div>
<div class="section" id="new-parameters">
<h3>New parameters<a class="headerlink" href="#new-parameters" title="Permalink to this headline"></a></h3>
<p>Unfortunately the design will add a significant number of new
parameters, and change the meaning of some of the current ones.</p>
<div class="section" id="instance-size-limits">
<h4>Instance size limits<a class="headerlink" href="#instance-size-limits" title="Permalink to this headline"></a></h4>
<p>As described in <a class="reference internal" href="#label-policies"><span class="std std-ref">Policies</span></a>, we currently lack a clear
definition of the support instance sizes (minimum, maximum and
standard). As such, we will add the following structure to the cluster
parameters:</p>
<ul class="simple">
<li><code class="docutils literal"><span class="pre">min_ispec</span></code>, <code class="docutils literal"><span class="pre">max_ispec</span></code>: minimum and maximum acceptable instance
specs</li>
<li><code class="docutils literal"><span class="pre">std_ispec</span></code>: standard instance size, which will be used for capacity
computations and for default parameters on the instance creation
request</li>
</ul>
<p>Ganeti will by default reject non-standard instance sizes (lower than
<code class="docutils literal"><span class="pre">min_ispec</span></code> or greater than <code class="docutils literal"><span class="pre">max_ispec</span></code>), but as usual a
<code class="docutils literal"><span class="pre">--ignore-ipolicy</span></code> option on the command line or in the RAPI request
will override these constraints. The <code class="docutils literal"><span class="pre">std_spec</span></code> structure will be used
to fill in missing instance specifications on create.</p>
<p>Each of the ispec structures will be a dictionary, since the contents
can change over time. Initially, we will define the following variables
in these structures:</p>
<table border="1" class="docutils">
<colgroup>
<col width="24%" />
<col width="54%" />
<col width="22%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Name</th>
<th class="head">Description</th>
<th class="head">Type</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>mem_size</td>
<td>Allowed memory size</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>cpu_count</td>
<td>Allowed vCPU count</td>
<td>int</td>
</tr>
<tr class="row-even"><td>disk_count</td>
<td>Allowed disk count</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>disk_size</td>
<td>Allowed disk size</td>
<td>int</td>
</tr>
<tr class="row-even"><td>nic_count</td>
<td>Allowed NIC count</td>
<td>int</td>
</tr>
</tbody>
</table>
<div class="section" id="inheritance">
<h5>Inheritance<a class="headerlink" href="#inheritance" title="Permalink to this headline"></a></h5>
<p>In a single-group cluster, the above structure is sufficient. However,
on a multi-group cluster, it could be that the hardware specifications
differ across node groups, and thus the following problem appears: how
can Ganeti present unified specifications over RAPI?</p>
<p>Since the set of instance specs is only partially ordered (as opposed to
the sets of values of individual variable in the spec, which are totally
ordered), it follows that we can’t present unified specs. As such, the
proposed approach is to allow the <code class="docutils literal"><span class="pre">min_ispec</span></code> and <code class="docutils literal"><span class="pre">max_ispec</span></code> to be
customised per node-group (and export them as a list of specifications),
and a single <code class="docutils literal"><span class="pre">std_spec</span></code> at cluster level (exported as a single value).</p>
</div>
</div>
<div class="section" id="allocation-parameters">
<h4>Allocation parameters<a class="headerlink" href="#allocation-parameters" title="Permalink to this headline"></a></h4>
<p>Beside the limits of min/max instance sizes, there are other parameters
related to capacity and allocation limits. These are mostly related to
the problems related to over allocation.</p>
<table border="1" class="docutils">
<colgroup>
<col width="24%" />
<col width="14%" />
<col width="39%" />
<col width="14%" />
<col width="9%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Name</th>
<th class="head">Level(s)</th>
<th class="head">Description</th>
<th class="head">Current
value</th>
<th class="head">Type</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>vcpu_ratio</td>
<td>cluster,
node group</td>
<td>Maximum ratio of virtual to
physical CPUs</td>
<td>64 (only
in htools)</td>
<td>float</td>
</tr>
<tr class="row-odd"><td>spindle_ratio</td>
<td>cluster,
node group</td>
<td>Maximum ratio of instances
to spindles; when the I/O
model doesn’t map directly
to spindles, another
measure of I/O should be
used instead</td>
<td>none</td>
<td>float</td>
</tr>
<tr class="row-even"><td>max_node_failures</td>
<td>cluster,
node group</td>
<td>Cap allocation/capacity so
that the cluster can
survive this many node
failures</td>
<td>1
(hardcoded
in htools)</td>
<td>int</td>
</tr>
</tbody>
</table>
<p>Since these are used mostly internally (in htools), they will be
exported as-is from Ganeti, without explicit handling of node-groups
grouping.</p>
<p>Regarding <code class="docutils literal"><span class="pre">spindle_ratio</span></code>, in this context spindles do not necessarily
have to mean actual mechanical hard-drivers; it’s rather a measure of
I/O performance for internal storage.</p>
</div>
<div class="section" id="disk-parameters">
<h4>Disk parameters<a class="headerlink" href="#disk-parameters" title="Permalink to this headline"></a></h4>
<p>The proposed model for the new disk parameters is a simple free-form one
based on dictionaries, indexed per disk template and parameter name.
Only the disk template parameters are visible to the user, and those are
internally translated to logical disk level parameters.</p>
<p>This is a simplification, because each parameter is applied to a whole
nested structure and there is no way of fine-tuning each level’s
parameters, but it is good enough for the current parameter set. This
model could need to be expanded, e.g., if support for three-nodes stacked
DRBD setups is added to Ganeti.</p>
<p>At JSON level, since the object key has to be a string, the keys can be
encoded via a separator (e.g. slash), or by having two dict levels.</p>
<p>When needed, the unit of measurement is expressed inside square
brackets.</p>
<table border="1" class="docutils">
<colgroup>
<col width="11%" />
<col width="19%" />
<col width="34%" />
<col width="28%" />
<col width="8%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Disk
template</th>
<th class="head">Name</th>
<th class="head">Description</th>
<th class="head">Current status</th>
<th class="head">Type</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>plain</td>
<td>stripes</td>
<td>How many stripes to use
for newly created (plain)
logical volumes</td>
<td>Configured at
./configure time, not
overridable at
runtime</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>data-stripes</td>
<td>How many stripes to use
for data volumes</td>
<td>Same as for
plain/stripes</td>
<td>int</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>metavg</td>
<td>Default volume group for
the metadata LVs</td>
<td>Same as the main
volume group,
overridable via
‘metavg’ key</td>
<td>string</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>meta-stripes</td>
<td>How many stripes to use
for meta volumes</td>
<td>Same as for lvm
‘stripes’, suboptimal
as the meta LVs are
small</td>
<td>int</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>disk-barriers</td>
<td>What kind of barriers to
<em>disable</em> for disks;
either “n” or a string
containing a subset of
“bfd”</td>
<td>Either all enabled or
all disabled, per
./configure time
option</td>
<td>string</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>meta-barriers</td>
<td>Whether to disable or not
the barriers for the meta
volume</td>
<td>Handled together with
disk-barriers</td>
<td>bool</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>resync-rate</td>
<td>The (static) resync rate
for drbd, when using the
static syncer, in KiB/s</td>
<td>Hardcoded in
constants.py, not
changeable via Ganeti</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>dynamic-resync</td>
<td>Whether to use the
dynamic resync speed
controller or not. If
enabled, c-plan-ahead
must be non-zero and all
the c-* parameters will
be used by DRBD.
Otherwise, the value of
resync-rate will be used
as a static resync speed.</td>
<td>Not supported.</td>
<td>bool</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>c-plan-ahead</td>
<td>Agility factor of the
dynamic resync speed
controller. (the higher,
the slower the algorithm
will adapt the resync
speed). A value of 0
(that is the default)
disables the controller
[ds]</td>
<td>Not supported.</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>c-fill-target</td>
<td>Maximum amount of
in-flight resync data
for the dynamic resync
speed controller
[sectors]</td>
<td>Not supported.</td>
<td>int</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>c-delay-target</td>
<td>Maximum estimated peer
response latency for the
dynamic resync speed
controller [ds]</td>
<td>Not supported.</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>c-max-rate</td>
<td>Upper bound on resync
speed for the dynamic
resync speed controller
[KiB/s]</td>
<td>Not supported.</td>
<td>int</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>c-min-rate</td>
<td>Minimum resync speed for
the dynamic resync speed
controller [KiB/s]</td>
<td>Not supported.</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>drbd</td>
<td>disk-custom</td>
<td>Free-form string that
will be appended to the
drbdsetup disk command
line, for custom options
not supported by Ganeti
itself</td>
<td>Not supported</td>
<td>string</td>
</tr>
<tr class="row-even"><td>drbd</td>
<td>net-custom</td>
<td>Free-form string for
custom net setup options</td>
<td>Not supported</td>
<td>string</td>
</tr>
</tbody>
</table>
<p>Currently Ganeti supports only DRBD 8.0.x, 8.2.x, 8.3.x.  It will refuse
to work with DRBD 8.4 since the <strong class="command">drbdsetup</strong> syntax has changed
significantly.</p>
<p>The barriers-related parameters have been introduced in different DRBD
versions; please make sure that your version supports all the barrier
parameters that you pass to Ganeti. Any version later than 8.3.0
implements all of them.</p>
<p>The minimum DRBD version for using the dynamic resync speed controller
is 8.3.9, since previous versions implement different parameters.</p>
<p>A more detailed discussion of the dynamic resync speed controller
parameters is outside the scope of the present document. Please refer to
the <code class="docutils literal"><span class="pre">drbdsetup</span></code> man page
(<a class="reference external" href="http://www.drbd.org/users-guide-8.3/re-drbdsetup.html">8.3</a> and
<a class="reference external" href="http://www.drbd.org/users-guide/re-drbdsetup.html">8.4</a>). An
interesting discussion about them can also be found in a
<a class="reference external" href="http://lists.linbit.com/pipermail/drbd-user/2011-August/016739.html">drbd-user mailing list post</a>.</p>
<p>All the above parameters are at cluster and node group level; as in
other parts of the code, the intention is that all nodes in a node group
should be equal. It will later be decided to which node group give
precedence in case of instances split over node groups.</p>
<div class="admonition-fixme admonition">
<p class="first admonition-title">FIXME</p>
<p class="last">Add details about when each parameter change takes effect (device
creation vs. activation)</p>
</div>
</div>
<div class="section" id="node-parameters">
<h4>Node parameters<a class="headerlink" href="#node-parameters" title="Permalink to this headline"></a></h4>
<p>For the new memory model, we’ll add the following parameters, in a
dictionary indexed by the hypervisor name (node attribute
<code class="docutils literal"><span class="pre">hv_state</span></code>). The rationale is that, even though multi-hypervisor
clusters are rare, they make sense sometimes, and thus we need to
support multiple node states (one per hypervisor).</p>
<p>Since usually only one of the multiple hypervisors is the ‘main’ one
(and the others used sparringly), capacity computation will still only
use the first hypervisor, and not all of them. Thus we avoid possible
inconsistencies.</p>
<table border="1" class="docutils">
<colgroup>
<col width="15%" />
<col width="52%" />
<col width="22%" />
<col width="10%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Name</th>
<th class="head">Description</th>
<th class="head">Current state</th>
<th class="head">Type</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>mem_total</td>
<td>Total node memory, as discovered by
this hypervisor</td>
<td>Queried at
runtime</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>mem_node</td>
<td>Memory used by, or reserved for,
the node itself; not that some
hypervisors can report this in an
authoritative way, other not</td>
<td>Queried at
runtime</td>
<td>int</td>
</tr>
<tr class="row-even"><td>mem_hv</td>
<td>Memory used either by the
hypervisor itself or lost due to
instance allocation rounding;
usually this cannot be precisely
computed, but only roughly
estimated</td>
<td>Not used,
htools computes
it internally</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>cpu_total</td>
<td>Total node cpu (core) count;
usually this can be discovered
automatically</td>
<td>Queried at
runtime</td>
<td>int</td>
</tr>
<tr class="row-even"><td>cpu_node</td>
<td>Number of cores reserved for the
node itself; this can either be
discovered or set manually. Only
used for estimating how many VCPUs
are left for instances</td>
<td>Not used at all</td>
<td>int</td>
</tr>
</tbody>
</table>
<p>Of the above parameters, only <code class="docutils literal"><span class="pre">_total</span></code> ones are straight-forward. The
others have sometimes strange semantics:</p>
<ul class="simple">
<li>Xen can report <code class="docutils literal"><span class="pre">mem_node</span></code>, if configured statically (as we
recommend); but Linux-based hypervisors (KVM, chroot, LXC) do not, and
this needs to be configured statically for these values</li>
<li><code class="docutils literal"><span class="pre">mem_hv</span></code>, representing unaccounted for memory, is not directly
computable; on Xen, it can be seen that on a N GB machine, with 1 GB
for dom0 and N-2 GB for instances, there’s just a few MB left, instead
fo a full 1 GB of RAM; however, the exact value varies with the total
memory size (at least)</li>
<li><code class="docutils literal"><span class="pre">cpu_node</span></code> only makes sense on Xen (currently), in the case when we
restrict dom0; for Linux-based hypervisors, the node itself cannot be
easily restricted, so it should be set as an estimate of how “heavy”
the node loads will be</li>
</ul>
<p>Since these two values cannot be auto-computed from the node, we need to
be able to declare a default at cluster level (debatable how useful they
are at node group level); the proposal is to do this via a cluster-level
<code class="docutils literal"><span class="pre">hv_state</span></code> dict (per hypervisor).</p>
<p>Beside the per-hypervisor attributes, we also have disk attributes,
which are queried directly on the node (without hypervisor
involvement). The are stored in a separate attribute (<code class="docutils literal"><span class="pre">disk_state</span></code>),
which is indexed per storage type and name; currently this will be just
<code class="docutils literal"><span class="pre">DT_PLAIN</span></code> and the volume name as key.</p>
<table border="1" class="docutils">
<colgroup>
<col width="20%" />
<col width="38%" />
<col width="30%" />
<col width="12%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Name</th>
<th class="head">Description</th>
<th class="head">Current state</th>
<th class="head">Type</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>disk_total</td>
<td>Total disk size</td>
<td>Queried at runtime</td>
<td>int</td>
</tr>
<tr class="row-odd"><td>disk_reserved</td>
<td>Reserved disk size; this
is a lower limit on the
free space, if such a
limit is desired</td>
<td>None used in Ganeti;
htools has a
parameter for this</td>
<td>int</td>
</tr>
<tr class="row-even"><td>disk_overhead</td>
<td>Disk that is expected to
be used by other volumes
(set via
<code class="docutils literal"><span class="pre">reserved_lvs</span></code>);
usually should be zero</td>
<td>None used in Ganeti;
htools detects this
at runtime</td>
<td>int</td>
</tr>
</tbody>
</table>
</div>
<div class="section" id="instance-parameters">
<h4>Instance parameters<a class="headerlink" href="#instance-parameters" title="Permalink to this headline"></a></h4>
<p>New instance parameters, needed especially for supporting the new memory
model:</p>
<table border="1" class="docutils">
<colgroup>
<col width="20%" />
<col width="48%" />
<col width="24%" />
<col width="8%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Name</th>
<th class="head">Description</th>
<th class="head">Current status</th>
<th class="head">Type</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>offline</td>
<td>Whether the instance is in
“permanent” offline mode; this is
stronger than the “admin_down”
state, and is similar to the node
offline attribute</td>
<td>Not supported</td>
<td>bool</td>
</tr>
<tr class="row-odd"><td>be/max_memory</td>
<td>The maximum memory the instance is
allowed</td>
<td>Not existent, but
virtually
identical to
memory</td>
<td>int</td>
</tr>
</tbody>
</table>
</div>
</div>
<div class="section" id="htools-changes">
<h3>HTools changes<a class="headerlink" href="#htools-changes" title="Permalink to this headline"></a></h3>
<p>All the new parameters (node, instance, cluster, not so much disk) will
need to be taken into account by HTools, both in balancing and in
capacity computation.</p>
<p>Since the Ganeti’s cluster model is much enhanced, Ganeti can also
export its own reserved/overhead variables, and as such HTools can make
less “guesses” as to the difference in values.</p>
<div class="admonition-fixme admonition">
<p class="first admonition-title">FIXME</p>
<p class="last">Need to detail more the htools changes; the model is clear to me, but
need to write it down.</p>
</div>
</div>
</div>
</div>


          </div>
        </div>
      </div>
      <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
        <div class="sphinxsidebarwrapper">
  <h3><a href="index.html">Table Of Contents</a></h3>
  <ul>
<li><a class="reference internal" href="#">Resource model changes</a><ul>
<li><a class="reference internal" href="#introduction">Introduction</a></li>
<li><a class="reference internal" href="#current-situation">Current situation</a><ul>
<li><a class="reference internal" href="#ganeti">Ganeti</a></li>
<li><a class="reference internal" href="#htools">HTools</a><ul>
<li><a class="reference internal" href="#disk">Disk</a></li>
<li><a class="reference internal" href="#memory">Memory</a></li>
<li><a class="reference internal" href="#cpu">CPU</a></li>
<li><a class="reference internal" href="#dynamic-load">Dynamic load</a></li>
</ul>
</li>
<li><a class="reference internal" href="#limitations">Limitations</a><ul>
<li><a class="reference internal" href="#id1">Memory</a></li>
<li><a class="reference internal" href="#disks">Disks</a></li>
<li><a class="reference internal" href="#locking">Locking</a></li>
<li><a class="reference internal" href="#policies">Policies</a></li>
<li><a class="reference internal" href="#balancing-algorithm">Balancing algorithm</a></li>
<li><a class="reference internal" href="#allocation-algorithm">Allocation algorithm</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#proposed-architecture">Proposed architecture</a><ul>
<li><a class="reference internal" href="#lock-improved-resource-model">Lock-improved resource model</a><ul>
<li><a class="reference internal" href="#hybrid-sor-sow-model">Hybrid SoR/SoW model</a></li>
<li><a class="reference internal" href="#pre-computing-dynamic-resource-values">Pre-computing dynamic resource values</a></li>
<li><a class="reference internal" href="#separating-per-node-resource-locks">Separating per-node resource locks</a></li>
<li><a class="reference internal" href="#lock-contention-reduction">Lock contention reduction</a></li>
</ul>
</li>
<li><a class="reference internal" href="#instance-memory-model">Instance memory model</a></li>
<li><a class="reference internal" href="#new-parameters">New parameters</a><ul>
<li><a class="reference internal" href="#instance-size-limits">Instance size limits</a><ul>
<li><a class="reference internal" href="#inheritance">Inheritance</a></li>
</ul>
</li>
<li><a class="reference internal" href="#allocation-parameters">Allocation parameters</a></li>
<li><a class="reference internal" href="#disk-parameters">Disk parameters</a></li>
<li><a class="reference internal" href="#node-parameters">Node parameters</a></li>
<li><a class="reference internal" href="#instance-parameters">Instance parameters</a></li>
</ul>
</li>
<li><a class="reference internal" href="#htools-changes">HTools changes</a></li>
</ul>
</li>
</ul>
</li>
</ul>

  <h4>Previous topic</h4>
  <p class="topless"><a href="design-reason-trail.html"
                        title="previous chapter">Ganeti reason trail</a></p>
  <h4>Next topic</h4>
  <p class="topless"><a href="design-restricted-commands.html"
                        title="next chapter">Design for executing commands via RPC</a></p>
  <div role="note" aria-label="source link">
    <h3>This Page</h3>
    <ul class="this-page-menu">
      <li><a href="_sources/design-resource-model.rst.txt"
            rel="nofollow">Show Source</a></li>
    </ul>
   </div>
<div id="searchbox" style="display: none" role="search">
  <h3>Quick search</h3>
    <form class="search" action="search.html" method="get">
      <div><input type="text" name="q" /></div>
      <div><input type="submit" value="Go" /></div>
      <input type="hidden" name="check_keywords" value="yes" />
      <input type="hidden" name="area" value="default" />
    </form>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
        </div>
      </div>
      <div class="clearer"></div>
    </div>
    <div class="related" role="navigation" aria-label="related navigation">
      <h3>Navigation</h3>
      <ul>
        <li class="right" style="margin-right: 10px">
          <a href="design-restricted-commands.html" title="Design for executing commands via RPC"
             >next</a></li>
        <li class="right" >
          <a href="design-reason-trail.html" title="Ganeti reason trail"
             >previous</a> |</li>
        <li class="nav-item nav-item-0"><a href="index.html">Ganeti 2.16.0~rc2 documentation</a> &#187;</li> 
      </ul>
    </div>
    <div class="footer" role="contentinfo">
        &#169; Copyright 2018, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Google Inc..
      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.6.7.
    </div>
  </body>
</html>