Skip to content

Commit

Permalink
Add more Rome models (#292)
Browse files Browse the repository at this point in the history
  • Loading branch information
wenkaidu committed Dec 6, 2020
1 parent d46295d commit e87f28e
Show file tree
Hide file tree
Showing 6 changed files with 307 additions and 1 deletion.
33 changes: 33 additions & 0 deletions src/graph/rome_models.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,36 @@ static struct rcclRomeModel rome_model_28 = {
.ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
};

static struct rcclRomeModel rome_model_40 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
.pattern = "10302120",
.ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
};

static struct rcclRomeModel rome_model_42 = {
.nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
.nicNuma = { 4, },
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
.pattern = "00102010012010",
.ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
};

static struct rcclRomeModel rome_model_44 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, },
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "20202120",
.ringBase = "4 5 7 6 2 1 3 0|7 4 5 6 2 0 1 3",
};

static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
Expand All @@ -180,4 +210,7 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_23,
rome_model_38,
rome_model_28,
rome_model_40,
rome_model_42,
rome_model_44,
};
2 changes: 1 addition & 1 deletion tools/scripts/topo_val.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

for i in {0..38}
for i in {0..44}
do
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
Expand Down
87 changes: 87 additions & 0 deletions tools/topo_expl/models/topo_4p3l_n2.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x70cd600003da341c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
</system>
87 changes: 87 additions & 0 deletions tools/topo_expl/models/topo_4p3l_n2_1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00ffffff,00000000,00000000,00ffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:61:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:63:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,0000ffff,ff000000,00000000,0000ffff,ff000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:25:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:27:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="000000ff,ffff0000,00000000,000000ff,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x22fd9f00039b0398" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="3" affinity="ffffff00,00000000,00000000,ffffff00,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
</system>
93 changes: 93 additions & 0 deletions tools/topo_expl/models/topo_4p3l_n4.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
<system version="2">
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x70cd600003da341c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
6 changes: 6 additions & 0 deletions tools/topo_expl/topo_expl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ NodeModelDesc model_descs[] = {
{4, "topo_8p_rome_n2_2.xml", "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"},
{4, "topo_8p_ts1_n4_2.xml", "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"},
{1, "topo_8p_rome_n4.xml", "single node 8 VEGA20 Rome NPS=4"},
{1, "topo_4p3l_n2.xml", "single node 8 gfx908 Rome"},
{4, "topo_4p3l_n2.xml", "4 nodes 8 gfx908 Rome"},
{1, "topo_4p3l_n4.xml", "single node 8 gfx908 Rome NPS=4"},
{4, "topo_4p3l_n4.xml", "4 nodes 8 gfx908 Rome NPS=4"},
{1, "topo_4p3l_n2_1.xml", "single node 8 gfx908 Rome"},
{4, "topo_4p3l_n2_1.xml", "4 nodes 8 gfx908 Rome"},
};

int main(int argc,char* argv[])
Expand Down

0 comments on commit e87f28e

Please sign in to comment.