Skip to content

Commit

Permalink
Add NPS4 support on some models (#256)
Browse files Browse the repository at this point in the history
* Add NPS4 support on some models

* Add XML models
  • Loading branch information
wenkaidu committed Aug 19, 2020
1 parent ec9af40 commit 391bbf3
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 5 deletions.
30 changes: 26 additions & 4 deletions src/graph/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,8 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
static const char *ringBase_10302120_2 = "6 4 7 5 0 1 3 2|6 5 7 4 2 3 1 0";
static const char *ringBase_11303011_1 = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0";
static const char *ringBase_11303011_2 = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2";
static const char *ringBase_0110201010200110_1 = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3";
static const char *ringBase_0110201010200110_2 = "3 0 6 2 1 4 5 7|4 1 0 3 2 6 7 5";
static const char *ringBase;
static char ringRemap[64];
int id[8], dist[8];
Expand All @@ -891,22 +893,22 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
// 8 GPUs and 4 numa nodes only
if (ngpus != 8 || ncpus != 4)
if (ngpus != 8 || (ncpus != 4 && ncpus != 8))
return ncclSuccess;
// only valid on Rome
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
return ncclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
char pattern[9];
for (int i = 0; i < ncpus; i++) {
char pattern[256];
for (i = 0; i < ncpus; i++) {
int g, n;
if (!getGpuNetCount(system, i, &g, &n)) return ncclSuccess;
pattern[i*2] = '0' + g;
pattern[i*2+1] = '0' + n;
}
pattern[8] = 0;
pattern[i*2] = 0;
int g[8], h1[4], h2[4];
for (int i = 0; i <8; i++) g[i] = -1;
if (strcmp(pattern, "10302120") == 0) {
Expand Down Expand Up @@ -961,6 +963,26 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
ringBase = ringBase_11303011_1;
}
}
else if (strcmp(pattern, "0110201010200110") == 0) {
if (findGpuByXGMI(system, 2, 5, &g[2], &g[6], 1, -1, -1)) {
if (!findGpuByXGMI(system, 4, 2, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
ringBase = ringBase_0110201010200110_2;
} else {
if (!findGpuByXGMI(system, 1, 2, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 2, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
if (!findGpuByXGMI(system, 4, 5, &g[4], &g[5], -1, -1, -1)) return ncclSuccess;
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
ringBase = ringBase_0110201010200110_1;
}
}
else
return ncclSuccess;

Expand Down
2 changes: 1 addition & 1 deletion tools/scripts/topo_val.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

for i in {0..29}
for i in {0..33}
do
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
Expand Down
92 changes: 92 additions & 0 deletions tools/topo_expl/models/topo_8p_ts1_n4.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
<system version="2">
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x18815600039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_2" dev="1" speed="200000" port="1" guid="0x70815600039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
92 changes: 92 additions & 0 deletions tools/topo_expl/models/topo_8p_ts1_n4_1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
<system version="2">
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_2" dev="1" speed="200000" port="1" guid="0x38815600039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
4 changes: 4 additions & 0 deletions tools/topo_expl/topo_expl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ NodeModelDesc model_descs[] = {
{4, "topo_8p_ts1_1.xml", "4 nodes 8 VEGA20 TS1 Alt. Model"},
{1, "topo_4p3l_2h.xml", "single node 8 gfx908 Rome"},
{4, "topo_4p3l_2h.xml", "4 nodes 8 gfx908 Rome"},
{1, "topo_8p_ts1_n4.xml", "single node 8 VEGA20 TS1 NPS=4"},
{4, "topo_8p_ts1_n4.xml", "4 nodes 8 VEGA20 TS1 NPS=4"},
{1, "topo_8p_ts1_n4_1.xml", "single node 8 VEGA20 TS1 NPS=4 Alt. Model"},
{4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
};

int main(int argc,char* argv[])
Expand Down

0 comments on commit 391bbf3

Please sign in to comment.