Skip to content

Commit 2733e93

Browse files
ericjbohmEric Bohmebohm
authored
bugfix: improve CXI support for ALCF Aurora configuration (#3855)
* bugfix: improve CXI support for ALCF Aurora configuration * !fixup: clean up logic for cxi defaults and clarifying whitespace --------- Co-authored-by: Eric Bohm <bohm@aurora-uan-0009.hostmgmt.cm.aurora.alcf.anl.gov> Co-authored-by: Eric Bohm <ebohm@illinois.edu>
1 parent 0880840 commit 2733e93

File tree

1 file changed

+21
-7
lines changed

1 file changed

+21
-7
lines changed

src/arch/ofi/machine.C

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
696696
* should not be considered predictive of proximity. That
697697
* relationship has to be detected by other means.
698698
699+
699700
* 2. HWLOC doesn't have a hwloc_get_closest_nic because... NIC
700701
* doesn't even rate an object type in their ontology, let
701702
* alone get first class treatment. Given that PCI devices
@@ -714,7 +715,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
714715
* do *not* have such convenient labeling as something special
715716
* needs to happen to get their linuxfs utilities to inject
716717
* that derived information into your topology object. As an
717-
* interim solution we allow the user to map their cxi[0..3]
718+
* interim solution we allow the user to map their cxi[0..7]
718719
* selection using command line arguments.
719720
720721
* 2b. Likewise the 1:1 relationship we assume here between
@@ -741,6 +742,8 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
741742
* CPU nodes. The user could easily be confused, so we can't
742743
* rely on them telling us. This has to be determined at
743744
* run time.
745+
746+
* 6. Aurora can apparently go up to cxi7.
744747
*/
745748

746749
char *cximap=NULL;
@@ -812,23 +815,34 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
812815

813816

814817
/// short hsnOrder[numcxi]={2,1,3,0};
815-
if(numcxi==4)
818+
if(numcxi == 4)
816819
{
817820
short hsnOrder[4]= {1,3,0,2};
818-
if(myRank%quad>numcxi)
821+
if(myRank % quad > numcxi)
822+
{
823+
CmiPrintf("Error: myrank %d quad %d myrank/quad %n",myRank,quad, myRank/quad);
824+
CmiAbort("cxi mapping failure");
825+
}
826+
myNet = hsnOrder[myRank % quad];
827+
}
828+
else if(numcxi == 8)
829+
{
830+
// this appears to be a good ordering on aurora
831+
short hsnOrder[8]= {0,1,2,3,4,5,6,7};
832+
if(myRank % quad > numcxi)
819833
{
820834
CmiPrintf("Error: myrank %d quad %d myrank/quad %n",myRank,quad, myRank/quad);
821835
CmiAbort("cxi mapping failure");
822836
}
823-
myNet=hsnOrder[myRank%quad];
837+
myNet = hsnOrder[myRank % quad];
824838
}
825839
else
826840
{
827-
CmiAssert(numcxi==1);
828-
//theoretically there are cases other than 4 and 1, but
841+
CmiAssert(numcxi == 1);
842+
//theoretically there are cases other than 8, 4 and 1, but
829843
//until someone sights such an incrayptid on a machine floor,
830844
//we're just going to assume they don't exist.
831-
myNet=0;
845+
myNet = 0;
832846
}
833847
}
834848
snprintf(myDomainName,5, "cxi%d", myNet);

0 commit comments

Comments
 (0)