@@ -6959,11 +6959,123 @@ int VMK::alltoall(void *in, int inCount, void *out, int outCount,
6959
6959
&req);
6960
6960
MPI_Wait(&req, MPI_STATUS_IGNORE);
6961
6961
#endif
6962
- #if 1
6962
+ #if 0
6963
6963
for (int i=0; i<npets; i++){
6964
6964
MPI_Scatter(in, inCount, mpitype, out+outCount*i*size, outCount, mpitype,
6965
6965
i, mpi_c);
6966
6966
}
6967
+ #endif
6968
+ #if 1
6969
+ {
6970
+ // Hierarchical Alltoall implementation, with SSI roots as intermediary
6971
+ // Step-0: SSI-local exchange via alltoallv to avoid data movements.
6972
+ std::vector<int> inCounts(ssiLocalPetCount, inCount);
6973
+ std::vector<int> outCounts(ssiLocalPetCount, outCount);
6974
+ std::vector<int> inOffsets(ssiLocalPetCount);
6975
+ std::vector<int> outOffsets(ssiLocalPetCount);
6976
+ std::set<int> ssiLocalPetSet;
6977
+ for (int i=0; i<ssiLocalPetCount; i++){
6978
+ ssiLocalPetSet.insert(ssiLocalPetList[i]);
6979
+ inOffsets[i] = ssiLocalPetList[i] * inCount;
6980
+ outOffsets[i] = ssiLocalPetList[i] * outCount;
6981
+ }
6982
+ MPI_Alltoallv(in, &(inCounts[0]), &(inOffsets[0]), mpitype,
6983
+ out, &(outCounts[0]), &(outOffsets[0]), mpitype, mpi_c_ssi);
6984
+ // Only go up the hierarchy if there are more than one SSI in the VM
6985
+ if (ssiCount > 1){
6986
+ // Multiple SSIs in the VM, under each mpi_c_ssi communicator,
6987
+ // using task 0 as the SSI root PET below.
6988
+ // Step-1: SSI root PETs gather xfer data on their SSI.
6989
+ // - Each PET prepares an xferBuffer to send its "in" data that is
6990
+ // destined for PETs outside the local SSI to its SSI root PET.
6991
+ std::vector<char> xferBuffer((npets-ssiLocalPetCount)*inCount*size);
6992
+ char *xferBC = (char *)&(xferBuffer[0]);
6993
+ char *inC = (char *)in;
6994
+ int j=0;
6995
+ for (int i=0; i<npets; i++){
6996
+ if (ssiLocalPetSet.find(i) != ssiLocalPetSet.end()) continue; // skip
6997
+ memcpy(xferBC+outCount*size*j, inC+inCount*size*i, inCount*size);
6998
+ ++j;
6999
+ }
7000
+ // - SSI roots gather xfer data from their SSI PETs toward other SSI
7001
+ char *xferSsiBC = NULL;
7002
+ std::vector<char> xferSsiBuffer;
7003
+ if (mpi_c_ssi_roots != MPI_COMM_NULL){
7004
+ xferSsiBuffer.resize((npets-ssiLocalPetCount)
7005
+ *inCount*size*ssiLocalPetCount);
7006
+ xferSsiBC = (char *)&(xferSsiBuffer[0]);
7007
+ }
7008
+ MPI_Gather(xferBC, (npets-ssiLocalPetCount)*inCount, mpitype,
7009
+ xferSsiBC, (npets-ssiLocalPetCount)*outCount, mpitype, 0, mpi_c_ssi);
7010
+ // Total exchange between SSI roots
7011
+ if (mpi_c_ssi_roots != MPI_COMM_NULL){
7012
+ // - SSI roots exchange their ssiLocalPetCount
7013
+ std::vector<int> ssiLocalPetCounts(ssiCount);
7014
+ MPI_Allgather(&ssiLocalPetCount, 1, MPI_INT,
7015
+ &(ssiLocalPetCounts[0]), 1, MPI_INT, mpi_c_ssi_roots);
7016
+ // - Construct offsets array to match the received ssiLocalPetCounts
7017
+ std::vector<int> offsets(ssiCount);
7018
+ offsets[0] = 0;
7019
+ for (int i=1; i<ssiCount; i++)
7020
+ offsets[i] = offsets[i-1] + ssiLocalPetCounts[i-1];
7021
+ // - SSI roots exchange their ssiLocalPetList information
7022
+ std::vector<int> ssiLocalPetLists(npets);
7023
+ MPI_Allgatherv(ssiLocalPetList, ssiLocalPetCount, MPI_INT,
7024
+ &(ssiLocalPetLists[0]), &(ssiLocalPetCounts[0]), &(offsets[0]),
7025
+ MPI_INT, mpi_c_ssi_roots);
7026
+ // - SSI roots keep track of all the PETs in the other SSIs
7027
+ std::vector<std::set<int> > ssiLocalPetSets(ssiCount);
7028
+ j=0;
7029
+ for (int i=0; i<ssiCount; i++){
7030
+ for (int k=0; k<ssiLocalPetCounts[i]; k++){
7031
+ ssiLocalPetSets[i].insert(ssiLocalPetLists[j]);
7032
+ ++j;
7033
+ }
7034
+ }
7035
+ // - SSI roots collate data into SSI blocks for sending
7036
+ std::vector<char> xferSsiSendBuffer((npets-ssiLocalPetCount)
7037
+ *inCount*size*ssiLocalPetCount);
7038
+ char *xferSsiSBC = (char *)&(xferSsiSendBuffer[0]);
7039
+ int localSsi; // rank of local SSI's root, same as SSI index
7040
+ MPI_Comm_rank(mpi_c_ssi_roots, &localSsi);
7041
+ std::vector<int> xferInCounts(ssiCount);
7042
+ std::vector<int> xferInOffsets(ssiCount);
7043
+ std::vector<int> xferOutCounts(ssiCount);
7044
+ std::vector<int> xferOutOffsets(ssiCount);
7045
+ xferInOffsets[0] = xferOutOffsets[0] = 0;
7046
+ j=0; int jj=0;
7047
+ for (int i=0; i<ssiCount; i++){
7048
+ // prep data block to send to PETs in SSI i
7049
+ if (i > 0){
7050
+ xferInOffsets[i] = xferInOffsets[i-1] + xferInCounts[i-1];
7051
+ xferOutOffsets[i] = xferOutOffsets[i-1] + xferOutCounts[i-1];
7052
+ }
7053
+ if (i == localSsi){
7054
+ xferInCounts[i] = xferOutCounts[i] = 0;
7055
+ continue; // no self communication to local SSI
7056
+ }else{
7057
+ xferInCounts[i] = ssiLocalPetCounts[i]*ssiLocalPetCount*inCount;
7058
+ xferOutCounts[i] = ssiLocalPetCounts[i]*ssiLocalPetCount*outCount;
7059
+ }
7060
+ for (int k=0; k<ssiLocalPetCounts[i]; k++){
7061
+ // prepare block to SSI local PET k on SSI i
7062
+ for (int l=0; l<ssiLocalPetCount; l++){
7063
+ // prepare block from local SSI local PET l to PET k on SSI i
7064
+ memcpy(xferSsiSBC+inCount*size*j, xferSsiBC
7065
+ +inCount*size*(npets-ssiLocalPetCount)*l
7066
+ +inCount*size*ssiLocalPetLists[offsets[i]+k],
7067
+ inCount*size);
7068
+ ++j;
7069
+ }
7070
+ }
7071
+ }
7072
+ // - Alltoallv xferSsiSendBuffer -> xferSsiBuffer
7073
+ MPI_Alltoallv(xferSsiSBC, &(xferInCounts[0]), &(xferInOffsets[0]),
7074
+ mpitype, xferSsiBC, &(xferOutCounts[0]), &(xferOutOffsets[0]),
7075
+ mpitype, mpi_c_ssi);
7076
+ }
7077
+ }
7078
+ }
6967
7079
#endif
6968
7080
}else{
6969
7081
// This is a very simplistic, probably very bad peformance implementation.
@@ -7043,7 +7155,7 @@ int VMK::alltoallv(void *in, int *inCounts, int *inOffsets, void *out,
7043
7155
mpitype = MPI_LOGICAL;
7044
7156
break;
7045
7157
}
7046
- #if 0
7158
+ #if 1
7047
7159
localrc = MPI_Alltoallv(in, inCounts, inOffsets, mpitype, out, outCounts,
7048
7160
outOffsets, mpitype, mpi_c);
7049
7161
#else
0 commit comments