Skip to content

Commit

Permalink
Only detect overflow in col_on_comm(), report, and exit.
Browse files Browse the repository at this point in the history
Fixes runtime failures caught by CI in in the underlying MPI_Gatherv().
  • Loading branch information
benkirk committed Jan 18, 2025
1 parent 05be2ee commit 13b94c2
Showing 1 changed file with 31 additions and 37 deletions.
68 changes: 31 additions & 37 deletions frame/collect_on_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ col_on_comm ( int * Fcomm, int * typesize ,
int root_task ;
MPI_Comm *comm, dummy_comm ;
int ierr ;
int dtype = -1;

comm = &dummy_comm ;
*comm = MPI_Comm_f2c( *Fcomm ) ;
Expand All @@ -91,18 +90,6 @@ col_on_comm ( int * Fcomm, int * typesize ,
for ( p = 1 , displace[0] = 0 , noutbuf_loc = recvcounts[0] ; p < ntasks ; p++ ) {
displace[p] = displace[p-1]+recvcounts[p-1] ;
noutbuf_loc = noutbuf_loc + recvcounts[p] ;

/* check for overflow: displace is the partial sum of sendcounts, which can overflow for large problems. */
/* it is sufficient to check displace[p] since it is greater than sendcounts[p] */
if ( (displace[p] < 0) || (noutbuf_loc < 0) )
{
#ifndef MS_SUA
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> noutbuf_loc = %d,\n ---> typesize = %d\n",
p, p, displace[p], noutbuf_loc, *typesize);
#endif
MPI_Abort(MPI_COMM_WORLD,1) ;
}
}

if ( noutbuf_loc > * noutbuf )
Expand All @@ -114,23 +101,27 @@ col_on_comm ( int * Fcomm, int * typesize ,
#endif
MPI_Abort(MPI_COMM_WORLD,1) ;
}
}

/* handle different sized data types appropriately. */
ierr = MPI_Type_match_size (MPI_TYPECLASS_REAL, *typesize, &dtype);
if (MPI_SUCCESS != ierr) {
ierr = MPI_Type_match_size (MPI_TYPECLASS_INTEGER, *typesize, &dtype);
}
if (MPI_SUCCESS != ierr) {
/* multiply everything by the size of the type */
for ( p = 0 ; p < ntasks ; p++ ) {
displace[p] *= *typesize ;
recvcounts[p] *= *typesize ;

/* check for overflow: displace is the partial sum of recvcounts, which can overflow for large problems. */
if (displace[p] < 0) {
#ifndef MS_SUA
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> typesize = %d\n",
p, p, displace[p], *typesize);
#endif
MPI_Abort(MPI_COMM_WORLD,1) ;
MPI_Abort(MPI_COMM_WORLD,1) ;
}
}
}

ierr = MPI_Gatherv( inbuf, *ninbuf, dtype,
outbuf, recvcounts, displace, dtype,
root_task, *comm ) ;
ierr = MPI_Gatherv( inbuf , *ninbuf * *typesize , MPI_CHAR ,
outbuf , recvcounts , displace, MPI_CHAR ,
root_task , *comm ) ;
#ifndef MS_SUA
if ( ierr != 0 ) fprintf(stderr,"%s %d MPI_Gatherv returns %d\n",__FILE__,__LINE__,ierr ) ;
#endif
Expand Down Expand Up @@ -194,29 +185,32 @@ dst_on_comm ( int * Fcomm, int * typesize ,
noutbuf_loc = noutbuf_loc + sendcounts[p] ;

/* check for overflow: displace is the partial sum of sendcounts, which can overflow for large problems. */
/* it is sufficient to check displace[p] since it is greater than sendcounts[p] */
if ( (displace[p] < 0) || (noutbuf_loc < 0) )
{
if ( (displace[p] < 0) || (noutbuf_loc < 0) ) {
#ifndef MS_SUA
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> noutbuf_loc = %d,\n ---> typesize = %d\n",
p, p, displace[p], noutbuf_loc, *typesize);
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> noutbuf_loc = %d,\n ---> typesize = %d\n",
p, p, displace[p], noutbuf_loc, *typesize);
#endif
MPI_Abort(MPI_COMM_WORLD,1) ;
}
MPI_Abort(MPI_COMM_WORLD,1) ;
}
}
}

/* handle different sized data types appropriately. */
ierr = MPI_Type_match_size (MPI_TYPECLASS_REAL, *typesize, &dtype);
if (MPI_SUCCESS != ierr) {
ierr = MPI_Type_match_size (MPI_TYPECLASS_INTEGER, *typesize, &dtype);
}
if (MPI_SUCCESS != ierr) {
if (MPI_SUCCESS != ierr) {
if (1 == *typesize) {
dtype = MPI_CHAR;
}
else {
#ifndef MS_SUA
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
#endif
MPI_Abort(MPI_COMM_WORLD,1) ;
MPI_Abort(MPI_COMM_WORLD,1) ;
}
}
}

MPI_Scatterv( inbuf, sendcounts, displace, dtype,
Expand Down

0 comments on commit 13b94c2

Please sign in to comment.