Skip to content

Commit 05be2ee

Browse files
committed
Determine MPI Data Types in col_on_comm() & dst_on_comm() to prevent displacements overflow.
The MPI_Gatherv() and MPI_Scatterv() operations require integer displacements into the communications buffers. Historically everything is passed as an MPI_CHAR, causing these displacements to be larger than otherwise necessary. For large domain sizes this can cause the displace[] offsets to exceed the maximum int, wrapping to negative values. This change introduces additional error checking and then uses the function MPI_Type_match_size() (available since MPI-2.0) to determine a suitable MPI_Datatype given the input *typesize. The result then is that the displace[] offsets are in terms of data type extents, rather than bytes, and less likely to overflow.
1 parent 13bbd1c commit 05be2ee

File tree

1 file changed

+55
-16
lines changed

1 file changed

+55
-16
lines changed

frame/collect_on_comm.c

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ col_on_comm ( int * Fcomm, int * typesize ,
6969
int root_task ;
7070
MPI_Comm *comm, dummy_comm ;
7171
int ierr ;
72+
int dtype = -1;
7273

7374
comm = &dummy_comm ;
7475
*comm = MPI_Comm_f2c( *Fcomm ) ;
@@ -90,6 +91,18 @@ col_on_comm ( int * Fcomm, int * typesize ,
9091
for ( p = 1 , displace[0] = 0 , noutbuf_loc = recvcounts[0] ; p < ntasks ; p++ ) {
9192
displace[p] = displace[p-1]+recvcounts[p-1] ;
9293
noutbuf_loc = noutbuf_loc + recvcounts[p] ;
94+
95+
/* check for overflow: displace is the partial sum of sendcounts, which can overflow for large problems. */
96+
/* it is sufficient to check displace[p] since it is greater than sendcounts[p] */
97+
if ( (displace[p] < 0) || (noutbuf_loc < 0) )
98+
{
99+
#ifndef MS_SUA
100+
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
101+
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> noutbuf_loc = %d,\n ---> typesize = %d\n",
102+
p, p, displace[p], noutbuf_loc, *typesize);
103+
#endif
104+
MPI_Abort(MPI_COMM_WORLD,1) ;
105+
}
93106
}
94107

95108
if ( noutbuf_loc > * noutbuf )
@@ -101,17 +114,23 @@ col_on_comm ( int * Fcomm, int * typesize ,
101114
#endif
102115
MPI_Abort(MPI_COMM_WORLD,1) ;
103116
}
117+
}
104118

105-
/* multiply everything by the size of the type */
106-
for ( p = 0 ; p < ntasks ; p++ ) {
107-
displace[p] *= *typesize ;
108-
recvcounts[p] *= *typesize ;
109-
}
119+
/* handle different sized data types appropriately. */
120+
ierr = MPI_Type_match_size (MPI_TYPECLASS_REAL, *typesize, &dtype);
121+
if (MPI_SUCCESS != ierr) {
122+
ierr = MPI_Type_match_size (MPI_TYPECLASS_INTEGER, *typesize, &dtype);
123+
}
124+
if (MPI_SUCCESS != ierr) {
125+
#ifndef MS_SUA
126+
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
127+
#endif
128+
MPI_Abort(MPI_COMM_WORLD,1) ;
110129
}
111130

112-
ierr = MPI_Gatherv( inbuf , *ninbuf * *typesize , MPI_CHAR ,
113-
outbuf , recvcounts , displace, MPI_CHAR ,
114-
root_task , *comm ) ;
131+
ierr = MPI_Gatherv( inbuf, *ninbuf, dtype,
132+
outbuf, recvcounts, displace, dtype,
133+
root_task, *comm ) ;
115134
#ifndef MS_SUA
116135
if ( ierr != 0 ) fprintf(stderr,"%s %d MPI_Gatherv returns %d\n",__FILE__,__LINE__,ierr ) ;
117136
#endif
@@ -152,6 +171,8 @@ dst_on_comm ( int * Fcomm, int * typesize ,
152171
int *displace ;
153172
int noutbuf_loc ;
154173
int root_task ;
174+
int dtype = -1;
175+
int ierr = -1;
155176
MPI_Comm *comm, dummy_comm ;
156177

157178
comm = &dummy_comm ;
@@ -171,18 +192,36 @@ dst_on_comm ( int * Fcomm, int * typesize ,
171192
for ( p = 1 , displace[0] = 0 , noutbuf_loc = sendcounts[0] ; p < ntasks ; p++ ) {
172193
displace[p] = displace[p-1]+sendcounts[p-1] ;
173194
noutbuf_loc = noutbuf_loc + sendcounts[p] ;
174-
}
175195

176-
/* multiply everything by the size of the type */
177-
for ( p = 0 ; p < ntasks ; p++ ) {
178-
displace[p] *= *typesize ;
179-
sendcounts[p] *= *typesize ;
196+
/* check for overflow: displace is the partial sum of sendcounts, which can overflow for large problems. */
197+
/* it is sufficient to check displace[p] since it is greater than sendcounts[p] */
198+
if ( (displace[p] < 0) || (noutbuf_loc < 0) )
199+
{
200+
#ifndef MS_SUA
201+
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
202+
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> noutbuf_loc = %d,\n ---> typesize = %d\n",
203+
p, p, displace[p], noutbuf_loc, *typesize);
204+
#endif
205+
MPI_Abort(MPI_COMM_WORLD,1) ;
206+
}
180207
}
181208
}
182209

183-
MPI_Scatterv( inbuf , sendcounts , displace, MPI_CHAR ,
184-
outbuf , *noutbuf * *typesize , MPI_CHAR ,
185-
root_task , *comm ) ;
210+
/* handle different sized data types appropriately. */
211+
ierr = MPI_Type_match_size (MPI_TYPECLASS_REAL, *typesize, &dtype);
212+
if (MPI_SUCCESS != ierr) {
213+
ierr = MPI_Type_match_size (MPI_TYPECLASS_INTEGER, *typesize, &dtype);
214+
}
215+
if (MPI_SUCCESS != ierr) {
216+
#ifndef MS_SUA
217+
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
218+
#endif
219+
MPI_Abort(MPI_COMM_WORLD,1) ;
220+
}
221+
222+
MPI_Scatterv( inbuf, sendcounts, displace, dtype,
223+
outbuf, *noutbuf, dtype,
224+
root_task, *comm ) ;
186225

187226
free(sendcounts) ;
188227
free(displace) ;

0 commit comments

Comments
 (0)