-
Notifications
You must be signed in to change notification settings - Fork 0
/
bt.f90
465 lines (396 loc) · 21.3 KB
/
bt.f90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
!
! Copyright (C) 2013, Northwestern University
! See COPYRIGHT notice in top-level directory.
!
!----< main >----------------------------------------------------
program main
use mpi
use header
use mpiio_m
use pnetcdf_m
implicit none
character io_mode
character(LEN=128) filename, cmd
integer i, err, argc, iargc, fstatus, io_method
integer*8 n3
integer striping_factor, striping_unit
double precision navg, t, t1, t2, t3, t4, tmax, t_total
integer(kind=MPI_OFFSET_KIND) malloc_size, sum_size
call MPI_Init(err)
call MPI_Comm_size(MPI_COMM_WORLD, nprocs, err)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, err)
! check if number of processes is a square number
ncells = dint(dsqrt(dble(nprocs) + 0.00001d0))
if (nprocs .NE. ncells*ncells) then
print*, 'Number of processes must be a square number. exit ...'
goto 999
endif
root = 0
!---------------------------------------------------------------------
! Root reads input file (if it exists) else takes
! defaults from parameters
!---------------------------------------------------------------------
if (rank .eq. root) then
! take filename from command-line argument if there is any
call getarg(0, cmd)
argc = IARGC()
if (argc .GT. 1) then
print*,'Usage: ',trim(cmd),' [filename]'
niter = -1
goto 777
endif
filename = 'inputbt.data'
if (argc .EQ. 1) call getarg(1, filename)
open(unit=2,file=trim(filename),status='old', iostat=fstatus)
if (fstatus .eq. 0) then
233 FORMAT(' Reading from input file ',A)
write(*,233) trim(filename)
read (2,*) io_mode
read (2,*) io_method
read (2,*) niter
read (2,*) grid_points(1), grid_points(2), grid_points(3)
read (2,'(a)') dir_path
close(2)
else
234 format(' No input file inputbt.data. Exiting ...')
write(*,234)
niter = -1
goto 777
endif
endif
777 call MPI_Bcast(io_mode, 1, MPI_CHARACTER, root, MPI_COMM_WORLD, err)
call MPI_Bcast(io_method, 1, MPI_INTEGER, root, MPI_COMM_WORLD, err)
call MPI_Bcast(niter, 1, MPI_INTEGER, root, MPI_COMM_WORLD, err)
call MPI_Bcast(grid_points, 3, MPI_INTEGER8, root, MPI_COMM_WORLD, err)
call MPI_Bcast(dir_path, 128, MPI_CHARACTER, root, MPI_COMM_WORLD, err)
if (niter .EQ. -1) goto 999
call allocate_variables
call make_set
!---------------------------------------------------------------------
! Synchronize before placing time stamp
!---------------------------------------------------------------------
call MPI_Barrier(MPI_COMM_WORLD, err)
t1 = MPI_Wtime()
t_total = MPI_Wtime()
num_io = 0
if (io_method .LT. 2) then ! 0: collective I/O, 1: independent I/O
err = mpiio_setup(io_mode)
else
err = pnetcdf_setup(io_mode, io_method)
endif
if (err .EQ. 0) goto 999
t2 = MPI_Wtime()
do i=1, niter
if (io_mode .EQ. 'w') then
if (io_method .LT. 2) then ! 0: collective I/O, 1: independent I/O
call mpiio_write(io_method)
else
call pnetcdf_write
endif
else
if (io_method .LT. 2) then ! 0: collective I/O, 1: independent I/O
call mpiio_read(io_method)
else
call pnetcdf_read
endif
endif
end do
t3 = MPI_Wtime()
if (io_method .LT. 2) then ! 0: collective I/O, 1: independent I/O
call mpiio_cleanup
else
call pnetcdf_cleanup
endif
call deallocate_variables
t4 = MPI_Wtime()
t = t4 - t1
call MPI_Reduce(t, tmax, 1, MPI_DOUBLE_PRECISION, MPI_MAX, &
root, MPI_COMM_WORLD, err)
call report_io_performance(t, t2 - t1, t3 - t2, t4 - t3, rank, nprocs, root, io_method)
t_total = MPI_Wtime() - t_total
call MPI_Reduce(t_total, tmax, 1, MPI_DOUBLE_PRECISION, MPI_MAX, &
root, MPI_COMM_WORLD, err)
t_total = tmax
if (io_mode .EQ. 'w') then
call MPI_Reduce(t_create, tmax, 1, MPI_DOUBLE_PRECISION, &
MPI_MAX, root, MPI_COMM_WORLD, err)
t_create = tmax
call MPI_Reduce(t_post_w, tmax, 1, MPI_DOUBLE_PRECISION, &
MPI_MAX, root, MPI_COMM_WORLD, err)
t_post_w = tmax
call MPI_Reduce(t_wait_w, tmax, 1, MPI_DOUBLE_PRECISION, &
MPI_MAX, root, MPI_COMM_WORLD, err)
t_wait_w = tmax
else
call MPI_Reduce(t_open, tmax, 1, MPI_DOUBLE_PRECISION, &
MPI_MAX, root, MPI_COMM_WORLD, err)
t_open = tmax
call MPI_Reduce(t_post_r, tmax, 1, MPI_DOUBLE_PRECISION, &
MPI_MAX, root, MPI_COMM_WORLD, err)
t_post_r = tmax
call MPI_Reduce(t_wait_r, tmax, 1, MPI_DOUBLE_PRECISION, &
MPI_MAX, root, MPI_COMM_WORLD, err)
t_wait_r = tmax
endif
if ( rank .eq. root ) then
striping_factor = 0
striping_unit = 0
call get_file_striping(info_used, striping_factor, striping_unit)
call print_io_hints(info_used)
n3 = grid_points(1)*grid_points(2)*grid_points(3)
navg = n3 * 5 * 8 ! I/O amount per write/read
navg = navg * num_io ! I/O amount for all write/read
3000 format('#%$: ', A, ': ', A)
3001 format('#%$: ', A, ': ', F16.2)
3002 format('#%$: ', A, ': ', I13)
print 3002,' number_of_cells', ncells
print 3002,' number_of_iterations', niter
if (io_mode .EQ. 'w') then
print 3000,'io_operation', 'write'
else
print 3000,'io_operation', 'read'
endif
print 3002,'n_proc', nprocs
print 3002,'size_x', grid_points(1)
print 3002,'size_y', grid_points(2)
print 3002,'size_z', grid_points(3)
print 3002,'n_itr', niter
print 3001,'io_size', navg
if (io_method > 1) then ! 0: collective I/O, 1: independent I/O
print 3000,'api', 'mpiio'
else
print 3000,'api', 'pnetcdf'
endif
print 3000,'file_name',trim(dir_path)
print 3002,'stripe_count',striping_factor
print 3002,'stripe_size',striping_unit
navg = navg / 1048576.0 ! I/O amount in MB
2000 FORMAT(A)
2001 FORMAT(A, A)
2002 FORMAT(A, I9)
2003 FORMAT(A, I9, A)
2004 FORMAT(A, F12.2)
2005 FORMAT(A, F12.2, A)
if (io_mode .EQ. 'w') then
print 2000,'-- BT-IO Benchmark (write operation only) --'
else
print 2000,'-- BT-IO Benchmark (read operation only) --'
endif
print 2002,'Number of MPI processes : ',nprocs
print 2002,'Global array size X : ',grid_points(1)
print 2002,'Global array size Y : ',grid_points(2)
print 2002,'Global array size Z : ',grid_points(3)
print 2002,'Number of I/O iterations : ',niter
print 2005,'Total I/O amount : ',navg, ' MiB'
print 2004,'Total I/O time in sec : ',t_total
if (io_mode .EQ. 'w') then
print 2004,'Max file create time in sec : ',t_create
print 2004,'Max write post time in sec : ',t_post_w
print 2004,'Max write wait time in sec : ',t_wait_w
else
print 2004,'Max file open time in sec : ',t_open
print 2004,'Max read post time in sec : ',t_post_r
print 2004,'Max read wait time in sec : ',t_wait_r
endif
navg = navg / t_total ! I/O Bandwidth in MB/s
print 2005,'I/O bandwidth : ',navg, ' MiB/s'
print 2000, '------------------------------------------'
if (io_method .EQ. 0) then ! 0: collective I/O, 1: independent I/O
print 2000,'Using MPI collective I/O method'
elseif (io_method .EQ. 1) then
print 2000,'Using MPI independent I/O method'
elseif (io_method .EQ. 2) then
print 2000,'Using Parallel netCDF blocking I/O method'
else
print 2000,'Using Parallel netCDF non-blocking I/O method'
endif
print 2001,'output file path : ',trim(dir_path)
print 2002,'file striping count : ',striping_factor
print 2003,'file striping size : ',striping_unit,' bytes'
endif
if (info_used .NE. MPI_INFO_NULL) &
call MPI_Info_free(info_used, err)
if (io_method .LT. 2) goto 999 ! MPI-IO method skip PnetCDF malloc check
! print info about PnetCDF internal malloc usage
err = nfmpi_inq_malloc_max_size(malloc_size)
if (err .EQ. NF_NOERR) then
call MPI_Reduce(malloc_size, sum_size, 1, MPI_OFFSET, MPI_SUM, &
0, MPI_COMM_WORLD, err)
if (rank .EQ. 0) then
print 2000, '------------------------------------------'
print 2003, &
'maximum heap memory allocted by PnetCDF internally is', &
sum_size/1048576, ' MiB'
endif
err = nfmpi_inq_malloc_size(malloc_size)
call MPI_Reduce(malloc_size, sum_size, 1, MPI_OFFSET, MPI_SUM, &
0, MPI_COMM_WORLD, err)
if (rank .EQ. 0 .AND. sum_size .GT. 0_MPI_OFFSET_KIND) &
print 2003, &
'heap memory allocated by PnetCDF internally has ', &
sum_size, ' bytes yet to be freed'
endif
999 err = nf90mpi_mpi_finalize()
end program main
!---------------------------------------------------------------------------
! print I/O performance numbers
!---------------------------------------------------------------------------
subroutine report_io_performance(btio_time, init_time, rw_time, close_time, rank, NumPEs, root, io_method)
use pnetcdf
use mpi
integer root, NumPEs, io_method, rank
double precision btio_time, init_time, rw_time, close_time
! local variables
integer ierr
integer(kind=MPI_OFFSET_KIND) bb_data, bb_meta, bb_buffer
integer(kind=MPI_OFFSET_KIND) bb_meta_all, bb_data_all, bb_buffer_all
double precision time_io_max, time_io_min, time_io_mean, time_io_var
double precision time_init_max, time_init_min, time_init_mean, time_init_var
double precision time_rw_max, time_rw_min, time_rw_mean, time_rw_var
double precision time_close_max, time_close_min, time_close_mean, time_close_var
double precision bb_time(13), bb_time_max(13), bb_time_min(13), bb_time_mean(13), bb_time_var(13)
double precision var(13), total_max, total_min, total_mean, total_var
double precision time_staging
err = nfmpi_inq_bb_time( bb_time(1), bb_time(2), bb_time(3), bb_time(4), bb_time(5), bb_time(6))
err = nfmpi_inq_bb_time_put(bb_time(7), bb_time(8), bb_time(9))
err = nfmpi_inq_bb_time_flush(bb_time(10), bb_time(11), bb_time(12), bb_time(13))
err = nfmpi_inq_bb_size(bb_data, bb_meta, bb_buffer)
call MPI_Reduce(btio_time, time_io_max, 1, MPI_DOUBLE_PRECISION, MPI_max, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(btio_time, time_io_min, 1, MPI_DOUBLE_PRECISION, MPI_min, root, MPI_COMM_WORLD, ierr)
call MPI_Allreduce(btio_time, time_io_mean, 1, MPI_DOUBLE_PRECISION, MPI_sum, MPI_COMM_WORLD, ierr)
time_io_mean = time_io_mean / NumPEs
var(1) = (btio_time - time_io_mean) * (btio_time - time_io_mean)
call MPI_Reduce(var(1), time_io_var, 1, MPI_DOUBLE_PRECISION, MPI_sum, root, MPI_COMM_WORLD, ierr)
time_io_var = time_io_var / NumPEs
call MPI_Reduce(init_time, time_init_max, 1, MPI_DOUBLE_PRECISION, MPI_max, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(init_time, time_init_min, 1, MPI_DOUBLE_PRECISION, MPI_min, root, MPI_COMM_WORLD, ierr)
call MPI_Allreduce(init_time, time_init_mean, 1, MPI_DOUBLE_PRECISION, MPI_sum, MPI_COMM_WORLD, ierr)
time_init_mean = time_init_mean / NumPEs
var(1) = (init_time - time_init_mean) * (init_time - time_init_mean)
call MPI_Reduce(var(1), time_init_var, 1, MPI_DOUBLE_PRECISION, MPI_sum, root, MPI_COMM_WORLD, ierr)
time_io_var = time_io_var / NumPEs
call MPI_Reduce(rw_time, time_rw_max, 1, MPI_DOUBLE_PRECISION, MPI_max, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(rw_time, time_rw_min, 1, MPI_DOUBLE_PRECISION, MPI_min, root, MPI_COMM_WORLD, ierr)
call MPI_Allreduce(rw_time, time_rw_mean, 1, MPI_DOUBLE_PRECISION, MPI_sum, MPI_COMM_WORLD, ierr)
time_rw_mean = time_rw_mean / NumPEs
var(1) = (rw_time - time_rw_mean) * (rw_time - time_rw_mean)
call MPI_Reduce(var(1), time_rw_var, 1, MPI_DOUBLE_PRECISION, MPI_sum, root, MPI_COMM_WORLD, ierr)
time_rw_var = time_rw_var / NumPEs
call MPI_Reduce(close_time, time_close_max, 1, MPI_DOUBLE_PRECISION, MPI_max, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(close_time, time_close_min, 1, MPI_DOUBLE_PRECISION, MPI_min, root, MPI_COMM_WORLD, ierr)
call MPI_Allreduce(close_time, time_close_mean, 1, MPI_DOUBLE_PRECISION, MPI_sum, MPI_COMM_WORLD, ierr)
time_close_mean = time_close_mean / NumPEs
var(1) = (close_time - time_close_mean) * (close_time - time_close_mean)
call MPI_Reduce(var(1), time_close_var, 1, MPI_DOUBLE_PRECISION, MPI_sum, root, MPI_COMM_WORLD, ierr)
time_close_var = time_close_var / NumPEs
call MPI_Reduce(bb_time, bb_time_max, 13, MPI_DOUBLE_PRECISION, MPI_max, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(bb_time, bb_time_min, 13, MPI_DOUBLE_PRECISION, MPI_min, root, MPI_COMM_WORLD, ierr)
call MPI_Allreduce(bb_time, bb_time_mean, 13, MPI_DOUBLE_PRECISION, MPI_sum, MPI_COMM_WORLD, ierr)
do 180 i = 1, 13
bb_time_mean(i) = bb_time_mean(i) / NumPEs
var(i) = (bb_time(i) - bb_time_mean(i)) * (bb_time(i) - bb_time_mean(i))
180 continue
call MPI_Reduce(var, bb_time_var, 13, MPI_DOUBLE_PRECISION, MPI_sum, root, MPI_COMM_WORLD, ierr)
do 190 i = 1, 13
bb_time_var(i) = bb_time_var(i) / NumPEs
190 continue
call MPI_Reduce(bb_meta, bb_meta_all, 1, MPI_OFFSET, MPI_SUM, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(bb_data, bb_data_all, 1, MPI_OFFSET, MPI_SUM, root, MPI_COMM_WORLD, ierr)
call MPI_Reduce(bb_buffer, bb_buffer_all, 1, MPI_OFFSET, MPI_max, root, MPI_COMM_WORLD, ierr)
if (rank .EQ. root) then
call nfmpi_stage_out_env(time_staging)
1002 format(A,I13,A)
1007 format(A,A)
1008 format('#%$: ', A, ': ', F16.2)
1009 format('#%$: ', A, ': ', I13)
1010 format('#%$: ', A, ': ', A)
1011 format('#%$: ', A, ': ', F16.4)
print 1009,' number_of_processes', NumPEs
if ((io_method .EQ. 3) .OR. (io_method .EQ. 5)) then
print 1010,' nonblocking_io', '1'
else
print 1010,' nonblocking_io', '0'
endif
if (io_method > 3) then
print 1010,' indep_io','1'
else
print 1010,' indep_io', '0'
endif
print 1008,' init_time_max ', time_init_max
print 1008,' init_time_min ', time_init_min
print 1008,' init_time_mean ', time_init_mean
print 1008,' init_time_var ', time_init_var
print 1008,' rw_time_max ', time_rw_max
print 1008,' rw_time_min ', time_rw_min
print 1008,' rw_time_mean ', time_rw_mean
print 1008,' rw_time_var ', time_rw_var
print 1008,' close_time_max ', time_close_max
print 1008,' close_time_min ', time_close_min
print 1008,' close_time_mean ', time_close_mean
print 1008,' close_time_var ', time_close_var
print 1008,' btio_time_max ', time_io_max
print 1008,' btio_time_min ', time_io_min
print 1008,' btio_time_mean ', time_io_mean
print 1008,' btio_time_var ', time_io_var
print 1008,' total_time_max ', time_io_max + time_staging
print 1008,' total_time_min ', time_io_min + time_staging
print 1008,' total_time_mean ', time_io_mean + time_staging
print 1008,' total_time_var ', time_io_var
print 1008,' stage_time ', time_staging
print 1008,' bb_total_time_max ', bb_time_max(1)
print 1008,' bb_create_time_max ', bb_time_max(2)
print 1008,' bb_enddef_time_max ', bb_time_max(3)
print 1008,' bb_put_time_max ', bb_time_max(4)
print 1008,' bb_flush_time_max ', bb_time_max(5)
print 1008,' bb_close_time_max ', bb_time_max(6)
print 1008,' bb_put_data_wr_time_max ', bb_time_max(7)
print 1008,' bb_put_meta_wr_time_max ', bb_time_max(8)
print 1008,' bb_put_num_wr_time_max ', bb_time_max(9)
print 1008,' bb_flush_replay_time_max ', bb_time_max(10)
print 1008,' bb_flush_data_rd_time_max ', bb_time_max(11)
print 1008,' bb_flush_put_time_max ', bb_time_max(12)
print 1008,' bb_flush_wait_time_max ', bb_time_max(13)
print 1008,' bb_total_time_min ', bb_time_min(1)
print 1008,' bb_create_time_min ', bb_time_min(2)
print 1008,' bb_enddef_time_min ', bb_time_min(3)
print 1008,' bb_put_time_min ', bb_time_min(4)
print 1008,' bb_flush_time_min ', bb_time_min(5)
print 1008,' bb_close_time_min ', bb_time_min(6)
print 1008,' bb_put_data_wr_time_min ', bb_time_min(7)
print 1008,' bb_put_meta_wr_time_min ', bb_time_min(8)
print 1008,' bb_put_num_wr_time_min ', bb_time_min(9)
print 1008,' bb_flush_replay_time_min ', bb_time_min(10)
print 1008,' bb_flush_data_rd_time_min ', bb_time_min(11)
print 1008,' bb_flush_put_time_min ', bb_time_min(12)
print 1008,' bb_flush_wait_time_min ', bb_time_min(13)
print 1008,' bb_total_time_mean ', bb_time_mean(1)
print 1008,' bb_create_time_mean ', bb_time_mean(2)
print 1008,' bb_enddef_time_mean ', bb_time_mean(3)
print 1008,' bb_put_time_mean ', bb_time_mean(4)
print 1008,' bb_flush_time_mean ', bb_time_mean(5)
print 1008,' bb_close_time_mean ', bb_time_mean(6)
print 1008,' bb_put_data_wr_time_mean ', bb_time_mean(7)
print 1008,' bb_put_meta_wr_time_mean ', bb_time_mean(8)
print 1008,' bb_put_num_wr_time_mean ', bb_time_mean(9)
print 1008,' bb_flush_replay_time_mean ', bb_time_mean(10)
print 1008,' bb_flush_data_rd_time_mean ', bb_time_mean(11)
print 1008,' bb_flush_put_time_mean ', bb_time_mean(12)
print 1008,' bb_flush_wait_time_mean ', bb_time_mean(13)
print 1011,' bb_total_time_var ', bb_time_var(1)
print 1011,' bb_create_time_var ', bb_time_var(2)
print 1011,' bb_enddef_time_var ', bb_time_var(3)
print 1011,' bb_put_time_var ', bb_time_var(4)
print 1011,' bb_flush_time_var ', bb_time_var(5)
print 1011,' bb_close_time_var ', bb_time_var(6)
print 1011,' bb_put_data_wr_time_var ', bb_time_var(7)
print 1011,' bb_put_meta_wr_time_var ', bb_time_var(8)
print 1011,' bb_put_num_wr_time_var ', bb_time_var(9)
print 1011,' bb_flush_replay_time_var ', bb_time_var(10)
print 1011,' bb_flush_data_rd_time_var ', bb_time_var(11)
print 1011,' bb_flush_put_time_var ', bb_time_var(12)
print 1011,' bb_flush_wait_time_var ', bb_time_var(13)
print 1009,' bb_metadata_size ', bb_meta_all
print 1009,' bb_data_size ', bb_data_all
print 1009,' bb_flush_buffer_size ', bb_buffer_all
endif
end subroutine report_io_performance