-
Notifications
You must be signed in to change notification settings - Fork 0
/
dying.c
98 lines (80 loc) · 1.83 KB
/
dying.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>
#include <string.h>
#include "dying.h"
#include "util.h"
#include "corrected_tree.h"
static MPI_Comm MPI_COMM_LIVING_WORLD;
int will_die = 0;
int read_env_will_die(int rank)
{
const char *dying_list = read_env_or_fail("DYING_LIST");
/* Traverse dying_list, check oneself */
int dying_rank = -1;
while (1) {
int ret = sscanf(dying_list, "%d", &dying_rank);
if (dying_rank == rank && ret) {
return 1;
}
dying_list = strchr(dying_list, ';');
/* Can't find a comma, hence the end of the list */
if (!dying_list) break;
dying_list++;
}
return 0;
}
void die_if_needed()
{
int rank;
PMPI_Comm_rank(MPI_COMM_WORLD, &rank);
will_die = read_env_will_die(rank);
/* Create two communicators. New world for all operations except the
broadcast. The old comm world will be used for the corrected
broadcast. */
int color = !!will_die;
int ret = PMPI_Comm_split(MPI_COMM_WORLD, color, rank, &MPI_COMM_LIVING_WORLD);
if (ret != MPI_SUCCESS) {
fprintf(stderr, "Failed to split communicator @ %d\n", rank);
PMPI_Abort(MPI_COMM_WORLD, -1);
}
if (will_die) {
PMPI_Finalize();
exit(0);
/* not reached */
}
}
void dying_finalize()
{
int rank;
PMPI_Comm_rank(MPI_COMM_WORLD, &rank);
fflush(stdout);
fflush(stderr);
if (rank == 0) {
PMPI_Abort(MPI_COMM_SELF, 0);
} else {
exit(0);
}
/* kill(getpid(), SIGTERM); */
}
/* Replace a new COMM_WORLD with an old one, where some nodes are
potentially dead. */
MPI_Comm replace_comm_world(MPI_Comm comm)
{
if (comm == MPI_COMM_WORLD) {
return MPI_COMM_LIVING_WORLD;
}
return comm;
}
int die_in_bcast()
{
if (will_die) {
PMPI_Finalize();
exit(0);
/* not reached */
return 1;
}
return 0;
}