-
Notifications
You must be signed in to change notification settings - Fork 0
/
bvg.cc
368 lines (328 loc) · 10.7 KB
/
bvg.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
// Read a mutation probability and gene data source file name from the
// command line. Initialize a bitvector population from the data
// stream, and calculate the relation distance between all pairs of
// vectors in the population.
//
// Find an undirected graph that spans all the bitvectors and
// minimizes the bit difference between adjacent bitvectors normalized
// to the number of expected mutations.
//
// Orient the graph into a rooted tree by first transforming the
// spanning graph's vector and edge representation into a neighborhood
// representation. The leaves of the tree are the bitvectors with
// only a single neighbor. Find the leaves, note their parents, then
// trim them from the graph, and repeat until there is a single
// bitvector left, the "progenitor".
//
// Run 'make test' to build the executables, run the tests, and check
// the results. This code was developed on MacOSX, but should run on
// any standardly-endowed Unix system with a Makefile tweak or two.
//
// Or just run 'make bitvectors-parents.data' for the "solution".
#include <algorithm>
#include <bitset>
#include <cassert>
#include <fstream>
#include <iostream>
#include <list>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <vector>
using namespace std;
static void showUsage(ostream &errs, const char *cmd)
{
errs << endl
<< "Usage: " << cmd << " <prob> <data>" << endl
<< "Where: <prop> is the bitwise probability of mutation " << endl
<< " as an integer percentage (20 for example). " << endl
<< " <data> is a file of " << SCALE
<< " bit strings of length " << SCALE << "." << endl
<< "Each line matches the regular expression "
<< "'^[01]{" << SCALE << "}$', " << endl
<< "and there are " << SCALE << " lines in <data>." << endl
<< endl;
}
// A vector of SCALE bits with mutationPercentage, the bitwise
// probability of mutation each generation.
//
// The difference of two BitVectors is the number of bits different
// between lhs and rhs normalized to the expected mutation count.
//
struct BitVector
{
static int mutationPercentage;
int index;
bitset<SCALE> bits;
friend size_t operator-(const BitVector &lhs, const BitVector &rhs)
{
static const size_t expected = (SCALE * mutationPercentage / 100);
bitset<SCALE> difference(lhs.bits ^ rhs.bits);
const size_t distance = difference.count();
if (distance > expected) return distance - expected;
return expected - distance;
}
BitVector(): index(-1), bits() {}
BitVector(int n, const string &s): index(n), bits(s) {}
};
// A population of BitVectors initialized from the stream s.
// There is a problem with error on line when *this is false.
//
struct Population
{
vector<BitVector> bitVectors;
int line;
string error;
operator bool() { return line == -1; }
Population(istream &s): bitVectors(SCALE), line(-1), error()
{
string bitvector;
int n = 0;
for (; s && n < bitVectors.size(); ++n) {
if (getline(s, bitvector) && bitvector.size() == SCALE) {
bitVectors[n] = BitVector(n, bitvector);
} else {
break;
}
}
if (n < bitVectors.size()) {
line = n; error = bitvector;
}
}
};
// Relation between two BitVectors, with indexes left and right, and
// the normalized bit distance nbd between them.
//
struct Relation
{
size_t nbd;
int left; int right;
// True if lhs is probably a closer relation than rhs.
//
friend bool operator<(const Relation &lhs, const Relation &rhs)
{
return lhs.nbd < rhs.nbd;
}
Relation(): nbd(0), left(0), right(0) {}
Relation(size_t d, int p, int c): nbd(d), left(p), right(c) {}
};
// A connected graph of Relations built while constructing a
// SpanningGraph.
//
struct ConnectedGraph
{
set<int> vertexes;
vector<Relation> edges;
// True if this graph contains all of the bitvectors.
//
bool full() { return vertexes.size() >= SCALE; }
// Add e to this.
//
void add(const Relation &e) {
vertexes.insert(e.left);
vertexes.insert(e.right);
edges.push_back(e);
}
// True if e connects to (has some vertex in common with) this.
// Otherwise false.
//
bool connectsTo(const Relation &e) {
return vertexes.count(e.left) || vertexes.count(e.right);
}
// Merge that with this (when some new Edge connects two subgraphs).
//
void mergeWith(const ConnectedGraph &that) {
vertexes.insert(that.vertexes.begin(), that.vertexes.end());
edges.insert(edges.end(), that.edges.begin(), that.edges.end());
}
ConnectedGraph(): vertexes(), edges() {}
ConnectedGraph(const Relation &e): vertexes(), edges() {
add(e);
}
};
// A undirected graph spanning all bitvectors and minimizing the
// Relation distance between connected bitvectors.
//
struct SpanningGraph
{
typedef list<ConnectedGraph>::iterator Cp;
ConnectedGraph result;
// Merge any two subgraphs in connected joined by r.
//
Cp merge(list<ConnectedGraph> &connected, Cp resultP, const Relation &r) {
for (Cp pC = connected.begin(); pC != connected.end(); ++pC) {
if (pC->connectsTo(r)) {
if (resultP == connected.end()) {
resultP = pC;
} else {
resultP->mergeWith(*pC);
connected.erase(pC);
break;
}
}
}
return resultP;
}
// Add r to some subgraph. Start a new subgraph if no connection
// was already found by merge(). Return true if graph at resultP
// spans all bitvectors. Return false if the graph extraction
// should continue.
//
bool add(list<ConnectedGraph> &connected, Cp resultP, const Relation &r) {
if (resultP == connected.end()) {
ConnectedGraph cg(r);
connected.push_back(cg);
} else {
resultP->add(r);
if (resultP->full()) return true;
}
return false;
}
// Return all the relations in population.
//
static vector<Relation> findAll(const Population &population)
{
const vector<BitVector> &bitVectors = population.bitVectors;
const size_t size = bitVectors.size();
typedef vector<BitVector>::const_iterator BvP;
vector<Relation> result(size * (size - 1) / 2);
vector<Relation>::iterator op = result.begin();
for (BvP lp = bitVectors.begin(); lp != bitVectors.end(); ++lp) {
for (BvP rp = lp + 1; rp < bitVectors.end(); ++op, ++rp) {
*op = Relation(*lp - *rp, lp->index, rp->index);
}
}
return result;
}
// *this is true when the result graph spans all the bitvectors.
// Otherwise false.
//
operator bool() { return result.vertexes.size() == SCALE; }
// Find a graph spanning all the bitvectors in relations that
// minimizes the normalized bit distances between bitvectors.
//
SpanningGraph(const Population &population): result()
{
typedef vector<Relation>::const_iterator Rp;
vector<Relation> relations(findAll(population));
sort(relations.begin(), relations.end());
list<ConnectedGraph> connected;
Cp resultP = connected.end();
for (Rp pR = relations.begin(); pR != relations.end(); ++pR) {
resultP = merge(connected, resultP, *pR);
if (add(connected, resultP, *pR)) break;
resultP = connected.end();
}
if (resultP != connected.end()) result = *resultP;
}
};
// From the undirected connected graph cg, extract a directed tree,
// where result is a 0-based array such that result[n] is the parent
// of child n and result[r] == -1 means that r is the root of the
// tree.
//
struct Genealogy {
typedef map<int, set<int> > Neighbors;
typedef Neighbors::iterator Np;
typedef vector<Np> Leaves;
typedef Leaves::iterator Pl;
vector<int> result;
bool itsOk;
// Return a map of bitvector indexes to the indexes of their
// neighbors from the Relations in edges.
//
static Neighbors discover(const vector<Relation> &edges) {
typedef vector<Relation>::const_iterator Rp;
Neighbors result;
for (Rp pE = edges.begin(); pE != edges.end(); ++pE) {
result[pE->left].insert(pE->right);
result[pE->right].insert(pE->left);
}
return result;
}
// Remove all leaves from the map neighbors. Return true if a leaf
// was removed from neighbors. Otherwise return false.
//
static bool trim(Neighbors &neighbors, Leaves &leaves) {
bool result = false;
for (Pl pL = leaves.begin(); pL != leaves.end(); ++pL) {
neighbors.erase(*pL);
result = true;
}
return result;
}
friend ostream &operator<<(ostream &s, const Genealogy &g) {
typedef vector<int>::const_iterator Pi;
for (Pi pI = g.result.begin(); pI != g.result.end(); ++pI) {
s << *pI << endl;
}
return s;
}
// True if this converged to a rooted tree.
//
operator bool() { return itsOk; }
// Extract neighborhoods from cg. Find the leaves (vertexes with
// only one neighbor). For each leaf, note its parent in result,
// remove it from its parent's neighborhood, then erase it from
// neighbors. Consume neighbors until only one vertex is left.
//
Genealogy(const ConnectedGraph &cg): result(SCALE, -1)
{
Neighbors neighbors(discover(cg.edges));
while (neighbors.size() > 1) {
vector<Np> leaves;
for (Np pC = neighbors.begin(); pC != neighbors.end(); ++pC) {
if (pC->second.size() == 1) {
const size_t child = pC->first;
const size_t parent = *pC->second.begin();
result[child] = parent;
pC->second.erase(pC->second.begin());
leaves.push_back(pC);
neighbors.find(parent)->second.erase(child);
}
}
itsOk = trim(neighbors, leaves);
if (!itsOk) break;
}
}
};
int BitVector::mutationPercentage;
static bool initializeMutationPercentage(char *percentageString)
{
istringstream issMp; issMp.str(percentageString);
return issMp >> BitVector::mutationPercentage
&& BitVector::mutationPercentage >= 0
&& BitVector::mutationPercentage <= 100;
}
int main(int ac, char *av[])
{
if (ac == 3) {
if (initializeMutationPercentage(av[1])) {
ifstream dataStream(av[2]);
Population population(dataStream);
if (population) {
SpanningGraph graph(population);
if (graph) {
Genealogy genealogy(graph.result);
if (genealogy) {
cout << genealogy;
return 0;
} else {
cerr << av[0] << ": Error: The genealogy did not converge." << endl;
}
} else {
cerr << av[0] << ": Error: Cannot relate entire population." << endl;
}
} else {
cerr << av[0] << ": Error on line " << population.line
<< ": " << population.error << endl;
}
} else {
cerr << av[0] << ": Error: First argument '" << av[1]
<< "' should be an integer between 0 and 100." << endl;
}
}
showUsage(cerr, av[0]);
return 1;
}