Skip to content

Commit

Permalink
LD clumping out-of-core should work
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Sep 25, 2024
1 parent 9d4c246 commit 959e499
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 129 deletions.
1 change: 1 addition & 0 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ jobs:
- name: test LD module
run: |
make ld_matrix
make ld_clump
make ld_tests
15 changes: 8 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -190,30 +190,31 @@ ld_prune:

ld_clump:
./PCAone -B adj.residuals --ld-bim adj.kept.bim --clump example/plink.pheno0.assoc --clump-p1 0.01 --clump-p2 0.05 --clump-r2 0.1 --clump-bp 10000000 -m 0 -o adj_clump_m0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --clump example/plink.pheno0.assoc --clump-p1 0.01 --clump-p2 0.05 --clump-r2 0.1 --clump-bp 10000000 -m 1 -o adj_clump_m1
./PCAone -B adj.residuals --ld-bim adj.kept.bim --clump example/plink.pheno0.assoc --clump-p1 0.01 --clump-p2 0.05 --clump-r2 0.1 --clump-bp 10000000 -m 1 -o adj_clump_m1
diff adj_clump_m0.p0.clump adj_clump_m1.p0.clump > /dev/null

ld_tests:
./PCAone -b example/plink -k 3 --ld -o adj -d 0 -m 4
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
./PCAone -b example/plink -k 3 --ld -o adj -d 0 --maf 0.1
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
./PCAone -b example/plink -k 3 --ld -o adj -d 1 -m 4
./PCAone -b example/plink -k 3 --ld -o adj -d 0 -m 4
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
./PCAone -b example/plink -k 3 --ld -o adj -d 1 --maf 0.1
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
./PCAone -b example/plink -k 3 --ld -o adj -d 2 -m 4
./PCAone -b example/plink -k 3 --ld -o adj -d 1 -m 4
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
./PCAone -b example/plink -k 3 --ld -o adj -d 2 --maf 0.1
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
./PCAone -b example/plink -k 3 --ld -o adj -d 2 -m 4
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0
./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
1 change: 1 addition & 0 deletions src/Cmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ Param::Param(int argc, char **argv) {
maf = 1 - maf;
}
keepsnp = maf > 0 ? true : false;
if(maf && out_of_core) std::cerr << "does not support --maf filters for out-of-core mode!\n" ;
if (ld_r2 > 0 || !clump.empty()) pca = false;
if (svd_t == SvdType::PCAoneAlg2 && !noshuffle) perm = true;
if(svd_t == SvdType::FULL) out_of_core = false;
Expand Down
160 changes: 42 additions & 118 deletions src/LD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,29 +266,18 @@ std::vector<UMapIntPds> map_index_snps(const std::string& fileassoc,
return vm;
}

void ld_clump_big(std::string fileout, std::string fileassoc,
std::string colnames, int clump_bp, double clump_r2,
double clump_p1, double clump_p2, const SNPld& snp,
const MyMatrix& G) {
if ((long int)snp.pos.size() != G.cols())
cao.error("The number of variants is not matching the LD matrix");
cao.print(tick.date(), "LD-based clumping: p1 =", clump_p1,
", p2 =", clump_p2, ", r2 =", clump_r2, ", bp =", clump_bp,
", associated file:", fileassoc);
// 0: chr, 1: pos, 2: pvalue
auto colidx = valid_assoc_file(fileassoc, colnames);
SNPld snp_t;
std::string head = get_snp_pos_bim(snp_t, fileassoc, true, colidx);

Int2D idx_per_chr, bp_per_chr;
std::tie(idx_per_chr, bp_per_chr) = get_target_snp_idx(snp_t, snp);
const auto pvals_per_chr = map_index_snps(fileassoc, colidx, clump_p2);
void ld_clump_single_pheno(std::string fileout, int clump_bp, double clump_r2,
double clump_p1, double clump_p2, std::string head,
const MyMatrix& Gs, const SNPld& snp_t,
const SNPld& snp, const Int2D& idx_per_chr,
const Int2D& bp_per_chr,
const std::vector<UMapIntPds>& pvals_per_chr) {
// sort by pvalues and get new idx
const MyArray sds = 1.0 / calc_sds(G);
const double df = 1.0 / (G.rows() - 1); // N-1
const MyArray sds = 1.0 / calc_sds(Gs);
const double df = 1.0 / (Gs.rows() - 1); // N-1
std::ofstream ofs(fileout);
ofs << head + "\tSP2" << std::endl;
for (int c = 0; c < (int)idx_per_chr.size(); c++) {
for (int c = 0; c < (int)bp_per_chr.size(); c++) {
const auto idx = idx_per_chr[c];
const auto bp = bp_per_chr[c];
const auto mbp = vector2map(bp);
Expand Down Expand Up @@ -327,8 +316,11 @@ void ld_clump_big(std::string fileout, std::string fileassoc,
}
p2 = bp[k];
if (mpp.count(p2) == 0) continue;
double r =
G.col(idx[j]).dot(G.col(idx[k])) * (sds(idx[j]) * sds(idx[k]) * df);
double r = Gs.col(idx[j]).dot(Gs.col(idx[k])) *
(sds(idx[j]) * sds(idx[k]) * df);
// double r =
// G.col(idx[j]).dot(G.col(idx[k])) * (sds(idx[j]) * sds(idx[k]) *
// df);
if (r * r >= clump_r2) {
clumped.push_back(p2);
mpp.erase(p2);
Expand Down Expand Up @@ -370,114 +362,46 @@ void run_ld_stuff(const Param& params, Data* data) {
} else {
const auto assocfiles = split_string(params.clump, ",");
for (size_t i = 0; i < assocfiles.size(); i++) {
// TODO: subset G for each pheno and compared with it
// cao.print(tick.date(), "LD-based clumping: p1 =", clump_p1,
// ", p2 =", clump_p2, ", r2 =", clump_r2, ", bp =", clump_bp,
// ", associated file:", fileassoc);
auto colidx = valid_assoc_file(assocfiles[i], params.assoc_colnames);
SNPld snp_t;
std::string head = get_snp_pos_bim(snp_t, assocfiles[i], true, colidx);
const auto pvals_per_chr =
map_index_snps(assocfiles[i], colidx, params.clump_p2);
Int2D idx_per_chr, bp_per_chr;
std::tie(idx_per_chr, bp_per_chr) = get_target_snp_idx(snp_t, snp);
if (params.out_of_core) {
auto colidx = valid_assoc_file(assocfiles[i], params.assoc_colnames);
Int2D idx_per_chr, bp_per_chr;
SNPld snp_t;
std::string head = get_snp_pos_bim(snp_t, assocfiles[i], true, colidx);
std::tie(idx_per_chr, bp_per_chr) = get_target_snp_idx(snp_t, snp);
const auto pvals_per_chr =
map_index_snps(assocfiles[i], colidx, params.clump_p2);
MyMatrix G(data->nsamples, snp_t.pos.size());
int b = 0, sidx = 0;
int b = 0;
data->check_file_offset_first_var();
data->read_block_initial(data->start[b], data->stop[b], false);
Int1D cumlen(idx_per_chr.size());
int cl = 0;
for (int c = 0; c < (int)idx_per_chr.size(); c++) {
for (int sidx = 0, c = 0; c < (int)idx_per_chr.size(); c++) {
int i = 0;
for (auto icol : idx_per_chr[c]) {
if (icol >= data->start[b] && icol <= data->stop[b]) {
G.col(sidx) = data->G.col(icol - data->start[b]);
} else {
if (!(icol >= data->start[b] && icol <= data->stop[b])) {
b++;
data->read_block_initial(data->start[b], data->stop[b], false);
G.col(sidx) = data->G.col(icol - data->start[b]);
}
G.col(sidx) = data->G.col(icol - data->start[b]);
idx_per_chr[c][i] = sidx;
sidx++;
i++;
}
cl += idx_per_chr[c].size();
cumlen[c] = cl;
}
const MyArray sds = 1.0 / calc_sds(G);
const double df = 1.0 / (G.rows() - 1); // N-1
std::string fileout =
params.fileout + ".p" + std::to_string(i) + ".clump";
std::ofstream ofs(fileout);
ofs << head + "\tSP2" << std::endl;
for (int c = 0; c < (int)idx_per_chr.size(); c++) {
const auto bp = bp_per_chr[c];
const auto mbp = vector2map(bp);
// greedy clumping algorithm
auto mpp = pvals_per_chr[c]; // key: pos, val: pval
Double1D pp;
Int1D ps;
for (auto it = mpp.begin(); it != mpp.end(); it++) {
if (it->second.first <= params.clump_p1) {
ps.push_back(it->first);
pp.push_back(it->second.first);
}
}
int p, p2, j, k;
for (auto i : sortidx(pp)) { // snps sorted by p value
p = ps[i];
if (mpp.count(p) == 0)
continue; // if snps with pval < clump_p1 are already clumped
// with
// previous snps
Int1D clumped;
bool backward = true;
if (mbp.count(p) == 0) continue;
j = mbp.at(p); // j:current
k = j; // k:forward or backward
while (true) {
if (backward) {
--k;
if (k < 0 || (bp[k] < p - params.clump_bp)) {
backward = false;
k = j;
continue;
}
} else {
++k;
if (k >= (int)bp.size() || (bp[k] > p + params.clump_bp)) break;
}
p2 = bp[k];
if (mpp.count(p2) == 0) continue;
int cj = c > 0 ? cumlen[c - 1] + j - 1 : j;
int ck = c > 0 ? cumlen[c - 1] + k - 1 : k;
double r = G.col(cj).dot(G.col(ck)) * (sds(cj) * sds(ck) * df);
if (r * r >= params.clump_r2) {
clumped.push_back(p2);
mpp.erase(p2);
}
}
// what we do with clumped SNPs. sort them by pval?
ofs << pvals_per_chr[c].at(p).second << "\t";
if (clumped.empty()) {
ofs << "NONE";
} else {
Double1D opp;
for (auto op : clumped)
opp.push_back(pvals_per_chr[c].at(op).first);
k = 0;
for (auto oi : sortidx(opp)) {
if (k == opp.size() - 1)
ofs << clumped[oi];
else
ofs << clumped[oi] << ",";
k++;
}
}
ofs << std::endl;
}
// end current chr
cao.print("snp_t size", snp_t.pos.size(), "sidx", sidx);
}

ld_clump_single_pheno(
params.fileout + ".p" + std::to_string(i) + ".clump",
params.clump_bp, params.clump_r2, params.clump_p1, params.clump_p2,
head, G, snp_t, snp, idx_per_chr, bp_per_chr, pvals_per_chr);

} else {
ld_clump_big(params.fileout + ".p" + std::to_string(i) + ".clump",
assocfiles[i], params.assoc_colnames, params.clump_bp,
params.clump_r2, params.clump_p1, params.clump_p2, snp,
data->G);
ld_clump_single_pheno(
params.fileout + ".p" + std::to_string(i) + ".clump",
params.clump_bp, params.clump_r2, params.clump_p1, params.clump_p2,
head, data->G, snp_t, snp, idx_per_chr, bp_per_chr, pvals_per_chr);
}
}
}
Expand Down
9 changes: 5 additions & 4 deletions src/LD.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ void ld_prune_small(Data* data, const std::string& fileout,
void ld_prune_big(const std::string& fileout, const std::string& filebim,
const MyMatrix& G, const SNPld& snp, double r2_tol);

void ld_clump_big(std::string fileout, std::string fileassoc,
std::string colnames, int clump_bp, double clump_r2,
double clump_p1, double clump_p2, const SNPld& snp,
const MyMatrix& G);
void ld_clump_single_pheno(std::string fileout, int clump_bp, double clump_r2,
double clump_p1, double clump_p2, std::string head,
const MyMatrix& G, const SNPld& snp_t, const SNPld& snp,
const Int2D& idx_per_chr, const Int2D& bp_per_chr,
const std::vector<UMapIntPds>& pvals_per_chr);

Int1D valid_assoc_file(const std::string& fileassoc,
const std::string& colnames);
Expand Down

0 comments on commit 959e499

Please sign in to comment.