diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 8527470..9e92b31 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -29,4 +29,5 @@ jobs: - name: test LD module run: | make ld_matrix + make ld_clump make ld_tests diff --git a/Makefile b/Makefile index 0a5fa29..9318ee9 100644 --- a/Makefile +++ b/Makefile @@ -190,18 +190,15 @@ ld_prune: ld_clump: ./PCAone -B adj.residuals --ld-bim adj.kept.bim --clump example/plink.pheno0.assoc --clump-p1 0.01 --clump-p2 0.05 --clump-r2 0.1 --clump-bp 10000000 -m 0 -o adj_clump_m0 - ./PCAone -B adj.residuals --ld-bim adj.kept.bim --clump example/plink.pheno0.assoc --clump-p1 0.01 --clump-p2 0.05 --clump-r2 0.1 --clump-bp 10000000 -m 1 -o adj_clump_m1 + ./PCAone -B adj.residuals --ld-bim adj.kept.bim --clump example/plink.pheno0.assoc --clump-p1 0.01 --clump-p2 0.05 --clump-r2 0.1 --clump-bp 10000000 -m 1 -o adj_clump_m1 + diff adj_clump_m0.p0.clump adj_clump_m1.p0.clump > /dev/null ld_tests: - ./PCAone -b example/plink -k 3 --ld -o adj -d 0 -m 4 - ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 - ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 - diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null ./PCAone -b example/plink -k 3 --ld -o adj -d 0 --maf 0.1 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null - ./PCAone -b example/plink -k 3 --ld -o adj -d 1 -m 4 + ./PCAone -b example/plink -k 3 --ld -o adj -d 0 -m 4 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null @@ -209,7 +206,7 @@ ld_tests: ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null - ./PCAone -b example/plink -k 3 --ld -o adj -d 2 -m 4 + ./PCAone -b example/plink -k 3 --ld -o adj -d 1 -m 4 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null @@ -217,3 +214,7 @@ ld_tests: ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null + ./PCAone -b example/plink -k 3 --ld -o adj -d 2 -m 4 + ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m0 -m 0 + ./PCAone -B adj.residuals --ld-bim adj.kept.bim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2 + diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null diff --git a/src/Cmd.cpp b/src/Cmd.cpp index 47f8fb7..b37100e 100644 --- a/src/Cmd.cpp +++ b/src/Cmd.cpp @@ -148,6 +148,7 @@ Param::Param(int argc, char **argv) { maf = 1 - maf; } keepsnp = maf > 0 ? true : false; + if(maf && out_of_core) std::cerr << "does not support --maf filters for out-of-core mode!\n" ; if (ld_r2 > 0 || !clump.empty()) pca = false; if (svd_t == SvdType::PCAoneAlg2 && !noshuffle) perm = true; if(svd_t == SvdType::FULL) out_of_core = false; diff --git a/src/LD.cpp b/src/LD.cpp index 6a6c968..b695d22 100644 --- a/src/LD.cpp +++ b/src/LD.cpp @@ -266,29 +266,18 @@ std::vector map_index_snps(const std::string& fileassoc, return vm; } -void ld_clump_big(std::string fileout, std::string fileassoc, - std::string colnames, int clump_bp, double clump_r2, - double clump_p1, double clump_p2, const SNPld& snp, - const MyMatrix& G) { - if ((long int)snp.pos.size() != G.cols()) - cao.error("The number of variants is not matching the LD matrix"); - cao.print(tick.date(), "LD-based clumping: p1 =", clump_p1, - ", p2 =", clump_p2, ", r2 =", clump_r2, ", bp =", clump_bp, - ", associated file:", fileassoc); - // 0: chr, 1: pos, 2: pvalue - auto colidx = valid_assoc_file(fileassoc, colnames); - SNPld snp_t; - std::string head = get_snp_pos_bim(snp_t, fileassoc, true, colidx); - - Int2D idx_per_chr, bp_per_chr; - std::tie(idx_per_chr, bp_per_chr) = get_target_snp_idx(snp_t, snp); - const auto pvals_per_chr = map_index_snps(fileassoc, colidx, clump_p2); +void ld_clump_single_pheno(std::string fileout, int clump_bp, double clump_r2, + double clump_p1, double clump_p2, std::string head, + const MyMatrix& Gs, const SNPld& snp_t, + const SNPld& snp, const Int2D& idx_per_chr, + const Int2D& bp_per_chr, + const std::vector& pvals_per_chr) { // sort by pvalues and get new idx - const MyArray sds = 1.0 / calc_sds(G); - const double df = 1.0 / (G.rows() - 1); // N-1 + const MyArray sds = 1.0 / calc_sds(Gs); + const double df = 1.0 / (Gs.rows() - 1); // N-1 std::ofstream ofs(fileout); ofs << head + "\tSP2" << std::endl; - for (int c = 0; c < (int)idx_per_chr.size(); c++) { + for (int c = 0; c < (int)bp_per_chr.size(); c++) { const auto idx = idx_per_chr[c]; const auto bp = bp_per_chr[c]; const auto mbp = vector2map(bp); @@ -327,8 +316,11 @@ void ld_clump_big(std::string fileout, std::string fileassoc, } p2 = bp[k]; if (mpp.count(p2) == 0) continue; - double r = - G.col(idx[j]).dot(G.col(idx[k])) * (sds(idx[j]) * sds(idx[k]) * df); + double r = Gs.col(idx[j]).dot(Gs.col(idx[k])) * + (sds(idx[j]) * sds(idx[k]) * df); + // double r = + // G.col(idx[j]).dot(G.col(idx[k])) * (sds(idx[j]) * sds(idx[k]) * + // df); if (r * r >= clump_r2) { clumped.push_back(p2); mpp.erase(p2); @@ -370,114 +362,46 @@ void run_ld_stuff(const Param& params, Data* data) { } else { const auto assocfiles = split_string(params.clump, ","); for (size_t i = 0; i < assocfiles.size(); i++) { - // TODO: subset G for each pheno and compared with it + // cao.print(tick.date(), "LD-based clumping: p1 =", clump_p1, + // ", p2 =", clump_p2, ", r2 =", clump_r2, ", bp =", clump_bp, + // ", associated file:", fileassoc); + auto colidx = valid_assoc_file(assocfiles[i], params.assoc_colnames); + SNPld snp_t; + std::string head = get_snp_pos_bim(snp_t, assocfiles[i], true, colidx); + const auto pvals_per_chr = + map_index_snps(assocfiles[i], colidx, params.clump_p2); + Int2D idx_per_chr, bp_per_chr; + std::tie(idx_per_chr, bp_per_chr) = get_target_snp_idx(snp_t, snp); if (params.out_of_core) { - auto colidx = valid_assoc_file(assocfiles[i], params.assoc_colnames); - Int2D idx_per_chr, bp_per_chr; - SNPld snp_t; - std::string head = get_snp_pos_bim(snp_t, assocfiles[i], true, colidx); - std::tie(idx_per_chr, bp_per_chr) = get_target_snp_idx(snp_t, snp); - const auto pvals_per_chr = - map_index_snps(assocfiles[i], colidx, params.clump_p2); MyMatrix G(data->nsamples, snp_t.pos.size()); - int b = 0, sidx = 0; + int b = 0; data->check_file_offset_first_var(); data->read_block_initial(data->start[b], data->stop[b], false); - Int1D cumlen(idx_per_chr.size()); - int cl = 0; - for (int c = 0; c < (int)idx_per_chr.size(); c++) { + for (int sidx = 0, c = 0; c < (int)idx_per_chr.size(); c++) { + int i = 0; for (auto icol : idx_per_chr[c]) { - if (icol >= data->start[b] && icol <= data->stop[b]) { - G.col(sidx) = data->G.col(icol - data->start[b]); - } else { + if (!(icol >= data->start[b] && icol <= data->stop[b])) { b++; data->read_block_initial(data->start[b], data->stop[b], false); - G.col(sidx) = data->G.col(icol - data->start[b]); } + G.col(sidx) = data->G.col(icol - data->start[b]); + idx_per_chr[c][i] = sidx; sidx++; + i++; } - cl += idx_per_chr[c].size(); - cumlen[c] = cl; - } - const MyArray sds = 1.0 / calc_sds(G); - const double df = 1.0 / (G.rows() - 1); // N-1 - std::string fileout = - params.fileout + ".p" + std::to_string(i) + ".clump"; - std::ofstream ofs(fileout); - ofs << head + "\tSP2" << std::endl; - for (int c = 0; c < (int)idx_per_chr.size(); c++) { - const auto bp = bp_per_chr[c]; - const auto mbp = vector2map(bp); - // greedy clumping algorithm - auto mpp = pvals_per_chr[c]; // key: pos, val: pval - Double1D pp; - Int1D ps; - for (auto it = mpp.begin(); it != mpp.end(); it++) { - if (it->second.first <= params.clump_p1) { - ps.push_back(it->first); - pp.push_back(it->second.first); - } - } - int p, p2, j, k; - for (auto i : sortidx(pp)) { // snps sorted by p value - p = ps[i]; - if (mpp.count(p) == 0) - continue; // if snps with pval < clump_p1 are already clumped - // with - // previous snps - Int1D clumped; - bool backward = true; - if (mbp.count(p) == 0) continue; - j = mbp.at(p); // j:current - k = j; // k:forward or backward - while (true) { - if (backward) { - --k; - if (k < 0 || (bp[k] < p - params.clump_bp)) { - backward = false; - k = j; - continue; - } - } else { - ++k; - if (k >= (int)bp.size() || (bp[k] > p + params.clump_bp)) break; - } - p2 = bp[k]; - if (mpp.count(p2) == 0) continue; - int cj = c > 0 ? cumlen[c - 1] + j - 1 : j; - int ck = c > 0 ? cumlen[c - 1] + k - 1 : k; - double r = G.col(cj).dot(G.col(ck)) * (sds(cj) * sds(ck) * df); - if (r * r >= params.clump_r2) { - clumped.push_back(p2); - mpp.erase(p2); - } - } - // what we do with clumped SNPs. sort them by pval? - ofs << pvals_per_chr[c].at(p).second << "\t"; - if (clumped.empty()) { - ofs << "NONE"; - } else { - Double1D opp; - for (auto op : clumped) - opp.push_back(pvals_per_chr[c].at(op).first); - k = 0; - for (auto oi : sortidx(opp)) { - if (k == opp.size() - 1) - ofs << clumped[oi]; - else - ofs << clumped[oi] << ","; - k++; - } - } - ofs << std::endl; - } - // end current chr + cao.print("snp_t size", snp_t.pos.size(), "sidx", sidx); } + + ld_clump_single_pheno( + params.fileout + ".p" + std::to_string(i) + ".clump", + params.clump_bp, params.clump_r2, params.clump_p1, params.clump_p2, + head, G, snp_t, snp, idx_per_chr, bp_per_chr, pvals_per_chr); + } else { - ld_clump_big(params.fileout + ".p" + std::to_string(i) + ".clump", - assocfiles[i], params.assoc_colnames, params.clump_bp, - params.clump_r2, params.clump_p1, params.clump_p2, snp, - data->G); + ld_clump_single_pheno( + params.fileout + ".p" + std::to_string(i) + ".clump", + params.clump_bp, params.clump_r2, params.clump_p1, params.clump_p2, + head, data->G, snp_t, snp, idx_per_chr, bp_per_chr, pvals_per_chr); } } } diff --git a/src/LD.hpp b/src/LD.hpp index 31252f7..be167a8 100644 --- a/src/LD.hpp +++ b/src/LD.hpp @@ -24,10 +24,11 @@ void ld_prune_small(Data* data, const std::string& fileout, void ld_prune_big(const std::string& fileout, const std::string& filebim, const MyMatrix& G, const SNPld& snp, double r2_tol); -void ld_clump_big(std::string fileout, std::string fileassoc, - std::string colnames, int clump_bp, double clump_r2, - double clump_p1, double clump_p2, const SNPld& snp, - const MyMatrix& G); +void ld_clump_single_pheno(std::string fileout, int clump_bp, double clump_r2, + double clump_p1, double clump_p2, std::string head, + const MyMatrix& G, const SNPld& snp_t, const SNPld& snp, + const Int2D& idx_per_chr, const Int2D& bp_per_chr, + const std::vector& pvals_per_chr); Int1D valid_assoc_file(const std::string& fileassoc, const std::string& colnames);