Skip to content

Commit

Permalink
add stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Dec 20, 2024
1 parent 32c2e05 commit 6d3e3e3
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 49 deletions.
34 changes: 32 additions & 2 deletions src/Data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ void Data::standardize_E() {

void Data::pcangsd_standardize_E(const Mat2D &U, const Mat1D &svals, const Mat2D &VT) {
cao.print(tick.date(), "begin to standardize the matrix for pcangsd");
uint ks = svals.size();
uint nk = svals.size();
Dc = Mat1D::Zero(nsamples);
#pragma omp parallel
{
Expand All @@ -301,7 +301,7 @@ void Data::pcangsd_standardize_E(const Mat2D &U, const Mat1D &svals, const Mat2D
for (uint i = 0; i < nsamples; i++) {
// Rescale individual allele frequencies
pt = 0.0;
for (uint k = 0; k < ks; ++k) {
for (uint k = 0; k < nk; ++k) {
pt += U(i, k) * svals(k) * VT(k, j);
}
pt = (pt + 2.0 * F(j)) / 2.0;
Expand Down Expand Up @@ -329,3 +329,33 @@ void Data::pcangsd_standardize_E(const Mat2D &U, const Mat1D &svals, const Mat2D
}
}
}

void Data::predict_block_E(uint64 start_idx, uint64 stop_idx, const Mat2D &U) {
// assume full size G is in memory
// Eigen::ColPivHouseholderQR<Mat2D> qr(U); // nsamples x nk
const uint nk = U.cols();
// get beta by linear regresion on data without NAs
// Ux = g, x is the beta
#pragma omp parallel for
for (uint j = start_idx; j <= stop_idx; j++) {
// find non-missing samples
Int1D idx;
for (uint i = 0; i < nsamples; i++) {
if (!C(j * nsamples + i)) idx.push_back(i);
}
// cao.print("predict", idx.size());
// what if idx.size()==0
Mat1D v = U(idx, Eigen::all).colPivHouseholderQr().solve(G(idx, j));
// predict for NAs in E
for (uint i = 0; i < nsamples; i++) {
if (C(j * nsamples + i)) // bool & 1
{ // sites need to be predicted
G(i, j) = 0.0;
for (uint k = 0; k < nk; ++k) {
G(i, j) += U(i, k) * v(k);
}
G(i, j) = fmin(fmax(G(i, j), -F(j)), 1 - F(j));
}
}
}
}
2 changes: 2 additions & 0 deletions src/Data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class Data {
// for blockwise
void calcu_vt_initial(const Mat2D& T, Mat2D& VT, bool standardize);
void calcu_vt_update(const Mat2D& T, const Mat2D& U, const Mat1D& svals, Mat2D& VT, bool standardize);
// given PCs, predict the missing values, then update in place by block
void predict_block_E(uint64 start_idx, uint64 stop_idx, const Mat2D& U);

const Param& params;

Expand Down
92 changes: 51 additions & 41 deletions src/Halko.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,52 @@

using namespace std;

void RsvdOpData::computeB(Mat2D& B, Mat2D& G, const Mat2D& H) {
Mat2D RsvdOpData::computeU(const Mat2D& G, const Mat2D& H) {
const Index size{ranks() + oversamples()};
const Index nrow{rows()};
Mat2D R(size, size), Rt(size, size);
const Index nrow{G.rows()};
const Index nk{ranks()};
Mat2D R(size, size), Rt(size, size), Gt(nrow, G.cols());
Eigen::HouseholderQR<Mat2D> qr(G);
R.noalias() = Mat2D::Identity(size, nrow) * qr.matrixQR().triangularView<Eigen::Upper>(); // get R1
Gt.noalias() = qr.householderQ() * Mat2D::Identity(nrow, size);
{
Eigen::HouseholderQR<Eigen::Ref<Mat2D>> qr(G);
R.noalias() = Mat2D::Identity(size, nrow) * qr.matrixQR().triangularView<Eigen::Upper>(); // get R1
G.noalias() = qr.householderQ() * Mat2D::Identity(nrow, size); // hold Q1 in G
}
{
Eigen::HouseholderQR<Eigen::Ref<Mat2D>> qr(G);
Eigen::HouseholderQR<Eigen::Ref<Mat2D>> qr(Gt);
Rt.noalias() = Mat2D::Identity(size, nrow) * qr.matrixQR().triangularView<Eigen::Upper>(); // get R2
G.noalias() = qr.householderQ() * Mat2D::Identity(nrow, size); // hold Q2 in G
Gt.noalias() = qr.householderQ() * Mat2D::Identity(nrow, size); // hold Q2 in Gt
}
R = Rt * R; // get R = R1R2;
R = Rt * R; // get R = Rt * R
// B is size x ncol
// R.T * B = H.T
B.noalias() = R.transpose().fullPivHouseholderQr().solve(H.transpose());
Mat2D B = R.transpose().fullPivHouseholderQr().solve(H.transpose());
Eigen::BDCSVD<Mat2D> svd(B, Eigen::ComputeThinU | Eigen::ComputeThinV);
// B is K x nsamples, thus we use matrixV() for PCs
return svd.matrixV().leftCols(nk);
}

void RsvdOpData::computeUSV(int p, double tol) {
const Index size{ranks() + oversamples()};
const Index k{ranks()};
const Index nrow{rows()};
const Index ncol{cols()};
Mat2D Upre, H(ncol, size), G(nrow, size), B(size, ncol);
const Index nrow{rows()}; // nsnps
const Index ncol{cols()}; // nsamples
Mat2D Upre, H(ncol, size), G(nrow, size), B(size, ncol), R(size, size), Rt(size, size);
double diff;
for (int pi = 0; pi <= p; ++pi) {
computeGandH(G, H, pi);
computeB(B, G, H);
// check if converged
{
Eigen::HouseholderQR<Eigen::Ref<Mat2D>> qr(G);
R.noalias() = Mat2D::Identity(size, nrow) * qr.matrixQR().triangularView<Eigen::Upper>(); // get R1
G.noalias() = qr.householderQ() * Mat2D::Identity(nrow, size); // hold Q1 in G
}
{
Eigen::HouseholderQR<Eigen::Ref<Mat2D>> qr(G);
Rt.noalias() = Mat2D::Identity(size, nrow) * qr.matrixQR().triangularView<Eigen::Upper>(); // get R2
G.noalias() = qr.householderQ() * Mat2D::Identity(nrow, size); // hold Q2 in G
}
R = Rt * R; // get R = R1R2;
// B is size x ncol
// R.T * B = H.T
B.noalias() = R.transpose().fullPivHouseholderQr().solve(H.transpose());
Eigen::BDCSVD<Mat2D> svd(B, Eigen::ComputeThinU | Eigen::ComputeThinV);
U = svd.matrixV().leftCols(k);
if (data->params.printu) {
Expand Down Expand Up @@ -160,7 +175,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
data->standardize_E();
}
}
band = 1;
bandsize = 1;
blocksize = (unsigned int)ceil((double)data->nsnps / data->params.bands);
if (blocksize < data->params.bands)
cao.warn("block size < window size. please consider the IRAM method");
Expand All @@ -169,8 +184,8 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
if (data->params.perm) PCAone::permute_matrix(data->G, data->perm);
}
{
// band : 2, 4, 8, 16, 32, 64, 128
band = fmin(band * 2, data->params.bands);
// bandsize : 2, 4, 8, 16, 32, 64, 128
bandsize = fmin(bandsize * 2, data->params.bands);
for (uint b = 0, i = 1, j = 1; b < data->params.bands; ++b, ++i, ++j) {
start_idx = b * blocksize;
stop_idx = (b + 1) * blocksize >= data->nsnps ? data->nsnps - 1 : (b + 1) * blocksize - 1;
Expand All @@ -188,23 +203,23 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
flip_Omg(Omg2, Omg);
H2.setZero();
}
} else if (i <= band / 2) {
} else if (i <= bandsize / 2) {
// continues to add in data based on current band
H1.noalias() +=
data->G.middleCols(start_idx, actual_block_size) * G.middleRows(start_idx, actual_block_size);
} else if (i > band / 2 && i <= band) {
} else if (i > bandsize / 2 && i <= bandsize) {
H2.noalias() +=
data->G.middleCols(start_idx, actual_block_size) * G.middleRows(start_idx, actual_block_size);
}
if ((b + 1) >= band) {
if (i == band) {
if ((b + 1) >= bandsize) {
if (i == bandsize) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
flip_Omg(Omg2, Omg);
H1.setZero();
i = 0;
} else if (i == band / 2) {
} else if (i == bandsize / 2) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
Expand All @@ -217,28 +232,23 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
flip_Omg(Omg2, Omg);
}
// update estimates rightaway
if (update && data->params.fancyem) {
cao.print(b, ",", i, ",", j, ",", band);
Mat2D B(size, H.rows());
computeB(B, G, H);
Eigen::BDCSVD<Mat2D> svd(B, Eigen::ComputeThinU | Eigen::ComputeThinV);
U = svd.matrixV().leftCols(nk);
V.noalias() = G * svd.matrixU().leftCols(nk);
S = svd.singularValues().head(nk);
data->update_batch_E(U, S, V.transpose());
// update estimates rightaway
if (data->params.fancyem) {
U = computeU(G.middleRows(start_idx, actual_block_size), H); // only use G in band
if (data->params.verbose > 1) cao.print(b, ",", i, ",", j, ",", bandsize);
data->predict_block_E(start_idx, stop_idx, U);
}
}
}
}
} else {
if (pi == 0) {
band = data->bandFactor;
bandsize = data->bandFactor;
}
{
data->check_file_offset_first_var();
// band : 2, 4, 8, 16, 32, 64
band = fmin(band * 2, data->nblocks);
bandsize = fmin(bandsize * 2, data->nblocks);
for (uint b = 0, i = 1, j = 1; b < data->nblocks; ++b, ++i, ++j) {
start_idx = data->start[b];
stop_idx = data->stop[b];
Expand All @@ -261,20 +271,20 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
flip_Omg(Omg2, Omg);
H2.setZero();
}
} else if (i <= band / 2) {
} else if (i <= bandsize / 2) {
H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
} else if (i > band / 2 && i <= band) {
} else if (i > bandsize / 2 && i <= bandsize) {
H2.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
}
if ((b + 1) >= band) {
if (i == band) {
if ((b + 1) >= bandsize) {
if (i == bandsize) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
flip_Omg(Omg2, Omg);
H1 = Mat2D::Zero(cols(), size);
i = 0;
} else if (i == band / 2) {
} else if (i == bandsize / 2) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
Expand Down
4 changes: 2 additions & 2 deletions src/Halko.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class RsvdOpData {

void computeUSV(int p, double tol);

void computeB(Mat2D& B, Mat2D& G, const Mat2D& H);
Mat2D computeU(const Mat2D& G, const Mat2D& H);
};

class NormalRsvdOpData : public RsvdOpData {
Expand All @@ -57,7 +57,7 @@ class NormalRsvdOpData : public RsvdOpData {
class FancyRsvdOpData : public RsvdOpData {
private:
const Index nk, os, size;
uint64 band, blocksize, actual_block_size, start_idx, stop_idx;
uint64 bandsize, blocksize, actual_block_size, start_idx, stop_idx;
Mat2D Omg, Omg2, H1, H2;

public:
Expand Down
8 changes: 4 additions & 4 deletions src/Projection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ void run_projection(Data* data, const Param& params) {

if (p_miss == 0.0) {
cao.warn("there is no missing genotypes");
Eigen::BDCSVD<Mat2D> svd(V, Eigen::ComputeThinU | Eigen::ComputeThinV);
Eigen::ColPivHouseholderQR<Mat2D> qr(V);

for (uint i = 0; i < data->nsamples; i++) {
// Vx = g
U.row(i) = svd.solve(data->G.row(i).transpose());
U.row(i) = qr.solve(data->G.row(i).transpose());
}
} else {
Int1D idx;
Expand All @@ -58,10 +58,10 @@ void run_projection(Data* data, const Param& params) {
for (int j = 0; j < V.rows(); j++) {
if (!data->C(j * data->nsamples + i)) idx.push_back(j);
}
Eigen::BDCSVD<Mat2D> svd(V(idx, Eigen::all), Eigen::ComputeThinU | Eigen::ComputeThinV);
Eigen::ColPivHouseholderQR<Mat2D> qr(V(idx, Eigen::all));
Mat1D g = data->G(i, idx);
// Vx = g
U.row(i) = svd.solve(g);
U.row(i) = qr.solve(g);
}
}

Expand Down

0 comments on commit 6d3e3e3

Please sign in to comment.