Skip to content

Commit

Permalink
update popl and cmd
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Nov 4, 2024
1 parent 5f58e28 commit 68c779f
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 41 deletions.
17 changes: 12 additions & 5 deletions external/popl/popl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
( _ \ / \( _ \( )
) __/( O )) __// (_/\
(__) \__/(__) \____/
version 1.3.0
version 1.4.0
https://github.com/badaix/popl
modified by Zilong-Li Nov 03, 2024
This file is part of popl (program options parser lib)
Copyright (C) 2015-2021 Johannes Pohl
Expand Down Expand Up @@ -41,7 +43,7 @@
namespace popl
{

#define POPL_VERSION "1.3.0"
#define POPL_VERSION "1.4.0"


/// Option's argument type
Expand All @@ -62,15 +64,17 @@ enum class Argument
/**
* inactive: Option is not set and will not be parsed
* hidden: Option is active, but will not show up in the help message
* headline: Option is a head line as start of a new section
* required: Option must be set on the command line. Otherwise an exception will be thrown
* optional: Option must not be set. Default attribute.
* advanced: Option is advanced and will only show up in the advanced help message
* expoert: Option is expert and will only show up in the expert help message
*/
enum class Attribute
{
inactive = 0,
hidden = 1,
inactive = -1,
hidden = 0,
headline = 1,
required = 2,
optional = 3,
advanced = 4,
Expand Down Expand Up @@ -1183,7 +1187,10 @@ inline std::string ConsoleOptionPrinter::print(const Attribute& max_attribute) c
optionStr.resize(optionRightMargin, ' ');
else
optionStr += "\n" + std::string(optionRightMargin, ' ');
s << optionStr;
if (option->attribute() != Attribute::headline)
s << optionStr;
else
s << "\n";

std::string line;
std::vector<std::string> lines;
Expand Down
80 changes: 44 additions & 36 deletions src/Cmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,40 +24,64 @@ Param::Param(int argc, char **argv) {
" use csv file as input and apply the Implicitly Restarted Arnoldi Method\n" +
" PCAone --csv csv.zst --svd 0 \n" +
"\033[0m\n"};
OptionParser opts(copyr + "Main options");
OptionParser opts(copyr + "Help options");
auto help_opt = opts.add<Switch>("h", "help", "print all options including hidden advanced options");
opts.add<Value<std::string>, Attribute::headline>("","PCA","PCA algorithms:");
auto svd_opt = opts.add<Value<uint>>("d", "svd", "SVD method to be applied. default 2 is recommended for big data.\n"
"0: the Implicitly Restarted Arnoldi Method (IRAM)\n"
"1: the Yu's single-pass Randomized SVD with power iterations\n"
"2: the accurate window-based Randomized SVD method (PCAone)\n"
"3: the full Singular Value Decomposition.", 2);
auto plinkfile = opts.add<Value<std::string>>("b", "bfile", "prefix to PLINK .bed/.bim/.fam files", "", &filein);
auto binfile = opts.add<Value<std::string>>("B", "binary", "path of binary file", "", &filein);
auto csvfile = opts.add<Value<std::string>>("c", "csv", "path of comma seperated CSV file compressed by zstd", "", &filein);
auto bgenfile = opts.add<Value<std::string>>("g", "bgen", "path of BGEN file compressed by gzip/zstd", "", &filein);
auto beaglefile = opts.add<Value<std::string>>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein);
auto usvprefix = opts.add<Value<std::string>>("", "USV", "prefix to PCAone .eigvecs/.eigvals/.loadings/.mbim");
opts.add<Value<std::string>>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU);
opts.add<Value<std::string>>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV);
opts.add<Value<std::string>>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS);
opts.add<Value<uint>>("k", "pc", "top k principal components (PCs) to be calculated", k, &k);
opts.add<Value<double>>("m", "memory", "RAM usage in GB unit for out-of-core mode. default is in-core mode", memory, &memory);
opts.add<Value<uint>>("n", "threads", "the number of threads to be used", threads, &threads);
opts.add<Value<std::string>>("o", "out", "prefix to output files. default [pcaone]", fileout, &fileout);
opts.add<Value<uint>>("p", "maxp", "maximum number of power iterations for RSVD algorithm", maxp, &maxp);
opts.add<Switch>("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated)", &noshuffle);
opts.add<Switch>("v", "verbose", "verbose message output", &verbose);
opts.add<Switch>("V", "printv", "output the right eigenvectors with suffix .loadings", &printv);
opts.add<Value<uint>, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2", bands, &bands);
opts.add<Value<uint>>("C", "scale", "do scaling for input file.\n"
"0: do just centering\n"
"1: do log transformation eg. log(x+0.01) for RNA-seq data\n"
"2: do count per median log transformation (CPMED) for scRNAs",
scale, &scale);
opts.add<Value<uint>>("p", "maxp", "maximum number of power iterations for RSVD algorithm", maxp, &maxp);
opts.add<Switch>("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated)", &noshuffle);
opts.add<Value<uint>, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2", bands, &bands);
opts.add<Switch>("", "emu", "use EMU algorithm for genotype input with missingness", &emu);
opts.add<Switch>("", "pcangsd", "use PCAngsd algorithm for genotype likelihood input", &pcangsd);
opts.add<Value<double>>("", "maf", "exclude variants with MAF lower than this value", maf, &maf);
opts.add<Value<double>>("m", "memory", "RAM usage in GB unit for out-of-core mode. default is in-core mode", memory, &memory);
opts.add<Value<uint>>("n", "threads", "the number of threads to be used", threads, &threads);
opts.add<Value<uint>, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known", 0, &nsnps);
opts.add<Value<uint>, Attribute::advanced>("", "N", "the number of samples if already known", 0, &nsamples);
// opts.add<Switch, Attribute::advanced>("", "debug", "turn on debugging mode", &debug);
opts.add<Value<uint>, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data", buffer, &buffer);
opts.add<Value<uint>, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations", imaxiter, &imaxiter);
opts.add<Value<double>, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm", itol, &itol);
opts.add<Value<uint>, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM", ncv, &ncv);
opts.add<Value<uint>, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD", oversamples, &oversamples);
opts.add<Value<uint>, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform, 1: guassian", rand, &rand);
opts.add<Value<uint>, Attribute::advanced>("", "maxiter", "maximum number of EM iterations", maxiter, &maxiter);
opts.add<Value<double>, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol);
opts.add<Value<double>, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem);
opts.add<Value<double>, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf);
opts.add<Switch, Attribute::hidden>("", "printu", "output eigen vector of each epoch (for tests)", &printu);

opts.add<Value<std::string>, Attribute::headline>("","INPUT","Input options:");
auto plinkfile = opts.add<Value<std::string>>("b", "bfile", "prefix of PLINK .bed/.bim/.fam files", "", &filein);
opts.add<Switch, Attribute::advanced>("", "haploid", "the plink format represents haploid data", &haploid);
auto binfile = opts.add<Value<std::string>>("B", "binary", "path of binary file", "", &filein);
auto csvfile = opts.add<Value<std::string>>("c", "csv", "path of comma seperated CSV file compressed by zstd", "", &filein);
auto bgenfile = opts.add<Value<std::string>>("g", "bgen", "path of BGEN file compressed by gzip/zstd", "", &filein);
auto beaglefile = opts.add<Value<std::string>>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein);
opts.add<Value<std::string>>("", "match-bim", "the .mbim file to be matched, where the 7th column is allele frequency", "", &filebim);
auto usvprefix = opts.add<Value<std::string>>("", "USV", "prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim");
opts.add<Value<std::string>, Attribute::advanced>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU);
opts.add<Value<std::string>, Attribute::advanced>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV);
opts.add<Value<std::string>, Attribute::advanced>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS);

opts.add<Value<std::string>, Attribute::headline>("","OUTPUT","Output options:");
opts.add<Value<std::string>>("o", "out", "prefix of output files. default [pcaone]", fileout, &fileout);
opts.add<Switch>("v", "verbose", "verbose message output", &verbose);
opts.add<Switch>("V", "printv", "output the right eigenvectors with suffix .loadings", &printv);
opts.add<Switch>("", "ld", "output a binary matrix for downstream LD related analysis", &ld);
opts.add<Switch>("", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2);

opts.add<Value<std::string>, Attribute::headline>("","MISC","Misc options:");
opts.add<Value<double>>("", "maf", "exclude variants with MAF lower than this value", maf, &maf);
opts.add<Value<int>>("", "project", "project the new samples onto the existing PCs.\n"
"0: disabled\n"
"1: by multiplying the loadings with mean imputation for missing genotypes\n"
Expand All @@ -68,36 +92,20 @@ Param::Param(int argc, char **argv) {
"0: disabled\n"
"1: compute per-site inbreeding coefficient and HWE test\n",
inbreed, &inbreed);
opts.add<Switch>("", "ld", "output a binary matrix for downstream LD related analysis", &ld);
opts.add<Value<double>>("", "ld-r2", "r2 cutoff for LD-based pruning. (usually 0.2)", ld_r2, &ld_r2);
opts.add<Value<uint>>("", "ld-bp", "physical distance threshold in bases for LD. (usually 1000000)", ld_bp, &ld_bp);
opts.add<Value<int>>("", "ld-stats", "statistics to calculate LD r2 for pairwise SNPs.\n"
"0: the ancestry adjusted, i.e. correlation between residuals\n"
"1: the standard, i.e. correlation between two alleles\n",
ld_stats, &ld_stats);
opts.add<Switch>("", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2);
auto clumpfile = opts.add<Value<std::string>>("", "clump", "assoc-like file with target variants and pvalues for clumping", "", &clump);
auto assocnames = opts.add<Value<std::string>>("", "clump-names", "column names in assoc-like file for locating chr, pos and pvalue", "CHR,BP,P", &assoc_colnames);
opts.add<Value<double>>("", "clump-p1", "significance threshold for index SNPs", clump_p1, &clump_p1);
opts.add<Value<double>>("", "clump-p2", "secondary significance threshold for clumped SNPs", clump_p2, &clump_p2);
opts.add<Value<double>>("", "clump-r2", "r2 cutoff for LD-based clumping", clump_r2, &clump_r2);
opts.add<Value<uint>>("", "clump-bp", "physical distance threshold in bases for clumping", clump_bp, &clump_bp);
opts.add<Switch, Attribute::advanced>("", "printu", "output eigen vector of each epoch (for tests)", &printu);
opts.add<Value<uint>, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known", 0, &nsnps);
opts.add<Value<uint>, Attribute::advanced>("", "N", "the number of samples if already known", 0, &nsamples);
// opts.add<Switch, Attribute::advanced>("", "debug", "turn on debugging mode", &debug);
opts.add<Switch, Attribute::advanced>("", "haploid", "the plink format represents haploid data", &haploid);
opts.add<Value<uint>, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data", buffer, &buffer);
opts.add<Value<uint>, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations", imaxiter, &imaxiter);
opts.add<Value<double>, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm", itol, &itol);
opts.add<Value<uint>, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM", ncv, &ncv);
opts.add<Value<uint>, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD", oversamples, &oversamples);
opts.add<Value<uint>, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform, 1: guassian", rand, &rand);
opts.add<Value<uint>, Attribute::advanced>("", "maxiter", "maximum number of EM iterations", maxiter, &maxiter);
opts.add<Value<double>, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol);
opts.add<Value<double>, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem);
opts.add<Value<double>, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf);
opts.add<Switch, Attribute::hidden>("", "groff", "print groff formatted help message", &groff);

// collect command line options acutal in effect
ss << (std::string) "PCAone (v" + VERSION + ") https://github.com/Zilong-Li/PCAone\n";
ss << "Options in effect:\n";
Expand Down

0 comments on commit 68c779f

Please sign in to comment.