From b36bc67bcad63fdf6b757aa023434694eb07b4c0 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 11 Dec 2024 15:57:41 +0100 Subject: [PATCH] update popl and cmd --- external/popl/popl.hpp | 17 ++++++--- src/Cmd.cpp | 80 +++++++++++++++++++++++------------------- 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/external/popl/popl.hpp b/external/popl/popl.hpp index 8fd668b..9cce497 100644 --- a/external/popl/popl.hpp +++ b/external/popl/popl.hpp @@ -3,9 +3,11 @@ ( _ \ / \( _ \( ) ) __/( O )) __// (_/\ (__) \__/(__) \____/ - version 1.3.0 + version 1.4.0 https://github.com/badaix/popl + modified by Zilong-Li Nov 03, 2024 + This file is part of popl (program options parser lib) Copyright (C) 2015-2021 Johannes Pohl @@ -41,7 +43,7 @@ namespace popl { -#define POPL_VERSION "1.3.0" +#define POPL_VERSION "1.4.0" /// Option's argument type @@ -62,6 +64,7 @@ enum class Argument /** * inactive: Option is not set and will not be parsed * hidden: Option is active, but will not show up in the help message + * headline: Option is a head line as start of a new section * required: Option must be set on the command line. Otherwise an exception will be thrown * optional: Option must not be set. Default attribute. * advanced: Option is advanced and will only show up in the advanced help message @@ -69,8 +72,9 @@ enum class Argument */ enum class Attribute { - inactive = 0, - hidden = 1, + inactive = -1, + hidden = 0, + headline = 1, required = 2, optional = 3, advanced = 4, @@ -1183,7 +1187,10 @@ inline std::string ConsoleOptionPrinter::print(const Attribute& max_attribute) c optionStr.resize(optionRightMargin, ' '); else optionStr += "\n" + std::string(optionRightMargin, ' '); - s << optionStr; + if (option->attribute() != Attribute::headline) + s << optionStr; + else + s << "\n"; std::string line; std::vector lines; diff --git a/src/Cmd.cpp b/src/Cmd.cpp index 3dc7724..d7981e4 100644 --- a/src/Cmd.cpp +++ b/src/Cmd.cpp @@ -24,40 +24,64 @@ Param::Param(int argc, char **argv) { " use csv file as input and apply the Implicitly Restarted Arnoldi Method\n" + " PCAone --csv csv.zst --svd 0 \n" + "\033[0m\n"}; - OptionParser opts(copyr + "Main options"); + OptionParser opts(copyr + "General options"); auto help_opt = opts.add("h", "help", "print all options including hidden advanced options"); + opts.add>("m", "memory", "RAM usage in GB unit for out-of-core mode. default is in-core mode", memory, &memory); + opts.add>("n", "threads", "the number of threads to be used", threads, &threads); + opts.add("v", "verbose", "verbose message output", &verbose); + opts.add, Attribute::headline>("","PCA","PCA algorithms:"); auto svd_opt = opts.add>("d", "svd", "SVD method to be applied. default 2 is recommended for big data.\n" "0: the Implicitly Restarted Arnoldi Method (IRAM)\n" "1: the Yu's single-pass Randomized SVD with power iterations\n" "2: the accurate window-based Randomized SVD method (PCAone)\n" "3: the full Singular Value Decomposition.", 2); - auto plinkfile = opts.add>("b", "bfile", "prefix to PLINK .bed/.bim/.fam files", "", &filein); - auto binfile = opts.add>("B", "binary", "path of binary file", "", &filein); - auto csvfile = opts.add>("c", "csv", "path of comma seperated CSV file compressed by zstd", "", &filein); - auto bgenfile = opts.add>("g", "bgen", "path of BGEN file compressed by gzip/zstd", "", &filein); - auto beaglefile = opts.add>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein); - auto usvprefix = opts.add>("", "USV", "prefix to PCAone .eigvecs/.eigvals/.loadings/.mbim"); - opts.add>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU); - opts.add>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV); - opts.add>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS); opts.add>("k", "pc", "top k principal components (PCs) to be calculated", k, &k); - opts.add>("m", "memory", "RAM usage in GB unit for out-of-core mode. default is in-core mode", memory, &memory); - opts.add>("n", "threads", "the number of threads to be used", threads, &threads); - opts.add>("o", "out", "prefix to output files. default [pcaone]", fileout, &fileout); - opts.add>("p", "maxp", "maximum number of power iterations for RSVD algorithm", maxp, &maxp); - opts.add("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated)", &noshuffle); - opts.add("v", "verbose", "verbose message output", &verbose); - opts.add("V", "printv", "output the right eigenvectors with suffix .loadings", &printv); - opts.add, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2", bands, &bands); opts.add>("C", "scale", "do scaling for input file.\n" "0: do just centering\n" "1: do log transformation eg. log(x+0.01) for RNA-seq data\n" "2: do count per median log transformation (CPMED) for scRNAs", scale, &scale); + opts.add>("p", "maxp", "maximum number of power iterations for RSVD algorithm", maxp, &maxp); + opts.add("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated)", &noshuffle); + opts.add, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2", bands, &bands); opts.add("", "emu", "use EMU algorithm for genotype input with missingness", &emu); opts.add("", "pcangsd", "use PCAngsd algorithm for genotype likelihood input", &pcangsd); - opts.add>("", "maf", "exclude variants with MAF lower than this value", maf, &maf); + opts.add, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known", 0, &nsnps); + opts.add, Attribute::advanced>("", "N", "the number of samples if already known", 0, &nsamples); + // opts.add("", "debug", "turn on debugging mode", &debug); + opts.add, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data", buffer, &buffer); + opts.add, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations", imaxiter, &imaxiter); + opts.add, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm", itol, &itol); + opts.add, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM", ncv, &ncv); + opts.add, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD", oversamples, &oversamples); + opts.add, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform, 1: guassian", rand, &rand); + opts.add, Attribute::advanced>("", "maxiter", "maximum number of EM iterations", maxiter, &maxiter); + opts.add, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol); + opts.add, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem); + opts.add, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf); + opts.add("", "printu", "output eigen vector of each epoch (for tests)", &printu); + + opts.add, Attribute::headline>("","INPUT","Input options:"); + auto plinkfile = opts.add>("b", "bfile", "prefix of PLINK .bed/.bim/.fam files", "", &filein); + opts.add("", "haploid", "the plink format represents haploid data", &haploid); + auto binfile = opts.add>("B", "binary", "path of binary file", "", &filein); + auto csvfile = opts.add>("c", "csv", "path of comma seperated CSV file compressed by zstd", "", &filein); + auto bgenfile = opts.add>("g", "bgen", "path of BGEN file compressed by gzip/zstd", "", &filein); + auto beaglefile = opts.add>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein); opts.add>("", "match-bim", "the .mbim file to be matched, where the 7th column is allele frequency", "", &filebim); + auto usvprefix = opts.add>("", "USV", "prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim"); + opts.add, Attribute::advanced>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU); + opts.add, Attribute::advanced>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV); + opts.add, Attribute::advanced>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS); + + opts.add, Attribute::headline>("","OUTPUT","Output options:"); + opts.add>("o", "out", "prefix of output files. default [pcaone]", fileout, &fileout); + opts.add("V", "printv", "output the right eigenvectors with suffix .loadings", &printv); + opts.add("", "ld", "output a binary matrix for downstream LD related analysis", &ld); + opts.add("", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2); + + opts.add, Attribute::headline>("","MISC","Misc options:"); + opts.add>("", "maf", "exclude variants with MAF lower than this value", maf, &maf); opts.add>("", "project", "project the new samples onto the existing PCs.\n" "0: disabled\n" "1: by multiplying the loadings with mean imputation for missing genotypes\n" @@ -68,36 +92,20 @@ Param::Param(int argc, char **argv) { "0: disabled\n" "1: compute per-site inbreeding coefficient and HWE test\n", inbreed, &inbreed); - opts.add("", "ld", "output a binary matrix for downstream LD related analysis", &ld); opts.add>("", "ld-r2", "r2 cutoff for LD-based pruning. (usually 0.2)", ld_r2, &ld_r2); opts.add>("", "ld-bp", "physical distance threshold in bases for LD. (usually 1000000)", ld_bp, &ld_bp); opts.add>("", "ld-stats", "statistics to calculate LD r2 for pairwise SNPs.\n" "0: the ancestry adjusted, i.e. correlation between residuals\n" "1: the standard, i.e. correlation between two alleles\n", ld_stats, &ld_stats); - opts.add("", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2); auto clumpfile = opts.add>("", "clump", "assoc-like file with target variants and pvalues for clumping", "", &clump); auto assocnames = opts.add>("", "clump-names", "column names in assoc-like file for locating chr, pos and pvalue", "CHR,BP,P", &assoc_colnames); opts.add>("", "clump-p1", "significance threshold for index SNPs", clump_p1, &clump_p1); opts.add>("", "clump-p2", "secondary significance threshold for clumped SNPs", clump_p2, &clump_p2); opts.add>("", "clump-r2", "r2 cutoff for LD-based clumping", clump_r2, &clump_r2); opts.add>("", "clump-bp", "physical distance threshold in bases for clumping", clump_bp, &clump_bp); - opts.add("", "printu", "output eigen vector of each epoch (for tests)", &printu); - opts.add, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known", 0, &nsnps); - opts.add, Attribute::advanced>("", "N", "the number of samples if already known", 0, &nsamples); - // opts.add("", "debug", "turn on debugging mode", &debug); - opts.add("", "haploid", "the plink format represents haploid data", &haploid); - opts.add, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data", buffer, &buffer); - opts.add, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations", imaxiter, &imaxiter); - opts.add, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm", itol, &itol); - opts.add, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM", ncv, &ncv); - opts.add, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD", oversamples, &oversamples); - opts.add, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform, 1: guassian", rand, &rand); - opts.add, Attribute::advanced>("", "maxiter", "maximum number of EM iterations", maxiter, &maxiter); - opts.add, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol); - opts.add, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem); - opts.add, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf); opts.add("", "groff", "print groff formatted help message", &groff); + // collect command line options acutal in effect ss << (std::string) "PCAone (v" + VERSION + ") https://github.com/Zilong-Li/PCAone\n"; ss << "Options in effect:\n";